From a7a3021e104f06aca4ce33b308fc9a68a21a55c5 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Tue, 6 Dec 2022 14:30:46 +0100
Subject: [PATCH 001/286] Added integration to generate C code able to execute
 neural networks on the Gemmini accelerator

---
 .gitmodules                                   |   3 +
 3rdparty/gemmini                              |   1 +
 CMakeLists.txt                                |   2 +
 apps/microtvm/gemmini/README.md               |   3 +
 .../template_project/crt_config/crt_config.h  |  57 ++
 .../template_project/microtvm_api_server.py   | 386 ++++++++
 .../gemmini/template_project/src/Makefile.in  |  34 +
 .../gemmini/template_project/src/Makefrag     |  25 +
 .../gemmini/template_project/src/add.c        |  69 ++
 .../gemmini/template_project/src/conv2d.c     |  67 ++
 .../gemmini/template_project/src/dense.c      |  67 ++
 .../gemmini/template_project/src/dwconv2d.c   |  67 ++
 .../src/makefiles/add/Makefile                |  68 ++
 .../src/makefiles/conv2d/Makefile             |  68 ++
 .../src/makefiles/dense/Makefile              |  68 ++
 .../src/makefiles/dwconv2d/Makefile           |  68 ++
 .../src/makefiles/maxpool2d/Makefile          |  68 ++
 .../src/makefiles/mobilenet/Makefile          |  68 ++
 .../gemmini/template_project/src/maxpool2d.c  |  67 ++
 .../gemmini/template_project/src/mobilenet.c  | 127 +++
 cmake/modules/contrib/Gemmini.cmake           | 117 +++
 python/tvm/autotvm/measure/measure_methods.py |  12 +
 python/tvm/contrib/gemmini/__init__.py        |  32 +
 python/tvm/contrib/gemmini/build_module.py    | 201 ++++
 python/tvm/contrib/gemmini/environment.py     | 386 ++++++++
 python/tvm/contrib/gemmini/helpers.py         | 188 ++++
 python/tvm/contrib/gemmini/intrin.py          | 873 ++++++++++++++++++
 python/tvm/contrib/gemmini/legalize.py        | 595 ++++++++++++
 python/tvm/contrib/gemmini/pattern_table.py   | 469 ++++++++++
 python/tvm/contrib/gemmini/transform.py       | 816 ++++++++++++++++
 .../networks/mobilenet-tutorial.ipynb         | 311 +++++++
 .../tutorials/networks/mobilenet_utils.py     | 138 +++
 .../single_operators/add-tutorial.ipynb       | 395 ++++++++
 .../single_operators/conv2d-tutorial.ipynb    | 378 ++++++++
 .../single_operators/dense-tutorial.ipynb     | 378 ++++++++
 .../single_operators/dwconv2d-tutorial.ipynb  | 373 ++++++++
 .../single_operators/maxpool2d-tutorial.ipynb | 378 ++++++++
 python/tvm/contrib/gemmini/utils.py           | 142 +++
 python/tvm/micro/build.py                     |   1 +
 python/tvm/micro/model_library_format.py      |   2 +-
 .../relay/backend/contrib/gemmini/__init__.py |  23 +
 .../backend/contrib/gemmini/gemmini_add.py    | 214 +++++
 .../contrib/gemmini/gemmini_conv2d_cisc.py    | 244 +++++
 .../backend/contrib/gemmini/gemmini_dense.py  | 377 ++++++++
 .../contrib/gemmini/gemmini_dense_cisc.py     | 137 +++
 .../gemmini/gemmini_depthwise_conv2d_cisc.py  | 227 +++++
 .../contrib/gemmini/gemmini_max_pool2d.py     | 148 +++
 .../tvm/relay/backend/contrib/gemmini/op.py   | 286 ++++++
 python/tvm/tir/transform/transform.py         |  11 +
 src/relay/op/contrib/gemmini/add.cc           | 134 +++
 src/relay/op/contrib/gemmini/convolution.cc   | 221 +++++
 .../contrib/gemmini/depthwise_convolution.cc  | 159 ++++
 src/relay/op/contrib/gemmini/gemm.cc          | 125 +++
 src/relay/op/contrib/gemmini/max_pool2d.cc    | 121 +++
 src/target/metadata_module.cc                 |   3 +-
 src/target/source/codegen_c_host.cc           |   3 +
 src/tir/ir/stmt.cc                            |   3 +-
 .../inject_gemmini_pointer_correction.cc      | 131 +++
 58 files changed, 10132 insertions(+), 3 deletions(-)
 create mode 160000 3rdparty/gemmini
 create mode 100644 apps/microtvm/gemmini/README.md
 create mode 100644 apps/microtvm/gemmini/template_project/crt_config/crt_config.h
 create mode 100644 apps/microtvm/gemmini/template_project/microtvm_api_server.py
 create mode 100644 apps/microtvm/gemmini/template_project/src/Makefile.in
 create mode 100644 apps/microtvm/gemmini/template_project/src/Makefrag
 create mode 100644 apps/microtvm/gemmini/template_project/src/add.c
 create mode 100644 apps/microtvm/gemmini/template_project/src/conv2d.c
 create mode 100644 apps/microtvm/gemmini/template_project/src/dense.c
 create mode 100644 apps/microtvm/gemmini/template_project/src/dwconv2d.c
 create mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/add/Makefile
 create mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile
 create mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile
 create mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile
 create mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile
 create mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile
 create mode 100644 apps/microtvm/gemmini/template_project/src/maxpool2d.c
 create mode 100644 apps/microtvm/gemmini/template_project/src/mobilenet.c
 create mode 100644 cmake/modules/contrib/Gemmini.cmake
 create mode 100644 python/tvm/contrib/gemmini/__init__.py
 create mode 100644 python/tvm/contrib/gemmini/build_module.py
 create mode 100644 python/tvm/contrib/gemmini/environment.py
 create mode 100644 python/tvm/contrib/gemmini/helpers.py
 create mode 100644 python/tvm/contrib/gemmini/intrin.py
 create mode 100644 python/tvm/contrib/gemmini/legalize.py
 create mode 100644 python/tvm/contrib/gemmini/pattern_table.py
 create mode 100644 python/tvm/contrib/gemmini/transform.py
 create mode 100644 python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb
 create mode 100644 python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py
 create mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb
 create mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb
 create mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb
 create mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb
 create mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb
 create mode 100644 python/tvm/contrib/gemmini/utils.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/__init__.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/op.py
 create mode 100644 src/relay/op/contrib/gemmini/add.cc
 create mode 100644 src/relay/op/contrib/gemmini/convolution.cc
 create mode 100644 src/relay/op/contrib/gemmini/depthwise_convolution.cc
 create mode 100644 src/relay/op/contrib/gemmini/gemm.cc
 create mode 100644 src/relay/op/contrib/gemmini/max_pool2d.cc
 create mode 100644 src/tir/transforms/inject_gemmini_pointer_correction.cc

diff --git a/.gitmodules b/.gitmodules
index 66fd0390cf35..64c1a30050bc 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -19,3 +19,6 @@
 [submodule "3rdparty/OpenCL-Headers"]
 	path = 3rdparty/OpenCL-Headers
 	url = https://github.com/KhronosGroup/OpenCL-Headers.git
+[submodule "3rdparty/gemmini"]
+	path = 3rdparty/gemmini
+	url = https://github.com/ucb-bar/gemmini
diff --git a/3rdparty/gemmini b/3rdparty/gemmini
new file mode 160000
index 000000000000..b6bdad59cbd6
--- /dev/null
+++ b/3rdparty/gemmini
@@ -0,0 +1 @@
+Subproject commit b6bdad59cbd6313f1ea4c93d3493db3d59b9e418
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 736d516fa1f6..47499ff90356 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -492,6 +492,7 @@ include(cmake/modules/Micro.cmake)
 include(cmake/modules/contrib/EthosN.cmake)
 include(cmake/modules/contrib/CMSISNN.cmake)
 include(cmake/modules/contrib/EthosU.cmake)
+include(cmake/modules/contrib/Gemmini.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
 include(cmake/modules/contrib/CODEGENC.cmake)
 include(cmake/modules/contrib/DNNL.cmake)
@@ -574,6 +575,7 @@ if(USE_MICRO)
   # Unix Makefiles generator, need to add these explicit target-level dependency)
   add_dependencies(tvm_runtime zephyr)
   add_dependencies(tvm_runtime arduino)
+  add_dependencies(tvm_runtime gemmini)
   if(MSVC)
     target_link_libraries(tvm PRIVATE host_standalone_crt )
     target_link_libraries(tvm_runtime PRIVATE host_standalone_crt)
diff --git a/apps/microtvm/gemmini/README.md b/apps/microtvm/gemmini/README.md
new file mode 100644
index 000000000000..11fea3415b70
--- /dev/null
+++ b/apps/microtvm/gemmini/README.md
@@ -0,0 +1,3 @@
+This directory contains code to create code for the Gemmini accelerator using microTVM. These tests are then executed on the Spike RISC-V ISA simulator.
+
+In order to use this correctly, the Spike simulator has to be installed. This can be done by following the steps found on the Chipyard repository.
diff --git a/apps/microtvm/gemmini/template_project/crt_config/crt_config.h b/apps/microtvm/gemmini/template_project/crt_config/crt_config.h
new file mode 100644
index 000000000000..b3126cfac920
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/crt_config/crt_config.h
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief CRT configuration for the host-linked CRT.
+ */
+#ifndef TVM_RUNTIME_MICRO_CRT_CONFIG_H_
+#define TVM_RUNTIME_MICRO_CRT_CONFIG_H_
+
+/*! Log level of the CRT runtime */
+#define TVM_CRT_LOG_LEVEL TVM_CRT_LOG_LEVEL_DEBUG
+
+/*! Support low-level debugging in MISRA-C runtime */
+#define TVM_CRT_DEBUG 0
+
+/*! Maximum supported dimension in NDArray */
+#define TVM_CRT_MAX_NDIM 6
+/*! Maximum supported arguments in generated functions */
+#define TVM_CRT_MAX_ARGS 10
+/*! Maximum supported string length in dltype, e.g. "int8", "int16", "float32" */
+#define TVM_CRT_MAX_STRLEN_DLTYPE 10
+/*! Maximum supported string length in function names */
+#define TVM_CRT_MAX_STRLEN_FUNCTION_NAME 120
+/*! Maximum supported string length in parameter names */
+#define TVM_CRT_MAX_STRLEN_PARAM_NAME 80
+
+/*! Maximum number of registered modules. */
+#define TVM_CRT_MAX_REGISTERED_MODULES 2
+
+/*! Size of the global function registry, in bytes. */
+#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 512
+
+/*! Maximum packet size, in bytes, including the length header. */
+#define TVM_CRT_MAX_PACKET_SIZE_BYTES 8 * 1024
+
+/*! \brief Maximum length of a PackedFunc function name. */
+#define TVM_CRT_MAX_FUNCTION_NAME_LENGTH_BYTES 30
+
+// #define TVM_CRT_FRAMER_ENABLE_LOGS
+
+#endif  // TVM_RUNTIME_MICRO_CRT_CONFIG_H_
diff --git a/apps/microtvm/gemmini/template_project/microtvm_api_server.py b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
new file mode 100644
index 000000000000..f4d4f7eb5e89
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
@@ -0,0 +1,386 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+MicroTVM API Server for Gemmini baremetal tests on the Spike simulator
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import atexit
+import collections
+import functools
+import json
+import logging
+import os
+import os.path
+import pathlib
+import re
+import shlex
+import shutil
+import shlex, subprocess
+import sys
+import tarfile
+import tempfile
+import time
+from string import Template
+import re
+from distutils.dir_util import copy_tree
+import subprocess
+import serial
+
+# import serial.tools.list_ports
+from tvm.micro.project_api import server
+
+from subprocess import PIPE
+
+_LOG = logging.getLogger(__name__)
+
+MODEL_LIBRARY_FORMAT_RELPATH = pathlib.Path("src") / "model" / "model.tar"
+API_SERVER_DIR = pathlib.Path(os.path.dirname(__file__) or os.path.getcwd())
+BUILD_DIR = API_SERVER_DIR / "build"
+MODEL_LIBRARY_FORMAT_PATH = API_SERVER_DIR / MODEL_LIBRARY_FORMAT_RELPATH
+
+IS_TEMPLATE = not (API_SERVER_DIR / MODEL_LIBRARY_FORMAT_RELPATH).exists()
+
+PROJECT_TYPES = [
+    "dense_example",
+    "conv2d_example",
+    "dwconv2d_example",
+    "add_example",
+    "maxpool2d_example",
+    "mobilenet_example",
+]
+
+PROJECT_OPTIONS = [
+    server.ProjectOption(
+        "project_type",
+        required=["generate_project"],
+        choices=tuple(PROJECT_TYPES),
+        type="str",
+        help="Type of project to generate.",
+    )
+]
+
+
+class Handler(server.ProjectAPIHandler):
+    def __init__(self):
+        super(Handler, self).__init__()
+        self._proc = None
+        self._port = None
+        self._transport = None
+        self._project_dir = None
+        self._qemu_instance = None
+
+    def server_info_query(self, tvm_version):
+        return server.ServerInfo(
+            platform_name="gemmini",
+            is_template=IS_TEMPLATE,
+            model_library_format_path="" if IS_TEMPLATE else MODEL_LIBRARY_FORMAT_PATH,
+            project_options=PROJECT_OPTIONS,
+        )
+
+    def _copy_project_files(self, api_server_dir, project_dir, project_type):
+        """Copies the files for project_type into project_dir.
+
+        Notes
+        -----
+        template_dir is NOT a project type, and that directory is never copied
+        in this function. template_dir only holds this file and its unit tests,
+        so this file is copied separately in generate_project.
+
+        """
+        for item in (API_SERVER_DIR / "src" / project_type).iterdir():
+            dest = project_dir / "src" / item.name
+            if item.is_dir():
+                shutil.copytree(item, dest)
+            else:
+                shutil.copy2(item, dest)
+
+    CRT_COPY_ITEMS = ("include", "src")
+
+    def _copy_standalone_crt(self, source_dir, standalone_crt_dir):
+        output_crt_dir = source_dir / "standalone_crt"
+        for item in self.CRT_COPY_ITEMS:
+            src_path = os.path.join(standalone_crt_dir, item)
+            dst_path = output_crt_dir / item
+            if os.path.isdir(src_path):
+                shutil.copytree(src_path, dst_path)
+            else:
+                shutil.copy2(src_path, dst_path)
+
+    # Example project is the "minimum viable project",
+    # and doesn't need a fancy RPC server
+    EXAMPLE_PROJECT_UNUSED_COMPONENTS = []
+
+    def _remove_unused_components(self, source_dir, project_type):
+        unused_components = []
+        if project_type == "example_project":
+            unused_components = self.EXAMPLE_PROJECT_UNUSED_COMPONENTS
+
+        for component in unused_components:
+            shutil.rmtree(source_dir / "standalone_crt" / component)
+
+    def _disassemble_mlf(self, mlf_tar_path, source_dir):
+        with tempfile.TemporaryDirectory() as mlf_unpacking_dir_str:
+            mlf_unpacking_dir = pathlib.Path(mlf_unpacking_dir_str)
+            with tarfile.open(mlf_tar_path, "r:") as tar:
+                tar.extractall(mlf_unpacking_dir)
+
+            model_dir = source_dir / "model"
+            model_dir.mkdir()
+
+            # Copy C files from model. The filesnames and quantity
+            # depend on the target string, so we just copy all c files
+            source_dir = mlf_unpacking_dir / "codegen" / "host" / "src"
+            for file in source_dir.rglob(f"*.c"):
+                shutil.copy(file, model_dir)
+
+            source_dir = mlf_unpacking_dir / "codegen" / "host" / "include"
+            for file in source_dir.rglob(f"*.h"):
+                shutil.copy(file, model_dir)
+
+            # Return metadata.json for use in templating
+            with open(os.path.join(mlf_unpacking_dir, "metadata.json")) as f:
+                metadata = json.load(f)
+        return metadata
+
+    def _template_model_header(self, source_dir, metadata):
+        with open(source_dir / "model.h", "r") as f:
+            model_h_template = Template(f.read())
+
+        assert (
+            metadata["style"] == "full-model"
+        ), "when generating AOT, expect only full-model Model Library Format"
+
+        template_values = {
+            "workspace_size_bytes": metadata["memory"]["functions"]["main"][0][
+                "workspace_size_bytes"
+            ],
+        }
+
+        with open(source_dir / "model.h", "w") as f:
+            f.write(model_h_template.substitute(template_values))
+
+    # Arduino ONLY recognizes .ino, .ccp, .c, .h
+
+    CPP_FILE_EXTENSION_SYNONYMS = ("cc", "cxx")
+
+    def _change_cpp_file_extensions(self, source_dir):
+        for ext in self.CPP_FILE_EXTENSION_SYNONYMS:
+            for filename in source_dir.rglob(f"*.{ext}"):
+                filename.rename(filename.with_suffix(".cpp"))
+
+        for filename in source_dir.rglob(f"*.inc"):
+            filename.rename(filename.with_suffix(".h"))
+
+    def _convert_includes(self, project_dir, source_dir):
+        """Changes all #include statements in project_dir to be relevant to their
+        containing file's location.
+
+        Arduino only supports includes relative to a file's location, so this
+        function finds each time we #include a file and changes the path to
+        be relative to the file location. Does not do this for standard C
+        libraries. Also changes angle brackets syntax to double quotes syntax.
+
+        See Also
+        -----
+        https://www.arduino.cc/reference/en/language/structure/further-syntax/include/
+
+        """
+        for ext in ("c", "h", "cpp"):
+            for filename in source_dir.rglob(f"*.{ext}"):
+                with filename.open("rb") as src_file:
+                    lines = src_file.readlines()
+                    with filename.open("wb") as dst_file:
+                        for i, line in enumerate(lines):
+                            line_str = str(line, "utf-8")
+                            # Check if line has an include
+                            result = re.search(r"#include\s*[<\"]([^>]*)[>\"]", line_str)
+                            if not result:
+                                dst_file.write(line)
+                            else:
+                                new_include = self._find_modified_include_path(
+                                    project_dir, filename, result.groups()[0]
+                                )
+                                updated_line = f'#include "{new_include}"\n'
+                                dst_file.write(updated_line.encode("utf-8"))
+
+    # Most of the files we used to be able to point to directly are under "src/standalone_crt/include/".
+    # Howver, crt_config.h lives under "src/standalone_crt/crt_config/", and more exceptions might
+    # be added in the future.
+    POSSIBLE_BASE_PATHS = ["src/standalone_crt/include/", "src/standalone_crt/crt_config/"]
+
+    def _find_modified_include_path(self, project_dir, file_path, include_path):
+        """Takes a single #include path, and returns the location it should point to.
+
+        Examples
+        --------
+        >>> _find_modified_include_path(
+        ...     "/path/to/project/dir"
+        ...     "/path/to/project/dir/src/standalone_crt/src/runtime/crt/common/ndarray.c"
+        ...     "tvm/runtime/crt/platform.h"
+        ... )
+        "../../../../../../src/standalone_crt/include/tvm/runtime/crt/platform.h"
+
+        """
+        if include_path.endswith(".inc"):
+            include_path = re.sub(r"\.[a-z]+$", ".h", include_path)
+
+        # Change includes referencing .cc and .cxx files to point to the renamed .cpp file
+        if include_path.endswith(self.CPP_FILE_EXTENSION_SYNONYMS):
+            include_path = re.sub(r"\.[a-z]+$", ".cpp", include_path)
+
+        # If the include already works, don't modify it
+        if (file_path.parents[0] / include_path).exists():
+            return include_path
+
+        relative_path = file_path.relative_to(project_dir)
+        up_dirs_path = "../" * str(relative_path).count("/")
+
+        for base_path in self.POSSIBLE_BASE_PATHS:
+            full_potential_path = project_dir / base_path / include_path
+            if full_potential_path.exists():
+                return up_dirs_path + base_path + include_path
+
+        # If we can't find the file, just leave it untouched
+        # It's probably a standard C/C++ header
+        return include_path
+
+    def _copy_standalone_crt_makefiles(self, api_server_dir, source_dir):
+        print(source_dir)
+        shutil.copy2(
+            api_server_dir / "src/example_project/Makefile",
+            source_dir,
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/Makefile.in",
+            source_dir,
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/Makefrag",
+            source_dir,
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/build.sh",
+            source_dir,
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/configure.ac",
+            source_dir,
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/include/gemmini_nn.h",
+            source_dir / "include/gemmini_nn.h",
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/include/gemmini_testutils.h",
+            source_dir / "include/gemmini_testutils.h",
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/include/gemmini.h",
+            source_dir / "include/gemmini.h",
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/rocc-software/src/xcustom.h",
+            source_dir / "rocc-software/src/xcustom.h",
+        )
+
+    def _copy_debug_data_files(self, project_dir):
+        if os.path.isdir(str(project_dir / ".." / "include")):
+            copy_tree(str(project_dir / ".." / "include"), str(project_dir / "src" / "model"))
+
+    def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
+
+        # Reference key directories with pathlib
+        project_dir = pathlib.Path(project_dir)
+        project_dir.mkdir()
+        source_dir = project_dir / "src"
+        source_dir.mkdir()
+
+        # Copies files from the template folder to project_dir
+        shutil.copy2(API_SERVER_DIR / "microtvm_api_server.py", project_dir)
+        self._copy_project_files(API_SERVER_DIR, project_dir, options["project_type"])
+
+        # Copy standalone_crt into src folder
+        self._copy_standalone_crt(source_dir, standalone_crt_dir)
+        self._remove_unused_components(source_dir, options["project_type"])
+
+        # Populate crt-config.h
+        crt_config_dir = project_dir / "src" / "standalone_crt" / "crt_config"
+        crt_config_dir.mkdir()
+        shutil.copy2(
+            API_SERVER_DIR / "crt_config" / "crt_config.h", crt_config_dir / "crt_config.h"
+        )
+
+        # Unpack the MLF and copy the relevant files
+        # extract_path = os.path.splitext(model_library_format_path)[0]
+        # with tarfile.TarFile(model_library_format_path) as tf:
+        #    os.makedirs(project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
+        #    tf.extractall(path=project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
+        metadata = self._disassemble_mlf(model_library_format_path, source_dir)
+        shutil.copy2(model_library_format_path, project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
+
+        self._copy_debug_data_files(project_dir)
+        # For AOT, template model.h with metadata to minimize space usage
+        # if options["project_type"] == "example_project":
+        #    self._template_model_header(source_dir, metadata)
+
+        # Copy makefiles to treat standalone crt code as RIOT modules
+        # self._copy_standalone_crt_makefiles(API_SERVER_DIR, source_dir)
+
+        self._change_cpp_file_extensions(source_dir)
+
+        # Recursively change includes
+        self._convert_includes(project_dir, source_dir)
+
+    def build(self, options):
+        subprocess.call(
+            "source %s && cd src && ./build.sh" % (os.environ["CHIPYARD_HOME"] + "/env.sh",),
+            shell=True,
+            executable="/bin/bash",
+        )
+        # os.system("source %s && cd src && ./build.sh" % (os.environ["CHIPYARD_HOME"] + "/env.sh",))
+
+    def flash(self, options):
+        test_name = options["project_type"].split("_")[0]
+        subprocess.call(
+            "source %s && cd src/build && spike --extension=gemmini %s"
+            % (os.environ["CHIPYARD_HOME"] + "/env.sh", test_name + "-baremetal"),
+            shell=True,
+            executable="/bin/bash",
+        )
+        # os.system("source %s && cd src/build && spike --extension=gemmini %s" % (os.environ["CHIPYARD_HOME"] + "/env.sh",test_name + "-baremetal",))
+        # if logging.root.level == logging.DEBUG:
+        #    os.system("cd src/build && spike --extension=gemmini ")
+        # else:
+        #    os.system("cd src && make flash -s > /dev/null")
+
+    def open_transport(self, options):
+        pass
+
+    def close_transport(self):
+        pass
+
+    def read_transport(self, n, timeout_sec):
+        pass
+
+    def write_transport(self, data, timeout_sec):
+        pass
+
+
+if __name__ == "__main__":
+    server.main(Handler())
diff --git a/apps/microtvm/gemmini/template_project/src/Makefile.in b/apps/microtvm/gemmini/template_project/src/Makefile.in
new file mode 100644
index 000000000000..ed017cc918ce
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/Makefile.in
@@ -0,0 +1,34 @@
+prefix		:= @prefix@
+abs_top_srcdir	:= @abs_top_srcdir@
+XLEN            := @XLEN@
+RISCVTOOLS      := @RISCVTOOLS@
+ROCC = examples
+RUNNER         := "spike --extension=gemmini "
+
+.PHONY: all bareMetalC clean
+all: bareMetalC
+
+vars = \
+	abs_top_srcdir=$(abs_top_srcdir) \
+	XLEN=$(XLEN) \
+	PREFIX=$(ROCC)-$@ \
+	src_dir=$(abs_top_srcdir) \
+	RISCVTOOLS=$(RISCVTOOLS)
+
+bareMetalC:
+	$(MAKE) -f $(abs_top_srcdir)/Makefile $(vars)
+
+clean:
+	$(MAKE) -f $(abs_top_srcdir)/Makefile abs_top_srcdir=$(abs_top_srcdir) PREFIX=$(ROCC)-bareMetalC clean
+
+test-baremetal-bareMetalC:
+	make	\
+	        -f $(abs_top_srcdir)/Makefile \
+                TARGET_MAKEFILE=$(abs_top_srcdir)/Makefile \
+		abs_top_srcdir=$(abs_top_srcdir) \
+	 	src_dir=$(abs_top_srcdir) \
+	 	XLEN=$(XLEN) \
+	 	PREFIX=$(ROCC)-bareMetalC \
+		RISCVTOOLS=$(RISCVTOOLS) \
+		RUNNER=$(RUNNER) \
+		run-baremetal
diff --git a/apps/microtvm/gemmini/template_project/src/Makefrag b/apps/microtvm/gemmini/template_project/src/Makefrag
new file mode 100644
index 000000000000..a60184526081
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/Makefrag
@@ -0,0 +1,25 @@
+XLEN ?= 64
+
+CC_BAREMETAL := riscv$(XLEN)-unknown-elf-gcc
+
+CC_LINUX_PRESENT := $(shell command -v riscv$(XLEN)-unknown-linux-gnu-gcc 2> /dev/null)
+
+# Support Linux gcc from riscv-gnu-toolchain and from system packages
+# riscv64-unknown-linux-gnu-gcc is built from riscv-gnu-toolchain, comes with Firesim's tools
+# riscv64-linux-gnu-gcc comes from a system package
+ifdef CC_LINUX_PRESENT
+    CC_LINUX := riscv$(XLEN)-unknown-linux-gnu-gcc
+else
+    CC_LINUX := riscv$(XLEN)-linux-gnu-gcc
+endif
+
+ENV_P = $(abs_top_srcdir)/riscv-tests/env/p
+ENV_V = $(abs_top_srcdir)/riscv-tests/env/v
+
+.PHONY: all clean default
+
+default: all
+src_dir = .
+
+clean:
+	rm -rf $(junk)
diff --git a/apps/microtvm/gemmini/template_project/src/add.c b/apps/microtvm/gemmini/template_project/src/add.c
new file mode 100644
index 000000000000..13aeb1a80e3f
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/add.c
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_add[output_len];
+
+int main() {
+  printf("Starting add test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_x_0 = input_1;
+  inputs.serving_default_y_0 = input_2;
+  struct tvmgen_default_outputs outputs;
+  outputs.PartitionedCall_0 = output_add;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_add[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN ADD EXAMPLE! output_add[%d] (%d) != output[%d] (%d)\r\n", i, output_add[i],
+             i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  float error_perc = ((float)(error_counter / output_len) * 100);
+  if (error_perc < 1)
+    printf("SUCCESS! (error_counter = %d)\r\n", error_counter);
+  else
+    printf("FAIL! (error_counter = %d)\r\n", error_counter);
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/conv2d.c b/apps/microtvm/gemmini/template_project/src/conv2d.c
new file mode 100644
index 000000000000..22f1bcb1d281
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/conv2d.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_conv[output_len];
+
+int main() {
+  printf("Starting conv2d test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_conv2d_input_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.StatefulPartitionedCall_0 = output_conv;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_conv[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN CONV2D EXAMPLE! output_conv[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_conv[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / output_len) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/dense.c b/apps/microtvm/gemmini/template_project/src/dense.c
new file mode 100644
index 000000000000..414eeac88020
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/dense.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_gemm[output_len];
+
+int main() {
+  printf("Starting dense test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_x_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.StatefulPartitionedCall_0 = output_gemm;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_gemm[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN DENSE EXAMPLE! output_gemm[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_gemm[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / output_len) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/dwconv2d.c b/apps/microtvm/gemmini/template_project/src/dwconv2d.c
new file mode 100644
index 000000000000..ee125e2fdc25
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/dwconv2d.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_conv[output_len];
+
+int main() {
+  printf("Starting dw conv2d test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_depthwise_conv2d_input_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.StatefulPartitionedCall_0 = output_conv;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_conv[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN DW CONV2D EXAMPLE! output_conv[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_conv[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / output_len) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/add/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/add/Makefile
new file mode 100644
index 000000000000..2c997cea1a80
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/makefiles/add/Makefile
@@ -0,0 +1,68 @@
+include $(abs_top_srcdir)/Makefrag
+
+tests = \
+	add \
+
+tests_baremetal = $(tests:=-baremetal)
+
+ifeq ($(findstring spike,$(RUNNER)),spike)
+# Currently don't support conv or conv-with-pool on spike
+runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
+else
+# Don't run very long benchmarks for RTL sim
+runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
+endif
+
+RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
+BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
+GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
+STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
+
+CFLAGS := $(CFLAGS) \
+	-DPREALLOCATE=1 \
+	-DMULTITHREAD=1 \
+	-mcmodel=medany \
+	-std=gnu99 \
+	-O2 \
+	-ffast-math \
+	-fno-common \
+	-fno-builtin-printf \
+	-march=rv64gc -Wa,-march=rv64gcxhwacha \
+	-lm \
+	-lgcc \
+	-I${RISCV_TESTS} \
+	-I${RISCV_TESTS}/env \
+	-I$(abs_top_srcdir) \
+	-I$(abs_top_srcdir)/include \
+	-I$(BENCH_COMMON) \
+	-DID_STRING=$(ID_STRING) \
+	-DPRINT_TILE=0 \
+
+CFLAGS_BAREMETAL := \
+	$(CFLAGS) \
+	-nostdlib \
+	-nostartfiles \
+	-static \
+	-T $(BENCH_COMMON)/test.ld \
+	-DBAREMETAL=1 \
+
+all: $(tests_baremetal)
+
+vpath %.c $(src_dir)
+
+%-baremetal: %.c $(GEMMINI_HEADERS)
+	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
+#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
+		$(LIBS)
+
+run-baremetal: $(runs_baremetal)
+
+%-baremetal.run: %-baremetal
+	$(RUNNER)$(abs_top_srcdir)/build/$^
+
+junk += $(tests_baremetal)
+
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile
new file mode 100644
index 000000000000..f80da67c3f98
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile
@@ -0,0 +1,68 @@
+include $(abs_top_srcdir)/Makefrag
+
+tests = \
+	conv2d \
+
+tests_baremetal = $(tests:=-baremetal)
+
+ifeq ($(findstring spike,$(RUNNER)),spike)
+# Currently don't support conv or conv-with-pool on spike
+runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
+else
+# Don't run very long benchmarks for RTL sim
+runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
+endif
+
+RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
+BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
+GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
+STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
+
+CFLAGS := $(CFLAGS) \
+	-DPREALLOCATE=1 \
+	-DMULTITHREAD=1 \
+	-mcmodel=medany \
+	-std=gnu99 \
+	-O2 \
+	-ffast-math \
+	-fno-common \
+	-fno-builtin-printf \
+	-march=rv64gc -Wa,-march=rv64gcxhwacha \
+	-lm \
+	-lgcc \
+	-I${RISCV_TESTS} \
+	-I${RISCV_TESTS}/env \
+	-I$(abs_top_srcdir) \
+	-I$(abs_top_srcdir)/include \
+	-I$(BENCH_COMMON) \
+	-DID_STRING=$(ID_STRING) \
+	-DPRINT_TILE=0 \
+
+CFLAGS_BAREMETAL := \
+	$(CFLAGS) \
+	-nostdlib \
+	-nostartfiles \
+	-static \
+	-T $(BENCH_COMMON)/test.ld \
+	-DBAREMETAL=1 \
+
+all: $(tests_baremetal)
+
+vpath %.c $(src_dir)
+
+%-baremetal: %.c $(GEMMINI_HEADERS)
+	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
+#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
+		$(LIBS)
+
+run-baremetal: $(runs_baremetal)
+
+%-baremetal.run: %-baremetal
+	$(RUNNER)$(abs_top_srcdir)/build/$^
+
+junk += $(tests_baremetal)
+
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile
new file mode 100644
index 000000000000..0b1932ceef91
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile
@@ -0,0 +1,68 @@
+include $(abs_top_srcdir)/Makefrag
+
+tests = \
+	dense \
+
+tests_baremetal = $(tests:=-baremetal)
+
+ifeq ($(findstring spike,$(RUNNER)),spike)
+# Currently don't support conv or conv-with-pool on spike
+runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
+else
+# Don't run very long benchmarks for RTL sim
+runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
+endif
+
+RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
+BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
+GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
+STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
+
+CFLAGS := $(CFLAGS) \
+	-DPREALLOCATE=1 \
+	-DMULTITHREAD=1 \
+	-mcmodel=medany \
+	-std=gnu99 \
+	-O2 \
+	-ffast-math \
+	-fno-common \
+	-fno-builtin-printf \
+	-march=rv64gc -Wa,-march=rv64gcxhwacha \
+	-lm \
+	-lgcc \
+	-I${RISCV_TESTS} \
+	-I${RISCV_TESTS}/env \
+	-I$(abs_top_srcdir) \
+	-I$(abs_top_srcdir)/include \
+	-I$(BENCH_COMMON) \
+	-DID_STRING=$(ID_STRING) \
+	-DPRINT_TILE=0 \
+
+CFLAGS_BAREMETAL := \
+	$(CFLAGS) \
+	-nostdlib \
+	-nostartfiles \
+	-static \
+	-T $(BENCH_COMMON)/test.ld \
+	-DBAREMETAL=1 \
+
+all: $(tests_baremetal)
+
+vpath %.c $(src_dir)
+
+%-baremetal: %.c $(GEMMINI_HEADERS)
+	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
+#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
+		$(LIBS)
+
+run-baremetal: $(runs_baremetal)
+
+%-baremetal.run: %-baremetal
+	$(RUNNER)$(abs_top_srcdir)/build/$^
+
+junk += $(tests_baremetal)
+
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile
new file mode 100644
index 000000000000..fa89e5be162d
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile
@@ -0,0 +1,68 @@
+include $(abs_top_srcdir)/Makefrag
+
+tests = \
+	dwconv2d \
+
+tests_baremetal = $(tests:=-baremetal)
+
+ifeq ($(findstring spike,$(RUNNER)),spike)
+# Currently don't support conv or conv-with-pool on spike
+runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
+else
+# Don't run very long benchmarks for RTL sim
+runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
+endif
+
+RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
+BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
+GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
+STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
+
+CFLAGS := $(CFLAGS) \
+	-DPREALLOCATE=1 \
+	-DMULTITHREAD=1 \
+	-mcmodel=medany \
+	-std=gnu99 \
+	-O2 \
+	-ffast-math \
+	-fno-common \
+	-fno-builtin-printf \
+	-march=rv64gc -Wa,-march=rv64gcxhwacha \
+	-lm \
+	-lgcc \
+	-I${RISCV_TESTS} \
+	-I${RISCV_TESTS}/env \
+	-I$(abs_top_srcdir) \
+	-I$(abs_top_srcdir)/include \
+	-I$(BENCH_COMMON) \
+	-DID_STRING=$(ID_STRING) \
+	-DPRINT_TILE=0 \
+
+CFLAGS_BAREMETAL := \
+	$(CFLAGS) \
+	-nostdlib \
+	-nostartfiles \
+	-static \
+	-T $(BENCH_COMMON)/test.ld \
+	-DBAREMETAL=1 \
+
+all: $(tests_baremetal)
+
+vpath %.c $(src_dir)
+
+%-baremetal: %.c $(GEMMINI_HEADERS)
+	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
+#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
+		$(LIBS)
+
+run-baremetal: $(runs_baremetal)
+
+%-baremetal.run: %-baremetal
+	$(RUNNER)$(abs_top_srcdir)/build/$^
+
+junk += $(tests_baremetal)
+
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile
new file mode 100644
index 000000000000..1218e9e67a96
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile
@@ -0,0 +1,68 @@
+include $(abs_top_srcdir)/Makefrag
+
+tests = \
+	maxpool2d \
+
+tests_baremetal = $(tests:=-baremetal)
+
+ifeq ($(findstring spike,$(RUNNER)),spike)
+# Currently don't support conv or conv-with-pool on spike
+runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
+else
+# Don't run very long benchmarks for RTL sim
+runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
+endif
+
+RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
+BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
+GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
+STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
+
+CFLAGS := $(CFLAGS) \
+	-DPREALLOCATE=1 \
+	-DMULTITHREAD=1 \
+	-mcmodel=medany \
+	-std=gnu99 \
+	-O2 \
+	-ffast-math \
+	-fno-common \
+	-fno-builtin-printf \
+	-march=rv64gc -Wa,-march=rv64gcxhwacha \
+	-lm \
+	-lgcc \
+	-I${RISCV_TESTS} \
+	-I${RISCV_TESTS}/env \
+	-I$(abs_top_srcdir) \
+	-I$(abs_top_srcdir)/include \
+	-I$(BENCH_COMMON) \
+	-DID_STRING=$(ID_STRING) \
+	-DPRINT_TILE=0 \
+
+CFLAGS_BAREMETAL := \
+	$(CFLAGS) \
+	-nostdlib \
+	-nostartfiles \
+	-static \
+	-T $(BENCH_COMMON)/test.ld \
+	-DBAREMETAL=1 \
+
+all: $(tests_baremetal)
+
+vpath %.c $(src_dir)
+
+%-baremetal: %.c $(GEMMINI_HEADERS)
+	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
+#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
+		$(LIBS)
+
+run-baremetal: $(runs_baremetal)
+
+%-baremetal.run: %-baremetal
+	$(RUNNER)$(abs_top_srcdir)/build/$^
+
+junk += $(tests_baremetal)
+
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile
new file mode 100644
index 000000000000..b6d977550097
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile
@@ -0,0 +1,68 @@
+include $(abs_top_srcdir)/Makefrag
+
+tests = \
+	mobilenet \
+
+tests_baremetal = $(tests:=-baremetal)
+
+ifeq ($(findstring spike,$(RUNNER)),spike)
+# Currently don't support conv or conv-with-pool on spike
+runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
+else
+# Don't run very long benchmarks for RTL sim
+runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
+endif
+
+RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
+BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
+GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
+STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
+
+CFLAGS := $(CFLAGS) \
+	-DPREALLOCATE=1 \
+	-DMULTITHREAD=1 \
+	-mcmodel=medany \
+	-std=gnu99 \
+	-O2 \
+	-ffast-math \
+	-fno-common \
+	-fno-builtin-printf \
+	-march=rv64gc -Wa,-march=rv64gcxhwacha \
+	-lm \
+	-lgcc \
+	-I${RISCV_TESTS} \
+	-I${RISCV_TESTS}/env \
+	-I$(abs_top_srcdir) \
+	-I$(abs_top_srcdir)/include \
+	-I$(BENCH_COMMON) \
+	-DID_STRING=$(ID_STRING) \
+	-DPRINT_TILE=0 \
+
+CFLAGS_BAREMETAL := \
+	$(CFLAGS) \
+	-nostdlib \
+	-nostartfiles \
+	-static \
+	-T $(BENCH_COMMON)/test.ld \
+	-DBAREMETAL=1 \
+
+all: $(tests_baremetal)
+
+vpath %.c $(src_dir)
+
+%-baremetal: %.c $(GEMMINI_HEADERS)
+	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
+#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
+		$(LIBS)
+
+run-baremetal: $(runs_baremetal)
+
+%-baremetal.run: %-baremetal
+	$(RUNNER)$(abs_top_srcdir)/build/$^
+
+junk += $(tests_baremetal)
+
diff --git a/apps/microtvm/gemmini/template_project/src/maxpool2d.c b/apps/microtvm/gemmini/template_project/src/maxpool2d.c
new file mode 100644
index 000000000000..8f508333c492
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/maxpool2d.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_maxpool2d[output_len];
+
+int main() {
+  printf("Starting max pooling 2D test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_x_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.PartitionedCall_0 = output_maxpool2d;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_maxpool2d[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN MAX POOL 2D EXAMPLE! output_maxpool2d[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_maxpool2d[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / output_len) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/mobilenet.c b/apps/microtvm/gemmini/template_project/src/mobilenet.c
new file mode 100644
index 000000000000..45b606004653
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/mobilenet.c
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+uint8_t output_pred[1001];
+
+int argmax(uint8_t* vec) {
+  int idx = 0;
+  uint8_t max_value = 0;
+  for (int i = 0; i < 1001; i++) {
+    if (vec[i] > max_value) {
+      idx = i;
+      max_value = vec[i];
+    }
+  }
+  return idx;
+}
+
+void get_top_5_labels(int* top_5, uint8_t* predicted_output) {
+  uint8_t prev_max_value = (uint8_t)255;
+  uint8_t current_max_value = 0;
+  int idx = 0;
+  for (int i = 0; i < 5; i++) {
+    current_max_value = 0;
+    idx = 0;
+    for (int j = 0; j < 1001; j++) {
+      if ((predicted_output[j] > current_max_value) && (predicted_output[j] < prev_max_value)) {
+        current_max_value = predicted_output[j];
+        idx = j;
+      }
+    }
+    top_5[i] = idx;
+    prev_max_value = current_max_value;
+  }
+}
+
+int main() {
+  printf("Starting MobileNet test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  int top_5_labels[5];
+
+  struct tvmgen_default_inputs inputs;
+  inputs.input = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.MobilenetV2_Predictions_Reshape = output_pred;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  /*for(int i = 0; i < output_len; i++)
+  {
+          if(output_pred[i] != output[i])
+{
+error_counter += 1;
+printf("ERROR IN MOBILENET EXAMPLE! output_pred[%d] (%d) != output[%d]
+(%d)\r\n",i,(int)output_pred[i],i,(int)output[i]);
+//exit(1);
+}
+  }*/
+
+  get_top_5_labels(top_5_labels, output_pred);
+
+  printf("Real Top-5 output labels: [ ");
+  for (int i = 0; i < 5; i++) printf("%d ", (int)top_5_labels[i]);
+  printf("]\r\n");
+
+  printf("Expected Top-5 output labels: [ ");
+  for (int i = 0; i < 5; i++) printf("%d ", (int)output[i]);
+  printf("]\r\n");
+
+  /*for(int i = 0; i < 5; i++)
+        {
+                if(top_5_labels[i] != output[i])
+    {
+      error_counter += 1;
+      printf("ERROR IN MOBILENET EXAMPLE! top_5_labels[%d] (%d) != output[%d]
+    (%d)\r\n",i,(int)top_5_labels[i],i,(int)output[i]);
+      //exit(1);
+    }
+        }*/
+
+  // printf("SUCCESS!\r\n");
+  exit(0);
+
+  // Take the argmax to get the predicted label, and the expected label
+  /*int predicted_label = argmax(output_pred);
+  int expected_label = argmax(output);
+  printf("Expected label = %d\r\n",expected_label);
+  printf("Predicted label = %d\r\n",predicted_label);
+  if(expected_label == predicted_label) printf("SUCCESS!\r\n");
+  else printf("FAILED!\r\n");
+  exit(0);*/
+}
diff --git a/cmake/modules/contrib/Gemmini.cmake b/cmake/modules/contrib/Gemmini.cmake
new file mode 100644
index 000000000000..4b73d183ddc1
--- /dev/null
+++ b/cmake/modules/contrib/Gemmini.cmake
@@ -0,0 +1,117 @@
+if(USE_MICRO)
+  message(STATUS "Add Gemmini for microTVM")
+
+  function(microtvm_add_gemmini)
+    list(
+      APPEND
+      GEMMINI_FILE_COPY_JOBS
+      "apps/microtvm/gemmini/template_project microtvm_api_server.py -> gemmini"
+      "apps/microtvm/gemmini/template_project/crt_config *.h -> gemmini/crt_config"
+
+      # Dense example project generation
+      "apps/microtvm/gemmini/template_project/src dense.c -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src/makefiles/dense Makefile -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/dense_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/dense_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/dense_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/dense_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/dense_example/rocc-software/src"
+
+      # CONV2D example project generation
+      "apps/microtvm/gemmini/template_project/src conv2d.c -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src/makefiles/conv2d Makefile -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/conv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/conv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/conv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/conv2d_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/conv2d_example/rocc-software/src"
+
+      # DW CONV2D example project generation
+      "apps/microtvm/gemmini/template_project/src dwconv2d.c -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d Makefile -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/dwconv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/dwconv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/dwconv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/dwconv2d_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/dwconv2d_example/rocc-software/src"
+
+      # ADD example project generation
+      "apps/microtvm/gemmini/template_project/src add.c -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src/makefiles/add Makefile -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/add_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/add_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/add_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/add_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/add_example/rocc-software/src"
+
+      # Max pooling 2d example project generation
+      "apps/microtvm/gemmini/template_project/src maxpool2d.c -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d Makefile -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/maxpool2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/maxpool2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/maxpool2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/maxpool2d_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/maxpool2d_example/rocc-software/src"
+
+      # Mobilenet example project generation
+      "apps/microtvm/gemmini/template_project/src mobilenet.c -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src/makefiles/mobilenet Makefile -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/mobilenet_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/mobilenet_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/mobilenet_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/mobilenet_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/mobilenet_example/rocc-software/src"
+    )
+
+    foreach(job_spec IN LISTS GEMMINI_FILE_COPY_JOBS)
+      string(REPLACE " " ";" job_spec "${job_spec}")
+      list(LENGTH job_spec job_spec_length)
+      math(EXPR job_spec_length_mod "${job_spec_length} % 3")
+      if(NOT "${job_spec_length_mod}" EQUAL 1)
+        message(
+          FATAL_ERROR
+            "Gemmini copy job spec list length is ${job_spec_length}; parsed job spec is ${job_spec}"
+        )
+      endif()
+      math(EXPR job_spec_stop "${job_spec_length} - 3")
+
+      list(GET job_spec 0 job_src_base)
+      set(job_src_base "${CMAKE_SOURCE_DIR}/${job_src_base}")
+      foreach(copy_pattern_index RANGE 1 "${job_spec_stop}" 3)
+        list(GET job_spec ${copy_pattern_index} copy_pattern)
+        math(EXPR copy_dest_index "${copy_pattern_index} + 2")
+        list(GET job_spec ${copy_dest_index} copy_dest)
+
+        file(
+          GLOB_RECURSE copy_files
+          RELATIVE "${job_src_base}"
+          "${job_src_base}/${copy_pattern}")
+        list(LENGTH copy_files copy_files_length)
+        if("${copy_files_length}" EQUAL 0)
+          message(
+            FATAL_ERROR
+              "Gemmini copy job matched 0 files: ${job_src_base}/${copy_pattern} -> ${copy_dest}"
+          )
+        endif()
+        foreach(copy_src IN LISTS copy_files)
+          get_filename_component(
+            dest_path "${MICROTVM_TEMPLATE_PROJECTS}/${copy_dest}/${copy_src}"
+            ABSOLUTE)
+          tvm_micro_add_copy_file(gemmini_template_deps
+                                  ${job_src_base}/${copy_src} ${dest_path})
+        endforeach()
+      endforeach()
+    endforeach()
+
+    add_custom_target(gemmini DEPENDS ${gemmini_template_deps})
+  endfunction()
+
+  microtvm_add_gemmini()
+
+endif(USE_MICRO)
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index f1c14c3cd914..413daf430ed5 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -500,9 +500,21 @@ def _build_func_common(measure_input, runtime=None, checks=None, build_option=No
     target, task, config = measure_input
     target, task.target_host = Target.canon_target_and_host(target, task.target_host)
     checks = checks or {}
+
     with target:
         s, args = task.instantiate(config)
 
+        # if target is gemmini, we need to use gemmini build
+        if (
+            hasattr(measure_input.target, "device_name")
+            and measure_input.target.device_name == "gemmini"
+        ):
+            # pylint: disable=import-outside-toplevel
+            import tvm.contrib.gemmini as gemmini
+
+            func = gemmini.build(s, args, target=measure_input.target, runtime=runtime)
+            return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
+
         # check invalidity of template and code hash consistency
         if not config.valid():
             raise InstantiationError(config.errors)
diff --git a/python/tvm/contrib/gemmini/__init__.py b/python/tvm/contrib/gemmini/__init__.py
new file mode 100644
index 000000000000..9515769fd641
--- /dev/null
+++ b/python/tvm/contrib/gemmini/__init__.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Gemmini package is a TVM backend extension to support the Gemmini hardware accelerator
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import sys
+import tvm._ffi.base
+
+from .environment import Environment
+from .build_module import build_config, lower, build, preprocess_pass
+from tvm.relay.backend.contrib.gemmini import *
+from .helpers import create_header_file
+from .utils import *
+
+__version__ = "0.1.0"
diff --git a/python/tvm/contrib/gemmini/build_module.py b/python/tvm/contrib/gemmini/build_module.py
new file mode 100644
index 000000000000..a094147b7a14
--- /dev/null
+++ b/python/tvm/contrib/gemmini/build_module.py
@@ -0,0 +1,201 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Helpers and functions related to the build process to generate code for the Gemmini accelerator
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import tvm
+
+from .environment import Environment
+from .transform import *
+from tvm import relay
+from .legalize import LegalizeGemmini
+
+
+def preprocess_pass(mod):
+    """This is the preprocess pass to use the Gemmini accelerator, it groups the
+
+    Args:
+        mod (tvm.ir.IRModule): IRModule to preprocess
+
+    Returns:
+        tvm.ir.IRModule: preprocessed IRModule
+    """
+
+    # First, merge all dw and convs that can be merged!
+    pattern = relay.op.contrib.get_pattern_table("gemmini")
+
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.ConvertLayout({"qnn.conv2d": ["NHWC", "HWIO"]})(mod)
+    mod = relay.transform.SimplifyExpr()(mod)
+    mod = relay.transform.MergeComposite(pattern)(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.SimplifyExpr()(mod)
+    mod = LegalizeGemmini()(mod)
+    mod = relay.transform.InferType()(mod)
+    return mod
+
+
+def internal_build_configs(usmp_alg=""):
+    """Builds the internal configurations for the build process
+
+    Args:
+        usmp_alg (str, optional): Which USMP algorithm to use. Defaults to "".
+
+    Returns:
+        dict: configurations
+    """
+    enable_usmp = False if usmp_alg == "" else True
+    pass_list = [
+        (0, tvm.tir.transform.StorageFlatten(16)),
+        (1, InjectAMVINIntrin()),
+        (1, InjectAMVINIntrinTransposed()),
+        (1, InjectBMVINIntrin()),
+        (1, InjectBMVINIntrinTransposed()),
+        (1, InjectCMVOUTIntrin()),
+        (1, InjectCMVOUTIntrinTransposed()),
+        (1, InjectDMVINIntrin()),
+        (1, InjectDMVINIntrinTransposed()),
+        (1, InjectCMVINIntrin()),
+        (1, InjectCMVINIntrinTransposed()),
+        (1, InjectCMVINAccumIntrin()),
+        (1, InjectCMVINAccumIntrinTransposed()),
+        (1, tvm.tir.transform.CorrectGemminisScratchpadAndAccumulatorPointers()),
+        (2, tvm.tir.transform.LowerDeviceStorageAccessInfo()),
+        (4, InsertGemminiHeaderOperators()),
+        (5, InsertGemminiFenceOperator()),
+    ]
+
+    return {
+        "tir.add_lower_pass": pass_list,
+        "tir.disable_vectorize": True,
+        # "tir.CorrectGemminisScratchpadAndAccumulatorPointers": {"dim": env.DIM}
+        "tir.usmp.enable": enable_usmp,
+        "tir.usmp.algorithm": usmp_alg,
+    }
+
+
+def build_config(usmp_alg="", **kwargs):
+    """Creates the PassContext needed by the build process to correctly build the Gemmini operators
+
+    Args:
+        usmp_alg (str, optional): Which USMP algorithm to use. Defaults to "".
+
+    Returns:
+        tvm.transform.PassContext: PassContext with specific configurations
+    """
+
+    config = internal_build_configs(usmp_alg)
+    if kwargs.get("config"):
+        config.update(kwargs[config])
+        del kwargs["config"]
+
+    return tvm.transform.PassContext(config=config, **kwargs)
+
+
+def lower(*args, **kwargs):
+    """Thin wrapper of tvm.lower
+
+    This wrapper automatically applies Gemmini's build_config
+    if there is no user specified build_config in context.
+
+    See Also
+    --------
+    tvm.lower : The original TVM's lower function
+    """
+    pass_ctx = tvm.transform.PassContext.current()
+    if not pass_ctx.config.get("add_lower_pass"):
+        with build_config():
+            return tvm.lower(*args, **kwargs)
+    return tvm.lower(*args, **kwargs)
+
+
+def build(*args, **kwargs):
+    """Thin wrapper of tvm.build
+
+    This wrapper automatically applies Gemmini's build_config
+    if there is no user specified build_config in context.
+
+    See Also
+    --------
+    tvm.build : The original TVM's build function
+    """
+    pass_ctx = tvm.transform.PassContext.current()
+    if not pass_ctx.config.get("tir.add_lower_pass"):
+        with build_config():
+            return tvm.build(*args, **kwargs)
+    return tvm.build(*args, **kwargs)
+
+
+# The memory information for the compiler
+@tvm.register_func("tvm.info.mem.%s" % Environment.instance().scr_scope)
+def mem_info_inp_buffer():
+    """Creates the information about the local.scratchpad memory node
+
+    Returns:
+        node: The corresponding MemoryInfo node
+    """
+    spec = Environment.instance()
+    return tvm.ir.make_node(
+        "MemoryInfo",
+        unit_bits=spec.inp_bits,
+        max_simd_bits=spec.DIM,
+        max_num_bits=int(spec.INP_SCR_ROWS * spec.DIM * spec.inp_bits),
+        # head_address=tvm.runtime.const(spec.INP_SCR_BASE_ADDRESS, "uint32"),
+        head_address=None,
+    )
+
+
+# The memory information for the compiler
+@tvm.register_func("tvm.info.mem.%s" % Environment.instance().scr_wgt_scope)
+def mem_info_wgt_buffer():
+    """Creates the information about the local.scratchpad_weight memory node
+
+    Returns:
+        node: The corresponding MemoryInfo node
+    """
+    spec = Environment.instance()
+    return tvm.ir.make_node(
+        "MemoryInfo",
+        unit_bits=spec.wgt_bits,
+        max_simd_bits=spec.DIM,
+        max_num_bits=int(spec.WGT_SCR_ROWS * spec.DIM * spec.wgt_bits),
+        # head_address=tvm.runtime.const(spec.WGT_SCR_BASE_ADDRESS, "uint32"),
+        head_address=None,
+    )
+
+
+# The memory information for the compiler
+@tvm.register_func("tvm.info.mem.%s" % Environment.instance().acc_scope)
+def mem_info_acc_buffer():
+    """Creates the information about the local.accumulator memory node
+
+    Returns:
+        node: The corresponding MemoryInfo node
+    """
+    spec = Environment.instance()
+    return tvm.ir.make_node(
+        "MemoryInfo",
+        unit_bits=env.inp_bits,
+        max_simd_bits=env.DIM,
+        max_num_bits=int(env.ACC_ROWS * env.DIM * env.inp_bits),
+        # head_address=tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32"),
+        head_address=None,
+    )
diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
new file mode 100644
index 000000000000..7d6350d1ebb9
--- /dev/null
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -0,0 +1,386 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, exec-used
+"""
+Environment declaration. Contains Gemminis hardware parameters.
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from __future__ import absolute_import as _abs
+from .intrin import (
+    gemm,
+    gemm_cisc,
+    conv2d_cisc,
+    dw_conv2d_cisc,
+    add_tensorize,
+    add_mvout_tensorize,
+)
+import re
+from pydevicetree import Devicetree
+import os
+import tvm
+import sys
+from typing import List, Tuple, Dict, Callable
+from .utils import counters
+
+
+class Environment(object):
+    """Hardware configuration object.
+
+    This object contains all the information
+    needed for compiling to a specific Gemmini backend.
+
+    """
+
+    _instance = None
+
+    @classmethod
+    def init_overwrite(
+        cls,
+        batch=1,
+        dim=32,
+        max_bytes=64,
+        inp_dtype="int8",
+        wgt_dtype="int8",
+        acc_dtype="int32",
+        acc_rows=4096,
+        bank_rows=8192,
+        bank_num=4,
+        debug=False,
+        enabled_counters: Dict = {},
+        supports_non_zero_padding: bool = False,
+        use_experimental_qnn_add: bool = False,
+    ):
+        """Overwrites the init function
+
+        Args:
+            batch (int, optional): Batch size. Defaults to 1.
+            dim (int, optional): Gemminis systolic array dimensions (DIM). Defaults to 32.
+            max_bytes (int, optional): Used to calculate the maximum amount of columns one mvin instruction can generate. Defaults to 64.
+            inp_dtype (str, optional): Type supported by the Gemmini scratchpad. Defaults to "int8".
+            wgt_dtype (str, optional): Type supported by the Gemmini "logical" weight scratchpad. Defaults to "int8".
+            acc_dtype (str, optional): Type supported by the Gemmini accumulator. Defaults to "int32".
+            acc_rows (int, optional): Amount of rows of the accumulator. Defaults to 4096.
+            bank_rows (int, optional): Amount of rows of each bank in the scratchpad. Defaults to 8192.
+            bank_num (int, optional): Amount of banks for the scratchpad. Defaults to 4.
+            debug (bool, optional): Adds debug of Gemmini counters to generated code. Defaults to False.
+            enabled_counters (dict, optional): Dictionary of enabled Gemmini counters for debug purposes. Defaults to empty.
+            supports_non_zero_padding (bool, optional): If Gemmini supports instructions with non-zero padding. Defaults to False.
+            use_experimental_qnn_add (bool, optional): Activate pattern matching for qnn.add. Defaults to False.
+        """
+        inst = Environment.instance()
+        inst.init(
+            batch=batch,
+            dim=dim,
+            max_bytes=max_bytes,
+            inp_dtype=inp_dtype,
+            wgt_dtype=wgt_dtype,
+            acc_dtype=acc_dtype,
+            acc_rows=acc_rows,
+            bank_rows=bank_rows,
+            bank_num=bank_num,
+            debug=debug,
+            enabled_counters=enabled_counters,
+            supports_non_zero_padding=supports_non_zero_padding,
+            use_experimental_qnn_add=use_experimental_qnn_add,
+        )
+
+    @classmethod
+    def instance(cls):
+        """Returns the current instance
+
+        Returns:
+            _type_: _description_
+        """
+        if cls._instance is None:
+            cls._instance = cls.__new__(cls)
+            cls._instance.init()
+        return cls._instance
+
+    def init(
+        self,
+        batch=1,
+        dim=16,
+        max_bytes=64,
+        inp_dtype="int8",
+        wgt_dtype="int8",
+        acc_dtype="int32",
+        acc_rows=1024,
+        bank_rows=4096,
+        bank_num=4,
+        debug=False,
+        enabled_counters: Dict = {},
+        supports_non_zero_padding: bool = False,
+        use_experimental_qnn_add: bool = False,
+    ):
+        """_summary_
+
+        Args:
+            batch (int, optional): Batch size. Defaults to 1.
+            dim (int, optional): Gemminis systolic array dimensions (DIM). Defaults to 32.
+            max_bytes (int, optional): Used to calculate the maximum amount of columns one mvin instruction can generate. Defaults to 64.
+            inp_dtype (str, optional): Type supported by the Gemmini scratchpad. Defaults to "int8".
+            wgt_dtype (str, optional): Type supported by the Gemmini "logical" weight scratchpad. Defaults to "int8".
+            acc_dtype (str, optional): Type supported by the Gemmini accumulator. Defaults to "int32".
+            acc_rows (int, optional): Amount of rows of the accumulator. Defaults to 4096.
+            bank_rows (int, optional): Amount of rows of each bank in the scratchpad. Defaults to 8192.
+            bank_num (int, optional): Amount of banks for the scratchpad. Defaults to 4.
+            debug (bool, optional): Adds debug of Gemmini counters to generated code. Defaults to False.
+            enabled_counters (dict, optional): Dictionary of enabled Gemmini counters for debug purposes. Defaults to empty.
+            supports_non_zero_padding (bool, optional): If Gemmini supports instructions with non-zero padding. Defaults to False.
+            use_experimental_qnn_add (bool, optional): Activate pattern matching for qnn.add. Defaults to False.
+        """
+
+        assert batch == 1, "Only batch size of 1 is currently supported"
+        self.debug = debug
+
+        self.BATCH = batch
+        self.DIM = dim
+        self.MAX_BYTES = max_bytes
+
+        self.inp_dtype = inp_dtype
+        self.wgt_dtype = wgt_dtype
+        self.acc_dtype = acc_dtype
+
+        self.inp_bits = int(
+            re.match(r"((float)|(int)|(uint))(?P<width_bits>[0-9]+)", self.inp_dtype).group(
+                "width_bits"
+            )
+        )
+        self.wgt_bits = int(
+            re.match(r"((float)|(int)|(uint))(?P<width_bits>[0-9]+)", self.wgt_dtype).group(
+                "width_bits"
+            )
+        )
+        self.acc_bits = int(
+            re.match(r"((float)|(int)|(uint))(?P<width_bits>[0-9]+)", self.acc_dtype).group(
+                "width_bits"
+            )
+        )
+
+        self.size_elem = int(self.inp_bits / 8)
+        self.size_acc = int(self.acc_bits / 8)
+
+        self.ACC_ROWS = acc_rows
+        self.BANK_ROWS = bank_rows
+        self.BANK_NUM = bank_num
+
+        self.WGT_SCR_BASE_ADDRESS = int(self.BANK_ROWS * self.BANK_NUM * 2 / 4)
+        self.WGT_SCR_ROWS = self.BANK_ROWS * self.BANK_NUM - self.WGT_SCR_BASE_ADDRESS
+        self.INP_SCR_BASE_ADDRESS = 0
+        self.INP_SCR_ROWS = self.WGT_SCR_BASE_ADDRESS
+        self.OUT_ACC_BASE_ADDRESS = 0xC0000000
+
+        self.MAX_BLOCK_LEN = int(self.MAX_BYTES / self.DIM)
+        if self.DIM * self.size_acc <= self.MAX_BYTES:
+            self.MAX_BLOCK_LEN_ACC = int(self.MAX_BYTES / (self.DIM * self.size_acc))
+        else:
+            self.MAX_BLOCK_LEN_ACC = 1
+
+        self.scr_scope = "local.scratchpad"
+        self.acc_scope = "local.accumulator"
+        # TODO (FP): check this scratchpad_weight. Actually, only one scratchpad should exist, but we do this logical partition to correctly manage the pointers to the buffers stored in this memories. Should see how we can fix this in the future.
+        self.scr_wgt_scope = "local.scratchpad_weight"
+
+        self.A_mvin = "A_mvin"
+        self.B_mvin = "B_mvin"
+        self.D_mvin = "D_mvin"
+        self.C_mvin = "C_mvin"
+        self.C_mvin_accum = "C_mvin_accum"
+        self.C_mvout = "C_mvout"
+        self.C_mvout_acc_dtype = "C_mvout_acc_dtype"
+
+        self.WEIGHT_STATIONARY = 1
+        self.OUTPUT_STATIONARY = 0
+
+        self.mvin_scale_identity = 1.0
+        self.max_matrix = 64
+
+        self.supports_non_zero_padding = supports_non_zero_padding
+        self.use_experimental_qnn_add = use_experimental_qnn_add
+
+        self.enabled_counters = enabled_counters if bool(enabled_counters) else counters
+        # Check that all enabled counters exist in the actual counters from Gemmini
+        for key, value in self.enabled_counters.items():
+            assert (
+                self.enabled_counters[key] == counters[key]
+            ), f"Enabled counter with key {key} does not exist or has a different name in the actual counters dict!"
+
+    def gemm(
+        self,
+        I: int,
+        K: int,
+        J: int,
+        stride: int = 1,
+        is_depthwise_conv2d: bool = False,
+        mode: int = 1,
+        accum_patch=None,
+    ) -> Callable:
+        """Wrapper to expose the gemm intrinsic
+
+        Args:
+            I (int): output first axis dimension
+            K (int): reduction axis dimension
+            J (int): output second axis dimension
+            stride (int, optional): Stride, useful for convolutions. Defaults to 1.
+            is_depthwise_conv2d (bool, optional): Flag to explain if this is a GEMM for a depthwise convolution. Defaults to False.
+            mode (int, optional): Systolic array mode (WS=1,OS=0). Defaults to 1.
+            accum_patch (_type_, optional): Var of the reduction axis loop. Defaults to None.
+
+        Returns:
+            Callable: gemm instrinsic
+        """
+        return gemm(self, I, K, J, stride, is_depthwise_conv2d, mode, accum_patch)
+
+    def gemm_cisc(
+        self,
+        inp_shape: Tuple[int, ...],
+        wgt_shape: Tuple[int, ...],
+        bias_shape: Tuple[int, ...],
+        scale: float,
+        matmul_type: int,
+    ) -> Callable:
+        """Wrapper to expose the gemm_cisc intrinsic
+
+        Args:
+            inp_shape (Tuple[int,...]): Input feature map shape
+            wgt_shape (Tuple[int,...]): Weights shape
+            bias_shape (Tuple[int,...]): Bias shape
+            scale (float): Output scaling factor
+            matmul_type (int): Systolic array mode (WS=1,OS=0)
+
+        Returns:
+            Callable: gemm cisc intrinsic
+        """
+        return gemm_cisc(self, inp_shape, wgt_shape, bias_shape, scale, matmul_type)
+
+    def conv2d_cisc(
+        self,
+        inp_shape: Tuple[int, ...],
+        wgt_shape: Tuple[int, ...],
+        bias_shape: Tuple[int, ...],
+        out_shape: Tuple[int, ...],
+        strides: int,
+        padding: List[int],
+        padding_value: int,
+        activation: int,
+        scale: float,
+        pool_size: List[int],
+        pool_strides: List[int],
+        pool_dilation: List[int],
+        pool_padding: List[int],
+    ) -> Callable:
+        """Wrapper to expose the conv2d_cisc intrinsic
+
+        Args:
+            inp_shape (Tuple[int,...]): Input feature map shape
+            wgt_shape (Tuple[int,...]): Weights shape
+            bias_shape (Tuple[int,...]): Bias shape
+            out_shape (Tuple[int,...]): Output feature map shape
+            strides (int): Convolution stride
+            padding (List[int]): Pixels to pad in each direction
+            padding_value (int): Value to use for padding
+            activation (int): Has activation?
+            scale (float): Output scaling factor
+            pool_size (List[int]): Size of the output pooling window
+            pool_strides (List[int]): Strides for the output pooling window
+            pool_dilation (List[int]): Dilation for the output pooling window
+            pool_padding (List[int]): Padding for the output pooling
+
+        Returns:
+            Callable: conv2d cisc intrinsic
+        """
+        return conv2d_cisc(
+            self,
+            inp_shape,
+            wgt_shape,
+            bias_shape,
+            out_shape,
+            strides,
+            padding,
+            padding_value,
+            activation,
+            scale,
+            pool_size,
+            pool_strides,
+            pool_dilation,
+            pool_padding,
+        )
+
+    def dw_conv2d_cisc(
+        self,
+        inp_shape: Tuple[int, ...],
+        wgt_shape: Tuple[int, ...],
+        bias_shape: Tuple[int, ...],
+        out_shape: Tuple[int, ...],
+        strides: int,
+        padding: List[int],
+        padding_value: int,
+        activation: int,
+        scale: float,
+    ) -> Callable:
+        """Wrapper to expose the dw_conv2d_cisc intrinsic
+
+        Args:
+            inp_shape (Tuple[int,...]): Input feature map shape
+            wgt_shape (Tuple[int,...]): Weights shape
+            bias_shape (Tuple[int,...]): Bias shape
+            out_shape (Tuple[int,...]): Output feature map shape
+            strides (int): Convolution stride
+            padding (List[int]): Pixels to pad in each direction
+            padding_value (int): Value to use for padding
+            activation (int): Has activation?
+            scale (float): Output scaling factor
+
+        Returns:
+            Callable: dw conv2d cisc intrinsic
+        """
+        return dw_conv2d_cisc(
+            self,
+            inp_shape,
+            wgt_shape,
+            bias_shape,
+            out_shape,
+            strides,
+            padding,
+            padding_value,
+            activation,
+            scale,
+        )
+
+    def add_tensorize(self, oshape: Tuple[int, ...]) -> Callable:
+        """Wrapper to expose the add_tensorize intrinsic
+
+        Args:
+            oshape (Tuple[int,...]): Output feature map shape
+
+        Returns:
+            Callable: add intrinsic
+        """
+        return add_tensorize(self, oshape)
+
+    def add_mvout_tensorize(self, oshape: Tuple[int, ...]) -> Callable:
+        """Wrapper to expose the add_mvout_tensorize intrinsic
+
+        Args:
+            oshape (Tuple[int,...]): Output feature map shape
+
+        Returns:
+            Callable: add mvout intrinsic
+        """
+        return add_mvout_tensorize(self, oshape)
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
new file mode 100644
index 000000000000..84c028b3d33c
--- /dev/null
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -0,0 +1,188 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Miscellaneous helpers
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import numpy as np
+import pathlib
+from .environment import Environment
+
+import abc
+import collections
+import matplotlib
+import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
+import PIL.Image as Image
+import PIL.ImageColor as ImageColor
+import PIL.ImageDraw as ImageDraw
+import PIL.ImageFont as ImageFont
+import six
+from six.moves import range
+from six.moves import zip
+import tensorflow.compat.v1 as tf
+from typing import List, Tuple
+
+
+env = Environment.instance()
+
+
+def create_header_file(
+    name: str,
+    section: str,
+    tensor_name: str,
+    tensor_data: np.ndarray,
+    output_path: str,
+    debug: bool = False,
+    weights: bool = None,
+):
+    """This function generates a header file containing the data from the numpy array provided.
+
+    Args:
+        name (str): Header file name
+        section (str): section to assign the generated variable
+        tensor_name (str): name for the generated variable
+        tensor_data (np.ndarray): data to fill the variable with
+        output_path (str): output path where the header file will be generated
+        debug (bool, optional): enable debug. Defaults to False.
+        weights (bool, optional): For debug purposes. Defaults to None.
+    """
+    file_path = pathlib.Path(f"{output_path}/" + name).resolve()
+    # Create header file with npy_data as a C array
+    raw_header_path = file_path.with_suffix(".h").resolve()
+    raw_source_path = file_path.with_suffix(".c").resolve()
+
+    if tensor_data.dtype == np.float32:
+        type = "float"
+        align = 32
+    elif tensor_data.dtype == np.int8:
+        type = "int8_t"
+        align = 16
+    elif tensor_data.dtype == np.uint8:
+        type = "uint8_t"
+        align = 16
+    elif tensor_data.dtype == np.uint32:
+        type = "uint32_t"
+        align = 16
+    else:
+        assert False, "Type %s is not supported!" % tensor_data.dtype
+
+    with open(raw_header_path, "a+") as header_file:
+        header_file.write(
+            f"#define {tensor_name}_len {tensor_data.size}\n"
+            + f"extern {type} {tensor_name}[{tensor_name}_len];\n"
+        )
+
+    if not raw_source_path.is_file():
+        with open(raw_source_path, "a+") as source_file:
+            source_file.write(f"#include <stdint.h>\n")
+    with open(raw_source_path, "a+") as source_file:
+
+        source_file.write(
+            f'{type} {tensor_name}[] __attribute__((section("{section}"), aligned({align}))) = {{'
+            if section
+            else f"{type} {tensor_name}[] __attribute__((aligned({align}))) = {{"
+        )
+        data_hexstr = tensor_data.tobytes().hex()
+        flatten = tensor_data.flatten()
+
+        if tensor_data.dtype == np.float32 or tensor_data.dtype == np.uint32:
+            for i in range(0, len(flatten)):
+                source_file.write(f"{flatten[i]},")
+            source_file.write("};\n\n")
+        else:
+            for i in range(0, len(data_hexstr), 2):
+                if flatten[int(i / 2)] < 0:
+                    # Special treatment to generate negative numbers correctly!
+                    data_hexstr_2comp = (
+                        (~int(flatten[int(i / 2)]) + 1).to_bytes(length=1, byteorder="big").hex()
+                    )
+                    source_file.write(f"-0x{data_hexstr_2comp}")
+                else:
+                    source_file.write(f"+0x{data_hexstr[i:i+2]}")
+                if i != (len(flatten) - 1) * 2:
+                    source_file.write(",")
+            source_file.write("};\n\n")
+
+        if debug:
+            source_file.write("/*\n")
+            for n in range(tensor_data.shape[0]):
+                for ch in range(tensor_data.shape[3]):
+                    source_file.write("Channel %i:\n" % ch)
+                    for row in range(tensor_data.shape[1]):
+                        for col in range(tensor_data.shape[2]):
+                            source_file.write(f"{tensor_data[n][row][col][ch]}\t")
+                        source_file.write("\n")
+            source_file.write("*/\n")
+
+            if weights is not None:
+                source_file.write("/*\n")
+                for o_ch in range(weights.shape[3]):
+                    source_file.write("Output channel %i:\n" % o_ch)
+                    for i_ch in range(weights.shape[2]):
+                        source_file.write("Input channel %i:\n" % i_ch)
+                        for row in range(weights.shape[0]):
+                            for col in range(weights.shape[1]):
+                                source_file.write(f"{weights[row][col][i_ch][o_ch]}\t")
+                            source_file.write("\n")
+                source_file.write("*/\n")
+
+
+def get_divisors(x: int) -> List[int]:
+    """Gets all the numbers that perfectly divide x
+
+    Args:
+        x (int): Number to divide
+
+    Returns:
+        List[int]: list of divisors
+    """
+    divs = []
+    for i in range(1, x + 1):
+        if x % i == 0:
+            divs.append(i)
+    return divs
+
+
+def get_greater_div(x, limit: int = None):
+    """Gets the greater divisor for all x
+
+    Args:
+        x: _description_
+        limit (int, optional): Max greater divisor to return. Defaults to None.
+
+    Returns:
+        int: Greater divisor
+    """
+
+    limit = env.DIM if limit == None else limit
+
+    if isinstance(x, int):
+        elements = [x]
+    elif isinstance(x, list):
+        elements = x
+    else:
+        assert False, "type of x not supported!"
+
+    divisors = []
+    for element in elements:
+        divs = get_divisors(element)
+        filtered = filter(lambda d: d <= limit, divs)
+        divisors.append(filtered)
+
+    return max(set.intersection(*map(set, divisors)))
diff --git a/python/tvm/contrib/gemmini/intrin.py b/python/tvm/contrib/gemmini/intrin.py
new file mode 100644
index 000000000000..0909e58a890d
--- /dev/null
+++ b/python/tvm/contrib/gemmini/intrin.py
@@ -0,0 +1,873 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Gemmini related intrinsics
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from __future__ import absolute_import as _abs
+
+import tvm
+from tvm import te
+from typing import List, Tuple
+
+
+def gemm(
+    env,
+    I: int,
+    K: int,
+    J: int,
+    stride: int = 1,
+    is_depthwise_conv2d: bool = True,
+    mode: int = 1,
+    accum_patch: tvm.tir.Var = None,
+):
+    """Matrix-matrix multiply intrinsic, inserts the most basic Gemmini instructions
+
+    Args:
+        env (Environment): Environment with configurations
+        I (int): output first axis dimension
+        K (int): reduction axis dimension
+        J (int): output second axis dimension
+        stride (int, optional): Stride, useful for convolutions. Defaults to 1.
+        is_depthwise_conv2d (bool, optional): Flag to explain if this is a GEMM for a depthwise convolution. Defaults to False.
+        mode (int, optional): Systolic array mode (WS=1,OS=0). Defaults to 1.
+        accum_patch (tvm.tir.Var, optional): Var of the reduction axis loop. Defaults to None.
+
+    Returns:
+        TensorIntrin: gemm tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for I, K and J?
+
+    wgt_shape = (K, J)
+
+    inp_shape = (I, K)
+
+    out_shape = (I, J)
+
+    wgt = te.placeholder(wgt_shape, dtype=env.wgt_dtype, name=env.scr_wgt_scope)
+    inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
+
+    bias = te.placeholder(out_shape, dtype=env.inp_dtype, name=env.scr_scope)
+
+    k = te.reduce_axis((0, wgt_shape[0]), name="k")
+
+    out_dtype = env.inp_dtype
+
+    if is_depthwise_conv2d:
+        out = te.compute(
+            out_shape,
+            lambda i, j: te.sum(
+                inp[i * stride + k, j].astype(env.inp_dtype) * wgt[0, k].astype(env.inp_dtype)
+                + bias[i, j].astype(env.inp_dtype),
+                axis=[k],
+            ),
+            name="out",
+        )
+    else:
+        out = te.compute(
+            out_shape,
+            lambda i, j: te.sum(
+                inp[i * stride, k].astype(env.inp_dtype) * wgt[k, j].astype(env.inp_dtype)
+                + bias[i, j].astype(env.inp_dtype),
+                axis=[k],
+            ),
+            name="out",
+        )
+    wgt_layout = tvm.tir.decl_buffer(
+        wgt.shape,
+        wgt.dtype,
+        "wgt_buff",
+        scope=env.scr_wgt_scope,
+        strides=[te.var("wgt_k"), te.var("wgt_y")],
+        offset_factor=env.DIM,
+    )
+    inp_layout = tvm.tir.decl_buffer(
+        inp.shape,
+        inp.dtype,
+        "inp_buff",
+        scope=env.scr_scope,
+        strides=[te.var("inp_x"), te.var("inp_k")],
+        offset_factor=env.DIM,
+    )
+    bias_layout = tvm.tir.decl_buffer(
+        bias.shape,
+        bias.dtype,
+        "bias_buff",
+        scope=env.acc_scope,
+        strides=[te.var("inp_x"), te.var("inp_k")],
+        offset_factor=env.DIM,
+    )
+    out_layout = tvm.tir.decl_buffer(
+        out.shape,
+        out_dtype,
+        "out_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_x"), te.var("out_y")],
+        offset_factor=env.DIM,
+    )
+
+    def intrin_func(ins, outs):
+        """Matrix-matrix multiply intrinsic function"""
+        dinp, dwgt, dbias = ins
+        dout = outs[0]
+
+        inp_base_address = tvm.runtime.const(env.INP_SCR_BASE_ADDRESS, "uint32")
+        wgt_base_address = tvm.runtime.const(env.WGT_SCR_BASE_ADDRESS, "uint32")
+        wgt_access_ptr = dwgt.access_ptr("r", "uint32")
+        out_base_address = tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+        out_access_ptr = dout.access_ptr("w", "uint32")
+
+        garbage = tvm.runtime.const(0xFFFFFFFF, "uint32")
+
+        def _body():
+            """Generate matrix-matrix multiply Gemmini instruction, without accumulate (garbage address in compute_preloaded)"""
+            irb = tvm.tir.ir_builder.create()
+
+            inp_access_ptr = dinp.access_ptr("r", "uint32")
+
+            A_access_ptr = inp_base_address + inp_access_ptr
+            BD_access_ptr = (
+                wgt_base_address + wgt_access_ptr if mode == env.WEIGHT_STATIONARY else garbage
+            )
+            C_access_ptr = out_base_address + out_access_ptr
+            DB_access_ptr = (
+                garbage if mode == env.WEIGHT_STATIONARY else wgt_base_address + wgt_access_ptr
+            )
+
+            A_cols = dinp.shape[1]
+            A_rows = dinp.shape[0]
+            BD_cols = dwgt.shape[1] if mode == env.WEIGHT_STATIONARY else dout.shape[1]
+            BD_rows = dwgt.shape[0] if mode == env.WEIGHT_STATIONARY else dout.shape[0]
+            C_cols = dout.shape[1]
+            C_rows = dout.shape[0]
+            DB_cols = C_cols if mode == env.WEIGHT_STATIONARY else dwgt.shape[1]
+            DB_rows = C_rows if mode == env.WEIGHT_STATIONARY else dwgt.shape[0]
+
+            with irb.if_scope(accum_patch == 0):
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended_preload",
+                        BD_access_ptr,
+                        C_access_ptr,
+                        BD_cols,
+                        BD_rows,
+                        C_cols,
+                        C_rows,
+                    )
+                )
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended_compute_preloaded",
+                        A_access_ptr,
+                        DB_access_ptr,
+                        A_cols,
+                        A_rows,
+                        DB_cols,
+                        DB_rows,
+                    )
+                )
+            with irb.else_scope():
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended_preload",
+                        garbage,
+                        C_access_ptr,
+                        BD_cols,
+                        BD_rows,
+                        C_cols,
+                        C_rows,
+                    )
+                )
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended_compute_accumulated",
+                        A_access_ptr,
+                        DB_access_ptr,
+                        A_cols,
+                        A_rows,
+                        DB_cols,
+                        DB_rows,
+                    )
+                )
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="GEMM",
+        binds={inp: inp_layout, wgt: wgt_layout, bias: bias_layout, out: out_layout},
+    )
+
+
+def gemm_cisc(
+    env,
+    inp_shape: Tuple[int, ...],
+    wgt_shape: Tuple[int, ...],
+    bias_shape: Tuple[int, ...],
+    scale: float,
+    matmul_type: int,
+):
+    """Matrix-matrix multiply intrinsic, inserts the calls to the function provided by the Gemmini developers to run matrix multiplication using the loop instructions
+
+    Args:
+        env (Environment): Environment with configurations
+        inp_shape (Tuple[int,...]): Input feature map shape
+        wgt_shape (Tuple[int,...]): Weights shape
+        bias_shape (Tuple[int,...]): Bias shape
+        scale (float): Output scaling factor
+        matmul_type (int): Systolic array mode (WS=1,OS=0)
+
+    Returns:
+        TensorIntrin: GEMM CISC tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for inp_shape, wgt_shape and bias_shape?
+
+    wgt = te.placeholder(wgt_shape, dtype=env.inp_dtype, name=env.scr_wgt_scope)
+    inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
+    bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
+
+    K = wgt.shape[0]
+    J = wgt.shape[1]
+    I = inp.shape[0]
+
+    k_ = te.reduce_axis((0, K), name="K")
+
+    output_shape = (I, J)
+
+    out = te.compute(
+        output_shape,
+        lambda x_, y_: te.sum(
+            inp[x_, k_].astype(env.inp_dtype) * wgt[k_, y_].astype(env.inp_dtype)
+            + bias[y_].astype(env.inp_dtype),
+            axis=[k_],
+        ),
+    )
+
+    wgt_layout = tvm.tir.decl_buffer(
+        wgt_shape,
+        env.inp_dtype,
+        "wgt_buff",
+    )
+    inp_layout = tvm.tir.decl_buffer(
+        inp_shape,
+        env.inp_dtype,
+        "inp_buff",
+        strides=[te.var("inp_x"), te.var("inp_y")],
+    )
+    bias_layout = tvm.tir.decl_buffer(
+        bias_shape,
+        env.acc_dtype,
+        "bias_buff",
+    )
+    out_layout = tvm.tir.decl_buffer(
+        output_shape,
+        env.inp_dtype,
+        "out_buff",
+    )
+
+    def intrin_func(ins, outs):
+        """Matrix-matrix multiply intrinsic function"""
+        dinp, dwgt, dbias = ins
+        dout = outs[0]
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "tiled_matmul_auto",
+                    dinp.shape[0],  # dim_I,
+                    dwgt.shape[1],  # dim_J,
+                    dinp.shape[1],  # dim_K,
+                    dinp.access_ptr("r"),
+                    dwgt.access_ptr("r"),
+                    dbias.access_ptr("r"),
+                    dout.access_ptr("w"),
+                    dinp.shape[0],  # stride_A
+                    dwgt.shape[1],  # stride_B
+                    dwgt.shape[1],  # stride_C
+                    dwgt.shape[1],  # stride_D
+                    1.0,  # A_scale_factor
+                    1.0,  # B_scale_factor
+                    1.0,  # D_scale_factor
+                    0,  # act
+                    scale,
+                    0,  # relu6_shift
+                    1,  # repeating_bias
+                    0,  # transpose_A
+                    0,  # transpose_B
+                    0,  # full_C
+                    0,  # low_D
+                    # 0,
+                    0,  # weightA
+                    matmul_type,
+                )
+            )
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="CONV2D_CISC",
+        binds={inp: inp_layout, wgt: wgt_layout, bias: bias_layout, out: out_layout},
+    )
+
+
+def conv2d_cisc(
+    env,
+    inp_shape: Tuple[int, ...],
+    wgt_shape: Tuple[int, ...],
+    bias_shape: Tuple[int, ...],
+    out_shape: Tuple[int, ...],
+    strides: int,
+    padding: List[int],
+    padding_value: int,
+    activation: int,
+    scale: float,
+    pool_size: List[int],
+    pool_strides: List[int],
+    pool_dilation: List[int],
+    pool_padding: List[int],
+):
+    """2D convolution intrinsic, inserts the calls to the function provided by the Gemmini developers to run a 2D convolution using the loop instructions
+
+    Args:
+        env (Environment): Environment with configurations
+        inp_shape (Tuple[int,...]): Input feature map shape
+        wgt_shape (Tuple[int,...]): Weights shape
+        bias_shape (Tuple[int,...]): Bias shape
+        out_shape (Tuple[int,...]): Output feature map shape
+        strides (int): Convolution stride
+        padding (List[int]): Pixels to pad in each direction
+        padding_value (int): Value to use for padding
+        activation (int): Has activation?
+        scale (float): Output scaling factor
+        pool_size (List[int]): Size of the output pooling window
+        pool_strides (List[int]): Strides for the output pooling window
+        pool_dilation (List[int]): Dilation for the output pooling window. Not used for now.
+        pool_padding (List[int]): Padding for the output pooling
+
+    Returns:
+        TensorIntrin: CONV2D CISC tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for the supported parameters?
+
+    wgt = te.placeholder(wgt_shape, dtype=env.inp_dtype, name=env.scr_wgt_scope)
+    inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
+    bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
+
+    OC = wgt.shape[3]
+    KH = wgt.shape[0]
+    KW = wgt.shape[1]
+
+    N = inp.shape[0]
+    IH = inp.shape[1]
+    IW = inp.shape[2]
+    IC = inp.shape[3]
+
+    ric = te.reduce_axis((0, IC), name="ric")
+    rkh = te.reduce_axis((0, KH), name="rkh")
+    rkw = te.reduce_axis((0, KW), name="rkw")
+
+    HSTR = strides[0]
+    WSTR = strides[1]
+
+    out = te.compute(
+        out_shape,
+        lambda b_o, i, j, c_o: te.sum(
+            inp[b_o, i * HSTR + rkh, j * WSTR + rkw, ric].astype(env.inp_dtype)
+            * wgt[rkh, rkw, ric, c_o].astype(env.inp_dtype)
+            + bias[c_o].astype(env.inp_dtype),
+            axis=[rkh, rkw, ric],
+        ),
+    )
+
+    wgt_layout = tvm.tir.decl_buffer(wgt_shape, env.inp_dtype, "wgt_buff")
+    inp_layout = tvm.tir.decl_buffer(
+        inp_shape,
+        env.inp_dtype,
+        "inp_buff",
+        strides=[te.var("inp_x"), te.var("inp_y"), te.var("inp_b"), te.var("inp_k")],
+    )
+    bias_layout = tvm.tir.decl_buffer(
+        bias_shape,
+        env.acc_dtype,
+        "bias_buff",
+    )
+    out_layout = tvm.tir.decl_buffer(
+        out_shape,
+        env.inp_dtype,
+        "out_buff",
+    )
+
+    def intrin_func(ins, outs):
+        """2D convolution intrinsic function"""
+        dinp, dwgt, dbias = ins
+        dout = outs[0]
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            if env.supports_non_zero_padding:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "tiled_conv_auto",
+                        dinp.shape[0],  # BATCH_SIZE,
+                        dinp.shape[1],  # IN_DIM,
+                        dinp.shape[3],  # IN_CHANNELS,
+                        dout.shape[3],  # OUT_CHANNELS,
+                        dout.shape[1],  # OUT_DIM,
+                        strides[0],
+                        1,
+                        1,
+                        padding[2],
+                        padding_value,
+                        dwgt.shape[0],
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        dinp.access_ptr("r"),
+                        dwgt.access_ptr("r"),
+                        dbias.access_ptr("r"),
+                        dout.access_ptr("w"),
+                        activation,
+                        scale,
+                        pool_size[0],
+                        pool_strides[0],
+                        pool_padding[0],
+                        1,
+                    )
+                )
+            else:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "tiled_conv_auto",
+                        dinp.shape[0],  # BATCH_SIZE,
+                        dinp.shape[1],  # IN_DIM,
+                        dinp.shape[3],  # IN_CHANNELS,
+                        dout.shape[3],  # OUT_CHANNELS,
+                        dout.shape[1],  # OUT_DIM,
+                        strides[0],
+                        1,
+                        1,
+                        padding[2],
+                        dwgt.shape[0],
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        dinp.access_ptr("r"),
+                        dwgt.access_ptr("r"),
+                        dbias.access_ptr("r"),
+                        dout.access_ptr("w"),
+                        activation,
+                        scale,
+                        pool_size[0],
+                        pool_strides[0],
+                        pool_padding[0],
+                        1,
+                    )
+                )
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="CONV2D_CISC",
+        binds={inp: inp_layout, wgt: wgt_layout, bias: bias_layout, out: out_layout},
+    )
+
+
+def dw_conv2d_cisc(
+    env,
+    inp_shape: Tuple[int, ...],
+    wgt_shape: Tuple[int, ...],
+    bias_shape: Tuple[int, ...],
+    out_shape: Tuple[int, ...],
+    strides: int,
+    padding: List[int],
+    padding_value: int,
+    activation: int,
+    scale: float,
+):
+    """2D depthwise convolution intrinsic, inserts the calls to the function provided by the Gemmini developers to run a 2D depthwise convolution using the loop instructions
+
+    Args:
+        env (Environment): Environment with configurations
+        inp_shape (Tuple[int,...]): Input feature map shape
+        wgt_shape (Tuple[int,...]): Weights shape
+        bias_shape (Tuple[int,...]): Bias shape
+        out_shape (Tuple[int,...]): Output feature map shape
+        strides (int): Convolution stride
+        padding (List[int]): Pixels to pad in each direction
+        padding_value (int): Value to use for padding
+        activation (int): Has activation?
+        scale (float): Output scaling factor
+
+    Returns:
+        TensorIntrin: depthwise convolution 2d tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for the supported parameters?
+
+    wgt = te.placeholder(wgt_shape, dtype=env.inp_dtype, name=env.scr_wgt_scope)
+    inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
+    bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
+
+    OC = wgt.shape[0]
+    KH = wgt.shape[1]
+    KW = wgt.shape[2]
+
+    N = inp.shape[0]
+    IH = inp.shape[1]
+    IW = inp.shape[2]
+    IC = inp.shape[3]
+
+    rkh = te.reduce_axis((0, KH), name="rkh")
+    rkw = te.reduce_axis((0, KW), name="rkw")
+
+    HSTR = strides[0]
+    WSTR = strides[1]
+
+    out = te.compute(
+        out_shape,
+        lambda b_o, i, j, c_o: te.sum(
+            inp[b_o, i * HSTR + rkh, j * WSTR + rkw, c_o].astype(env.inp_dtype)
+            * wgt[c_o, rkh, rkw].astype(env.inp_dtype)
+            + bias[c_o].astype(env.inp_dtype),
+            axis=[rkh, rkw],
+        ),
+    )
+
+    wgt_layout = tvm.tir.decl_buffer(
+        wgt_shape,
+        env.inp_dtype,
+        "wgt_buff",
+        # strides=[te.var("wgt_i"),te.var("wgt_j")]
+    )
+    inp_layout = tvm.tir.decl_buffer(
+        inp_shape,
+        env.inp_dtype,
+        "inp_buff",
+        strides=[te.var("inp_x"), te.var("inp_y"), te.var("inp_b"), te.var("inp_k")],
+    )
+    bias_layout = tvm.tir.decl_buffer(
+        bias_shape,
+        env.acc_dtype,
+        "bias_buff",
+    )
+    out_layout = tvm.tir.decl_buffer(
+        out_shape,
+        env.inp_dtype,
+        "out_buff",
+    )
+
+    def intrin_func(ins, outs):
+        """2D depthwise convolution intrinsic function"""
+        dinp, dwgt, dbias = ins
+        dout = outs[0]
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            if env.supports_non_zero_padding:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "tiled_conv_dw_auto",
+                        dinp.shape[0],  # BATCH_SIZE,
+                        dinp.shape[1],  # IN_DIM,
+                        dinp.shape[3],  # IN_CHANNELS,
+                        # dout.shape[3],#OUT_CHANNELS,
+                        dout.shape[1],  # OUT_DIM,
+                        strides[0],
+                        # 1, 1,
+                        padding[2],
+                        padding_value,
+                        dwgt.shape[1],
+                        # 0, 0, 0, 0, 0,
+                        dinp.access_ptr("r"),
+                        dwgt.access_ptr("r"),
+                        dbias.access_ptr("r"),
+                        dout.access_ptr("w"),
+                        activation,
+                        scale,
+                        1,
+                        0,
+                        0,
+                        1,
+                    )
+                )
+            else:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "tiled_conv_dw_auto",
+                        dinp.shape[0],  # BATCH_SIZE,
+                        dinp.shape[1],  # IN_DIM,
+                        dinp.shape[3],  # IN_CHANNELS,
+                        # dout.shape[3],#OUT_CHANNELS,
+                        dout.shape[1],  # OUT_DIM,
+                        strides[0],
+                        # 1, 1,
+                        padding[2],
+                        dwgt.shape[1],
+                        # 0, 0, 0, 0, 0,
+                        dinp.access_ptr("r"),
+                        dwgt.access_ptr("r"),
+                        dbias.access_ptr("r"),
+                        dout.access_ptr("w"),
+                        activation,
+                        scale,
+                        1,
+                        0,
+                        0,
+                        1,
+                    )
+                )
+
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="DWCONV2D_CISC",
+        binds={inp: inp_layout, wgt: wgt_layout, bias: bias_layout, out: out_layout},
+    )
+
+
+def add_tensorize(env, oshape: Tuple[int, ...]):
+    """Add intrinsic, inserts the most basic Gemmini instructions to support the qnn.add operator
+
+    Args:
+        env (Environment): Environment with configurations
+        oshape (Tuple[int,...]): Output feature map shape
+
+    Returns:
+        TensorIntrin: add tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for the supported parameters?
+
+    ifm1 = te.placeholder(oshape, dtype=env.inp_dtype, name=env.acc_scope)
+    ifm2 = te.placeholder(oshape, dtype=env.inp_dtype, name=env.acc_scope)
+
+    out = te.compute(
+        oshape, lambda i, j: ifm1[i, j].astype(env.inp_dtype) + ifm2[i, j].astype(env.inp_dtype)
+    )
+
+    ifm1_dtype = env.inp_dtype
+
+    ifm1_layout = tvm.tir.decl_buffer(
+        oshape,
+        ifm1_dtype,
+        "ifm1_buff",
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+    ifm2_layout = tvm.tir.decl_buffer(
+        oshape,
+        env.inp_dtype,
+        "ifm2_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+    out_layout = tvm.tir.decl_buffer(
+        oshape,
+        env.inp_dtype,
+        "out_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+
+    def intrin_func(ins, outs):
+        """Add intrinsic function"""
+        difm1, difm2 = ins
+        dout = outs[0]
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin2",
+                    difm1.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + difm2.access_ptr("w", "uint32"),
+                    difm1.shape[1],
+                    difm1.shape[0],
+                )
+            )
+
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="ADD",
+        binds={ifm1: ifm1_layout, ifm2: ifm2_layout, out: out_layout},
+    )
+
+
+def add_mvout_tensorize(env, oshape: Tuple[int, ...]):
+    """Helper for the add intrinsic
+
+    Args:
+        env (Environment): Environment with configurations
+        oshape (Tuple[int,...]): Output feature map shape
+
+    Returns:
+        TensorIntrin: add mvout tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for the supported parameters?
+
+    ifm1 = te.placeholder(oshape, dtype=env.inp_dtype, name=env.acc_scope)
+    ifm2 = te.placeholder(oshape, dtype=env.inp_dtype, name=env.acc_scope)
+
+    out = te.compute(
+        oshape, lambda i, j: ifm1[i, j].astype(env.inp_dtype) + ifm2[i, j].astype(env.inp_dtype)
+    )
+
+    ifm1_dtype = env.inp_dtype
+
+    ifm1_layout = tvm.tir.decl_buffer(
+        oshape,
+        ifm1_dtype,
+        "ifm1_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+    ifm2_layout = tvm.tir.decl_buffer(
+        oshape,
+        env.inp_dtype,
+        "ifm2_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+    out_layout = tvm.tir.decl_buffer(
+        oshape,
+        env.inp_dtype,
+        "out_buff",
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+
+    def intrin_func(ins, outs):
+        """Add mvout intrinsic function"""
+        difm1, difm2 = ins
+        dout = outs[0]
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvout",
+                    dout.access_ptr("w"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + difm2.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    difm1.shape[1],
+                    difm1.shape[0],
+                )
+            )
+
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="ADD_MVOUT",
+        binds={ifm1: ifm1_layout, ifm2: ifm2_layout, out: out_layout},
+    )
diff --git a/python/tvm/contrib/gemmini/legalize.py b/python/tvm/contrib/gemmini/legalize.py
new file mode 100644
index 000000000000..6f279bb512b3
--- /dev/null
+++ b/python/tvm/contrib/gemmini/legalize.py
@@ -0,0 +1,595 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+A set of passes to legalize the Gemmini operators
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from typing import List, Type, Callable
+import math
+
+import numpy as np  # type: ignore
+
+import tvm  # type: ignore
+from tvm import te
+from tvm import relay
+from tvm import ir
+from tvm.relay.dataflow_pattern import DFPatternCallback  # type: ignore
+from tvm.relay.dataflow_pattern import wildcard
+from tvm.relay.dataflow_pattern import is_op
+from tvm.relay.dataflow_pattern import rewrite
+from tvm.relay.dataflow_pattern import CallPattern
+from tvm.relay.frontend.common import infer_shape as _infer_shape
+from tvm.relay.frontend.common import infer_type as _infer_type
+from tvm.relay.expr_functor import ExprMutator, ExprVisitor
+
+from tvm.relay.op import _make  # type: ignore
+
+from .pattern_table import *  # type: ignore
+
+from .environment import Environment
+
+env = Environment.instance()
+
+
+def gemmini_gemm(
+    ifm1: tvm.relay.Expr,
+    ifm2: tvm.relay.Expr,
+    bias: tvm.relay.Expr,
+    ifm_scale: float,
+    ifm_offset: float,
+    bias_scale: float,
+    bias_offset: float,
+    ofm_scale: float,
+    ofm_offset: float,
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.gemm operator
+
+    Args:
+        ifm1 (tvm.relay.Expr): Input feature map 1
+        ifm2 (tvm.relay.Expr): Input feature map 2 (weights)
+        bias (tvm.relay.Expr): Biases
+        ifm_scale (float): Input feature map scaling factor
+        ifm_offset (float): Input feature map offset
+        bias_scale (float): Biases scaling factor
+        bias_offset (float): Biases offset
+        ofm_scale (float): Output feature map scaling factor
+        ofm_offset (float): Output feature map offset
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.gemm operator
+    """
+    return _make.gemmini_gemm(
+        ifm1, ifm2, bias, ifm_scale, ifm_offset, bias_scale, bias_offset, ofm_scale, ofm_offset
+    )
+
+
+def gemmini_add(
+    ifm1: tvm.relay.Expr,
+    ifm2: tvm.relay.Expr,
+    ifm1_scale: float,
+    ifm1_offset: float,
+    ifm2_scale: float,
+    ifm2_offset: float,
+    ofm_scale: float,
+    ofm_offset: float,
+    shape: Tuple[int, ...],
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.add operator
+
+    Args:
+        ifm1 (tvm.relay.Expr): Input feature map 1
+        ifm2 (tvm.relay.Expr): Input feature map 2
+        ifm1_scale (float): Input feature map 1 scaling factor
+        ifm1_offset (float): Input feature map 1 offset
+        ifm2_scale (float): Input feature map 2 scaling factor
+        ifm2_offset (float): Input feature map 2 offset
+        ofm_scale (float): Output feature map scaling factor
+        ofm_offset (float): Output feature map offset
+        shape (Tuple[int,...]): Shape of the input feature maps and the output feature map
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.add operator
+    """
+    return _make.gemmini_add(
+        ifm1,
+        ifm2,
+        ifm1_scale,
+        ifm1_offset,
+        ifm2_scale,
+        ifm2_offset,
+        ofm_scale,
+        ofm_offset,
+        shape,
+    )
+
+
+def gemmini_conv2d(
+    data: tvm.relay.Expr,
+    weights: tvm.relay.Expr,
+    bias: tvm.relay.Expr,
+    strides: tuple,
+    padding: tuple,
+    ifm_scale: float,
+    ifm_offset: float,
+    weights_scale: float,
+    weights_offset: float,
+    bias_scale: float,
+    bias_offset: float,
+    ofm_scale: float,
+    ofm_offset: float,
+    activation: bool,
+    has_pool: bool,
+    pool_size: tvm.relay.Expr,
+    pool_strides: tvm.relay.Expr,
+    pool_dilation: tvm.relay.Expr,
+    pool_padding: tvm.relay.Expr,
+    input_req_offset_out: tvm.relay.Expr,
+    has_activation: bool,
+    activation_scale_in: tvm.relay.Expr,
+    activation_offset_in: tvm.relay.Expr,
+    activation_scale_out: tvm.relay.Expr,
+    activation_offset_out: tvm.relay.Expr,
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.conv2d operator
+
+    Args:
+        data (tvm.relay.Expr): Input feature map
+        weights (tvm.relay.Expr): Convolution weights matrix
+        bias (tvm.relay.Expr): Convolution biases matrix
+        strides (tuple): Convolution strides
+        padding (tuple): Convolution paddings in each direction
+        ifm_scale (float): Input feature map scaling factor
+        ifm_offset (float): Input feature map offset
+        weights_scale (float): Weights scaling factor
+        weights_offset (float): Convolution weights offset
+        bias_scale (float): Biases scaling factor
+        bias_offset (float): Biases weights offset
+        ofm_scale (float): Output feature map scaling factor
+        ofm_offset (float): Output feature map offset
+        activation (bool): TODO (FP): see if this can be deleted! Has activation?
+        has_pool (bool): Has pooling layer after the output of the convolution?
+        pool_size (tvm.relay.Expr): Pooling window size
+        pool_strides (tvm.relay.Expr): Pooling window strides
+        pool_dilation (tvm.relay.Expr): Pooling window dilation
+        pool_padding (tvm.relay.Expr): Pooling padding in each direction
+        input_req_offset_out (tvm.relay.Expr): Requantize layer output offset
+        has_activation (bool): Has activation?
+        activation_scale_in (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer input scaling factor
+        activation_offset_in (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer input offset
+        activation_scale_out (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer output scaling factor
+        activation_offset_out (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer output offset
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.conv2d operator
+    """
+    return _make.gemmini_conv2d(
+        data,
+        weights,
+        bias,
+        strides,
+        padding,
+        ifm_scale,
+        ifm_offset,
+        weights_scale,
+        weights_offset,
+        bias_scale,
+        bias_offset,
+        ofm_scale,
+        ofm_offset,
+        activation,
+        has_pool,
+        pool_size,
+        pool_strides,
+        pool_dilation,
+        pool_padding,
+        input_req_offset_out,
+        has_activation,
+        activation_scale_in,
+        activation_offset_in,
+        activation_scale_out,
+        activation_offset_out,
+    )
+
+
+def gemmini_depthwise_conv2d(
+    data: tvm.relay.Expr,
+    weights: tvm.relay.Expr,
+    bias: tvm.relay.Expr,
+    strides: tuple,
+    padding: tuple,
+    ifm_scale: float,
+    ifm_offset: float,
+    weights_scale: float,
+    weights_offset: float,
+    bias_scale: float,
+    bias_offset: float,
+    ofm_scale: float,
+    ofm_offset: float,
+    activation: bool,
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.depthwiseconv2d operator
+
+    Args:
+        data (tvm.relay.Expr): Input feature map
+        weights (tvm.relay.Expr): Convolution weights matrix
+        bias (tvm.relay.Expr): Convolution biases matrix
+        strides (tuple): Convolution strides
+        padding (tuple): Convolution paddings in each direction
+        ifm_scale (float): Input feature map scaling
+        ifm_offset (float): Input feature map offset
+        weights_scale (float): Convolution weights scaling factor
+        weights_offset (float): Convolution weights offset
+        bias_scale (float): Convolution biases scaling factor
+        bias_offset (float): Convolution biases offset
+        ofm_scale (float): Output feature map scaling
+        ofm_offset (float): Output feature map offset
+        activation (bool): Has activation?
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.depthwiseconv2d operator
+    """
+    return _make.gemmini_depthwise_conv2d(
+        data,
+        weights,
+        bias,
+        strides,
+        padding,
+        ifm_scale,
+        ifm_offset,
+        weights_scale,
+        weights_offset,
+        bias_scale,
+        bias_offset,
+        ofm_scale,
+        ofm_offset,
+        activation,
+    )
+
+
+def gemmini_max_pool2d(
+    ifm: tvm.relay.Expr,
+    pool_size: tvm.relay.Expr,
+    pool_strides: tvm.relay.Expr,
+    pool_dilation: tvm.relay.Expr,
+    pool_padding: tvm.relay.Expr,
+    shape: tuple,
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.max_pool2d operator
+
+    Args:
+        ifm (tvm.relay.Expr): Input feature map
+        pool_size (tvm.relay.Expr): Pooling window size
+        pool_strides (tvm.relay.Expr): Pooling window strides
+        pool_dilation (tvm.relay.Expr): Pooling window dilation
+        pool_padding (tvm.relay.Expr): Pooling padding in each direction
+        shape (tuple): Input shape
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.max_pool2d operator
+    """
+    return _make.gemmini_max_pool2d(
+        ifm, pool_size, pool_strides, pool_dilation, pool_padding, shape
+    )
+
+
+class AddRewriter(DFPatternCallback):
+    """Convert add related composite functions into contrib.gemmini.add operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": AddParams.composite_name}))(
+            wildcard(), wildcard()
+        )
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = AddParams(post.op.body)
+        gemmini_add_op = gemmini_add(
+            post.args[0],
+            post.args[1],
+            params.ifm1_scale,
+            params.ifm1_offset,
+            params.ifm2_scale,
+            params.ifm2_offset,
+            params.ofm_scale,
+            params.ofm_offset,
+            params.output_shape,
+        )
+        return gemmini_add_op
+
+
+class GEMMRewriter(DFPatternCallback):
+    """Convert gemm related composite functions into contrib.gemmini.gemm operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": GEMMParams.composite_name}))(
+            wildcard(), wildcard(), wildcard()
+        )
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = GEMMParams(post.op.body)
+        gemmini_gemm_op = gemmini_gemm(
+            post.args[0],
+            post.args[1],
+            post.args[2],
+            params.ifm_scale,
+            params.ifm_offset,
+            params.bias_scale,
+            params.bias_offset,
+            params.ofm_scale,
+            params.ofm_offset,
+        )
+        return gemmini_gemm_op
+
+
+class CONV2DRewriter(DFPatternCallback):
+    """Convert conv2d related composite functions into contrib.gemmini.conv2d operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": CONV2DParams.composite_name}))(
+            wildcard(), wildcard(), wildcard()
+        )
+        self.data_index = 0
+        self.weights_index = 1
+        self.bias_index = 2
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = CONV2DParams(post.op.body)
+        if params.has_external_pad:
+            self.weights_index = 2
+            self.bias_index = 3
+        else:
+            self.weights_index = 1
+            self.bias_index = 2
+
+        bias = post.args[self.bias_index]
+
+        if params.has_input_requantize:
+            data = relay.cast(post.args[self.data_index], "int8")
+        else:
+            data = post.args[self.data_index]
+
+        if params.is_depthwise:
+            reshaped_weights = relay.squeeze(
+                relay.transpose(post.args[self.weights_index], [3, 0, 1, 2]), axis=[3]
+            )
+            gemmini_depthwise_conv2d_op = gemmini_depthwise_conv2d(
+                data=data,
+                weights=reshaped_weights,
+                bias=bias,
+                strides=params.strides,
+                padding=params.padding,
+                ifm_scale=params.ifm_scale,
+                ifm_offset=params.ifm_offset,
+                weights_scale=params.weights_scale,
+                weights_offset=params.weights_offset,
+                bias_scale=params.bias_scale,
+                bias_offset=params.bias_offset,
+                ofm_scale=params.ofm_scale,
+                ofm_offset=params.ofm_offset,
+                activation=params.activation,
+            )
+            return gemmini_depthwise_conv2d_op
+        else:
+            gemmini_conv2d_op = gemmini_conv2d(
+                data=data,
+                weights=post.args[self.weights_index],
+                bias=bias,
+                strides=params.strides,
+                padding=params.padding,
+                ifm_scale=params.ifm_scale,
+                ifm_offset=params.ifm_offset,
+                weights_scale=params.weights_scale,
+                weights_offset=params.weights_offset,
+                bias_scale=params.bias_scale,
+                bias_offset=params.bias_offset,
+                ofm_scale=params.ofm_scale,
+                ofm_offset=params.ofm_offset,
+                activation=params.activation,
+                has_pool=params.has_pool,
+                pool_size=params.pool_size,
+                pool_strides=params.pool_strides,
+                pool_dilation=params.pool_dilation,
+                pool_padding=params.pool_padding,
+                input_req_offset_out=params.input_offset_out,
+                has_activation=params.has_activation,
+                activation_scale_in=params.activation_scale_in,
+                activation_offset_in=params.activation_offset_in,
+                activation_scale_out=params.activation_scale_out,
+                activation_offset_out=params.activation_offset_out,
+            )
+        return gemmini_conv2d_op
+
+
+class CONV2DExternalPadRewriter(CONV2DRewriter):
+    def __init__(self):
+        super().__init__()
+        self.pattern = (wildcard().has_attr({"Composite": CONV2DParams.composite_name}))(
+            wildcard(), wildcard(), wildcard(), wildcard()
+        )
+        self.data_index = 0
+
+
+class CONV2DExternalPadAndRelu6Rewriter(CONV2DRewriter):
+    def __init__(self):
+        super().__init__()
+        self.pattern = (wildcard().has_attr({"Composite": CONV2DParams.composite_name}))(
+            wildcard(), wildcard(), wildcard(), wildcard(), wildcard()
+        )
+        self.data_index = 0
+        self.min_index = 4
+
+
+class MAXPOOL2DRewriter(DFPatternCallback):
+    """Convert conv2d related composite functions into gemmini_max_pool2d operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": MaxPoolParams.composite_name}))(
+            wildcard()
+        )
+        self.data_index = 0
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = MaxPoolParams(post.op.body)
+
+        data = post.args[self.data_index]
+
+        gemmini_max_pool2d_op = gemmini_max_pool2d(
+            ifm=data,
+            pool_size=params.pool_size,
+            pool_strides=params.pool_strides,
+            pool_dilation=params.pool_dilation,
+            pool_padding=params.pool_padding,
+            shape=params.shape,
+        )
+        return gemmini_max_pool2d_op
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeAdd:
+    """This is the pass that wraps the AddRewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(AddRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeMaxPool2D:
+    """This is the pass that wraps the MAXPOOL2DRewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(MAXPOOL2DRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeGEMM:
+    """This is the pass that wraps the GEMMRewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(GEMMRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeCONV2D:
+    """This is the pass that wraps the CONV2DRewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(CONV2DRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeCONV2DExternalPad:
+    """This is the pass that wraps the CONV2DExternalPadRewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(CONV2DExternalPadRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeCONV2DExternalPadAndRelu6:
+    """This is the pass that wraps the CONV2DExternalPadAndRelu6Rewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(CONV2DExternalPadAndRelu6Rewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeGemmini:
+    """This is the pass to call graph-rewrites to perform graph transformation
+    in a way such that the operations are replaced with hardware/codegen supported
+    operations.
+    """
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        """This is the method that replaces the operations with hardware/codegen supported
+        operations.
+        """
+        mod = LegalizeCONV2DExternalPadAndRelu6()(mod)
+        mod = LegalizeCONV2DExternalPad()(mod)
+        mod = LegalizeAdd()(mod)
+        mod = LegalizeCONV2D()(mod)
+        mod = LegalizeGEMM()(mod)
+        mod = LegalizeMaxPool2D()(mod)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        # pylint is unable figure out the decorated
+        # class is callable, thus adding this to
+        # suppress the warning.
+        pass
diff --git a/python/tvm/contrib/gemmini/pattern_table.py b/python/tvm/contrib/gemmini/pattern_table.py
new file mode 100644
index 000000000000..a43f10699c75
--- /dev/null
+++ b/python/tvm/contrib/gemmini/pattern_table.py
@@ -0,0 +1,469 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Pattern table declaring the supported Gemmini operators
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from typing import Dict, List, Tuple, Callable, Optional
+
+import tvm  # type: ignore
+from tvm import relay
+from tvm.target import Target
+from tvm.relay.build_module import bind_params_by_name  # type: ignore
+from tvm.relay.op.contrib.register import register_pattern_table  # type: ignore
+from tvm.relay.dataflow_pattern import is_constant, wildcard, is_op
+from .utils import *
+
+from tvm.topi.utils import const_vector, get_const_int, get_const_float
+from tvm.relay.frontend.common import infer_shape as _infer_shape
+from tvm.relay.frontend.common import infer_type as _infer_type
+
+from .environment import Environment
+
+env = Environment.instance()
+
+
+class GEMMParams:
+    """
+    This class will parse a Call to a gemmini.gemm composite function
+    """
+
+    composite_name = "gemmini.gemm"
+
+    def __init__(self, func_body: tvm.relay.Function):
+
+        dense_op = func_body.args[0]
+        self.weights = func_body.args[1]
+        requantize_op = func_body
+
+        bias_add = requantize_op.args[0]
+        self.bias = bias_add.args[1]
+        dense_op = bias_add.args[0]
+        self.ifm_scale = dense_op.args[QDenseArgs.IFM_SCALE.value]
+        self.ifm_offset = dense_op.args[QDenseArgs.IFM_ZERO_POINT.value]
+
+        if requantize_op.op.name == "qnn.requantize":
+            self.merge_requantize = True
+            self.bias_scale = requantize_op.args[RequantArgs.IFM_SCALE.value]
+            self.bias_offset = requantize_op.args[RequantArgs.IFM_ZERO_POINT.value]
+            self.ofm_scale = requantize_op.args[RequantArgs.OFM_SCALE.value]
+            self.ofm_offset = requantize_op.args[RequantArgs.OFM_ZERO_POINT.value]
+        else:
+            self.merge_requantize = False
+            self.bias_scale = tvm.relay.const([1.0], "float")
+            self.bias_offset = tvm.relay.const(0, "int32")
+            self.ofm_scale = tvm.relay.const(1.0, "float")
+            self.ofm_offset = tvm.relay.const(0, "int32")
+
+    def is_valid(self) -> bool:
+        """
+        This function checks whether gemmini.gemm has compatible attributes with the Gemmini
+        """
+        # TODO (FP): complete this validation
+        return True
+
+
+class AddParams:
+    """
+    This class will parse a Call to a gemmini.add composite function
+    """
+
+    composite_name = "gemmini.add"
+    activation_map = {"clip": "CLIP"}
+
+    def __init__(self, func_body: tvm.relay.Function):
+        if str(func_body.op) in self.activation_map.keys():
+            activation = func_body
+            add_op = func_body.args[0]
+        else:
+            add_op = func_body
+
+        self.ifm1_scale = add_op.args[BinaryElementwiseArgs.IFM1_SCALE.value]
+        self.ifm1_offset = add_op.args[BinaryElementwiseArgs.IFM1_ZERO_POINT.value]
+        self.ifm2_scale = add_op.args[BinaryElementwiseArgs.IFM2_SCALE.value]
+        self.ifm2_offset = add_op.args[BinaryElementwiseArgs.IFM2_ZERO_POINT.value]
+        self.ofm_scale = add_op.args[BinaryElementwiseArgs.OFM_SCALE.value]
+        self.ofm_offset = add_op.args[BinaryElementwiseArgs.OFM_ZERO_POINT.value]
+        self.output_shape = _infer_shape(add_op)
+        self.ifm1_shape = _infer_shape(add_op.args[0])
+        self.ifm2_shape = _infer_shape(add_op.args[1])
+
+    def is_valid(self) -> bool:
+        """
+        This function checks whether gemmini.add has compatible attributes with the Gemmini
+        """
+        # TODO (FP): complete this validation
+        # We only support 4 dimensions add operators... for now
+        if len(self.output_shape) != 4:
+            return False
+        if self.ifm1_shape != self.ifm2_shape:
+            return False
+        return True
+
+
+class CONV2DParams:
+    """
+    This class will parse a Call to a gemmini.conv2d composite function
+    """
+
+    composite_name = "gemmini.conv2d"
+    activation_map = {"clip": "CLIP"}
+
+    def __init__(self, func_body: tvm.relay.Function):
+        activation = None
+        self.pool_size = [0, 0]
+        self.pool_strides = [0, 0]
+        self.pool_padding = [0, 0, 0, 0]
+        self.pool_dilation = [0, 0]
+        self.has_pool = False
+        self.has_activation = False
+        self.a_min = None
+        self.a_max = None
+        self.has_external_pad = False
+        self.activation_scale_in = tvm.relay.const(1.0, "float")
+        self.activation_offset_in = tvm.relay.const(0, "int32")
+        self.activation_scale_out = tvm.relay.const(1.0, "float")
+        self.activation_offset_out = tvm.relay.const(0, "int32")
+
+        _op = func_body
+
+        if _op.args[0].op.name != "nn.bias_add":
+
+            if _op.op.name == "clip":
+                _op = _op.args[0]
+            else:
+
+                if _op.op.name == "nn.max_pool2d":
+                    max_pool = _op
+                    self.pool_size = max_pool.attrs.pool_size
+                    self.pool_strides = max_pool.attrs.strides
+                    self.pool_padding = max_pool.attrs.padding
+                    self.pool_dilation = max_pool.attrs.dilation
+                    self.has_pool = True
+                    _op = max_pool.args[0]
+
+                if _op.op.name == "clip":
+                    _op = _op.args[0]
+                elif _op.args[0].op.name == "clip":
+                    self.activation_scale_in = _op.args[RequantArgs.IFM_SCALE.value]
+                    self.activation_offset_in = _op.args[RequantArgs.IFM_ZERO_POINT.value]
+                    self.activation_scale_out = _op.args[RequantArgs.OFM_SCALE.value]
+                    self.activation_offset_out = _op.args[RequantArgs.OFM_ZERO_POINT.value]
+                    clip = _op.args[0]
+                    self.has_activation = True
+                    _min = clip.args[0]
+                    self.a_min = clip.attrs.a_min
+                    self.a_max = clip.attrs.a_max
+                    _op = _min.args[0]
+
+        requantize_op = _op
+
+        bias_add = requantize_op.args[0]
+
+        conv2d_op = bias_add.args[0]
+
+        self.has_input_requantize = False
+        self.input_scale_in = tvm.relay.const(1.0, "float")
+        self.input_offset_in = tvm.relay.const(0, "int32")
+        self.input_scale_out = tvm.relay.const(1.0, "float")
+        self.input_offset_out = tvm.relay.const(0, "int32")
+
+        self.output_shape = _infer_shape(conv2d_op)
+        self.strides = conv2d_op.attrs.strides
+        self.padding = conv2d_op.attrs.padding
+        self.groups = conv2d_op.attrs.groups
+        self.is_depthwise = self.groups == conv2d_op.attrs.channels and self.groups != 1
+        self.data = conv2d_op.args[0]
+        self.input_shape = _infer_shape(self.data)
+        if (
+            not isinstance(self.data, relay.expr.Var)
+            and not isinstance(self.data.op, relay.function.Function)
+            and self.data.op.name == "nn.pad"
+        ):
+            padding = self.data.attrs.pad_width
+            self.padding = [padding[1][0], padding[1][1], padding[2][0], padding[2][1]]
+            self.has_external_pad = True
+        self.weights = conv2d_op.args[1]
+        self.weights_shape = _infer_shape(self.weights)
+        self.bias = bias_add.args[1]
+        self.ifm_scale = float(conv2d_op.args[QConv2DArgs.IFM_SCALE.value].data.numpy())
+        self.ifm_offset = conv2d_op.args[QConv2DArgs.IFM_ZERO_POINT.value]
+        self.ifm_offset_const = conv2d_op.args[QConv2DArgs.IFM_ZERO_POINT.value]
+        self.weights_scale = 1.0
+        self.weights_offset = 0.0
+
+        if requantize_op.op.name == "qnn.requantize":
+            self.bias_scale = requantize_op.args[RequantArgs.IFM_SCALE.value]
+            self.bias_offset = requantize_op.args[RequantArgs.IFM_ZERO_POINT.value]
+            self.ofm_scale = requantize_op.args[RequantArgs.OFM_SCALE.value]
+            self.ofm_offset = requantize_op.args[RequantArgs.OFM_ZERO_POINT.value]
+        else:
+            self.bias_scale = tvm.relay.const([1.0], "float")
+            self.bias_offset = tvm.relay.const(0, "int32")
+            self.ofm_scale = tvm.relay.const(1.0, "float")
+            self.ofm_offset = tvm.relay.const(0, "int32")
+
+        if activation is not None:
+            self.activation = False
+        else:
+            self.activation = False
+
+    def is_valid(self) -> bool:
+        """
+        This function checks whether gemmini.conv2d has compatible attributes with the Gemmini
+        """
+        # TODO (FP): complete this validation
+        if len(set(self.pool_padding)) != 1 or len(set(self.pool_strides)) != 1:
+            return False
+
+        if self.has_input_requantize:
+            if (
+                self.input_scale_in.data.numpy() != self.input_scale_out.data.numpy()
+                or self.input_offset_in.data.numpy() != 0
+            ):
+                # Only this specific cases are supported, for now...
+                return False
+
+        if self.a_max is not None and self.a_max != 127:
+            return False
+
+        return True
+
+
+class DepthwiseCONV2DParams(CONV2DParams):
+    """
+    This class will parse a Call to a gemmini.depthwiseconv2d composite function
+    """
+
+    composite_name = "gemmini.depthwiseconv2d"
+    activation_map = {"clip": "CLIP"}
+
+    def __init__(self, func_body: tvm.relay.Function):
+        super().__init__(func_body)
+
+
+class MaxPoolParams:
+    """
+    This class will parse a Call to a gemmini.max_pool2d composite function
+    """
+
+    composite_name = "gemmini.max_pool2d"
+
+    def __init__(self, func_body: tvm.relay.Function):
+        self.pool_size = func_body.attrs.pool_size
+        self.pool_strides = func_body.attrs.strides
+        self.pool_padding = func_body.attrs.padding
+        self.pool_dilation = func_body.attrs.dilation
+        self.shape = _infer_shape(func_body)
+
+    def is_valid(self) -> bool:
+        """
+        This function checks whether max_pool2d has compatible attributes with the Gemmini
+        """
+        # TODO (FP): complete this validation?
+        if len(set(self.pool_padding)) != 1:
+            return False
+        if (self.shape[1] != self.shape[2]) or self.shape[1] == 1:
+            return False
+        return True
+
+
+def make_dense_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to qnn.dense.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    dense_out : CallPattern
+        Call node sequence.
+    """
+    data = wildcard()
+    weight = wildcard()
+    bias = wildcard()
+    dense = is_op("qnn.dense")(
+        data, weight, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    bias_add = is_op("nn.bias_add")(
+        dense,
+        bias,
+    )
+    req = is_op("qnn.requantize")(
+        bias_add, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    return req
+
+
+def make_add_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to qnn.add.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    add_out : CallPattern
+        Call node sequence.
+    """
+    ifm1 = wildcard()
+    ifm2 = wildcard()
+    add_out = is_op("qnn.add")(
+        ifm1,
+        ifm2,
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+    )
+    clip_or_req = add_out.optional(is_op("clip"))
+    return clip_or_req
+
+
+def make_conv2d_pattern(
+    with_padded_input: bool = False, with_maxpool: bool = False, with_relu_6: bool = False
+) -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to qnn.conv2d.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    conv2d_out : CallPattern
+        Call node sequence.
+    """
+    data = wildcard()
+    if with_padded_input:
+        data = is_op("nn.pad")(data, wildcard())
+    weight = wildcard()
+    bias = wildcard()
+    conv2d_out = is_op("qnn.conv2d")(
+        data, weight, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    bias_add = is_op("nn.bias_add")(
+        conv2d_out,
+        bias,
+    )
+    output = is_op("qnn.requantize")(
+        bias_add, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    if with_relu_6:
+        output = is_op("minimum")(output, wildcard())
+        output = is_op("clip")(output)
+        output = is_op("qnn.requantize")(
+            output, is_constant(), is_constant(), is_constant(), is_constant()
+        )
+    else:
+        output = output.optional(is_op("clip"))
+    if with_maxpool:
+        output = output.optional(is_op("nn.max_pool2d"))
+        return output
+    else:
+        return output
+
+
+def make_depthwiseconv2d_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to qnn.conv2d, but only if it is a depthwise convolution.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    conv2d_out : CallPattern
+        Call node sequence.
+    """
+    data = wildcard()
+    weight = wildcard()
+    bias = wildcard()
+    conv2d_out = is_op("qnn.conv2d")(
+        data, weight, is_constant(), is_constant(), is_constant(), is_constant()
+    ).has_attr({"kernel_layout": "HWOI"})
+    bias_add = is_op("nn.bias_add")(
+        conv2d_out,
+        bias,
+    )
+    output = is_op("qnn.requantize")(
+        bias_add, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    clip_or_req = output.optional(is_op("clip"))
+    return clip_or_req
+
+
+def make_maxpool_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to nn.max_pool2d.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    max_pool2d : CallPattern
+        Call node sequence.
+    """
+    max_pool2d = is_op("nn.max_pool2d")(wildcard())
+    return max_pool2d
+
+
+@register_pattern_table("gemmini")
+def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]:
+
+    pattern_table_filters = []
+    pattern_table_filters.append(
+        (
+            GEMMParams.composite_name,
+            make_dense_pattern(),
+            lambda pat: GEMMParams(pat).is_valid(),
+        )
+    )
+
+    for pad in [True, False]:
+        for max_pool in [True, False]:
+            for relu6 in [True, False]:
+                pattern_table_filters.append(
+                    (
+                        CONV2DParams.composite_name,
+                        make_conv2d_pattern(
+                            with_padded_input=pad, with_maxpool=max_pool, with_relu_6=relu6
+                        ),
+                        lambda pat: CONV2DParams(pat).is_valid(),
+                    )
+                )
+
+    pattern_table_filters.append(
+        (
+            MaxPoolParams.composite_name,
+            make_maxpool_pattern(),
+            lambda pat: MaxPoolParams(pat).is_valid(),
+        )
+    )
+
+    if env.use_experimental_qnn_add:
+        pattern_table_filters.append(
+            (
+                AddParams.composite_name,
+                make_add_pattern(),
+                lambda pat: AddParams(pat).is_valid(),
+            )
+        )
+
+    return pattern_table_filters
diff --git a/python/tvm/contrib/gemmini/transform.py b/python/tvm/contrib/gemmini/transform.py
new file mode 100644
index 000000000000..312217cc8210
--- /dev/null
+++ b/python/tvm/contrib/gemmini/transform.py
@@ -0,0 +1,816 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=len-as-condition, no-else-return, unused-argument, invalid-name
+"""
+Transformation passes for Gemmini
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from numpy import isin
+import tvm
+from tvm import te
+from tvm.topi import utils
+import numpy as np
+from copy import deepcopy
+import itertools
+import ast
+from tvm.tir.ir_builder import IRBuilder
+from typing import Dict
+
+from .environment import Environment
+
+env = Environment.instance()
+
+
+def _get_counters(irb: IRBuilder):
+    """Generates calls to print the values of the configured timers
+
+    Args:
+        irb (IRBuilder): IRBuilder
+    """
+    irb.emit(tvm.tir.call_extern("", "counter_snapshot_take"))
+    irb.emit(tvm.tir.call_extern("", "printf", "Counter values:\\r\\n"))
+    counter_vars = []
+    for i, (key, value) in enumerate(env.enabled_counters.items()):
+        counter_var = irb.let(
+            value.lower() + "_var", tvm.tir.call_extern("uint32", "counter_read", i)
+        )
+        counter_vars.append(counter_var)
+        irb.emit(tvm.tir.call_extern("", "printf", tvm.tir.StringImm("%s," % value)))
+    irb.emit(tvm.tir.call_extern("", "printf", "\\r\\n"))
+    for c in counter_vars:
+        irb.emit(tvm.tir.call_extern("", "printf", tvm.tir.StringImm("%lu,"), c))
+    irb.emit(tvm.tir.call_extern("", "printf", "\\r\\n"))
+
+
+def _configure_timers(irb: IRBuilder):
+    """Generates calls to configure the enabled counters
+
+    Args:
+        irb (IRBuilder): IRBuilder
+    """
+    for i, (key, value) in enumerate(env.enabled_counters.items()):
+        irb.emit(tvm.tir.call_extern("", "counter_configure", i, key))
+
+
+def _reset_counters(irb: IRBuilder):
+    """Generates calls to reset all Gemmini counters
+
+    Args:
+        irb (IRBuilder): IRBuilder
+    """
+    irb.emit(tvm.tir.call_extern("", "counter_reset"))
+    irb.emit(tvm.tir.call_extern("", "counter_snapshot_reset"))
+
+
+def _match_pragma(stmt, key):
+    """Internal helper to match stmt to pragma stmt.
+
+    Parameters
+    ----------
+    stmt : Stmt
+        The AttrStmt
+
+    key : str
+        The pragma key
+    """
+    return (stmt.attr_key == "pragma_" + key) or (
+        stmt.attr_key == "pragma_scope" and stmt.value.value == key
+    )
+
+
+def _get_config_dict_from_str(str_value: str) -> Dict:
+    """Returns a configuration dictionary from its string representation
+
+    Args:
+        str_value (str): Dictionary encoded in a string
+
+    Returns:
+        Dict: Configuration dictionary
+    """
+    return ast.literal_eval(str(str_value).replace("'", '"').replace('"{', "{").replace('}"', "}"))
+
+
+def _gen_debug_header(irb: IRBuilder):
+    """If the debug flag is activated in the environment, generate the debug headers for the code
+
+    Args:
+        irb (IRBuilder): _description_
+    """
+    if env.debug:
+        _configure_timers(irb)
+        _reset_counters(irb)
+
+
+def _gen_debug_tail(irb: IRBuilder):
+    """If the debug flag is activated in the environment, generate the debug tails for the code
+
+    Args:
+        irb (IRBuilder): _description_
+    """
+    if env.debug:
+        _get_counters(irb)
+
+
+def InsertGemminiHeaderOperators():
+    """Pass to generate the calls to the Gemmini configuration instructions"""
+
+    def _do_fold(stmt):
+        if _match_pragma(stmt, "add_start"):
+            irb = tvm.tir.ir_builder.create()
+            _gen_debug_header(irb)
+
+            irb.emit(tvm.tir.call_extern("", "gemmini_flush", 0))
+
+            config_dict = _get_config_dict_from_str(stmt.body.value)
+            A_size = config_dict["A_size"]
+            B_size = config_dict["B_size"]
+            C_size = config_dict["C_size"]
+            A_private_stride = config_dict["A_private_stride"]
+            B_private_stride = config_dict["B_private_stride"]
+            execution_stride = config_dict["execution_stride"]
+            activation = config_dict["activation"]
+            mode = config_dict["mode"]
+            max_pixels_per_row = config_dict["max_pixels_per_row"]
+            ifm1_scale = config_dict["ifm1_scale"]
+            ifm2_scale = config_dict["ifm2_scale"]
+            scale = config_dict["scale"]
+            act = 1 if activation else 0
+
+            shrunk = 1
+            irb.emit(tvm.tir.call_extern("", "gemmini_config_ex", mode, act, 0))
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended4_config_ld",
+                    A_size,
+                    ifm1_scale,
+                    shrunk,
+                    A_private_stride,
+                    0,
+                )
+            )
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended4_config_ld",
+                    B_size,
+                    ifm2_scale,
+                    shrunk,
+                    B_private_stride,
+                    1,
+                )
+            )
+            irb.emit(
+                tvm.tir.call_extern(
+                    "", "gemmini_extended4_config_ld", C_size * 4, scale, 0, env.DIM, 2
+                )
+            )
+            irb.emit(tvm.tir.call_extern("", "gemmini_extended_config_st", C_size, act, scale))
+
+            return tvm.tir.SeqStmt([irb.get(), stmt])
+        elif _match_pragma(stmt, "gemm_start"):
+            irb = tvm.tir.ir_builder.create()
+            _gen_debug_header(irb)
+
+            irb.emit(tvm.tir.call_extern("", "gemmini_flush", 0))
+
+            config_dict = _get_config_dict_from_str(stmt.body.value)
+            A_size = config_dict["A_size"]
+            B_size = config_dict["B_size"]
+            C_size = config_dict["C_size"]
+            A_private_stride = config_dict["A_private_stride"]
+            B_private_stride = config_dict["B_private_stride"]
+            execution_stride = config_dict["execution_stride"]
+            activation = config_dict["activation"]
+            mode = config_dict["mode"]
+            max_pixels_per_row = config_dict["max_pixels_per_row"]
+            scale = config_dict["scale"]
+            padding_value = config_dict["padding_value"]
+            act = 1 if activation else 0
+
+            irb.emit(
+                tvm.tir.call_extern(
+                    "", "gemmini_extended_config_ex", mode, act, 0, execution_stride, 0, 0
+                )
+            )
+            if padding_value == 0:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended5_config_ld",
+                        A_size,
+                        1.0,
+                        0,
+                        A_private_stride,
+                        max_pixels_per_row,
+                        0,
+                    )
+                )
+            else:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended6_config_ld",
+                        A_size,
+                        1.0,
+                        0,
+                        A_private_stride,
+                        max_pixels_per_row,
+                        padding_value,
+                        0,
+                    )
+                )
+            irb.emit(
+                tvm.tir.call_extern(
+                    "", "gemmini_extended5_config_ld", B_size, 1.0, 0, B_private_stride, 1, 1
+                )
+            )
+            irb.emit(tvm.tir.call_extern("", "gemmini_extended4_config_ld", 0, 1.0, 0, env.DIM, 2))
+            irb.emit(tvm.tir.call_extern("", "gemmini_extended_config_st", C_size, act, scale))
+
+            return tvm.tir.SeqStmt([irb.get(), stmt])
+        elif _match_pragma(stmt, "gemm_cisc_start"):
+            irb = tvm.tir.ir_builder.create()
+            _gen_debug_header(irb)
+
+            irb.emit(tvm.tir.call_extern("", "gemmini_flush", 0))
+            return tvm.tir.SeqStmt([irb.get(), stmt])
+        elif _match_pragma(stmt, "conv2d_cisc_start") or _match_pragma(
+            stmt, "dw_conv2d_cisc_start"
+        ):
+            irb = tvm.tir.ir_builder.create()
+            _gen_debug_header(irb)
+
+            return tvm.tir.SeqStmt([irb.get(), stmt])
+        return None
+
+    def _ftransform(f, mod, ctx):
+        return f.with_body(
+            tvm.tir.stmt_functor.ir_transform(f.body, _do_fold, None, ["tir.AttrStmt"])
+        )
+
+    return tvm.tir.transform.prim_func_pass(
+        _ftransform, opt_level=0, name="tir.gemmini.insert_header_operators"
+    )
+
+
+def InsertGemminiFenceOperator():
+    """Pass to generate the call to the fence instruction at the end of the operator"""
+
+    func_name = ""
+
+    def _do_fold(stmt):
+        if _match_pragma(stmt, "gemm_end"):
+            irb = tvm.tir.ir_builder.create()
+            irb.emit(tvm.tir.call_extern("", "gemmini_fence"))
+            _gen_debug_tail(irb)
+
+            return tvm.tir.SeqStmt([stmt, irb.get()])
+        return None
+
+    def _ftransform(f, mod, ctx):
+        func_name = f.attrs["global_symbol"]
+        return f.with_body(
+            tvm.tir.stmt_functor.ir_transform(f.body, _do_fold, None, ["tir.AttrStmt"])
+        )
+
+    return tvm.tir.transform.prim_func_pass(
+        _ftransform, opt_level=0, name="tir.gemmini.insert_fence_operators"
+    )
+
+
+def InjectAMVINIntrin():
+    """Pass to inject A mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("A mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            dst_access_ptr = dst.access_ptr("w", "uint32")
+
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.INP_SCR_BASE_ADDRESS, "uint8") + dst_access_ptr,
+                    cols,
+                    rows,
+                )
+            )
+
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.A_mvin, _inject_copy)
+
+
+def InjectAMVINIntrinTransposed():
+    """Pass to inject A mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("A mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            # TODO (FP): check this pointers types again!
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            dst_access_ptr = dst.access_ptr("w", "uint32")
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.INP_SCR_BASE_ADDRESS, "uint8") + dst_access_ptr,
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.A_mvin + "_t", _inject_copy)
+
+
+def InjectBMVINIntrin():
+    """Pass to inject B mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        wgt_base_address = tvm.runtime.const(env.WGT_SCR_BASE_ADDRESS, "int32")
+        if dst.scope() == "global":
+            raise RuntimeError("B mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            dst_access_ptr = dst.access_ptr("r", "int32")
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin2",
+                    src.access_ptr("r"),
+                    wgt_base_address + dst_access_ptr,
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.B_mvin, _inject_copy)
+
+
+def InjectBMVINIntrinTransposed():
+    """Pass to inject B mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("B mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin2",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.WGT_SCR_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.B_mvin + "_t", _inject_copy)
+
+
+def InjectDMVINIntrin():
+    """Pass to inject D mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("D mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin3",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.D_mvin, _inject_copy)
+
+
+def InjectDMVINIntrinTransposed():
+    """Pass to inject D mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("D mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin3",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.D_mvin + "_t", _inject_copy)
+
+
+def InjectCMVOUTIntrin():
+    """Pass to inject C mvout intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if src.scope() == "global":
+            raise RuntimeError("C mvout should have a local source")
+        elif dst.scope() == "global":
+            # Store
+            irb = tvm.tir.ir_builder.create()
+            if len(dst.shape) == 1:
+                cols = 1
+            else:
+                cols = dst.shape[1]
+            rows = dst.shape[0]
+            out_access_ptr = src.access_ptr("w", "uint32")
+            get_full_width = tvm.runtime.const(0x00000000, "uint32")
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvout",
+                    dst.access_ptr("w"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + out_access_ptr
+                    - tvm.runtime.const(0x40000000, "uint32")
+                    + get_full_width,
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvout, _inject_copy)
+
+
+def InjectCMVOUTIntrinTransposed():
+    """Pass to inject C mvout intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if src.scope() == "global":
+            raise RuntimeError("C mvout should have a local source")
+        elif dst.scope() == "global":
+            # Store
+            irb = tvm.tir.ir_builder.create()
+            # TODO (FP): check this pointers types again!
+            if len(dst.shape) == 1:
+                rows = 1
+            else:
+                rows = dst.shape[1]
+            cols = dst.shape[0]
+            out_access_ptr = src.access_ptr("w", "uint32")
+            get_full_width = tvm.runtime.const(0x00000000, "uint32")
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvout",
+                    dst.access_ptr("w"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + out_access_ptr
+                    - tvm.runtime.const(0x40000000, "uint32")
+                    + get_full_width,
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvout + "_t", _inject_copy)
+
+
+def InjectCMVINIntrin():
+    """Pass to inject C mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("C mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvin, _inject_copy)
+
+
+def InjectCMVINIntrinTransposed():
+    """Pass to inject C mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("C mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvin + "_t", _inject_copy)
+
+
+def InjectCMVINAccumIntrin():
+    """Pass to inject C mvin accum intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("C mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin3",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvin_accum, _inject_copy)
+
+
+def InjectCMVINAccumIntrinTransposed():
+    """Pass to inject C mvin accum intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("C mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin3",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvin_accum + "_t", _inject_copy)
diff --git a/python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb
new file mode 100644
index 000000000000..2c2527830858
--- /dev/null
+++ b/python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb
@@ -0,0 +1,311 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MobileNet tutorial\n",
+    "\n",
+    "This tutorials shows how a quantized MobileNet network can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os\n",
+    "import tvm.contrib.gemmini as gemmini\n",
+    "from tvm import relay\n",
+    "import tvm\n",
+    "from mobilenet_utils import generate_mobilenet_tflite_model, get_real_image, run_tflite_model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"CHIPYARD_HOME\"] = \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We clean and prepare the workspace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
+    "os.system(\"mkdir -p include\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tflite_model_dir = generate_mobilenet_tflite_model()\n",
+    "\n",
+    "input_image = get_real_image(224, 224)\n",
+    "\n",
+    "tflite_model_file = os.path.join(tflite_model_dir)\n",
+    "tflite_model_buf = open(tflite_model_file, \"rb\").read()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    import tflite\n",
+    "\n",
+    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "except AttributeError:\n",
+    "    import tflite.Model\n",
+    "\n",
+    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "\n",
+    "tflite_res = run_tflite_model(tflite_model_buf, input_image)\n",
+    "tflite_pred = np.squeeze(tflite_res).argsort()[-5:][::-1]\n",
+    "print(\"Expected argmax = %i\" % (tflite_pred[0],))\n",
+    "print(\"Expected max labels = %s\" % (tflite_pred,))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input_image, \"./include\")\n",
+    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", tflite_pred.astype(np.uint32), \"./include\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The TFLite model generated in the previous steps is now imported into TVM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dtype_dict = {\"input\": input_image.dtype.name}\n",
+    "shape_dict = {\"input\": input_image.shape}\n",
+    "\n",
+    "mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict)\n",
+    "mod = relay.transform.InferType()(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod = gemmini.preprocess_pass(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
+    "\n",
+    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
+    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
+    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
+    "\n",
+    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
+    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "\n",
+    "os.system(\"mkdir dev\")\n",
+    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
+    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
+    "\n",
+    "import tarfile\n",
+    "\n",
+    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
+    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
+    "project_options = {\n",
+    "    \"project_type\": \"mobilenet_example\"\n",
+    "}  \n",
+    "\n",
+    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
+    "generated_project = tvm.micro.generate_project(\n",
+    "    template_project_path, module, generated_project_dir, project_options\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We build the project. This will generate an executable we can run on the Spike simulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.build()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we execute the compiled baremetal project on the Spike simulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.flash()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('tvm': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py b/python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py
new file mode 100644
index 000000000000..51e75fdd7022
--- /dev/null
+++ b/python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py
@@ -0,0 +1,138 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Utils to help generate the MobileNet TFLite model
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import os
+from tvm.contrib.download import download_testdata
+import numpy as np
+import tensorflow as tf
+
+
+def get_real_image(im_height, im_width):
+    from PIL import Image
+
+    repo_base = "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/"
+    img_name = "elephant-299.jpg"
+    image_url = os.path.join(repo_base, img_name)
+    img_path = download_testdata(image_url, img_name, module="data")
+    image = Image.open(img_path).resize((im_height, im_width))
+    x = np.array(image).astype("uint8")
+    data = np.reshape(x, (1, im_height, im_width, 3))
+    return data
+
+
+def run_tflite_model(tflite_model_buf, input_data):
+    """Generic function to execute TFLite"""
+    try:
+        from tensorflow import lite as interpreter_wrapper
+    except ImportError:
+        from tensorflow.contrib import lite as interpreter_wrapper
+
+    input_data = input_data if isinstance(input_data, list) else [input_data]
+
+    interpreter = interpreter_wrapper.Interpreter(model_content=tflite_model_buf)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    # set input
+    assert len(input_data) == len(input_details)
+    for i in range(len(input_details)):
+        interpreter.set_tensor(input_details[i]["index"], input_data[i])
+
+    # Run
+    interpreter.invoke()
+
+    # get output
+    tflite_output = list()
+    for i in range(len(output_details)):
+        tflite_output.append(interpreter.get_tensor(output_details[i]["index"]))
+
+    return tflite_output
+
+
+def download_model():
+    model_url = (
+        "https://storage.googleapis.com/download.tensorflow.org/models/"
+        "tflite_11_05_08/mobilenet_v2_1.0_224.tgz"
+    )
+
+    # Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite
+    model_path = download_testdata(
+        model_url, "mobilenet_v2_1.0_224.tgz", module=["tf", "official", "mobilenet_v2"]
+    )
+    model_dir = os.path.dirname(model_path)
+
+    return model_dir, model_path
+
+
+def extract(path):
+    import tarfile
+
+    if path.endswith("tgz") or path.endswith("gz"):
+        dir_path = os.path.dirname(path)
+        tar = tarfile.open(path)
+        tar.extractall(path=dir_path)
+        tar.close()
+    else:
+        raise RuntimeError("Could not decompress the file: " + path)
+
+
+def create_tflite_model(model_dir: str):
+    # tflite_model_name = [f for f in os.listdir(model_dir) if f.endswith(".tflite")][0]
+    # return f"{model_dir}/{tflite_model_name}"
+    def representative_data_gen():
+        dataset = [
+            np.array(np.random.randint(0, 255, size=(1, 224, 224, 3)), dtype=np.float32)
+            for s in range(100)
+        ]
+        for input_value in dataset:
+            # Model has only one input so each data point has one element.s
+            yield [input_value]
+
+    pb_file = [f for f in os.listdir(model_dir) if f.endswith(".pb")][0]
+    converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
+        f"{model_dir}/{pb_file}",
+        input_arrays=["input"],
+        input_shapes={"input": [1, 224, 224, 3]},
+        output_arrays=["MobilenetV2/Predictions/Reshape"],
+    )
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    # converter.target_spec.supported_ops = [tf.lite.OpsSet.SELECT_TF_OPS]
+    converter.inference_input_type = tf.uint8
+    converter.inference_output_type = tf.uint8
+    converter.representative_dataset = representative_data_gen
+    converter._experimental_disable_per_channel = True
+
+    tflite_model = converter.convert()
+    tflite_model_name = pb_file.replace(".pb", ".tflite")
+    with open(f"{model_dir}/{tflite_model_name}", "wb") as f:
+        f.write(tflite_model)
+
+    return f"{model_dir}/{tflite_model_name}"
+
+
+def generate_mobilenet_tflite_model():
+    model_dir, model_path = download_model()
+    extract(model_path)
+    return create_tflite_model(model_dir)
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb
new file mode 100644
index 000000000000..3bb2fa5788e9
--- /dev/null
+++ b/python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb
@@ -0,0 +1,395 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Add layer tutorial\n",
+    "\n",
+    "This tutorials shows how a quantized add layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.\n",
+    "\n",
+    "Note: This is an **experimental** layer!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.keras import layers\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import tvm.contrib.gemmini as gemmini\n",
+    "from tvm import relay\n",
+    "import tvm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"CHIPYARD_HOME\"] = \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we define the parameters of the layer we want to test. In this case:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_height = 16\n",
+    "input_width = 16\n",
+    "input_channels = 16\n",
+    "activation = 0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Model(tf.Module):\n",
+    "    def __init__(self, name=None):\n",
+    "        super().__init__(name)\n",
+    "\n",
+    "    @tf.function(\n",
+    "        input_signature=[\n",
+    "            tf.TensorSpec(\n",
+    "                shape=[1, input_height, input_width, input_channels],\n",
+    "                dtype=tf.float32,\n",
+    "            ),\n",
+    "            tf.TensorSpec(\n",
+    "                shape=[1, input_height, input_width, input_channels],\n",
+    "                dtype=tf.float32,\n",
+    "            ),\n",
+    "        ]\n",
+    "    )\n",
+    "    def add(self, x, y):\n",
+    "        if activation == 0:\n",
+    "            return x + y\n",
+    "        else:\n",
+    "            return layers.Activation(\"relu\")(x + y)\n",
+    "\n",
+    "model = Model()\n",
+    "\n",
+    "# Convert the concrete functions using TFLiteConverter\n",
+    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+    "\n",
+    "def representative_data_gen():\n",
+    "    dataset = [\n",
+    "        (\n",
+    "            np.array(\n",
+    "                np.random.randint(\n",
+    "                    -127, 128, size=(1, input_height, input_width, input_channels)\n",
+    "                ),\n",
+    "                dtype=np.float32,\n",
+    "            ),\n",
+    "            np.array(\n",
+    "                np.random.randint(\n",
+    "                    0, 128, size=(1, input_height, input_width, input_channels)\n",
+    "                ),\n",
+    "                dtype=np.float32,\n",
+    "            ),\n",
+    "        )\n",
+    "        for s in range(100)\n",
+    "    ]\n",
+    "    for input_value in dataset:\n",
+    "        yield [input_value[0], input_value[1]]\n",
+    "\n",
+    "\n",
+    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
+    "converter.inference_input_type = tf.uint8\n",
+    "converter.inference_output_type = tf.int8\n",
+    "converter.representative_dataset = representative_data_gen\n",
+    "converter._experimental_disable_per_channel = True\n",
+    "\n",
+    "tflite_model = converter.convert()\n",
+    "\n",
+    "# Save the model.\n",
+    "with open(\"add.tflite\", \"wb\") as f:\n",
+    "    f.write(tflite_model)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
+    "\n",
+    "tflite_file = \"./add.tflite\"\n",
+    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
+    "input_tensor = \"layer1_input\"\n",
+    "input_dtype = \"uint8\"\n",
+    "\n",
+    "os.system(\"mkdir -p include\")\n",
+    "\n",
+    "try:\n",
+    "    import tflite\n",
+    "\n",
+    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "except AttributeError:\n",
+    "    import tflite.Model\n",
+    "\n",
+    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "\n",
+    "# Load the TFLite model and allocate tensors.\n",
+    "interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)\n",
+    "interpreter.allocate_tensors()\n",
+    "input_details = interpreter.get_input_details()\n",
+    "output_details = interpreter.get_output_details()\n",
+    "tensor_details = interpreter.get_tensor_details()\n",
+    "\n",
+    "input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
+    "input_matrix_2 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
+    "\n",
+    "interpreter.set_tensor(input_details[0][\"index\"], input_matrix_1)\n",
+    "interpreter.set_tensor(input_details[1][\"index\"], input_matrix_2)\n",
+    "\n",
+    "interpreter.invoke()\n",
+    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.create_header_file(\"inputs\", \"data\", \"input_1\", input_matrix_2, \"./include\")\n",
+    "gemmini.create_header_file(\"inputs\", \"data\", \"input_2\", input_matrix_1, \"./include\")\n",
+    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096, use_experimental_qnn_add=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The TFLite model generated in the previous steps is now imported into TVM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod, params = relay.frontend.from_tflite(\n",
+    "    tflite_model,\n",
+    "    shape_dict={\"serving_default_x\": (1, input_height, input_width, input_channels), \"serving_default_y\": (1, input_height, input_width, input_channels)},\n",
+    "    dtype_dict={\"serving_default_x\": input_dtype, \"serving_default_y\": input_dtype},\n",
+    ")\n",
+    "mod = relay.transform.InferType()(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod = gemmini.preprocess_pass(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
+    "\n",
+    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
+    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
+    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
+    "\n",
+    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
+    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "\n",
+    "os.system(\"mkdir dev\")\n",
+    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
+    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
+    "\n",
+    "import tarfile\n",
+    "\n",
+    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
+    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
+    "project_options = {\n",
+    "    \"project_type\": \"add_example\"\n",
+    "}  \n",
+    "\n",
+    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
+    "generated_project = tvm.micro.generate_project(\n",
+    "    template_project_path, module, generated_project_dir, project_options\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We build the project. This will generate an executable we can run on the Spike simulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.build()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
+    "\n",
+    "Note: if there are errors, these can be related to rounding errors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.flash()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('tvm': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb
new file mode 100644
index 000000000000..c7512586b809
--- /dev/null
+++ b/python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb
@@ -0,0 +1,378 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2D convolution layer tutorial\n",
+    "\n",
+    "This tutorials shows how a quantized 2d convolution layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow import keras\n",
+    "from tensorflow.keras import layers\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import tvm.contrib.gemmini as gemmini\n",
+    "from tvm import relay\n",
+    "import tvm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"CHIPYARD_HOME\"] = \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we define the parameters of the layer we want to test. In this case:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_height = 16\n",
+    "input_width = 16\n",
+    "input_channels = 16\n",
+    "output_channels = 16\n",
+    "kernel_size = 3\n",
+    "stride = 1\n",
+    "padding = 'valid'\n",
+    "activation = None\n",
+    "bias = True\n",
+    "\n",
+    "# We can add a max pooling layer after the convolution. This can be merged by the integration and can be executed together with the convolution on the Gemmini accelerator.\n",
+    "pool_size = 1\n",
+    "pool_stride = 1\n",
+    "pool_padding = 'valid'\n",
+    "use_pool = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "layer_sequence = [\n",
+    "    layers.Conv2D(\n",
+    "        output_channels,\n",
+    "        kernel_size=kernel_size,\n",
+    "        padding=padding,\n",
+    "        activation=activation,\n",
+    "        use_bias=True,\n",
+    "        bias_initializer=\"ones\",\n",
+    "        input_shape=(input_height, input_width, input_channels),\n",
+    "        strides=stride,\n",
+    "    )\n",
+    "]\n",
+    "if use_pool:\n",
+    "    layer_sequence.append(\n",
+    "        layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)\n",
+    "    )\n",
+    "\n",
+    "model = keras.Sequential(layer_sequence)\n",
+    "\n",
+    "# Convert the concrete functions using TFLiteConverter\n",
+    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+    "\n",
+    "def representative_data_gen():\n",
+    "    dataset = [\n",
+    "        np.array(np.random.randint(0, 10, size=(100, input_height, input_width, input_channels)), dtype=np.float32)\n",
+    "        for s in range(10)\n",
+    "    ]\n",
+    "    for input_value in dataset:\n",
+    "        # Model has only one input so each data point has one element.s\n",
+    "        yield [input_value]\n",
+    "\n",
+    "\n",
+    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
+    "converter.inference_input_type = tf.uint8\n",
+    "converter.inference_output_type = tf.int8\n",
+    "converter.representative_dataset = representative_data_gen\n",
+    "converter._experimental_disable_per_channel = True\n",
+    "\n",
+    "tflite_model = converter.convert()\n",
+    "\n",
+    "# Save the model.\n",
+    "with open(\"conv.tflite\", \"wb\") as f:\n",
+    "    f.write(tflite_model)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
+    "\n",
+    "tflite_file = \"./conv.tflite\"\n",
+    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
+    "input_tensor = \"layer1_input\"\n",
+    "input_dtype = \"uint8\"\n",
+    "\n",
+    "os.system(\"mkdir -p include\")\n",
+    "\n",
+    "try:\n",
+    "    import tflite\n",
+    "\n",
+    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "except AttributeError:\n",
+    "    import tflite.Model\n",
+    "\n",
+    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "\n",
+    "# Load the TFLite model and allocate tensors.\n",
+    "interpreter = tf.lite.Interpreter(model_path=\"./conv.tflite\")\n",
+    "interpreter.allocate_tensors()\n",
+    "input_details = interpreter.get_input_details()\n",
+    "output_details = interpreter.get_output_details()\n",
+    "input_matrix = np.random.randint(0, 127, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
+    "interpreter.set_tensor(input_details[0][\"index\"], input_matrix)\n",
+    "interpreter.invoke()\n",
+    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input_matrix, \"./include\")\n",
+    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The TFLite model generated in the previous steps is now imported into TVM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod, params = relay.frontend.from_tflite(\n",
+    "    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}\n",
+    ")\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod = gemmini.preprocess_pass(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
+    "\n",
+    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
+    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
+    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
+    "\n",
+    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
+    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "\n",
+    "os.system(\"mkdir dev\")\n",
+    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
+    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
+    "\n",
+    "import tarfile\n",
+    "\n",
+    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
+    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
+    "project_options = {\n",
+    "    \"project_type\": \"conv2d_example\"\n",
+    "}  \n",
+    "\n",
+    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
+    "generated_project = tvm.micro.generate_project(\n",
+    "    template_project_path, module, generated_project_dir, project_options\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We build the project. This will generate an executable we can run on the Spike simulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.build()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
+    "\n",
+    "Note: if there are errors, these can be related to rounding errors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.flash()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('tvm': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb
new file mode 100644
index 000000000000..d1959f66b72a
--- /dev/null
+++ b/python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb
@@ -0,0 +1,378 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dense layer tutorial\n",
+    "\n",
+    "This tutorials shows how a quantized dense (fully connected) layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import tvm.contrib.gemmini as gemmini\n",
+    "from tvm import relay\n",
+    "import tvm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"CHIPYARD_HOME\"] = \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we define the parameters of the layer we want to test. In this case:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_height = 32\n",
+    "input_width = 32\n",
+    "output_width = 32"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Model(tf.Module):\n",
+    "    def __init__(self, name=None):\n",
+    "        super().__init__(name)\n",
+    "        self.w = tf.Variable(tf.random.normal([input_width, output_width]), name=\"w\")\n",
+    "        self.b = tf.Variable(tf.random.normal([output_width]), name=\"b\")\n",
+    "\n",
+    "    @tf.function(\n",
+    "        input_signature=[\n",
+    "            tf.TensorSpec(shape=[input_height, input_width], dtype=tf.float32),\n",
+    "        ]\n",
+    "    )\n",
+    "    def matmul(self, x):\n",
+    "        return tf.linalg.matmul(x, self.w, transpose_b=False) + self.b\n",
+    "\n",
+    "model = Model()\n",
+    "\n",
+    "# Convert the concrete functions using TFLiteConverter\n",
+    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+    "\n",
+    "\n",
+    "def representative_data_gen():\n",
+    "    dataset = [\n",
+    "        (\n",
+    "            np.array(\n",
+    "                np.random.randint(-127, 128, size=(input_height, input_width)), dtype=np.float32\n",
+    "            ),\n",
+    "            np.array(\n",
+    "                np.random.randint(-127, 128, size=(input_width, output_width)), dtype=np.float32\n",
+    "            ),\n",
+    "        )\n",
+    "        for s in range(100)\n",
+    "    ]\n",
+    "    for input_value in dataset:\n",
+    "        # Model has only one input so each data point has one element.\n",
+    "        yield [input_value[0]]\n",
+    "\n",
+    "\n",
+    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
+    "converter.inference_input_type = tf.uint8\n",
+    "converter.inference_output_type = tf.int8\n",
+    "converter.representative_dataset = representative_data_gen\n",
+    "converter._experimental_disable_per_channel = True\n",
+    "\n",
+    "tflite_model = converter.convert()\n",
+    "\n",
+    "# Save the model.\n",
+    "with open(\"matmul.tflite\", \"wb\") as f:\n",
+    "    f.write(tflite_model)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
+    "\n",
+    "tflite_file = \"./matmul.tflite\"\n",
+    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
+    "input_tensor = \"layer1_input\"\n",
+    "input_dtype = \"uint8\"\n",
+    "\n",
+    "os.system(\"mkdir -p include\")\n",
+    "\n",
+    "try:\n",
+    "    import tflite\n",
+    "\n",
+    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "except AttributeError:\n",
+    "    import tflite.Model\n",
+    "\n",
+    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "\n",
+    "# Load the TFLite model and allocate tensors.\n",
+    "interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)\n",
+    "interpreter.allocate_tensors()\n",
+    "input_details = interpreter.get_input_details()\n",
+    "output_details = interpreter.get_output_details()\n",
+    "tensor_details = interpreter.get_tensor_details()\n",
+    "\n",
+    "input1 = np.random.randint(0, 255, (input_height, input_width), dtype=np.uint8)\n",
+    "interpreter.set_tensor(input_details[0][\"index\"], input1)\n",
+    "\n",
+    "interpreter.invoke()\n",
+    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input1, \"./include\")\n",
+    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The TFLite model generated in the previous steps is now imported into TVM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod, params = relay.frontend.from_tflite(\n",
+    "    tflite_model,\n",
+    "    shape_dict={\n",
+    "        \"serving_default_x:0\": (input_height, input_width),\n",
+    "    },\n",
+    "    dtype_dict={\n",
+    "        \"serving_default_x:0\": input_dtype,\n",
+    "    },\n",
+    ")\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod = gemmini.preprocess_pass(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
+    "\n",
+    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
+    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
+    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
+    "\n",
+    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
+    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "\n",
+    "os.system(\"mkdir dev\")\n",
+    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
+    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
+    "\n",
+    "import tarfile\n",
+    "\n",
+    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
+    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
+    "project_options = {\n",
+    "    \"project_type\": \"dense_example\"\n",
+    "}  \n",
+    "\n",
+    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
+    "generated_project = tvm.micro.generate_project(\n",
+    "    template_project_path, module, generated_project_dir, project_options\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We build the project. This will generate an executable we can run on the Spike simulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.build()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
+    "\n",
+    "Note: if there are errors, these can be related to rounding errors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.flash()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('tvm': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb
new file mode 100644
index 000000000000..b5753a300401
--- /dev/null
+++ b/python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb
@@ -0,0 +1,373 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2D depthwise convolution layer tutorial\n",
+    "\n",
+    "This tutorials shows how a quantized 2D depthwise convolution layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import itertools\n",
+    "from pyrsistent import v\n",
+    "import tensorflow as tf\n",
+    "from tensorflow import keras\n",
+    "from tensorflow.keras import layers\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import argparse\n",
+    "import random\n",
+    "import tvm.contrib.gemmini as gemmini\n",
+    "from tvm import relay\n",
+    "import tvm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"CHIPYARD_HOME\"] = \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we define the parameters of the layer we want to test. In this case:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_height = 112\n",
+    "input_width = 112\n",
+    "input_channels = 32\n",
+    "kernel_size = 3\n",
+    "stride = 1\n",
+    "padding = 'same'\n",
+    "activation = None\n",
+    "bias = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = keras.Sequential(\n",
+    "    [\n",
+    "        layers.DepthwiseConv2D(\n",
+    "            kernel_size=kernel_size,\n",
+    "            padding=padding,\n",
+    "            activation=activation,\n",
+    "            use_bias=True,\n",
+    "            bias_initializer=\"ones\",\n",
+    "            input_shape=(input_height, input_width, input_channels),\n",
+    "            strides=stride,\n",
+    "        )\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "# Convert the concrete functions using TFLiteConverter\n",
+    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+    "\n",
+    "def representative_data_gen():\n",
+    "    dataset = [\n",
+    "        np.array(np.random.randint(0, 127, size=(10, input_height, input_width, input_channels)), dtype=np.float32)\n",
+    "        for s in range(10)\n",
+    "    ]\n",
+    "    for input_value in dataset:\n",
+    "        # Model has only one input so each data point has one element.s\n",
+    "        yield [input_value]\n",
+    "\n",
+    "\n",
+    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
+    "converter.inference_input_type = tf.uint8\n",
+    "converter.inference_output_type = tf.int8\n",
+    "converter.representative_dataset = representative_data_gen\n",
+    "converter._experimental_disable_per_channel = True\n",
+    "\n",
+    "tflite_model = converter.convert()\n",
+    "\n",
+    "# Save the model.\n",
+    "with open(\"dwconv.tflite\", \"wb\") as f:\n",
+    "    f.write(tflite_model)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
+    "\n",
+    "tflite_file = \"./dwconv.tflite\"\n",
+    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
+    "input_tensor = \"layer1_input\"\n",
+    "input_dtype = \"uint8\"\n",
+    "\n",
+    "os.system(\"mkdir -p include\")\n",
+    "\n",
+    "try:\n",
+    "    import tflite\n",
+    "\n",
+    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "except AttributeError:\n",
+    "    import tflite.Model\n",
+    "\n",
+    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "\n",
+    "# Load the TFLite model and allocate tensors.\n",
+    "interpreter = tf.lite.Interpreter(model_path=\"./dwconv.tflite\")\n",
+    "interpreter.allocate_tensors()\n",
+    "input_details = interpreter.get_input_details()\n",
+    "output_details = interpreter.get_output_details()\n",
+    "tensor_details = interpreter.get_tensor_details()\n",
+    "\n",
+    "input = np.random.randint(0, 2, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
+    "interpreter.set_tensor(input_details[0][\"index\"], input)\n",
+    "\n",
+    "interpreter.invoke()\n",
+    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input, \"./include\")\n",
+    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The TFLite model generated in the previous steps is now imported into TVM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod, params = relay.frontend.from_tflite(\n",
+    "    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}\n",
+    ")\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod = gemmini.preprocess_pass(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
+    "\n",
+    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
+    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
+    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
+    "\n",
+    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
+    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "\n",
+    "os.system(\"mkdir dev\")\n",
+    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
+    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
+    "\n",
+    "import tarfile\n",
+    "\n",
+    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
+    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
+    "project_options = {\n",
+    "    \"project_type\": \"dwconv2d_example\"\n",
+    "}  \n",
+    "\n",
+    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
+    "generated_project = tvm.micro.generate_project(\n",
+    "    template_project_path, module, generated_project_dir, project_options\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We build the project. This will generate an executable we can run on the Spike simulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.build()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
+    "\n",
+    "Note: if there are errors, these can be related to rounding errors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.flash()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('tvm': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb
new file mode 100644
index 000000000000..bdee93760f96
--- /dev/null
+++ b/python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb
@@ -0,0 +1,378 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2D max pooling layer tutorial\n",
+    "\n",
+    "This tutorials shows how a quantized 2D max pooling layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.keras import layers\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import tvm.contrib.gemmini as gemmini\n",
+    "from tvm import relay\n",
+    "import tvm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"CHIPYARD_HOME\"] = \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we define the parameters of the layer we want to test. In this case:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_height = 16\n",
+    "input_width = 16\n",
+    "input_channels = 16\n",
+    "pool_size = 2\n",
+    "pool_stride = 1\n",
+    "pool_padding = 'valid'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Model(tf.Module):\n",
+    "    def __init__(self, name=None):\n",
+    "        super().__init__(name)\n",
+    "\n",
+    "    @tf.function(\n",
+    "        input_signature=[\n",
+    "            tf.TensorSpec(\n",
+    "                shape=[1, input_height, input_width, input_channels],\n",
+    "                dtype=tf.float32,\n",
+    "            )\n",
+    "        ]\n",
+    "    )\n",
+    "    def maxpool(self, x):\n",
+    "        return layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)(x)\n",
+    "\n",
+    "model = Model()\n",
+    "\n",
+    "# Convert the concrete functions using TFLiteConverter\n",
+    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+    "\n",
+    "\n",
+    "def representative_data_gen():\n",
+    "    dataset = [\n",
+    "        np.array(\n",
+    "            np.random.randint(\n",
+    "                -127, 128, size=(1, input_height, input_width, input_channels)\n",
+    "            ),\n",
+    "            dtype=np.float32,\n",
+    "        )\n",
+    "        for s in range(100)\n",
+    "    ]\n",
+    "    for input_value in dataset:\n",
+    "        # Model has only one input so each data point has one element.\n",
+    "        yield [input_value]\n",
+    "\n",
+    "\n",
+    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
+    "converter.inference_input_type = tf.uint8\n",
+    "converter.inference_output_type = tf.int8\n",
+    "converter.representative_dataset = representative_data_gen\n",
+    "converter._experimental_disable_per_channel = True\n",
+    "\n",
+    "tflite_model = converter.convert()\n",
+    "\n",
+    "# Save the model.\n",
+    "with open(\"maxpool.tflite\", \"wb\") as f:\n",
+    "    f.write(tflite_model)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
+    "\n",
+    "tflite_file = \"./maxpool.tflite\"\n",
+    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
+    "input_tensor = \"layer1_input\"\n",
+    "input_dtype = \"uint8\"\n",
+    "\n",
+    "os.system(\"mkdir -p include\")\n",
+    "\n",
+    "try:\n",
+    "    import tflite\n",
+    "\n",
+    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "except AttributeError:\n",
+    "    import tflite.Model\n",
+    "\n",
+    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "\n",
+    "# Load the TFLite model and allocate tensors.\n",
+    "interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)\n",
+    "interpreter.allocate_tensors()\n",
+    "input_details = interpreter.get_input_details()\n",
+    "output_details = interpreter.get_output_details()\n",
+    "tensor_details = interpreter.get_tensor_details()\n",
+    "\n",
+    "input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
+    "\n",
+    "interpreter.set_tensor(input_details[0][\"index\"], input_matrix_1)\n",
+    "\n",
+    "interpreter.invoke()\n",
+    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input_matrix_1, \"./include\")\n",
+    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The TFLite model generated in the previous steps is now imported into TVM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod, params = relay.frontend.from_tflite(\n",
+    "    tflite_model,\n",
+    "    shape_dict={\"serving_default_x\": (1, input_height, input_width, input_channels)},\n",
+    "    dtype_dict={\"serving_default_x\": input_dtype},\n",
+    ")\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod = gemmini.preprocess_pass(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
+    "\n",
+    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
+    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
+    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
+    "\n",
+    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
+    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "\n",
+    "os.system(\"mkdir dev\")\n",
+    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
+    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
+    "\n",
+    "import tarfile\n",
+    "\n",
+    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
+    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
+    "project_options = {\n",
+    "    \"project_type\": \"maxpool2d_example\"\n",
+    "}  \n",
+    "\n",
+    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
+    "generated_project = tvm.micro.generate_project(\n",
+    "    template_project_path, module, generated_project_dir, project_options\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We build the project. This will generate an executable we can run on the Spike simulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.build()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
+    "\n",
+    "Note: if there are errors, these can be related to rounding errors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.flash()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/tvm/contrib/gemmini/utils.py b/python/tvm/contrib/gemmini/utils.py
new file mode 100644
index 000000000000..1f9d6b26134f
--- /dev/null
+++ b/python/tvm/contrib/gemmini/utils.py
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Useful enumerations and others
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from enum import Enum
+
+counters = {
+    1: "MAIN_LD_CYCLES",
+    2: "MAIN_ST_CYCLES",
+    3: "MAIN_EX_CYCLES",
+    4: "MAIN_LD_ST_CYCLES",
+    5: "MAIN_LD_EX_CYCLES",
+    6: "MAIN_ST_EX_CYCLES",
+    7: "MAIN_LD_ST_EX_CYCLES",
+    8: "LOAD_DMA_WAIT_CYCLE",
+    9: "LOAD_ACTIVE_CYCLE",
+    10: "LOAD_SCRATCHPAD_WAIT_CYCLE",
+    11: "STORE_DMA_WAIT_CYCLE",
+    12: "STORE_ACTIVE_CYCLE",
+    13: "STORE_POOLING_CYCLE",
+    14: "STORE_SCRATCHPAD_WAIT_CYCLE",
+    15: "DMA_TLB_MISS_CYCLE",
+    16: "DMA_TLB_HIT_REQ",
+    17: "DMA_TLB_TOTAL_REQ",
+    18: "RDMA_ACTIVE_CYCLE",
+    19: "RDMA_TLB_WAIT_CYCLES",
+    20: "RDMA_TL_WAIT_CYCLES",
+    21: "WDMA_ACTIVE_CYCLE",
+    22: "WDMA_TLB_WAIT_CYCLES",
+    23: "WDMA_TL_WAIT_CYCLES",
+    24: "EXE_ACTIVE_CYCLE",
+    25: "EXE_FLUSH_CYCLE",
+    26: "EXE_CONTROL_Q_BLOCK_CYCLE",
+    27: "EXE_PRELOAD_HAZ_CYCLE",
+    28: "EXE_OVERLAP_HAZ_CYCLE",
+    29: "SCRATCHPAD_A_WAIT_CYCLE",
+    30: "SCRATCHPAD_B_WAIT_CYCLE",
+    31: "SCRATCHPAD_D_WAIT_CYCLE",
+    32: "ACC_A_WAIT_CYCLE",
+    33: "ACC_B_WAIT_CYCLE",
+    34: "ACC_D_WAIT_CYCLE",
+    35: "A_GARBAGE_CYCLES",
+    36: "B_GARBAGE_CYCLES",
+    37: "D_GARBAGE_CYCLES",
+    38: "IM2COL_MEM_CYCLES",
+    39: "IM2COL_ACTIVE_CYCLES",
+    40: "IM2COL_TRANSPOSER_WAIT_CYCLE",
+    41: "RESERVATION_STATION_FULL_CYCLES",
+    42: "RESERVATION_STATION_ACTIVE_CYCLES",
+    43: "LOOP_MATMUL_ACTIVE_CYCLES",
+    44: "TRANSPOSE_PRELOAD_UNROLLER_ACTIVE_CYCLES",
+    45: "RESERVATION_STATION_LD_COUNT",
+    46: "RESERVATION_STATION_ST_COUNT",
+    47: "RESERVATION_STATION_EX_COUNT",
+    48: "RDMA_BYTES_REC",
+    49: "WDMA_BYTES_SENT",
+    50: "RDMA_TOTAL_LATENCY",
+    51: "WDMA_TOTAL_LATENCY",
+}
+
+
+class ClipArgs(Enum):
+    """
+    This is a helper enums to obtain the correct index
+    of clip arguments.
+    """
+
+    A_MIN = 1
+    A_MAX = 2
+
+
+class BinaryElementwiseArgs(Enum):
+    """This is a helper enums to access the correct index
+    of binary elementwise arguments
+    """
+
+    IFM1 = 0
+    IFM2 = 1
+    IFM1_SCALE = 2
+    IFM1_ZERO_POINT = 3
+    IFM2_SCALE = 4
+    IFM2_ZERO_POINT = 5
+    OFM_SCALE = 6
+    OFM_ZERO_POINT = 7
+
+
+class QDenseArgs(Enum):
+    """
+    This is a helper enum to access the correct index of
+    qnn.dense arguments
+    """
+
+    IFM = 0
+    WEIGHTS = 1
+    IFM_ZERO_POINT = 2
+    WEIGHTS_ZERO_POINT = 3
+    IFM_SCALE = 4
+    WEIGHTS_SCALE = 5
+
+
+class QConv2DArgs(Enum):
+    """
+    This is a helper enum to obtain the correct index
+    of qnn.conv2d arguments.
+    """
+
+    IFM = 0
+    WEIGHTS = 1
+    IFM_ZERO_POINT = 2
+    WEIGHTS_ZERO_POINT = 3
+    IFM_SCALE = 4
+    WEIGHTS_SCALE = 5
+
+
+class RequantArgs(Enum):
+    """
+    This is a helper enum to obtain the correct index
+    of qnn.requantize arguments.
+    """
+
+    IFM_SCALE = 1
+    IFM_ZERO_POINT = 2
+    OFM_SCALE = 3
+    OFM_ZERO_POINT = 4
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index 92574ce2f8c2..b05d0d60d47a 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -38,6 +38,7 @@ class MicroTVMTemplateProject(enum.Enum):
     ZEPHYR = "zephyr"
     ARDUINO = "arduino"
     CRT = "crt"
+    GEMMINI = "gemmini"
 
     @classmethod
     def list(cls):
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 1ba9f5e73395..5f8469463997 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -519,7 +519,7 @@ def _export_operator_model_library_format(mod: build_module.OperatorModule, temp
     """
     targets = []
     for target in mod.ir_module_by_target.keys():
-        if str(target.kind) not in ("llvm", "c"):
+        if str(target.kind) not in ("llvm", "c", "gemmini"):
             raise UnsupportedInModelLibraryFormatError(
                 f"Operator has non-DSO-exportable target {target!s}, which is not yet supported in "
                 "Model Library Format"
diff --git a/python/tvm/relay/backend/contrib/gemmini/__init__.py b/python/tvm/relay/backend/contrib/gemmini/__init__.py
new file mode 100644
index 000000000000..b68c070cbed9
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/__init__.py
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Gemmini operators compute and schedule declarations
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from . import op
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
new file mode 100644
index 000000000000..9f7837c076e5
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
@@ -0,0 +1,214 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Add operator declaration and schedule registration for Gemmini
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
+
+from tvm.contrib.gemmini.environment import Environment
+from tvm.contrib.gemmini.build_module import lower
+from tvm.contrib.gemmini.helpers import get_greater_div
+
+import json
+
+env = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.add")
+def add(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    ifm1: tvm.te.tensor.Tensor,
+    ifm2: tvm.te.tensor.Tensor,
+    ofm_offset: tvm.te.tensor.Tensor,
+    ifm1_scale: float,
+    ifm2_scale: float,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's add operator
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        ifm1 (tvm.te.tensor.Tensor): input tensor 1
+        ifm2 (tvm.te.tensor.Tensor): input tensor 2
+        ofm_offset (tvm.te.tensor.Tensor): offset tensor
+        ifm1_scale (float): scaling factor for input tensor 1
+        ifm2_scale (float): scaling factor for input tensor 2
+
+    Raises:
+        topi.InvalidShapeError: if input shapes are not supported
+
+    Returns:
+        tvm.te.tensor.Tensor: add operator result
+    """
+
+    # Make sure that the input shapes make sense
+    if len(ifm1.shape) != 4 or len(ifm2.shape) != 4 or len(ofm_offset.shape) != 4:
+        raise topi.InvalidShapeError()
+
+    # Derive shapes
+    oshape = topi.utils.get_const_tuple(ifm1.shape)
+
+    tensor_type = env.inp_dtype
+
+    ofm_offset_stage = te.compute(
+        oshape,
+        lambda b, x, y, c: ofm_offset[b, x, y, c].astype(tensor_type),
+        name="ofm_offset.local",
+        tag="ofm_offset",
+    )
+    ifm2_stage = te.compute(
+        oshape,
+        lambda b, x, y, c: ifm2[b, x, y, c].astype(tensor_type)
+        + ofm_offset_stage[b, x, y, c].astype(tensor_type),
+        name="ifm2.local",
+        tag="ifm2",
+    )
+    res = te.compute(
+        oshape,
+        lambda b, x, y, c: ifm1[b, x, y, c].astype(tensor_type)
+        + ifm2_stage[b, x, y, c].astype(tensor_type),
+        name="res",
+        tag="add",
+        attrs={
+            "ifm1_scale": ifm1_scale,
+            "ifm2_scale": ifm2_scale,
+        },
+    )
+
+    cfg.add_flop(
+        3 * np.prod(topi.utils.get_const_tuple(oshape))
+        + 2  # element additions needed
+        * np.prod(
+            topi.utils.get_const_tuple(oshape)
+        )  # element multiplications needed (input scaling)
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.add")
+def schedule_add(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's add operator
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+
+    assert len(outs) == 1
+    output = outs[0]
+
+    add_stage = output.op.output(0)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    sch = te.create_schedule([x.op for x in outs])
+
+    ifm1, ifm2_op = add_stage.op.input_tensors
+    ifm2, ofm_offset_op = ifm2_op.op.input_tensors
+    ofm_offset = ofm_offset_op.op.input_tensors[0]
+
+    b, x, y, c = sch[add_stage].op.axis
+
+    # Prepare the scope of each buffer
+    cifm1 = sch.cache_read(ifm1, env.acc_scope, [add_stage])
+    sch[ifm2_op].set_scope(env.acc_scope)
+    sch[ofm_offset_op].set_scope(env.acc_scope)
+
+    # Split axis, taking into account the maximum value of rows and columns that can be moved into Gemminis accumulator (DIM)
+    y_factor = get_greater_div(int(sch[add_stage].op.axis[3].dom.extent))
+    x_factor = get_greater_div(int(sch[add_stage].op.axis[2].dom.extent))
+    y_o, y_i = sch[add_stage].split(sch[add_stage].op.axis[3], factor=y_factor)
+    x_o, x_i = sch[add_stage].split(sch[add_stage].op.axis[2], factor=x_factor)
+    sch[add_stage].reorder(x_o, y_o, x_i, y_i)
+
+    # Compute the stages in the correct position
+    sch[cifm1].compute_at(sch[add_stage], y_o)
+    sch[ifm2_op].compute_at(sch[add_stage], y_o)
+    sch[ofm_offset_op].compute_at(sch[add_stage], y_o)
+
+    # Split axis, taking into account the maximum value of rows and columns that can be moved into Gemminis accumulator (DIM)
+    cifm1_ax_0_1, cifm1_ax_0_2 = sch[cifm1].split(sch[cifm1].op.axis[2], factor=env.DIM)
+    cifm1_ax_1_1, cifm1_ax_1_2 = sch[cifm1].split(
+        sch[cifm1].op.axis[3], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+    )
+    sch[cifm1].reorder(cifm1_ax_0_1, cifm1_ax_1_1, cifm1_ax_0_2, cifm1_ax_1_2)
+
+    cifm2_ax_0_1, cifm2_ax_0_2 = sch[ifm2_op].split(sch[ifm2_op].op.axis[2], factor=env.DIM)
+    cifm2_ax_1_1, cifm2_ax_1_2 = sch[ifm2_op].split(
+        sch[ifm2_op].op.axis[3], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+    )
+    sch[ifm2_op].reorder(cifm2_ax_0_1, cifm2_ax_1_1, cifm2_ax_0_2, cifm2_ax_1_2)
+
+    cofm_offset_ax_0_1, cofm_offset_ax_0_2 = sch[ofm_offset_op].split(
+        sch[ofm_offset_op].op.axis[2], factor=env.DIM
+    )
+    cofm_offset_ax_1_1, cofm_offset_ax_1_2 = sch[ofm_offset_op].split(
+        sch[ofm_offset_op].op.axis[3], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+    )
+    sch[ofm_offset_op].reorder(
+        cofm_offset_ax_0_1, cofm_offset_ax_1_1, cofm_offset_ax_0_2, cofm_offset_ax_1_2
+    )
+
+    # Set pragmas to insert mvin instructions
+    oshape = (x_factor, y_factor)
+    if x_factor == 1:
+        sch[cifm1].pragma(cifm1_ax_0_2, env.C_mvin + "_t")
+        sch[ofm_offset_op].pragma(cofm_offset_ax_0_2, env.C_mvin_accum + "_t")
+    else:
+        sch[cifm1].pragma(cifm1_ax_0_2, env.C_mvin)
+        sch[ofm_offset_op].pragma(cofm_offset_ax_0_2, env.C_mvin_accum)
+
+    # Tensorize
+    sch[ifm2_op].tensorize(cifm2_ax_0_2, env.add_tensorize(oshape))
+    sch[add_stage].tensorize(x_i, env.add_mvout_tensorize(oshape))
+
+    # Create configuration dictionary
+    config_dict = {}
+    config_dict["A_size"] = int(ifm1.shape[3])
+    config_dict["B_size"] = int(ifm2.shape[3])
+    config_dict["C_size"] = int(output.shape[3])
+    config_dict["A_private_stride"] = env.DIM
+    config_dict["B_private_stride"] = env.DIM
+    config_dict["execution_stride"] = 1
+    config_dict["activation"] = 0
+    config_dict["mode"] = env.WEIGHT_STATIONARY
+    config_dict["max_pixels_per_row"] = 1
+    config_dict["ifm1_scale"] = float(add_stage.op.attrs["ifm1_scale"])
+    config_dict["ifm2_scale"] = float(add_stage.op.attrs["ifm2_scale"])
+    config_dict["scale"] = 1.0
+
+    # Set pragmas to configure the start and end of the Gemmini code
+    sch[output].pragma(sch[output].op.axis[0], "add_start")
+    sch[output].pragma(sch[output].op.axis[0], "configs", str(config_dict))
+    sch[output].pragma(sch[output].op.axis[0], "gemm_end")
+
+    # print(lower(sch,[ifm1,ifm2,ofm_offset,output]))
+    # breakpoint()
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
new file mode 100644
index 000000000000..6d129a0e8b0f
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
@@ -0,0 +1,244 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Conv2d operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+
+from tvm.contrib.gemmini.environment import Environment
+from tvm.contrib.gemmini.build_module import lower
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
+from tvm.contrib.gemmini.helpers import get_greater_div
+
+env = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.conv2d_cisc")
+def conv2d_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    orig_data: tvm.te.tensor.Tensor,
+    kernel: tvm.te.tensor.Tensor,
+    bias: tvm.te.tensor.Tensor,
+    strides: tvm.ir.container.Array,
+    padding: tvm.ir.container.Array,
+    ifm_offset: int,
+    activation: int,
+    gemmini_scale: float,
+    pool_size: tvm.ir.container.Array,
+    pool_strides: tvm.ir.container.Array,
+    pool_dilation: tvm.ir.container.Array,
+    pool_padding: tvm.ir.container.Array,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's conv2d operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        orig_data (tvm.te.tensor.Tensor): Input feature map
+        kernel (tvm.te.tensor.Tensor): Layer weights
+        bias (tvm.te.tensor.Tensor): Layer biases
+        strides (tvm.ir.container.Array): convolution strides
+        padding (tvm.ir.container.Array): input feature map padding
+        ifm_offset (int): input feature map offset (used for the padding of the input feature map)
+        activation (int): has activation?
+        gemmini_scale (float): output scaling factor
+        pool_size (tvm.ir.container.Array): size of the output pooling window
+        pool_strides (tvm.ir.container.Array): strides for the output pooling window
+        pool_dilation (tvm.ir.container.Array): dilation for the output pooling window (not used!)
+        pool_padding (tvm.ir.container.Array): padding for the output pooling window
+
+    Returns:
+        tvm.te.tensor.Tensor: conv2d operator result
+    """
+    assert len(orig_data.shape) == 4
+    assert len(kernel.shape) == 4
+    assert len(bias.shape) == 1
+    assert (
+        orig_data.shape[1] == orig_data.shape[2]
+    ), "GEMMINIs Conv2d CISC schedule only supports square inputs!"
+
+    OC = kernel.shape[3]
+    KH = kernel.shape[0]
+    KW = kernel.shape[1]
+
+    N = orig_data.shape[0]
+    IH = orig_data.shape[1]
+    IW = orig_data.shape[2]
+    IC = orig_data.shape[3]
+
+    HSTR = strides[0]
+    WSTR = strides[1]
+    TOP_PAD = padding[0]
+    LEFT_PAD = padding[1]
+    BOTTOM_PAD = padding[2]
+    RIGHT_PAD = padding[3]
+
+    OH = topi.utils.get_const_int(tvm.tir.div((IH + (TOP_PAD + BOTTOM_PAD) - KH), HSTR) + 1)
+    OW = topi.utils.get_const_int(tvm.tir.div((IW + (LEFT_PAD + RIGHT_PAD) - KW), WSTR) + 1)
+
+    ric = te.reduce_axis((0, IC), name="ric")
+    rkh = te.reduce_axis((0, KH), name="rkh")
+    rkw = te.reduce_axis((0, KW), name="rkw")
+
+    oshape = (N, OH, OW, OC)
+
+    if len(set(padding)) == 1 and (env.supports_non_zero_padding or ifm_offset == 0):
+        # If the padding is the same for all borders, there is no need to use topi.nn.pad, because Gemminis CISC instructions support equal padding
+        data = orig_data
+    else:
+        # If not, then pad before calling Gemminis functions
+        data = topi.nn.pad(
+            orig_data,
+            [0, TOP_PAD, LEFT_PAD, 0],
+            [0, BOTTOM_PAD, RIGHT_PAD, 0],
+            pad_value=ifm_offset,
+            name="pad_data",
+        )
+
+    res = te.compute(
+        oshape,
+        lambda b_o, i, j, c_o: te.sum(
+            data[b_o, i * HSTR + rkh, j * WSTR + rkw, ric].astype(env.inp_dtype)
+            * kernel[rkh, rkw, ric, c_o].astype(env.inp_dtype)
+            + bias[c_o].astype(env.inp_dtype),
+            axis=[rkh, rkw, ric],
+        ),
+        name="res",
+        tag="conv2d",
+        attrs={
+            "activation": activation,
+            "strides": [HSTR, WSTR],
+            "padding": padding,
+            "padding_value": ifm_offset,
+            "scale": gemmini_scale,
+            "pool_size": pool_size,
+            "pool_strides": pool_strides,
+            "pool_dilation": pool_dilation,
+            "pool_padding": pool_padding,
+        },
+    )
+
+    cfg.add_flop(
+        np.prod(topi.utils.get_const_tuple(oshape)) * KH * KW * IC
+        + np.prod(topi.utils.get_const_tuple(oshape))
+        * (KH * KW * IC - 1)  # Multiplications and additions needed
+        + np.prod(  # Additions needed
+            topi.utils.get_const_tuple(oshape)
+        )  # Output scaling multiplications
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.conv2d_cisc")
+def schedule_conv2d_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's conv2d operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+    assert len(outs) == 1
+    output = outs[0]
+    const_ops = []
+    ewise_inputs = []
+    ewise_ops = []
+    conv2d_res = []
+
+    def _traverse(op):
+        if topi.tag.is_broadcast(op.tag):
+            if not op.same_as(output.op):
+                if not op.axis:
+                    const_ops.append(op)
+                else:
+                    ewise_ops.append(op)
+            for tensor in op.input_tensors:
+                if isinstance(tensor.op, tvm.te.PlaceholderOp):
+                    ewise_inputs.append((op, tensor))
+                else:
+                    _traverse(tensor.op)
+        else:
+            if op.tag == "conv2d":
+                conv2d_res.append(op)
+            else:
+                for tensor in op.input_tensors:
+                    _traverse(tensor.op)
+
+    _traverse(output.op)
+    assert len(conv2d_res) == 1
+    conv2d_stage = conv2d_res[0].output(0)
+    sch = te.create_schedule(output.op)
+
+    data, kernel, bias = conv2d_stage.op.input_tensors
+
+    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
+        temp = data.op.input_tensors[0]
+        pad_data = data
+        data = temp
+    else:
+        pad_data = data
+
+    orig_kernel = kernel
+
+    x_bo, x_i, x_j, x_co = sch[conv2d_stage].op.axis
+    rkh, rkw, ric = sch[conv2d_stage].op.reduce_axis
+
+    x_bo_o, x_bo_i = sch[conv2d_stage].split(x_bo, factor=pad_data.shape[0])
+
+    axis_for_start = x_bo_o
+
+    # If topi.nn.pad was added, its because the padding was not equal in all dimensions.
+    padding_for_C_code = conv2d_stage.op.attrs["padding"] if pad_data == data else [0, 0, 0, 0]
+    padding_value_for_C_code = conv2d_stage.op.attrs["padding_value"] if pad_data == data else 0
+
+    # Apply tensorization
+    sch[conv2d_stage].tensorize(
+        x_bo_i,
+        env.conv2d_cisc(
+            pad_data.shape,
+            kernel.shape,
+            bias.shape,
+            conv2d_stage.shape,
+            conv2d_stage.op.attrs["strides"],
+            padding_for_C_code,
+            padding_value_for_C_code,
+            conv2d_stage.op.attrs["activation"],
+            conv2d_stage.op.attrs["scale"],
+            conv2d_stage.op.attrs["pool_size"],
+            conv2d_stage.op.attrs["pool_strides"],
+            conv2d_stage.op.attrs["pool_dilation"],
+            conv2d_stage.op.attrs["pool_padding"],
+        ),
+    )
+
+    # Tag loops with pragmas to delimit the start and end of the Gemmini related code
+    sch[conv2d_stage].pragma(axis_for_start, "conv2d_cisc_start")
+    sch[conv2d_stage].pragma(axis_for_start, "gemm_end")
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
new file mode 100644
index 000000000000..03051f193638
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
@@ -0,0 +1,377 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Dense (GEMM) operator declaration and schedule registration for Gemmini's intrinsic instructions
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
+
+from tvm.contrib.gemmini.environment import Environment
+from tvm.contrib.gemmini.build_module import lower
+from tvm.contrib.gemmini.helpers import get_greater_div
+
+env = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.gemm")
+def gemm(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    data: tvm.te.tensor.Tensor,
+    weight: tvm.te.tensor.Tensor,
+    bias: tvm.te.tensor.Tensor,
+    scale: float,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's dense operator using intrinsic instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        data (tvm.te.tensor.Tensor): Input feature map
+        weight (tvm.te.tensor.Tensor): Layer weights
+        bias (tvm.te.tensor.Tensor): Layer biases
+        scale (float): output scaling factor
+
+    Returns:
+        tvm.te.tensor.Tensor: dense operator result
+    """
+
+    # Derive shapes
+    ishape = topi.utils.get_const_tuple(data.shape)
+    wshape = topi.utils.get_const_tuple(weight.shape)
+    oshape = (data.shape[0], weight.shape[1])
+
+    # Reduction axes (input channel)
+    assert ishape[1] == wshape[0]
+    k_o = te.reduce_axis((0, wshape[0]), name="k_o")
+
+    bias_stage = te.compute(
+        oshape,
+        lambda x_o, y_o: bias[y_o].astype(env.inp_dtype),
+        name="bias.local.accumulator",
+        tag="bias_add",
+    )
+
+    res = te.compute(
+        oshape,
+        lambda x_o, y_o: te.sum(
+            data[x_o, k_o].astype(env.inp_dtype) * weight[k_o, y_o].astype(env.inp_dtype)
+            + bias_stage[x_o, y_o].astype(env.inp_dtype),
+            axis=[k_o],
+        ),
+        name="res",
+        tag="dense",
+        attrs={"scale": scale},
+    )
+
+    cfg.add_flop(
+        (2 * np.prod(topi.utils.get_const_tuple(oshape)) * ishape[1])  # element multiplications
+        + np.prod(topi.utils.get_const_tuple(oshape))  # bias additions
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.gemm")
+def schedule_gemm(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's dense operator using intrinsic instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+
+    assert len(outs) == 1
+    output = outs[0]
+
+    dense_stage = output.op.output(0)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    sch = te.create_schedule([x.op for x in outs])
+
+    data, weight, bias_op = dense_stage.op.input_tensors
+    bias = bias_op.op.input_tensors[0]
+
+    ##### space definition begin #####
+    x, y = sch[dense_stage].op.axis
+    (z,) = sch[dense_stage].op.reduce_axis
+
+    # TODO (FP): add limits for scratchpad and accumulator sizes perhaps?
+    cfg.define_split(
+        "tile_xo",
+        x,
+        num_outputs=3,
+        policy="power2",
+        filter=lambda ax: (
+            ax.size[-1] == get_greater_div(int(data.shape[0]))
+            if (data.shape[0] >= env.DIM)
+            else ax.size[-1] <= env.DIM
+        ),
+    )
+
+    cfg.define_split(
+        "tile_yo",
+        y,
+        num_outputs=3,
+        policy="power2",
+        filter=lambda ax: (
+            ax.size[-1] == get_greater_div(int(weight.shape[1]))
+            if (weight.shape[1] >= env.DIM)
+            else ax.size[-1] <= env.DIM
+        ),
+    )
+
+    cfg.define_split(
+        "tile_zo",
+        z,
+        num_outputs=3,
+        policy="power2",
+        filter=lambda ax: (
+            ax.size[-1] == get_greater_div(int(weight.shape[0]))
+            if (weight.shape[0] >= env.DIM)
+            else ax.size[-1] <= env.DIM
+        ),
+    )
+
+    # accumulate_multiple_patches knob
+    #   2: only one patch is computed in the accumulator
+    #   1: More than one patch is computed in the accumulator, depends on tile_yo
+    #   0: More than one patch is computed in the accumulator, depends on tile_yo AND tile_xo
+    cfg.define_knob("accumulate_multiple_patches", [0, 1, 2])
+    # exchange axis
+    #   exchange the order of axis x and y
+    cfg.define_knob("exchange_axis", [False, True])
+    # WS/OS
+    #   0: Gemmini will be configured as output stationary
+    #   1: Gemmini will be configured as weight stationary
+    cfg.define_knob("WS/OS", [env.WEIGHT_STATIONARY, env.OUTPUT_STATIONARY])
+    # mvout_big_block
+    #   False: generate mvout instructions moving as maximum DIM columns
+    #   True: generate mvout instructions moving more than DIM columns
+    cfg.define_knob("mvout_big_block", [True, False])
+    if cfg.is_fallback:
+        # Load default split values
+        cfg["tile_xo"] = SplitEntity([-1, 8, get_greater_div(int(data.shape[0]))])
+        cfg["tile_yo"] = SplitEntity([-1, 8, get_greater_div(int(weight.shape[1]))])
+        cfg["tile_zo"] = SplitEntity([-1, 8, get_greater_div(int(weight.shape[0]))])
+        cfg["accumulate_multiple_patches"] = OtherOptionEntity(0)
+        cfg["exchange_axis"] = OtherOptionEntity(False)
+        cfg["mvout_big_block"] = OtherOptionEntity(True)
+        cfg["WS/OS"] = OtherOptionEntity(env.WEIGHT_STATIONARY)
+
+    ###### space definition end ######
+
+    cdata = sch.cache_read(data, env.scr_scope, [dense_stage])
+    cweight = sch.cache_read(weight, env.scr_wgt_scope, [dense_stage])
+    dense_stage_acc = sch.cache_write(output, env.acc_scope)
+    sch[bias_op].set_scope(env.acc_scope)
+    (x_, y_) = sch[dense_stage_acc].op.axis
+    (z_,) = sch[dense_stage_acc].op.reduce_axis
+
+    # Split loops to generate the inner dimensions specified by knobs tile_xo and tile_yo
+    b_y, yo, yi = cfg["tile_yo"].apply(sch, output, sch[output].op.axis[1])
+    b_x, xo, xi = cfg["tile_xo"].apply(sch, output, sch[output].op.axis[0])
+
+    # Apply the exchange_axis knob
+    if cfg["exchange_axis"].val:
+        sch[output].reorder(b_y, b_x, yo, xo, yi, xi)
+    else:
+        sch[output].reorder(b_x, b_y, xo, yo, xi, yi)
+
+    # Apply the accumulate_multiple_patches knob
+    if cfg["accumulate_multiple_patches"].val == 0:
+        axis_for_output = b_x if cfg["exchange_axis"].val else b_y
+    elif cfg["accumulate_multiple_patches"].val == 1:
+        axis_for_output = yo if cfg["exchange_axis"].val else xo
+    else:
+        axis_for_output = xo if cfg["exchange_axis"].val else yo
+
+    axis_gemm_start = b_y if cfg["exchange_axis"].val else b_x
+
+    # Move the dense_stage_acc stage to the correct axis of the output stage
+    sch[dense_stage_acc].compute_at(sch[output], axis_for_output)
+
+    # # Split loops to generate the inner dimensions specified by knob tile_zo
+    xo_o, xi_o = sch[dense_stage_acc].split(x_, factor=env.DIM)
+    yo_o, yi_o = sch[dense_stage_acc].split(y_, factor=env.DIM)
+    b_z, zo_o, zi_o = cfg["tile_zo"].apply(sch, dense_stage_acc, z_)
+
+    # Apply the exchange_axis knob
+    if cfg["exchange_axis"].val:
+        sch[dense_stage_acc].reorder(b_z, xo_o, yo_o, zo_o, xi_o, yi_o, zi_o)
+    else:
+        sch[dense_stage_acc].reorder(b_z, yo_o, xo_o, zo_o, yi_o, xi_o, zi_o)
+
+    # Generate knobs to move the copy of data across different loops
+    axis_to_input_data = [b_x, b_z, xo_o, zo_o]
+    axis_to_input_weights = [b_y, b_z, yo_o, zo_o]
+    stages_to_input_data = [output, dense_stage_acc, dense_stage_acc, dense_stage_acc]
+    cfg.define_knob("axis_for_cdata", [0, 1, 2, 3])
+    cfg.define_knob("axis_for_cweight", [0, 1, 2, 3])
+    if cfg.is_fallback:
+        cfg["axis_for_cdata"] = OtherOptionEntity(0)
+        cfg["axis_for_cweight"] = OtherOptionEntity(0)
+
+    # Compute the move of the bias in the correct loop
+    sch[bias_op].compute_at(sch[output], axis_for_output)
+
+    # We assert here that the mvin of data does not use more space than the available one in the scratchpad
+    if cfg["axis_for_cdata"].val == 0:
+        assert (
+            cfg["tile_xo"].size[1] * cfg["tile_xo"].size[2] * data.shape[1]
+            <= env.INP_SCR_ROWS * env.DIM
+        ), "Data matrix will not fit in scratchpad!"
+    elif cfg["axis_for_cdata"].val == 1:
+        assert (
+            cfg["tile_xo"].size[2] * data.shape[1] <= env.INP_SCR_ROWS * env.DIM
+        ), "Data matrix will not fit in scratchpad!"
+    if cfg["axis_for_cweight"].val == 0:
+        assert (
+            cfg["tile_yo"].size[1] * cfg["tile_yo"].size[2] * weight.shape[0]
+            <= env.WGT_SCR_ROWS * env.DIM
+        ), "Weight matrix will not fit in scratchpad!"
+    elif cfg["axis_for_cweight"].val == 1:
+        assert (
+            cfg["tile_yo"].size[2] * weight.shape[0] <= env.WGT_SCR_ROWS * env.DIM
+        ), "Weight matrix will not fit in scratchpad!"
+
+    # And here we assert that there is enough place available in the accumulator
+    if cfg["accumulate_multiple_patches"].val == 0:
+        assert (
+            cfg["tile_xo"].size[1]
+            * cfg["tile_xo"].size[2]
+            * cfg["tile_yo"].size[1]
+            * cfg["tile_yo"].size[2]
+            <= env.ACC_ROWS * env.DIM
+        ), "Result matrix will not fit in accumulator!"
+    elif cfg["accumulate_multiple_patches"].val == 1:
+        assert (
+            cfg["tile_xo"].size[2] * cfg["tile_yo"].size[1] * cfg["tile_yo"].size[2]
+            <= env.ACC_ROWS * env.DIM
+        ), "Result matrix will not fit in accumulator!"
+
+    # Move the data and weight move instructions into the correct loops selected by the axis_for_cdata and axis_for_cweight knobs
+    axis_for_cdata = axis_to_input_data[cfg["axis_for_cdata"].val]
+    axis_for_cweight = axis_to_input_weights[cfg["axis_for_cweight"].val]
+    sch[cdata].compute_at(sch[stages_to_input_data[cfg["axis_for_cdata"].val]], axis_for_cdata)
+    sch[cweight].compute_at(
+        sch[stages_to_input_data[cfg["axis_for_cweight"].val]], axis_for_cweight
+    )
+
+    # Split input moves because Gemmini's mvin only supports mvins with rows <= DIM and cols <= MAX_BLOCK_LEN
+    cdata_ax_0_1, cdata_ax_0_2 = sch[cdata].split(sch[cdata].op.axis[0], factor=env.DIM)
+    cdata_ax_1_1, cdata_ax_1_2 = sch[cdata].split(
+        sch[cdata].op.axis[1], factor=env.MAX_BLOCK_LEN * env.DIM
+    )
+    sch[cdata].reorder(cdata_ax_0_1, cdata_ax_1_1, cdata_ax_0_2, cdata_ax_1_2)
+
+    cweight_ax_0_1, cweight_ax_0_2 = sch[cweight].split(sch[cweight].op.axis[0], factor=env.DIM)
+    cweight_ax_1_1, cweight_ax_1_2 = sch[cweight].split(
+        sch[cweight].op.axis[1], factor=env.MAX_BLOCK_LEN * env.DIM
+    )
+    sch[cweight].reorder(cweight_ax_0_1, cweight_ax_1_1, cweight_ax_0_2, cweight_ax_1_2)
+
+    cbias_ax_0_1, cbias_ax_0_2 = sch[bias_op].split(sch[bias_op].op.axis[0], factor=env.DIM)
+    cbias_ax_1_1, cbias_ax_1_2 = sch[bias_op].split(
+        sch[bias_op].op.axis[1], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+    )
+    sch[bias_op].reorder(cbias_ax_0_1, cbias_ax_1_1, cbias_ax_0_2, cbias_ax_1_2)
+
+    # Mvout preparation
+    if cfg["exchange_axis"].val:
+        sch[output].reorder(yo, yi, xo, xi)
+    else:
+        sch[output].reorder(xo, xi, yo, yi)
+    if cfg["accumulate_multiple_patches"].val == 0:
+        fused_x = sch[output].fuse(xo, xi)
+        fused_y = sch[output].fuse(yo, yi)
+    elif cfg["accumulate_multiple_patches"].val == 1:
+        if cfg["exchange_axis"].val:
+            fused_x = sch[output].fuse(xo, xi)
+            fused_y = yi
+        else:
+            fused_x = xi
+            fused_y = sch[output].fuse(yo, yi)
+    else:
+        fused_x = xi
+        fused_y = yi
+
+    fused_x_1, fused_x_2 = sch[output].split(fused_x, factor=env.DIM)
+    fused_y_1, fused_y_2 = sch[output].split(
+        fused_y, factor=env.MAX_BLOCK_LEN * env.DIM if cfg["mvout_big_block"].val else env.DIM
+    )
+    sch[output].reorder(fused_x_1, fused_y_1, fused_x_2, fused_y_2)
+
+    # Tag loops with pragmas, in order to insert the move in and move out instructions
+    sch[cweight].pragma(cweight_ax_0_2, env.B_mvin)
+    if data.shape[0] == 1 and weight.shape[1] > 1:
+        sch[cdata].pragma(cdata_ax_0_2, env.A_mvin + "_t")
+        sch[bias_op].pragma(cbias_ax_0_2, env.D_mvin + "_t")
+        sch[output].pragma(fused_x_2, env.C_mvout + "_t")
+    else:
+        sch[cdata].pragma(cdata_ax_0_2, env.A_mvin)
+        sch[bias_op].pragma(cbias_ax_0_2, env.D_mvin)
+        sch[output].pragma(fused_x_2, env.C_mvout)
+
+    # Apply tensorize
+    I = data.shape[0] if data.shape[0] < env.DIM else cfg["tile_xo"].size[-1]
+    K = weight.shape[0] if weight.shape[0] < env.DIM else cfg["tile_zo"].size[-1]
+    J = weight.shape[1] if weight.shape[1] < env.DIM else cfg["tile_yo"].size[-1]
+
+    sch[dense_stage_acc].tensorize(
+        xi_o if cfg["exchange_axis"].val else yi_o,
+        env.gemm(
+            I,
+            K,
+            J,
+            mode=cfg["WS/OS"].val,
+            accum_patch=tvm.tir.IntImm("uint8", 0)
+            if cfg["exchange_axis"].val or cfg["tile_zo"].size[1] != 1
+            else xo_o.var,
+        ),
+    )
+
+    # Generate configuration dictionary, in order to correctly generate the calls to the configuration instructions
+    config_dict = {}
+    config_dict["A_size"] = int(data.shape[1])
+    config_dict["B_size"] = int(weight.shape[1])
+    config_dict["C_size"] = int(output.shape[1])
+    config_dict["A_private_stride"] = env.DIM
+    config_dict["B_private_stride"] = env.DIM
+    config_dict["execution_stride"] = 1
+    config_dict["activation"] = 0
+    config_dict["mode"] = cfg["WS/OS"].val
+    config_dict["max_pixels_per_row"] = 1
+    config_dict["scale"] = float(dense_stage.op.attrs["scale"])
+    config_dict["padding_value"] = 0
+
+    # Tag loops with pragmas to delimit the start and end of the Gemmini related code
+    sch[output].pragma(axis_gemm_start, "gemm_start")
+    sch[output].pragma(axis_gemm_start, "configs", str(config_dict))
+    sch[output].pragma(axis_gemm_start, "gemm_end")
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
new file mode 100644
index 000000000000..0144563940f9
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Dense (GEMM) operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import math
+import sys
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+
+from tvm.contrib.gemmini.environment import Environment
+from tvm.contrib.gemmini.build_module import lower
+from tvm.contrib.gemmini.intrin import gemm_cisc
+
+env = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.gemm_cisc")
+def gemm_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    data: tvm.te.tensor.Tensor,
+    weight: tvm.te.tensor.Tensor,
+    bias: tvm.te.tensor.Tensor,
+    scale: float,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's dense operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        data (tvm.te.tensor.Tensor): Input feature map
+        weight (tvm.te.tensor.Tensor): Layer weights
+        bias (tvm.te.tensor.Tensor): Layer biases
+        scale (float): output scaling factor
+
+    Returns:
+        tvm.te.tensor.Tensor: dense operator result
+    """
+
+    # Derive shapes
+    ishape = topi.utils.get_const_tuple(data.shape)
+    wshape = topi.utils.get_const_tuple(weight.shape)
+    oshape = (data.shape[0], weight.shape[1])
+
+    # Reduction axes (input channel)
+    assert ishape[1] == wshape[0]
+    k_o = te.reduce_axis((0, wshape[0]), name="k_o")
+
+    res = te.compute(
+        oshape,
+        lambda x_o, y_o: te.sum(
+            data[x_o, k_o].astype(env.inp_dtype) * weight[k_o, y_o].astype(env.inp_dtype)
+            + bias[y_o].astype(env.inp_dtype),
+            axis=[k_o],
+        ),
+        name="res",
+        tag="dense",
+        attrs={"scale": scale},
+    )
+
+    cfg.add_flop(
+        (2 * np.prod(topi.utils.get_const_tuple(oshape)) * ishape[1])  # element multiplications
+        + np.prod(topi.utils.get_const_tuple(oshape))  # bias additions
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.gemm_cisc")
+def schedule_gemm_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's dense operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+    assert len(outs) == 1
+    output = outs[0]
+
+    dense_stage = output.op.output(0)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    sch = te.create_schedule([x.op for x in outs])
+
+    data, weight, bias = dense_stage.op.input_tensors
+
+    # WS/OS
+    #   0: Gemmini will be configured as output stationary
+    #   1: Gemmini will be configured as weight stationary
+    cfg.define_knob("WS/OS", [env.WEIGHT_STATIONARY, env.OUTPUT_STATIONARY])
+    if cfg.is_fallback:
+        cfg["WS/OS"] = OtherOptionEntity(env.WEIGHT_STATIONARY)
+
+    x_, y_ = sch[dense_stage].op.axis
+
+    x_o, x_i = sch[dense_stage].split(x_, factor=data.shape[0])
+
+    axis_for_start = x_o
+
+    # Apply tensorization
+    sch[dense_stage].tensorize(
+        x_i,
+        env.gemm_cisc(
+            data.shape, weight.shape, bias.shape, dense_stage.op.attrs["scale"], cfg["WS/OS"].val
+        ),
+    )
+
+    # Tag loops with pragmas to delimit the start and end of the Gemmini related code
+    sch[dense_stage].pragma(axis_for_start, "gemm_cisc_start")
+    sch[dense_stage].pragma(axis_for_start, "gemm_end")
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
new file mode 100644
index 000000000000..c67767f783c2
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
@@ -0,0 +1,227 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Depthwise conv2d operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+
+from tvm.contrib.gemmini.environment import Environment
+from tvm.contrib.gemmini.build_module import lower
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+from tvm.contrib.gemmini.helpers import get_greater_div
+
+env = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.depthwiseconv2d_cisc")
+def depthwise_conv2d_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    orig_data: tvm.te.tensor.Tensor,
+    orig_kernel: tvm.te.tensor.Tensor,
+    bias: tvm.te.tensor.Tensor,
+    strides: tvm.ir.container.Array,
+    padding: tvm.ir.container.Array,
+    ifm_offset: int,
+    activation: int,
+    gemmini_scale: float,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's depthwise conv2d operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        orig_data (tvm.te.tensor.Tensor): Input feature map
+        orig_kernel (tvm.te.tensor.Tensor): Layer weights
+        bias (tvm.te.tensor.Tensor): Layer biases
+        strides (tvm.ir.container.Array): convolution strides
+        padding (tvm.ir.container.Array): input feature map padding
+        ifm_offset (int): input feature map offset (used for the padding of the input feature map)
+        activation (int): has activation?
+        gemmini_scale (float): output scaling factor
+
+    Returns:
+        tvm.te.tensor.Tensor: depthwise conv2d operator result
+    """
+
+    assert len(orig_data.shape) == 4
+    assert len(orig_kernel.shape) == 3
+    assert len(bias.shape) == 1
+    assert (
+        orig_data.shape[1] == orig_data.shape[2]
+    ), "GEMMINIs depthwise conv2d CISC schedule only supports square inputs!"
+
+    OC = orig_kernel.shape[0]
+    KH = orig_kernel.shape[1]
+    KW = orig_kernel.shape[2]
+
+    kernel = orig_kernel
+
+    N = orig_data.shape[0]
+    IH = orig_data.shape[1]
+    IW = orig_data.shape[2]
+    IC = orig_data.shape[3]
+
+    HSTR = strides[0]
+    WSTR = strides[1]
+    TOP_PAD = padding[0]
+    LEFT_PAD = padding[1]
+    BOTTOM_PAD = padding[2]
+    RIGHT_PAD = padding[3]
+
+    OH = topi.utils.get_const_int(tvm.tir.div((IH + (TOP_PAD + BOTTOM_PAD) - KH), HSTR) + 1)
+    OW = topi.utils.get_const_int(tvm.tir.div((IW + (LEFT_PAD + RIGHT_PAD) - KW), WSTR) + 1)
+
+    if len(set(padding)) == 1 and env.supports_non_zero_padding:
+        # If the padding is the same for all borders, there is no need to use topi.nn.pad, because Gemminis CISC instructions support equal padding
+        data = orig_data
+    else:
+        # If not, then pad before calling Gemminis functions
+        data = topi.nn.pad(
+            orig_data,
+            [0, TOP_PAD, LEFT_PAD, 0],
+            [0, BOTTOM_PAD, RIGHT_PAD, 0],
+            pad_value=ifm_offset,
+            name="pad_data",
+        )
+
+    rkh = te.reduce_axis((0, KH), name="rkh")
+    rkw = te.reduce_axis((0, KW), name="rkw")
+
+    oshape = (N, OH, OW, OC)
+
+    res = te.compute(
+        oshape,
+        lambda b_o, i, j, c_o: te.sum(
+            data[b_o, i * HSTR + rkh, j * WSTR + rkw, c_o].astype(env.inp_dtype)
+            * kernel[c_o, rkh, rkw].astype(env.inp_dtype)
+            + bias[c_o].astype(env.inp_dtype),
+            axis=[rkh, rkw],
+        ),
+        name="res",
+        tag="conv2d",
+        attrs={
+            "activation": activation,
+            "strides": [HSTR, WSTR],
+            "padding": padding,
+            "padding_value": ifm_offset,
+            "scale": gemmini_scale,
+        },
+    )
+
+    cfg.add_flop(
+        np.prod(topi.utils.get_const_tuple(oshape)) * KH * KW
+        + np.prod(topi.utils.get_const_tuple(oshape))
+        * (KH * KW - 1)  # Multiplications and additions needed
+        + np.prod(topi.utils.get_const_tuple(oshape))  # Output scaling factor multiplications
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.depthwiseconv2d_cisc")
+def schedule_depthwise_conv2d_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's depthwise conv2d operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+    assert len(outs) == 1
+    output = outs[0]
+    const_ops = []
+    ewise_inputs = []
+    ewise_ops = []
+    conv2d_res = []
+
+    def _traverse(op):
+        if topi.tag.is_broadcast(op.tag):
+            if not op.same_as(output.op):
+                if not op.axis:
+                    const_ops.append(op)
+                else:
+                    ewise_ops.append(op)
+            for tensor in op.input_tensors:
+                if isinstance(tensor.op, tvm.te.PlaceholderOp):
+                    ewise_inputs.append((op, tensor))
+                else:
+                    _traverse(tensor.op)
+        else:
+            if op.tag == "conv2d":
+                conv2d_res.append(op)
+            else:
+                for tensor in op.input_tensors:
+                    _traverse(tensor.op)
+
+    _traverse(output.op)
+    assert len(conv2d_res) == 1
+    conv2d_stage = conv2d_res[0].output(0)
+    sch = te.create_schedule(output.op)
+
+    data, kernel, bias = conv2d_stage.op.input_tensors
+    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
+        temp = data.op.input_tensors[0]
+        pad_data = data
+        data = temp
+    else:
+        pad_data = data
+
+    orig_kernel = kernel
+
+    x_bo, x_i, x_j, x_co = sch[conv2d_stage].op.axis
+    rkh, rkw = sch[conv2d_stage].op.reduce_axis
+
+    x_bo_o, x_bo_i = sch[conv2d_stage].split(x_bo, factor=pad_data.shape[0])
+
+    axis_for_start = x_bo_o
+
+    # If topi.nn.pad was added, its because the padding was not equal in all dimensions.
+    padding_for_C_code = conv2d_stage.op.attrs["padding"] if pad_data == data else [0, 0, 0, 0]
+    padding_value_for_C_code = conv2d_stage.op.attrs["padding_value"] if pad_data == data else 0
+
+    # Apply tensorization
+    sch[conv2d_stage].tensorize(
+        x_bo_i,
+        env.dw_conv2d_cisc(
+            pad_data.shape,
+            kernel.shape,
+            bias.shape,
+            conv2d_stage.shape,
+            conv2d_stage.op.attrs["strides"],
+            padding_for_C_code,
+            padding_value_for_C_code,
+            conv2d_stage.op.attrs["activation"],
+            conv2d_stage.op.attrs["scale"],
+        ),
+    )
+
+    # Tag loops with pragmas to delimit the start and end of the Gemmini related code
+    sch[conv2d_stage].pragma(axis_for_start, "dw_conv2d_cisc_start")
+    sch[conv2d_stage].pragma(axis_for_start, "gemm_end")
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
new file mode 100644
index 000000000000..7d922ddd2db4
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
@@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+MaxPool2D operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+
+from tvm.contrib.gemmini.environment import Environment
+from tvm.contrib.gemmini.build_module import lower
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
+from tvm.contrib.gemmini.helpers import get_greater_div
+
+env = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.max_pool2d")
+# def conv2d(args,attrs):
+def max_pool2d(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    data: tvm.te.tensor.Tensor,
+    weights: tvm.te.tensor.Tensor,
+    pool_size: tvm.ir.container.Array,
+    pool_strides: tvm.ir.container.Array,
+    pool_dilation: tvm.ir.container.Array,
+    pool_padding: tvm.ir.container.Array,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition to run a max pooling layer on Gemmini. Uses a trick: we call a dw convolution + max pooling, but all weights are 1. So the depthwise convolution does nothing, and the Gemmini accelerator takes care internally of applying the max pooling.
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        data (tvm.te.tensor.Tensor): Input feature map
+        weights (tvm.te.tensor.Tensor): Weights... just all ones, needed by the called function
+        pool_size (tvm.ir.container.Array): Pooling window size
+        pool_strides (tvm.ir.container.Array): Pooling window strides
+        pool_dilation (tvm.ir.container.Array): Pooling window dilation (not used for now)
+        pool_padding (tvm.ir.container.Array): Pooling window padding
+
+    Returns:
+        tvm.te.tensor.Tensor: max pool2d operator result
+    """
+
+    assert len(data.shape) == 4
+
+    def irb_builder_func(ins, outs):
+        irb = tvm.tir.ir_builder.create()
+
+        if env.supports_non_zero_padding:
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "tiled_conv_dw_auto",
+                    ins[0].shape[0],  # BATCH_SIZE,
+                    ins[0].shape[1],  # IN_DIM,
+                    ins[0].shape[3],  # IN_CHANNELS,
+                    ins[0].shape[1],  # OUT_DIM,
+                    1,
+                    0,
+                    0,
+                    1,
+                    ins[0].access_ptr("r"),
+                    ins[1].access_ptr("r"),
+                    0,
+                    outs[0].access_ptr("w"),
+                    0,
+                    1.0,
+                    pool_size[0],
+                    pool_strides[0],
+                    pool_padding[0],
+                    1,
+                )
+            )
+        else:
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "tiled_conv_dw_auto",
+                    ins[0].shape[0],  # BATCH_SIZE,
+                    ins[0].shape[1],  # IN_DIM,
+                    ins[0].shape[3],  # IN_CHANNELS,
+                    ins[0].shape[1],  # OUT_DIM,
+                    1,
+                    0,
+                    1,
+                    ins[0].access_ptr("r"),
+                    ins[1].access_ptr("r"),
+                    0,
+                    outs[0].access_ptr("w"),
+                    0,
+                    1.0,
+                    pool_size[0],
+                    pool_strides[0],
+                    pool_padding[0],
+                    1,
+                )
+            )
+        irb.emit(tvm.tir.call_extern("", "gemmini_fence"))
+
+        return irb.get()
+
+    res = te.extern(
+        (1,), [data, weights], lambda ins, outs: irb_builder_func(ins, outs), dtype="int8"
+    )
+
+    # TODO (FP): add correct FLOPS
+    # cfg.add_flop(2 * np.prod(topi.utils.get_const_tuple(oshape)) * KH * KW * IC)
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.max_pool2d")
+def schedule_max_pool2d(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's max pool2d operator
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+    assert len(outs) == 1
+    output = outs[0]
+    sch = te.create_schedule(output.op)
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/op.py b/python/tvm/relay/backend/contrib/gemmini/op.py
new file mode 100644
index 000000000000..6ca41c66d139
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/op.py
@@ -0,0 +1,286 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument, ungrouped-imports
+"""
+Namespace for the supported Relay operators on Gemmini
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from __future__ import absolute_import as _abs
+
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+
+from tvm.relay.op import op as reg
+from tvm.relay.op import strategy as _strategy
+from tvm.relay.op.op import OpPattern, OpStrategy
+
+from .gemmini_dense import gemm, schedule_gemm
+from .gemmini_dense_cisc import gemm_cisc, schedule_gemm_cisc
+from .gemmini_conv2d_cisc import conv2d_cisc, schedule_conv2d_cisc
+from .gemmini_depthwise_conv2d_cisc import depthwise_conv2d_cisc, schedule_depthwise_conv2d_cisc
+from .gemmini_add import add, schedule_add
+from .gemmini_max_pool2d import max_pool2d, schedule_max_pool2d
+from tvm.contrib.gemmini.environment import Environment
+
+from tvm.topi.utils import const_vector, get_const_int, get_const_float
+import numpy as np
+
+ENV = Environment.instance()
+
+
+def wrap_max_pool2d_topi_compute(topi_compute):
+    """Wrapper for the max pool2d compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        return [
+            topi_compute(
+                *inputs,
+                attrs.pool_size,
+                attrs.pool_strides,
+                attrs.pool_dilation,
+                attrs.pool_padding,
+            )
+        ]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.max_pool2d", "FTVMStrategy")
+def max_pool2d_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's max_pool2d operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs) == 2:
+        strategy = OpStrategy()
+        strategy.add_implementation(
+            wrap_max_pool2d_topi_compute(max_pool2d),
+            _strategy.wrap_topi_schedule(schedule_max_pool2d),
+            name="contrib.gemmini.max_pool2d",
+            plevel=10,
+        )
+        return strategy
+    return None
+
+
+def wrap_add_topi_compute(topi_compute):
+    """Wrapper for the add compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        ifm1_scale = float(attrs.ifm1_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+        ifm2_scale = float(attrs.ifm2_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+        return [topi_compute(*inputs, ifm1_scale, ifm2_scale)]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.add", "FTVMStrategy")
+def add_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's add operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs) == 3:
+        strategy = OpStrategy()
+        strategy.add_implementation(
+            wrap_add_topi_compute(add),
+            _strategy.wrap_topi_schedule(schedule_add),
+            name="contrib.gemmini.add",
+            plevel=10,
+        )
+        return strategy
+    return None
+
+
+def wrap_gemm_topi_compute(topi_compute):
+    """Wrapper for the GEMM compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        return [
+            topi_compute(
+                *inputs, float(attrs.bias_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+            )
+        ]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.gemm", "FTVMStrategy")
+def gemm_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's GEMM operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs) == 3:
+        strategy = OpStrategy()
+        strategy.add_implementation(
+            wrap_gemm_topi_compute(gemm),
+            _strategy.wrap_topi_schedule(schedule_gemm),
+            name="contrib.gemmini.gemm",
+            plevel=9,
+        )
+        strategy.add_implementation(
+            wrap_gemm_topi_compute(gemm_cisc),
+            _strategy.wrap_topi_schedule(schedule_gemm_cisc),
+            name="contrib.gemmini.gemm_cisc",
+            plevel=10,  # Higher -> used over the other one, unless AutoTVM says the other is better
+        )
+        return strategy
+    return None
+
+
+def wrap_conv2d_topi_compute(topi_compute):
+    """Wrapper for the conv2d compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        if attrs.has_activation:
+            gemmini_scale = float(
+                attrs.activation_scale_in.data.numpy() / attrs.activation_scale_out.data.numpy()
+            ) * float(attrs.bias_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+        else:
+            gemmini_scale = float(attrs.bias_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+        return [
+            topi_compute(
+                *inputs,
+                attrs.strides,
+                attrs.padding,
+                int(attrs.ifm_offset.data.numpy()),
+                attrs.activation,
+                gemmini_scale,
+                attrs.pool_size,
+                attrs.pool_strides,
+                attrs.pool_dilation,
+                attrs.pool_padding,
+            )
+        ]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.conv2d", "FTVMStrategy")
+def conv2d_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's conv2d operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs[0].shape) == 4:
+        strategy = OpStrategy()
+        if inputs[0].shape[1] == inputs[0].shape[2]:
+            strategy.add_implementation(
+                wrap_conv2d_topi_compute(conv2d_cisc),
+                _strategy.wrap_topi_schedule(schedule_conv2d_cisc),
+                name="contrib.gemmini.conv2d_cisc",
+                plevel=10,
+            )
+        return strategy
+    return None
+
+
+def wrap_depthwise_conv2d_topi_compute(topi_compute):
+    """Wrapper for the depthwise conv2d compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        return [
+            topi_compute(
+                *inputs,
+                attrs.strides,
+                attrs.padding,
+                int(attrs.ifm_offset.data.numpy()),
+                attrs.activation,
+                float(attrs.bias_scale.data.numpy() / attrs.ofm_scale.data.numpy()),
+            )
+        ]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.depthwiseconv2d", "FTVMStrategy")
+def depthwise_conv2d_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's depthwiseconv2d operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs[0].shape) == 4:
+        strategy = OpStrategy()
+        if inputs[0].shape[1] == inputs[0].shape[2]:
+            strategy.add_implementation(
+                wrap_depthwise_conv2d_topi_compute(depthwise_conv2d_cisc),
+                _strategy.wrap_topi_schedule(schedule_depthwise_conv2d_cisc),
+                name="contrib.gemmini.depthwiseconv2d_cisc",
+                plevel=10,
+            )
+        return strategy
+    return None
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 9b0e5748bcc0..c0aa371b4d3d 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -214,6 +214,17 @@ def InjectRollingBuffer():
     return _ffi_api.InjectRollingBuffer()  # type: ignore
 
 
+def CorrectGemminisScratchpadAndAccumulatorPointers():
+    """Corrects the pointer addresses of buffers inside Gemmini's scratchpad and accumulator
+
+    Returns:
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.CorrectGemminisScratchpadAndAccumulatorPointers()
+
+
 def StorageRewrite():
     """Rewrite storage allocation pattern.
 
diff --git a/src/relay/op/contrib/gemmini/add.cc b/src/relay/op/contrib/gemmini/add.cc
new file mode 100644
index 000000000000..b27ad4717d14
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/add.cc
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/add.cc
+ * \brief Add operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/op/op_common.h"
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini Add operators */
+struct GemminiAddAttrs : public tvm::AttrsNode<GemminiAddAttrs> {
+  Expr ifm1_scale;
+  Expr ifm1_offset;
+  Expr ifm2_scale;
+  Expr ifm2_offset;
+  Expr ofm_scale;
+  Expr ofm_offset;
+  Array<PrimExpr> shape;
+
+  TVM_DECLARE_ATTRS(GemminiAddAttrs, "relay.attrs.GemminiAddAttrs") {
+    TVM_ATTR_FIELD(ifm1_scale).describe("Input feature map 1 quantization scale");
+    TVM_ATTR_FIELD(ifm1_offset).describe("Input feature map 1 quantization offset");
+    TVM_ATTR_FIELD(ifm2_scale).describe("Input feature map 2 quantization scale");
+    TVM_ATTR_FIELD(ifm2_offset).describe("Input feature map 2 quantization offset");
+    TVM_ATTR_FIELD(ofm_scale).describe("Output feature map quantization scale");
+    TVM_ATTR_FIELD(ofm_offset).describe("Output feature map quantization offset");
+    TVM_ATTR_FIELD(shape).describe("Output shape");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiAddAttrs);
+
+bool GemminiAddRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  const int ifm1_index = 0;
+  const int ifm2_index = 1;
+  const int result_index = 3;
+  ICHECK_EQ(types.size(), result_index + 1);
+
+  const auto* ifm1 = types[ifm1_index].as<TensorTypeNode>();
+  const auto* ifm2 = types[ifm2_index].as<TensorTypeNode>();
+  ICHECK(ifm1 != nullptr) << "ifm1 cannot be nullptr.";
+  ICHECK(ifm2 != nullptr) << "ifm2 cannot be nullptr.";
+
+  const auto* param = attrs.as<GemminiAddAttrs>();
+  ICHECK(param != nullptr) << "GemminiAddAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  Array<IndexExpr> ofm_shape({ifm1->shape[0], ifm2->shape[1], ifm2->shape[2], ifm2->shape[3]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiAdd(Expr ifm1, Expr ifm2, Expr ifm1_scale, Expr ifm1_offset, Expr ifm2_scale,
+                    Expr ifm2_offset, Expr ofm_scale, Expr ofm_offset, Array<PrimExpr> shape) {
+  auto attrs = make_object<GemminiAddAttrs>();
+  attrs->ifm1_scale = std::move(ifm1_scale);
+  attrs->ifm1_offset = std::move(ifm1_offset);
+  attrs->ifm2_scale = std::move(ifm2_scale);
+  attrs->ifm2_offset = std::move(ifm2_offset);
+  attrs->ofm_scale = std::move(ofm_scale);
+  attrs->ofm_offset = std::move(ofm_offset);
+  attrs->shape = std::move(shape);
+
+  static const Op& op = Op::Get("contrib.gemmini.add");
+
+  auto requantized_ifm1 = ifm1;
+
+  auto requantized_ifm2 = ifm2;
+
+  auto ofm_offset_tensor = Full(attrs->ofm_offset, attrs->shape, DataType::Float(32));
+  auto ifm1_offset_tensor = Multiply(Divide(attrs->ifm1_scale, attrs->ofm_scale),
+                                     Cast(attrs->ifm1_offset, DataType::Float(32)));
+  auto ifm2_offset_tensor = Multiply(Divide(attrs->ifm2_scale, attrs->ofm_scale),
+                                     Cast(attrs->ifm2_offset, DataType::Float(32)));
+  ofm_offset_tensor = Subtract(Subtract(ofm_offset_tensor, ifm1_offset_tensor), ifm2_offset_tensor);
+
+  auto final_offset_tensor = tvm::relay::qnn::RequantizeOrUpcast(
+      ofm_offset_tensor, MakeConstantScalar(DataType::Float(32), 1),
+      MakeConstantScalar(DataType::Float(32), 0), MakeConstantScalar(DataType::Float(32), 1),
+      MakeConstantScalar(DataType::Float(32), 0), attrs->shape, -1);
+
+  auto add_output =
+      Call(op, {requantized_ifm1, requantized_ifm2, final_offset_tensor}, Attrs(attrs), {});
+  return add_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_add").set_body_typed(MakeGemminiAdd);
+
+RELAY_REGISTER_OP("contrib.gemmini.add")
+    .describe("Gemmini Add operator.")
+    .set_attrs_type<GemminiAddAttrs>()
+    .set_num_inputs(3)
+    .add_argument("ifm1", "Tensor", "The Input 1 Feature Map tensor.")
+    .add_argument("ifm2", "Tensor", "The Input 2 Feature Map tensor.")
+    .add_argument("ofm_offset_tensor", "Tensor", "The output offset tensor.")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiAdd", GemminiAddRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/contrib/gemmini/convolution.cc b/src/relay/op/contrib/gemmini/convolution.cc
new file mode 100644
index 000000000000..1ac0a3ad0df5
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/convolution.cc
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/convolution.cc
+ * \brief 2D convolution operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+//#include "common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini 2D convolution operator */
+struct GemminiConv2dAttrs : public tvm::AttrsNode<GemminiConv2dAttrs> {
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  double ifm_scale;
+  Expr ifm_offset;
+  double weights_scale;
+  double weights_offset;
+  Expr bias_scale;
+  Expr bias_offset;
+  Expr ofm_scale;
+  Expr ofm_offset;
+  bool activation;
+  bool has_pool;
+  Array<IndexExpr> pool_size;
+  Array<IndexExpr> pool_strides;
+  Array<IndexExpr> pool_dilation;
+  Array<IndexExpr> pool_padding;
+  Expr input_req_offset_out;
+  Expr activation_scale_in;
+  Expr activation_offset_in;
+  Expr activation_scale_out;
+  Expr activation_offset_out;
+  bool has_activation;
+
+  TVM_DECLARE_ATTRS(GemminiConv2dAttrs, "relay.attrs.GemminiConv2dAttrs") {
+    TVM_ATTR_FIELD(strides)
+        .set_default(Array<IndexExpr>({1, 1}))
+        .describe("The 2 dimensional strides as (stride_height, stride_width).");
+    TVM_ATTR_FIELD(padding)
+        .set_default(Array<IndexExpr>({0, 0, 0, 0}))
+        .describe("The 4 dimensional padding.");
+    TVM_ATTR_FIELD(ifm_scale).set_default(1.0).describe("Input quantization scale");
+    TVM_ATTR_FIELD(ifm_offset).describe("Input quantization offset");
+    TVM_ATTR_FIELD(weights_scale).set_default(1.0).describe("Weights quantization scale");
+    TVM_ATTR_FIELD(weights_offset).set_default(0.0).describe("Weights quantization offset");
+    TVM_ATTR_FIELD(bias_scale).describe("Bias quantization scale");
+    TVM_ATTR_FIELD(bias_offset).describe("Bias quantization offset");
+    TVM_ATTR_FIELD(ofm_scale).describe("Output quantization scale");
+    TVM_ATTR_FIELD(ofm_offset).describe("Output quantization offset");
+    TVM_ATTR_FIELD(activation)
+        .set_default(false)
+        .describe("If it has a ReLu activation (True) or not (False)");
+    TVM_ATTR_FIELD(has_pool).set_default(false).describe(
+        "If it has a pool layer (True) or not (False)");
+    TVM_ATTR_FIELD(pool_size).describe("Pooling window size");
+    TVM_ATTR_FIELD(pool_strides).describe("Pooling window strides");
+    TVM_ATTR_FIELD(pool_dilation).describe("Pooling window dilation");
+    TVM_ATTR_FIELD(pool_padding).describe("Pooling padding");
+    TVM_ATTR_FIELD(input_req_offset_out).describe("Requantization output offset");
+    TVM_ATTR_FIELD(activation_scale_in).describe("Activation input scaling factor");
+    TVM_ATTR_FIELD(activation_offset_in).describe("Activation input offset");
+    TVM_ATTR_FIELD(activation_scale_out).describe("Activation output scaling factor");
+    TVM_ATTR_FIELD(activation_offset_out).describe("Activation output offset");
+    TVM_ATTR_FIELD(has_activation).describe("Has activation?");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiConv2dAttrs);
+
+bool GemminiConv2dRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                      const TypeReporter& reporter) {
+  const int data_index = 0;
+  const int weights_index = 1;
+  const int bias_index = 2;
+  const int result_index = 3;
+
+  const auto* data = types[data_index].as<TensorTypeNode>();
+  const auto* weights = types[weights_index].as<TensorTypeNode>();
+  const auto* bias = types[bias_index].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  if (weights == nullptr) return false;
+  if (bias == nullptr) return false;
+
+  const auto* params = attrs.as<GemminiConv2dAttrs>();
+  ICHECK(params != nullptr) << "GemminiConv2dAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  PrimExpr conv2d_output_h =
+      ((data->shape[1] + (params->padding[0] + params->padding[2]) - weights->shape[0]) /
+       params->strides[0]) +
+      1;
+  PrimExpr conv2d_output_w =
+      ((data->shape[2] + (params->padding[1] + params->padding[3]) - weights->shape[1]) /
+       params->strides[1]) +
+      1;
+  PrimExpr max_pool2d_h = conv2d_output_h;
+  PrimExpr max_pool2d_w = conv2d_output_w;
+  if (params->has_pool) {
+    max_pool2d_h = ((conv2d_output_h + (params->pool_padding[0] + params->pool_padding[2]) -
+                     params->pool_size[0]) /
+                    params->pool_strides[0]) +
+                   1;
+    max_pool2d_w = ((conv2d_output_w + (params->pool_padding[1] + params->pool_padding[3]) -
+                     params->pool_size[1]) /
+                    params->pool_strides[1]) +
+                   1;
+  }
+  Array<IndexExpr> ofm_shape({data->shape[0], max_pool2d_h, max_pool2d_w, weights->shape[3]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiConv2d(Expr data, Expr weights, Expr bias, Array<IndexExpr> strides,
+                       Array<IndexExpr> padding, double ifm_scale, Expr ifm_offset,
+                       double weights_scale, double weights_offset, Expr bias_scale,
+                       Expr bias_offset, Expr ofm_scale, Expr ofm_offset, bool activation,
+                       bool has_pool, Array<IndexExpr> pool_size, Array<IndexExpr> pool_strides,
+                       Array<IndexExpr> pool_dilation, Array<IndexExpr> pool_padding,
+                       Expr input_req_offset_out, bool has_activation, Expr activation_scale_in,
+                       Expr activation_offset_in, Expr activation_scale_out,
+                       Expr activation_offset_out) {
+  auto attrs = make_object<GemminiConv2dAttrs>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->activation = std::move(activation);
+  attrs->ifm_scale = std::move(ifm_scale);
+  attrs->ifm_offset = std::move(ifm_offset);
+  attrs->weights_scale = std::move(weights_scale);
+  attrs->weights_offset = std::move(weights_offset);
+  attrs->bias_scale = std::move(bias_scale);
+  attrs->bias_offset = std::move(bias_offset);
+  attrs->ofm_scale = std::move(ofm_scale);
+  attrs->ofm_offset = std::move(ofm_offset);
+  attrs->has_pool = std::move(has_pool);
+  attrs->pool_size = std::move(pool_size);
+  attrs->pool_strides = std::move(pool_strides);
+  attrs->pool_dilation = std::move(pool_dilation);
+  attrs->pool_padding = std::move(pool_padding);
+  attrs->input_req_offset_out = std::move(input_req_offset_out);
+  attrs->activation_scale_in = std::move(activation_scale_in);
+  attrs->activation_offset_in = std::move(activation_offset_in);
+  attrs->activation_scale_out = std::move(activation_scale_out);
+  attrs->activation_offset_out = std::move(activation_offset_out);
+  attrs->has_activation = std::move(has_activation);
+
+  static const Op& op = Op::Get("contrib.gemmini.conv2d");
+
+  auto zero_const = MakeConstantScalar(DataType::Int(32), 0);
+  auto one_const = MakeConstantScalar(DataType::Int(32), 0);
+
+  auto new_bias = bias;
+  // Bias change
+  // Term 3
+  auto reduced_t3 = Sum(Cast(weights, DataType::Int(32)), {0, 1, 2}, false, false);
+  auto term3 = Multiply(attrs->ifm_offset, reduced_t3);
+  auto input_req_bias_term = Multiply(attrs->input_req_offset_out, reduced_t3);
+
+  new_bias = Add(Subtract(bias, term3), input_req_bias_term);
+  auto scale_1 = Divide(attrs->bias_scale, attrs->ofm_scale);
+  auto bias_fix = Divide(Cast(attrs->ofm_offset, DataType::Float(32)), scale_1);
+  new_bias = Add(new_bias, Cast(bias_fix, DataType::Int(32)));
+
+  if (attrs->has_activation) {
+    auto scale_2 = Divide(attrs->activation_scale_in, attrs->activation_scale_out);
+    auto term_1 = Cast(attrs->activation_offset_in, DataType::Float(32));
+    auto term_2 = Divide(Cast(attrs->activation_offset_out, DataType::Float(32)), scale_2);
+    auto bias_fix = Divide(Subtract(term_2, term_1), scale_1);
+    new_bias = Add(new_bias, Cast(bias_fix, DataType::Int(32)));
+  }
+
+  auto conv2d_output = Call(op, {data, weights, new_bias}, Attrs(attrs), {});
+  return conv2d_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_conv2d").set_body_typed(MakeGemminiConv2d);
+
+RELAY_REGISTER_OP("contrib.gemmini.conv2d")
+    .describe("Gemmini 2D convolution operator")
+    .set_attrs_type<GemminiConv2dAttrs>()
+    .set_num_inputs(3)
+    .add_argument("data", "Tensor", "The Input Feature Map tensor.")
+    .add_argument("weights", "Tensor", "The Weights tensor.")
+    .add_argument("bias", "Tensor", "The bias tensor.")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiConv2d", GemminiConv2dRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/contrib/gemmini/depthwise_convolution.cc b/src/relay/op/contrib/gemmini/depthwise_convolution.cc
new file mode 100644
index 000000000000..d9cb264fb514
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/depthwise_convolution.cc
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/depthwise_convolution.cc
+ * \brief 2D depthwise convolution operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+//#include "common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini 2D depthwise convolution operator */
+struct GemminiDepthwiseConv2dAttrs : public tvm::AttrsNode<GemminiDepthwiseConv2dAttrs> {
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  double ifm_scale;
+  Expr ifm_offset;
+  double weights_scale;
+  double weights_offset;
+  Expr bias_scale;
+  Expr bias_offset;
+  Expr ofm_scale;
+  Expr ofm_offset;
+  bool activation;
+
+  TVM_DECLARE_ATTRS(GemminiDepthwiseConv2dAttrs, "relay.attrs.GemminiDepthwiseConv2dAttrs") {
+    TVM_ATTR_FIELD(strides)
+        .set_default(Array<IndexExpr>({1, 1}))
+        .describe("The 2 dimensional strides as (stride_height, stride_width).");
+    TVM_ATTR_FIELD(padding)
+        .set_default(Array<IndexExpr>({0, 0, 0, 0}))
+        .describe("The 4 dimensional padding.");
+    TVM_ATTR_FIELD(ifm_scale).set_default(1.0).describe("Input quantization scale");
+    TVM_ATTR_FIELD(ifm_offset).describe("Input quantization offset");
+    TVM_ATTR_FIELD(weights_scale).set_default(1.0).describe("Weights quantization scale");
+    TVM_ATTR_FIELD(weights_offset).set_default(0.0).describe("Weights quantization offset");
+    TVM_ATTR_FIELD(bias_scale).describe("Bias quantization scale");
+    TVM_ATTR_FIELD(bias_offset).describe("Bias quantization offset");
+    TVM_ATTR_FIELD(ofm_scale).describe("Output quantization scale");
+    TVM_ATTR_FIELD(ofm_offset).describe("Output quantization offset");
+    TVM_ATTR_FIELD(activation)
+        .set_default(false)
+        .describe("If it has a ReLu activation (True) or not (False)");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiDepthwiseConv2dAttrs);
+
+bool GemminiDepthwiseConv2dRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                               const TypeReporter& reporter) {
+  const int data_index = 0;
+  const int weights_index = 1;
+  const int bias_index = 2;
+  const int result_index = 3;
+
+  const auto* data = types[data_index].as<TensorTypeNode>();
+  const auto* weights = types[weights_index].as<TensorTypeNode>();
+  const auto* bias = types[bias_index].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  if (weights == nullptr) return false;
+  if (bias == nullptr) return false;
+
+  const auto* params = attrs.as<GemminiDepthwiseConv2dAttrs>();
+  ICHECK(params != nullptr) << "GemminiDepthwiseConv2dAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  Array<IndexExpr> ofm_shape(
+      {data->shape[0],
+       ((data->shape[1] + (params->padding[0] + params->padding[2]) - weights->shape[1]) /
+        params->strides[0]) +
+           1,
+       ((data->shape[2] + (params->padding[1] + params->padding[3]) - weights->shape[2]) /
+        params->strides[1]) +
+           1,
+       weights->shape[0]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiDepthwiseConv2d(Expr data, Expr weights, Expr bias, Array<IndexExpr> strides,
+                                Array<IndexExpr> padding, double ifm_scale, Expr ifm_offset,
+                                double weights_scale, double weights_offset, Expr bias_scale,
+                                Expr bias_offset, Expr ofm_scale, Expr ofm_offset,
+                                bool activation) {
+  auto attrs = make_object<GemminiDepthwiseConv2dAttrs>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->activation = std::move(activation);
+  attrs->ifm_scale = std::move(ifm_scale);
+  attrs->ifm_offset = std::move(ifm_offset);
+  attrs->weights_scale = std::move(weights_scale);
+  attrs->weights_offset = std::move(weights_offset);
+  attrs->bias_scale = std::move(bias_scale);
+  attrs->bias_offset = std::move(bias_offset);
+  attrs->ofm_scale = std::move(ofm_scale);
+  attrs->ofm_offset = std::move(ofm_offset);
+
+  static const Op& op = Op::Get("contrib.gemmini.depthwiseconv2d");
+
+  // Bias change
+  // Term 3
+  auto reduced_t3 = Sum(Cast(weights, DataType::Int(32)), {1, 2}, false, false);
+  auto term3 = Multiply(attrs->ifm_offset, reduced_t3);
+
+  auto new_bias = Subtract(bias, term3);
+  auto scale = Divide(attrs->bias_scale, attrs->ofm_scale);
+  auto bias_fix = Divide(Cast(attrs->ofm_offset, DataType::Float(32)), scale);
+  new_bias = Add(new_bias, Cast(bias_fix, DataType::Int(32)));
+
+  auto conv2d_output = Call(op, {data, weights, new_bias}, Attrs(attrs), {});
+  return conv2d_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_depthwise_conv2d")
+    .set_body_typed(MakeGemminiDepthwiseConv2d);
+
+RELAY_REGISTER_OP("contrib.gemmini.depthwiseconv2d")
+    .describe("Gemmini 2D depthwise convolution operator.")
+    .set_attrs_type<GemminiDepthwiseConv2dAttrs>()
+    .set_num_inputs(3)
+    .add_argument("data", "Tensor", "The Input Feature Map tensor.")
+    .add_argument("weights", "Tensor", "The Weights tensor.")
+    .add_argument("bias", "Tensor", "The bias tensor.")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiDepthwiseConv2d", GemminiDepthwiseConv2dRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/contrib/gemmini/gemm.cc b/src/relay/op/contrib/gemmini/gemm.cc
new file mode 100644
index 000000000000..6002e72aaa41
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/gemm.cc
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/gemm.cc
+ * \brief GEMM operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+//#include "common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini GEMM operator */
+struct GemminiGEMMAttrs : public tvm::AttrsNode<GemminiGEMMAttrs> {
+  Expr ifm_scale;
+  Expr ifm_offset;
+  Expr bias_scale;
+  Expr bias_offset;
+  Expr ofm_scale;
+  Expr ofm_offset;
+
+  TVM_DECLARE_ATTRS(GemminiGEMMAttrs, "relay.attrs.GemminiGEMMAttrs") {
+    TVM_ATTR_FIELD(ifm_scale).describe("Data quantization scale");
+    TVM_ATTR_FIELD(ifm_offset).describe("Data quantization offset");
+    TVM_ATTR_FIELD(bias_scale).describe("Bias quantization scale");
+    TVM_ATTR_FIELD(bias_offset).describe("Bias quantization offset");
+    TVM_ATTR_FIELD(ofm_scale).describe("Output quantization scale");
+    TVM_ATTR_FIELD(ofm_offset).describe("Output quantization offset");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiGEMMAttrs);
+
+bool GemminiGEMMRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                    const TypeReporter& reporter) {
+  const int ifm1_index = 0;
+  const int ifm2_index = 1;
+  const int bias_index = 2;
+  const int result_index = 3;
+
+  const auto* ifm1 = types[ifm1_index].as<TensorTypeNode>();
+  const auto* ifm2 = types[ifm2_index].as<TensorTypeNode>();
+  const auto* bias = types[bias_index].as<TensorTypeNode>();
+  if (ifm1 == nullptr) return false;
+  if (ifm2 == nullptr) return false;
+  if (bias == nullptr) return false;
+
+  const auto* param = attrs.as<GemminiGEMMAttrs>();
+  ICHECK(param != nullptr) << "GemminiGEMMAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  Array<IndexExpr> ofm_shape({ifm1->shape[0], ifm2->shape[1]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiGEMM(Expr data, Expr weights, Expr bias, Expr ifm_scale, Expr ifm_offset,
+                     Expr bias_scale, Expr bias_offset, Expr ofm_scale, Expr ofm_offset) {
+  auto attrs = make_object<GemminiGEMMAttrs>();
+  attrs->ifm_scale = std::move(ifm_scale);
+  attrs->ifm_offset = std::move(ifm_offset);
+  attrs->bias_scale = std::move(bias_scale);
+  attrs->bias_offset = std::move(bias_offset);
+  attrs->ofm_scale = std::move(ofm_scale);
+  attrs->ofm_offset = std::move(ofm_offset);
+
+  static const Op& op = Op::Get("contrib.gemmini.gemm");
+
+  auto weights_transposed = MakeTranspose(weights, {1, 0});
+  auto reduced_t3 = Sum(Cast(weights_transposed, DataType::Int(32)), {0}, false, false);
+  auto term3 = Multiply(attrs->ifm_offset, reduced_t3);
+
+  auto scale = Divide(attrs->bias_scale, attrs->ofm_scale);
+  auto bias_fix = Divide(Cast(attrs->ofm_offset, DataType::Float(32)), scale);
+
+  auto new_bias = Add(Subtract(bias, term3), Cast(bias_fix, DataType::Int(32)));
+
+  auto gemm_output = Call(op, {data, weights_transposed, new_bias}, Attrs(attrs), {});
+  return gemm_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_gemm").set_body_typed(MakeGemminiGEMM);
+
+RELAY_REGISTER_OP("contrib.gemmini.gemm")
+    .describe("Gemmini GEMM operator")
+    .set_attrs_type<GemminiGEMMAttrs>()
+    .set_num_inputs(3)
+    .add_argument("ifm1", "Tensor", "The Input Feature Map tensor.")
+    .add_argument("ifm2", "Tensor", "The Weights tensor.")
+    .add_argument("bias", "Tensor", "The bias tensor")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiGEMM", GemminiGEMMRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/contrib/gemmini/max_pool2d.cc b/src/relay/op/contrib/gemmini/max_pool2d.cc
new file mode 100644
index 000000000000..2e435ceea875
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/max_pool2d.cc
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/max_pool2d.cc
+ * \brief 2D max pool operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+//#include "common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini GEMM operators */
+struct GemminiMaxPool2DAttrs : public tvm::AttrsNode<GemminiMaxPool2DAttrs> {
+  Array<IndexExpr> pool_size;
+  Array<IndexExpr> pool_strides;
+  Array<IndexExpr> pool_dilation;
+  Array<IndexExpr> pool_padding;
+  Array<PrimExpr> shape;
+
+  TVM_DECLARE_ATTRS(GemminiMaxPool2DAttrs, "relay.attrs.GemminiMaxPool2DAttrs") {
+    TVM_ATTR_FIELD(pool_size).describe("Pooling window size");
+    TVM_ATTR_FIELD(pool_strides).describe("Pooling window strides");
+    TVM_ATTR_FIELD(pool_dilation).describe("Pooling window dilation");
+    TVM_ATTR_FIELD(pool_padding).describe("Pooling padding");
+    TVM_ATTR_FIELD(shape).describe("Input shape");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiMaxPool2DAttrs);
+
+bool GemminiMaxPool2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                         const TypeReporter& reporter) {
+  const int data_index = 0;
+  const int result_index = 2;
+
+  const auto* data = types[data_index].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const auto* params = attrs.as<GemminiMaxPool2DAttrs>();
+  ICHECK(params != nullptr) << "GemminiMaxPool2DAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  PrimExpr max_pool2d_h = ((data->shape[1] + (params->pool_padding[0] + params->pool_padding[2]) -
+                            params->pool_size[0]) /
+                           params->pool_strides[0]) +
+                          1;
+  PrimExpr max_pool2d_w = ((data->shape[2] + (params->pool_padding[1] + params->pool_padding[3]) -
+                            params->pool_size[1]) /
+                           params->pool_strides[1]) +
+                          1;
+  Array<IndexExpr> ofm_shape({data->shape[0], max_pool2d_h, max_pool2d_w, data->shape[3]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiMaxPool2D(Expr data, Array<IndexExpr> pool_size, Array<IndexExpr> pool_strides,
+                          Array<IndexExpr> pool_dilation, Array<IndexExpr> pool_padding,
+                          Array<PrimExpr> shape) {
+  auto attrs = make_object<GemminiMaxPool2DAttrs>();
+  attrs->pool_size = std::move(pool_size);
+  attrs->pool_strides = std::move(pool_strides);
+  attrs->pool_dilation = std::move(pool_dilation);
+  attrs->pool_padding = std::move(pool_padding);
+  attrs->shape = std::move(shape);
+
+  static const Op& op = Op::Get("contrib.gemmini.max_pool2d");
+
+  // Trick to be able to accelerate the max pooling operation using the dw convolution function of
+  // Gemmini ;)
+  auto weights =
+      Full(MakeConstantScalar(DataType::Int(8), 1), {attrs->shape[3], 1, 1}, DataType::Int(8));
+
+  auto max_pool2d_output = Call(op, {data, weights}, Attrs(attrs), {});
+
+  return max_pool2d_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_max_pool2d").set_body_typed(MakeGemminiMaxPool2D);
+
+RELAY_REGISTER_OP("contrib.gemmini.max_pool2d")
+    .describe("Gemmini 2D max pooling operator")
+    .set_attrs_type<GemminiMaxPool2DAttrs>()
+    .set_num_inputs(2)
+    .add_argument("data", "Tensor", "The Input Feature Map tensor.")
+    .add_argument("weights", "Tensor", "The Weights dummy tensor.")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiMaxPool2D", GemminiMaxPool2DRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index c8c099171c96..d827aac35647 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -229,7 +229,8 @@ runtime::Module CreateMetadataModule(
     // TODO(@manupa-arm) : we should be able to use csource_metadata
     // if the variables are empty when all the runtime modules implement get_func_names
     if (symbol_const_vars.empty() && is_targeting_crt && mod->IsDSOExportable() &&
-        (target->kind->name == "c" || target->kind->name == "llvm")) {
+        (target->kind->name == "c" || target->kind->name == "llvm" ||
+         target->kind->name == "gemmini")) {
       crt_exportable_modules.push_back(mod);
     } else {
       non_crt_exportable_modules.push_back(mod);
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index a47158d37883..84fc9bb9dac9 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -64,6 +64,9 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_s
     decl_stream << "#include <arm_nn_types.h>\n";
     decl_stream << "#include <arm_nn_math_types.h>\n";
   }
+  if (target_str.find("gemmini") != std::string::npos) {
+    decl_stream << "#include \"gemmini_testutils.h\"\n";
+  }
   CodeGenC::Init(output_ssa);
 }
 
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index a8d8936c905a..43d50306be45 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -39,7 +39,8 @@ LetStmt::LetStmt(Var var, PrimExpr value, Stmt body, Span span) {
   // It is still valid to bind a pointer type
   // var to a value that is of type handle.
   if (var->type_annotation.as<PointerTypeNode>()) {
-    ICHECK(vdtype.is_handle());
+    // TODO (FP): Is this check really necessary?
+    // ICHECK(vdtype.is_handle());
   } else {
     ICHECK_EQ(value.dtype(), var.dtype());
   }
diff --git a/src/tir/transforms/inject_gemmini_pointer_correction.cc b/src/tir/transforms/inject_gemmini_pointer_correction.cc
new file mode 100644
index 000000000000..d73f6b9b63ca
--- /dev/null
+++ b/src/tir/transforms/inject_gemmini_pointer_correction.cc
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Correct pointer addresses in scratchpad and accumulator of Gemmini
+ * \file inject_gemmini_pointer_correction.cc
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/arith/analyzer.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/target/target_info.h>
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../../runtime/thread_storage_scope.h"
+#include "ir_utils.h"
+
+namespace tvm {
+namespace tir {
+
+struct CorrectGemminisScratchpadAndAccumulatorPointersConfigNode
+    : public tvm::AttrsNode<CorrectGemminisScratchpadAndAccumulatorPointersConfigNode> {
+  int dim;
+
+  TVM_DECLARE_ATTRS(CorrectGemminisScratchpadAndAccumulatorPointersConfigNode,
+                    "tir.transform.CorrectGemminisScratchpadAndAccumulatorPointersConfig") {
+    TVM_ATTR_FIELD(dim).describe("Systolic array DIM").set_default(16);
+  }
+};
+
+class CorrectGemminisScratchpadAndAccumulatorPointersConfig : public Attrs {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(
+      CorrectGemminisScratchpadAndAccumulatorPointersConfig, Attrs,
+      CorrectGemminisScratchpadAndAccumulatorPointersConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(CorrectGemminisScratchpadAndAccumulatorPointersConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.CorrectGemminisScratchpadAndAccumulatorPointers",
+                                CorrectGemminisScratchpadAndAccumulatorPointersConfig);
+
+class CorrectGemminisScratchpadAndAccumulatorPointersInjector : public StmtExprMutator {
+ public:
+  explicit CorrectGemminisScratchpadAndAccumulatorPointersInjector(int dim) : dim_(dim) {}
+
+  Stmt Inject(Stmt stmt) { return this->VisitStmt(stmt); }
+
+  PrimExpr VisitExpr_(const CallNode* op) final {
+    /*
+    This pass is used to modify the access ptr
+    */
+    auto node = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
+    if (node->op.same_as(builtin::tvm_access_ptr())) {
+      const VarNode* buffer = node->args[1].as<VarNode>();
+
+      if (std::string(buffer->name_hint).find("local") != std::string::npos) {
+        PrimExpr offset = this->VisitExpr(node->args[2]);
+        PrimExpr extent = this->VisitExpr(node->args[3]);
+
+        const auto* ptr_type = buffer->type_annotation.as<PointerTypeNode>();
+        ICHECK(ptr_type) << "The provided variable is not of pointer type";
+        auto scope = ptr_type->storage_scope;
+        auto info = GetMemoryInfo(scope);
+        ICHECK(info.defined()) << "Cannot find memory info of " << scope;
+        DataType dtype = Downcast<PrimType>(ptr_type->element_type)->dtype;
+        int dtype_bits = dtype.bits() * dtype.lanes();
+
+        int div = dim_;
+        const IntImmNode* extent_int = extent.as<IntImmNode>();
+
+        PrimExpr inner_offset = indexmod(offset, extent);
+        PrimExpr outer_offset = offset - inner_offset;
+        PrimExpr outer_offset_corrected = indexdiv(outer_offset, div);
+        PrimExpr offset_corrected = outer_offset_corrected + inner_offset;
+
+        return Call(node->dtype, node->op,
+                    {node->args[0], node->args[1], offset_corrected, extent, node->args[4]});
+      }
+    }
+    return StmtExprMutator::VisitExpr_(op);
+  }
+
+ private:
+  int dim_;
+};
+
+namespace transform {
+
+Pass CorrectGemminisScratchpadAndAccumulatorPointers() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    auto* n = f.CopyOnWrite();
+    auto cfg = ctx->GetConfig<CorrectGemminisScratchpadAndAccumulatorPointersConfig>(
+        "tir.CorrectGemminisScratchpadAndAccumulatorPointers");
+    if (!cfg.defined()) {
+      cfg = AttrsWithDefaultValues<CorrectGemminisScratchpadAndAccumulatorPointersConfig>();
+    }
+    n->body = CorrectGemminisScratchpadAndAccumulatorPointersInjector(cfg.value()->dim)
+                  .Inject(std::move(n->body));
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.CorrectGemminisScratchpadAndAccumulatorPointers",
+                            {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.CorrectGemminisScratchpadAndAccumulatorPointers")
+    .set_body_typed(CorrectGemminisScratchpadAndAccumulatorPointers);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm

From 02bf48a587ca39ff191012c1e17179d2c01c6e37 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 11:09:03 +0100
Subject: [PATCH 002/286] Merged Makefiles into template, added cmake Gemmini
 config, moved tutorials to gallery

---
 apps/microtvm/gemmini/README.md               |   2 +-
 .../template_project/microtvm_api_server.py   | 122 +-----
 .../add/Makefile => Makefile.template}        |   8 -
 .../src/makefiles/conv2d/Makefile             |  68 ---
 .../src/makefiles/dense/Makefile              |  68 ---
 .../src/makefiles/dwconv2d/Makefile           |  68 ---
 .../src/makefiles/maxpool2d/Makefile          |  68 ---
 .../src/makefiles/mobilenet/Makefile          |  68 ---
 cmake/config.cmake                            |   3 +
 cmake/modules/contrib/Gemmini.cmake           |  14 +-
 .../micro_gemmini/README.txt                  |   5 +
 .../micro_gemmini/micro_gemmini_add.py        | 234 +++++++++++
 .../micro_gemmini/micro_gemmini_conv2d.py     | 215 ++++++++++
 .../micro_gemmini/micro_gemmini_dense.py      | 214 ++++++++++
 .../micro_gemmini/micro_gemmini_dwconv2d.py   | 207 +++++++++
 .../micro_gemmini/micro_gemmini_maxpool2d.py  | 211 ++++++++++
 .../micro_gemmini/micro_gemmini_mobilenet.py  | 262 ++++++++++++
 .../networks/mobilenet-tutorial.ipynb         | 311 --------------
 .../tutorials/networks/mobilenet_utils.py     | 138 ------
 .../single_operators/add-tutorial.ipynb       | 395 ------------------
 .../single_operators/conv2d-tutorial.ipynb    | 378 -----------------
 .../single_operators/dense-tutorial.ipynb     | 378 -----------------
 .../single_operators/dwconv2d-tutorial.ipynb  | 373 -----------------
 .../single_operators/maxpool2d-tutorial.ipynb | 378 -----------------
 24 files changed, 1370 insertions(+), 2818 deletions(-)
 rename apps/microtvm/gemmini/template_project/src/{makefiles/add/Makefile => Makefile.template} (82%)
 delete mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile
 delete mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile
 delete mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile
 delete mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile
 delete mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile
 create mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/README.txt
 create mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py
 create mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py
 create mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py
 create mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py
 create mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py
 create mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py
 delete mode 100644 python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb
 delete mode 100644 python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py
 delete mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb
 delete mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb
 delete mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb
 delete mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb
 delete mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb

diff --git a/apps/microtvm/gemmini/README.md b/apps/microtvm/gemmini/README.md
index 11fea3415b70..9b4c45716062 100644
--- a/apps/microtvm/gemmini/README.md
+++ b/apps/microtvm/gemmini/README.md
@@ -1,3 +1,3 @@
 This directory contains code to create code for the Gemmini accelerator using microTVM. These tests are then executed on the Spike RISC-V ISA simulator.
 
-In order to use this correctly, the Spike simulator has to be installed. This can be done by following the steps found on the Chipyard repository.
+In order to use this correctly, the Spike simulator has to be installed. This can be done by following the steps found on the [Chipyard](https://chipyard.readthedocs.io/en/stable/) repository. The instructions to also install the patch of the Spike simulator that adds the Gemmini functional simulator can be found in the [Gemmini](https://github.com/ucb-bar/gemmini) repository.
diff --git a/apps/microtvm/gemmini/template_project/microtvm_api_server.py b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
index f4d4f7eb5e89..85971316ec4e 100644
--- a/apps/microtvm/gemmini/template_project/microtvm_api_server.py
+++ b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
@@ -109,6 +109,14 @@ def _copy_project_files(self, api_server_dir, project_dir, project_type):
                 shutil.copytree(item, dest)
             else:
                 shutil.copy2(item, dest)
+        
+        shutil.copy2(project_dir / "src" / "Makefile.template", project_dir / "src" / "Makefile")
+
+        test_name = project_type.replace("_example","")
+        new_line = f"tests = {test_name}\n"
+        with open(project_dir / "src" / "Makefile", 'r') as original: data = original.read()
+        with open(project_dir / "src" / "Makefile", 'w') as modified: modified.write(new_line + data)
+
 
     CRT_COPY_ITEMS = ("include", "src")
 
@@ -122,18 +130,6 @@ def _copy_standalone_crt(self, source_dir, standalone_crt_dir):
             else:
                 shutil.copy2(src_path, dst_path)
 
-    # Example project is the "minimum viable project",
-    # and doesn't need a fancy RPC server
-    EXAMPLE_PROJECT_UNUSED_COMPONENTS = []
-
-    def _remove_unused_components(self, source_dir, project_type):
-        unused_components = []
-        if project_type == "example_project":
-            unused_components = self.EXAMPLE_PROJECT_UNUSED_COMPONENTS
-
-        for component in unused_components:
-            shutil.rmtree(source_dir / "standalone_crt" / component)
-
     def _disassemble_mlf(self, mlf_tar_path, source_dir):
         with tempfile.TemporaryDirectory() as mlf_unpacking_dir_str:
             mlf_unpacking_dir = pathlib.Path(mlf_unpacking_dir_str)
@@ -158,48 +154,12 @@ def _disassemble_mlf(self, mlf_tar_path, source_dir):
                 metadata = json.load(f)
         return metadata
 
-    def _template_model_header(self, source_dir, metadata):
-        with open(source_dir / "model.h", "r") as f:
-            model_h_template = Template(f.read())
-
-        assert (
-            metadata["style"] == "full-model"
-        ), "when generating AOT, expect only full-model Model Library Format"
-
-        template_values = {
-            "workspace_size_bytes": metadata["memory"]["functions"]["main"][0][
-                "workspace_size_bytes"
-            ],
-        }
-
-        with open(source_dir / "model.h", "w") as f:
-            f.write(model_h_template.substitute(template_values))
-
-    # Arduino ONLY recognizes .ino, .ccp, .c, .h
-
     CPP_FILE_EXTENSION_SYNONYMS = ("cc", "cxx")
 
-    def _change_cpp_file_extensions(self, source_dir):
-        for ext in self.CPP_FILE_EXTENSION_SYNONYMS:
-            for filename in source_dir.rglob(f"*.{ext}"):
-                filename.rename(filename.with_suffix(".cpp"))
-
-        for filename in source_dir.rglob(f"*.inc"):
-            filename.rename(filename.with_suffix(".h"))
-
     def _convert_includes(self, project_dir, source_dir):
         """Changes all #include statements in project_dir to be relevant to their
         containing file's location.
 
-        Arduino only supports includes relative to a file's location, so this
-        function finds each time we #include a file and changes the path to
-        be relative to the file location. Does not do this for standard C
-        libraries. Also changes angle brackets syntax to double quotes syntax.
-
-        See Also
-        -----
-        https://www.arduino.cc/reference/en/language/structure/further-syntax/include/
-
         """
         for ext in ("c", "h", "cpp"):
             for filename in source_dir.rglob(f"*.{ext}"):
@@ -260,45 +220,6 @@ def _find_modified_include_path(self, project_dir, file_path, include_path):
         # It's probably a standard C/C++ header
         return include_path
 
-    def _copy_standalone_crt_makefiles(self, api_server_dir, source_dir):
-        print(source_dir)
-        shutil.copy2(
-            api_server_dir / "src/example_project/Makefile",
-            source_dir,
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/Makefile.in",
-            source_dir,
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/Makefrag",
-            source_dir,
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/build.sh",
-            source_dir,
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/configure.ac",
-            source_dir,
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/include/gemmini_nn.h",
-            source_dir / "include/gemmini_nn.h",
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/include/gemmini_testutils.h",
-            source_dir / "include/gemmini_testutils.h",
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/include/gemmini.h",
-            source_dir / "include/gemmini.h",
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/rocc-software/src/xcustom.h",
-            source_dir / "rocc-software/src/xcustom.h",
-        )
-
     def _copy_debug_data_files(self, project_dir):
         if os.path.isdir(str(project_dir / ".." / "include")):
             copy_tree(str(project_dir / ".." / "include"), str(project_dir / "src" / "model"))
@@ -317,7 +238,6 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
 
         # Copy standalone_crt into src folder
         self._copy_standalone_crt(source_dir, standalone_crt_dir)
-        self._remove_unused_components(source_dir, options["project_type"])
 
         # Populate crt-config.h
         crt_config_dir = project_dir / "src" / "standalone_crt" / "crt_config"
@@ -327,47 +247,27 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         )
 
         # Unpack the MLF and copy the relevant files
-        # extract_path = os.path.splitext(model_library_format_path)[0]
-        # with tarfile.TarFile(model_library_format_path) as tf:
-        #    os.makedirs(project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
-        #    tf.extractall(path=project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
         metadata = self._disassemble_mlf(model_library_format_path, source_dir)
         shutil.copy2(model_library_format_path, project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
 
         self._copy_debug_data_files(project_dir)
-        # For AOT, template model.h with metadata to minimize space usage
-        # if options["project_type"] == "example_project":
-        #    self._template_model_header(source_dir, metadata)
-
-        # Copy makefiles to treat standalone crt code as RIOT modules
-        # self._copy_standalone_crt_makefiles(API_SERVER_DIR, source_dir)
-
-        self._change_cpp_file_extensions(source_dir)
 
         # Recursively change includes
         self._convert_includes(project_dir, source_dir)
 
     def build(self, options):
         subprocess.call(
-            "source %s && cd src && ./build.sh" % (os.environ["CHIPYARD_HOME"] + "/env.sh",),
+            "cd src && ./build.sh",
             shell=True,
-            executable="/bin/bash",
         )
-        # os.system("source %s && cd src && ./build.sh" % (os.environ["CHIPYARD_HOME"] + "/env.sh",))
 
     def flash(self, options):
         test_name = options["project_type"].split("_")[0]
         subprocess.call(
-            "source %s && cd src/build && spike --extension=gemmini %s"
-            % (os.environ["CHIPYARD_HOME"] + "/env.sh", test_name + "-baremetal"),
+            "cd src/build && spike --extension=gemmini %s"
+            % (test_name + "-baremetal",),
             shell=True,
-            executable="/bin/bash",
         )
-        # os.system("source %s && cd src/build && spike --extension=gemmini %s" % (os.environ["CHIPYARD_HOME"] + "/env.sh",test_name + "-baremetal",))
-        # if logging.root.level == logging.DEBUG:
-        #    os.system("cd src/build && spike --extension=gemmini ")
-        # else:
-        #    os.system("cd src && make flash -s > /dev/null")
 
     def open_transport(self, options):
         pass
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/add/Makefile b/apps/microtvm/gemmini/template_project/src/Makefile.template
similarity index 82%
rename from apps/microtvm/gemmini/template_project/src/makefiles/add/Makefile
rename to apps/microtvm/gemmini/template_project/src/Makefile.template
index 2c997cea1a80..9368836a8802 100644
--- a/apps/microtvm/gemmini/template_project/src/makefiles/add/Makefile
+++ b/apps/microtvm/gemmini/template_project/src/Makefile.template
@@ -1,8 +1,5 @@
 include $(abs_top_srcdir)/Makefrag
 
-tests = \
-	add \
-
 tests_baremetal = $(tests:=-baremetal)
 
 ifeq ($(findstring spike,$(RUNNER)),spike)
@@ -53,11 +50,6 @@ vpath %.c $(src_dir)
 %-baremetal: %.c $(GEMMINI_HEADERS)
 	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
 		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
-#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
-		$(LIBS)
 
 run-baremetal: $(runs_baremetal)
 
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile
deleted file mode 100644
index f80da67c3f98..000000000000
--- a/apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-include $(abs_top_srcdir)/Makefrag
-
-tests = \
-	conv2d \
-
-tests_baremetal = $(tests:=-baremetal)
-
-ifeq ($(findstring spike,$(RUNNER)),spike)
-# Currently don't support conv or conv-with-pool on spike
-runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
-else
-# Don't run very long benchmarks for RTL sim
-runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
-endif
-
-RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
-BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
-GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
-STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
-
-CFLAGS := $(CFLAGS) \
-	-DPREALLOCATE=1 \
-	-DMULTITHREAD=1 \
-	-mcmodel=medany \
-	-std=gnu99 \
-	-O2 \
-	-ffast-math \
-	-fno-common \
-	-fno-builtin-printf \
-	-march=rv64gc -Wa,-march=rv64gcxhwacha \
-	-lm \
-	-lgcc \
-	-I${RISCV_TESTS} \
-	-I${RISCV_TESTS}/env \
-	-I$(abs_top_srcdir) \
-	-I$(abs_top_srcdir)/include \
-	-I$(BENCH_COMMON) \
-	-DID_STRING=$(ID_STRING) \
-	-DPRINT_TILE=0 \
-
-CFLAGS_BAREMETAL := \
-	$(CFLAGS) \
-	-nostdlib \
-	-nostartfiles \
-	-static \
-	-T $(BENCH_COMMON)/test.ld \
-	-DBAREMETAL=1 \
-
-all: $(tests_baremetal)
-
-vpath %.c $(src_dir)
-
-%-baremetal: %.c $(GEMMINI_HEADERS)
-	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
-		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
-#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
-		$(LIBS)
-
-run-baremetal: $(runs_baremetal)
-
-%-baremetal.run: %-baremetal
-	$(RUNNER)$(abs_top_srcdir)/build/$^
-
-junk += $(tests_baremetal)
-
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile
deleted file mode 100644
index 0b1932ceef91..000000000000
--- a/apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-include $(abs_top_srcdir)/Makefrag
-
-tests = \
-	dense \
-
-tests_baremetal = $(tests:=-baremetal)
-
-ifeq ($(findstring spike,$(RUNNER)),spike)
-# Currently don't support conv or conv-with-pool on spike
-runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
-else
-# Don't run very long benchmarks for RTL sim
-runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
-endif
-
-RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
-BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
-GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
-STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
-
-CFLAGS := $(CFLAGS) \
-	-DPREALLOCATE=1 \
-	-DMULTITHREAD=1 \
-	-mcmodel=medany \
-	-std=gnu99 \
-	-O2 \
-	-ffast-math \
-	-fno-common \
-	-fno-builtin-printf \
-	-march=rv64gc -Wa,-march=rv64gcxhwacha \
-	-lm \
-	-lgcc \
-	-I${RISCV_TESTS} \
-	-I${RISCV_TESTS}/env \
-	-I$(abs_top_srcdir) \
-	-I$(abs_top_srcdir)/include \
-	-I$(BENCH_COMMON) \
-	-DID_STRING=$(ID_STRING) \
-	-DPRINT_TILE=0 \
-
-CFLAGS_BAREMETAL := \
-	$(CFLAGS) \
-	-nostdlib \
-	-nostartfiles \
-	-static \
-	-T $(BENCH_COMMON)/test.ld \
-	-DBAREMETAL=1 \
-
-all: $(tests_baremetal)
-
-vpath %.c $(src_dir)
-
-%-baremetal: %.c $(GEMMINI_HEADERS)
-	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
-		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
-#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
-		$(LIBS)
-
-run-baremetal: $(runs_baremetal)
-
-%-baremetal.run: %-baremetal
-	$(RUNNER)$(abs_top_srcdir)/build/$^
-
-junk += $(tests_baremetal)
-
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile
deleted file mode 100644
index fa89e5be162d..000000000000
--- a/apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-include $(abs_top_srcdir)/Makefrag
-
-tests = \
-	dwconv2d \
-
-tests_baremetal = $(tests:=-baremetal)
-
-ifeq ($(findstring spike,$(RUNNER)),spike)
-# Currently don't support conv or conv-with-pool on spike
-runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
-else
-# Don't run very long benchmarks for RTL sim
-runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
-endif
-
-RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
-BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
-GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
-STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
-
-CFLAGS := $(CFLAGS) \
-	-DPREALLOCATE=1 \
-	-DMULTITHREAD=1 \
-	-mcmodel=medany \
-	-std=gnu99 \
-	-O2 \
-	-ffast-math \
-	-fno-common \
-	-fno-builtin-printf \
-	-march=rv64gc -Wa,-march=rv64gcxhwacha \
-	-lm \
-	-lgcc \
-	-I${RISCV_TESTS} \
-	-I${RISCV_TESTS}/env \
-	-I$(abs_top_srcdir) \
-	-I$(abs_top_srcdir)/include \
-	-I$(BENCH_COMMON) \
-	-DID_STRING=$(ID_STRING) \
-	-DPRINT_TILE=0 \
-
-CFLAGS_BAREMETAL := \
-	$(CFLAGS) \
-	-nostdlib \
-	-nostartfiles \
-	-static \
-	-T $(BENCH_COMMON)/test.ld \
-	-DBAREMETAL=1 \
-
-all: $(tests_baremetal)
-
-vpath %.c $(src_dir)
-
-%-baremetal: %.c $(GEMMINI_HEADERS)
-	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
-		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
-#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
-		$(LIBS)
-
-run-baremetal: $(runs_baremetal)
-
-%-baremetal.run: %-baremetal
-	$(RUNNER)$(abs_top_srcdir)/build/$^
-
-junk += $(tests_baremetal)
-
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile
deleted file mode 100644
index 1218e9e67a96..000000000000
--- a/apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-include $(abs_top_srcdir)/Makefrag
-
-tests = \
-	maxpool2d \
-
-tests_baremetal = $(tests:=-baremetal)
-
-ifeq ($(findstring spike,$(RUNNER)),spike)
-# Currently don't support conv or conv-with-pool on spike
-runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
-else
-# Don't run very long benchmarks for RTL sim
-runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
-endif
-
-RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
-BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
-GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
-STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
-
-CFLAGS := $(CFLAGS) \
-	-DPREALLOCATE=1 \
-	-DMULTITHREAD=1 \
-	-mcmodel=medany \
-	-std=gnu99 \
-	-O2 \
-	-ffast-math \
-	-fno-common \
-	-fno-builtin-printf \
-	-march=rv64gc -Wa,-march=rv64gcxhwacha \
-	-lm \
-	-lgcc \
-	-I${RISCV_TESTS} \
-	-I${RISCV_TESTS}/env \
-	-I$(abs_top_srcdir) \
-	-I$(abs_top_srcdir)/include \
-	-I$(BENCH_COMMON) \
-	-DID_STRING=$(ID_STRING) \
-	-DPRINT_TILE=0 \
-
-CFLAGS_BAREMETAL := \
-	$(CFLAGS) \
-	-nostdlib \
-	-nostartfiles \
-	-static \
-	-T $(BENCH_COMMON)/test.ld \
-	-DBAREMETAL=1 \
-
-all: $(tests_baremetal)
-
-vpath %.c $(src_dir)
-
-%-baremetal: %.c $(GEMMINI_HEADERS)
-	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
-		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
-#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
-		$(LIBS)
-
-run-baremetal: $(runs_baremetal)
-
-%-baremetal.run: %-baremetal
-	$(RUNNER)$(abs_top_srcdir)/build/$^
-
-junk += $(tests_baremetal)
-
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile
deleted file mode 100644
index b6d977550097..000000000000
--- a/apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-include $(abs_top_srcdir)/Makefrag
-
-tests = \
-	mobilenet \
-
-tests_baremetal = $(tests:=-baremetal)
-
-ifeq ($(findstring spike,$(RUNNER)),spike)
-# Currently don't support conv or conv-with-pool on spike
-runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
-else
-# Don't run very long benchmarks for RTL sim
-runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
-endif
-
-RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
-BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
-GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
-STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
-
-CFLAGS := $(CFLAGS) \
-	-DPREALLOCATE=1 \
-	-DMULTITHREAD=1 \
-	-mcmodel=medany \
-	-std=gnu99 \
-	-O2 \
-	-ffast-math \
-	-fno-common \
-	-fno-builtin-printf \
-	-march=rv64gc -Wa,-march=rv64gcxhwacha \
-	-lm \
-	-lgcc \
-	-I${RISCV_TESTS} \
-	-I${RISCV_TESTS}/env \
-	-I$(abs_top_srcdir) \
-	-I$(abs_top_srcdir)/include \
-	-I$(BENCH_COMMON) \
-	-DID_STRING=$(ID_STRING) \
-	-DPRINT_TILE=0 \
-
-CFLAGS_BAREMETAL := \
-	$(CFLAGS) \
-	-nostdlib \
-	-nostartfiles \
-	-static \
-	-T $(BENCH_COMMON)/test.ld \
-	-DBAREMETAL=1 \
-
-all: $(tests_baremetal)
-
-vpath %.c $(src_dir)
-
-%-baremetal: %.c $(GEMMINI_HEADERS)
-	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
-		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
-#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
-		$(LIBS)
-
-run-baremetal: $(runs_baremetal)
-
-%-baremetal.run: %-baremetal
-	$(RUNNER)$(abs_top_srcdir)/build/$^
-
-junk += $(tests_baremetal)
-
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 679f5c459e87..5a93f9db652b 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -285,6 +285,9 @@ set(USE_ANTLR OFF)
 # Whether use Relay debug mode
 set(USE_RELAY_DEBUG OFF)
 
+# Wheter to build the microTVM Gemmini integration
+set(USE_GEMMINI OFF)
+
 # Whether to build fast VTA simulator driver
 set(USE_VTA_FSIM OFF)
 
diff --git a/cmake/modules/contrib/Gemmini.cmake b/cmake/modules/contrib/Gemmini.cmake
index 4b73d183ddc1..0d224c74ea75 100644
--- a/cmake/modules/contrib/Gemmini.cmake
+++ b/cmake/modules/contrib/Gemmini.cmake
@@ -1,4 +1,4 @@
-if(USE_MICRO)
+if(USE_GEMMINI)
   message(STATUS "Add Gemmini for microTVM")
 
   function(microtvm_add_gemmini)
@@ -10,7 +10,7 @@ if(USE_MICRO)
 
       # Dense example project generation
       "apps/microtvm/gemmini/template_project/src dense.c -> gemmini/src/dense_example"
-      "apps/microtvm/gemmini/template_project/src/makefiles/dense Makefile -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/dense_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dense_example"
       "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/dense_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/dense_example"
@@ -20,7 +20,7 @@ if(USE_MICRO)
 
       # CONV2D example project generation
       "apps/microtvm/gemmini/template_project/src conv2d.c -> gemmini/src/conv2d_example"
-      "apps/microtvm/gemmini/template_project/src/makefiles/conv2d Makefile -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/conv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/conv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/conv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/conv2d_example"
@@ -30,7 +30,7 @@ if(USE_MICRO)
 
       # DW CONV2D example project generation
       "apps/microtvm/gemmini/template_project/src dwconv2d.c -> gemmini/src/dwconv2d_example"
-      "apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d Makefile -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/dwconv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dwconv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/dwconv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/dwconv2d_example"
@@ -40,7 +40,7 @@ if(USE_MICRO)
 
       # ADD example project generation
       "apps/microtvm/gemmini/template_project/src add.c -> gemmini/src/add_example"
-      "apps/microtvm/gemmini/template_project/src/makefiles/add Makefile -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/add_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/add_example"
       "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/add_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/add_example"
@@ -50,7 +50,7 @@ if(USE_MICRO)
 
       # Max pooling 2d example project generation
       "apps/microtvm/gemmini/template_project/src maxpool2d.c -> gemmini/src/maxpool2d_example"
-      "apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d Makefile -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/maxpool2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/maxpool2d_example"
       "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/maxpool2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/maxpool2d_example"
@@ -60,7 +60,7 @@ if(USE_MICRO)
 
       # Mobilenet example project generation
       "apps/microtvm/gemmini/template_project/src mobilenet.c -> gemmini/src/mobilenet_example"
-      "apps/microtvm/gemmini/template_project/src/makefiles/mobilenet Makefile -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/mobilenet_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/mobilenet_example"
       "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/mobilenet_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/mobilenet_example"
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/README.txt b/gallery/how_to/work_with_microtvm/micro_gemmini/README.txt
new file mode 100644
index 000000000000..6826cc7ab810
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/README.txt
@@ -0,0 +1,5 @@
+.. _tutorial-micro-gemmini:
+
+Generate code for the Gemmini accelerator using microTVM
+------------------
+These how-tos demonstrate how to deploy models for the Gemmini accelerator using microTVM.
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py
new file mode 100644
index 000000000000..b3fe3c5bb3a0
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py
@@ -0,0 +1,234 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single add layer example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized add layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+
+Note: This is an **experimental** layer!
+"""
+
+import tensorflow as tf
+from tensorflow.keras import layers
+import numpy as np
+import os
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+# 
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 16
+input_width = 16
+input_channels = 16
+activation = 0
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+class Model(tf.Module):
+    def __init__(self, name=None):
+        super().__init__(name)
+
+    @tf.function(
+        input_signature=[
+            tf.TensorSpec(
+                shape=[1, input_height, input_width, input_channels],
+                dtype=tf.float32,
+            ),
+            tf.TensorSpec(
+                shape=[1, input_height, input_width, input_channels],
+                dtype=tf.float32,
+            ),
+        ]
+    )
+    def add(self, x, y):
+        if activation == 0:
+            return x + y
+        else:
+            return layers.Activation("relu")(x + y)
+
+model = Model()
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+def representative_data_gen():
+    dataset = [
+        (
+            np.array(
+                np.random.randint(
+                    -127, 128, size=(1, input_height, input_width, input_channels)
+                ),
+                dtype=np.float32,
+            ),
+            np.array(
+                np.random.randint(
+                    0, 128, size=(1, input_height, input_width, input_channels)
+                ),
+                dtype=np.float32,
+            ),
+        )
+        for s in range(100)
+    ]
+    for input_value in dataset:
+        yield [input_value[0], input_value[1]]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+with open("add.tflite", "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+
+os.system("rm -rf model.tar dev/ include/ generated-project/")
+
+tflite_file = "./add.tflite"
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+os.system("mkdir -p include")
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+tensor_details = interpreter.get_tensor_details()
+
+input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)
+input_matrix_2 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)
+
+interpreter.set_tensor(input_details[0]["index"], input_matrix_1)
+interpreter.set_tensor(input_details[1]["index"], input_matrix_2)
+
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+gemmini.create_header_file("inputs", "data", "input_1", input_matrix_2, "./include")
+gemmini.create_header_file("inputs", "data", "input_2", input_matrix_1, "./include")
+gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096, use_experimental_qnn_add=True)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+
+mod, params = relay.frontend.from_tflite(
+    tflite_model,
+    shape_dict={"serving_default_x": (1, input_height, input_width, input_channels), "serving_default_y": (1, input_height, input_width, input_channels)},
+    dtype_dict={"serving_default_x": input_dtype, "serving_default_y": input_dtype},
+)
+mod = relay.transform.InferType()(mod)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+##################################
+# Exporting and testing the model using microTVM
+# --------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
+
+import pathlib
+
+os.system("mkdir dev")
+model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
+tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+
+import tarfile
+
+with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
+    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
+
+# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+project_options = {
+    "project_type": "add_example"
+}  
+
+generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+generated_project = tvm.micro.generate_project(
+    template_project_path, module, generated_project_dir, project_options
+)
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+#generated_project.flash()
\ No newline at end of file
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py
new file mode 100644
index 000000000000..18bca38eafa0
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py
@@ -0,0 +1,215 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single 2d convolutional layer example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized 2d convolution layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+
+"""
+
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import numpy as np
+import os
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+# 
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 16
+input_width = 16
+input_channels = 16
+output_channels = 16
+kernel_size = 3
+stride = 1
+padding = 'valid'
+activation = None
+bias = True
+
+# We can add a max pooling layer after the convolution. This can be merged by the integration and can be executed together with the convolution on the Gemmini accelerator.
+pool_size = 1
+pool_stride = 1
+pool_padding = 'valid'
+use_pool = False
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+
+layer_sequence = [
+    layers.Conv2D(
+        output_channels,
+        kernel_size=kernel_size,
+        padding=padding,
+        activation=activation,
+        use_bias=True,
+        bias_initializer="ones",
+        input_shape=(input_height, input_width, input_channels),
+        strides=stride,
+    )
+]
+if use_pool:
+    layer_sequence.append(
+        layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)
+    )
+
+model = keras.Sequential(layer_sequence)
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+def representative_data_gen():
+    dataset = [
+        np.array(np.random.randint(0, 10, size=(100, input_height, input_width, input_channels)), dtype=np.float32)
+        for s in range(10)
+    ]
+    for input_value in dataset:
+        # Model has only one input so each data point has one element.s
+        yield [input_value]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+with open("conv.tflite", "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+
+os.system("rm -rf model.tar dev/ include/ generated-project/")
+
+tflite_file = "./conv.tflite"
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+os.system("mkdir -p include")
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_path="./conv.tflite")
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+input_matrix = np.random.randint(0, 127, (1, input_height, input_width, input_channels), dtype=np.uint8)
+interpreter.set_tensor(input_details[0]["index"], input_matrix)
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+gemmini.create_header_file("inputs", "data", "input", input_matrix, "./include")
+gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+mod, params = relay.frontend.from_tflite(
+    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}
+)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+##################################
+# Exporting and testing the model using microTVM
+# --------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
+import pathlib
+
+os.system("mkdir dev")
+model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
+tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+
+import tarfile
+
+with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
+    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
+
+# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+project_options = {
+    "project_type": "conv2d_example"
+}  
+
+generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+generated_project = tvm.micro.generate_project(
+    template_project_path, module, generated_project_dir, project_options
+)
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+generated_project.flash()
\ No newline at end of file
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py
new file mode 100644
index 000000000000..35349a5c157f
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py
@@ -0,0 +1,214 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single dense layer example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized dense (fully connected) layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+
+"""
+
+import tensorflow as tf
+import numpy as np
+import os
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+# 
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 32
+input_width = 32
+output_width = 32
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+class Model(tf.Module):
+    def __init__(self, name=None):
+        super().__init__(name)
+        self.w = tf.Variable(tf.random.normal([input_width, output_width]), name="w")
+        self.b = tf.Variable(tf.random.normal([output_width]), name="b")
+
+    @tf.function(
+        input_signature=[
+            tf.TensorSpec(shape=[input_height, input_width], dtype=tf.float32),
+        ]
+    )
+    def matmul(self, x):
+        return tf.linalg.matmul(x, self.w, transpose_b=False) + self.b
+
+model = Model()
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+
+def representative_data_gen():
+    dataset = [
+        (
+            np.array(
+                np.random.randint(-127, 128, size=(input_height, input_width)), dtype=np.float32
+            ),
+            np.array(
+                np.random.randint(-127, 128, size=(input_width, output_width)), dtype=np.float32
+            ),
+        )
+        for s in range(100)
+    ]
+    for input_value in dataset:
+        # Model has only one input so each data point has one element.
+        yield [input_value[0]]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+with open("matmul.tflite", "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+os.system("rm -rf model.tar dev/ include/ generated-project/")
+
+tflite_file = "./matmul.tflite"
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+os.system("mkdir -p include")
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+tensor_details = interpreter.get_tensor_details()
+
+input1 = np.random.randint(0, 255, (input_height, input_width), dtype=np.uint8)
+interpreter.set_tensor(input_details[0]["index"], input1)
+
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+gemmini.create_header_file("inputs", "data", "input", input1, "./include")
+gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+mod, params = relay.frontend.from_tflite(
+    tflite_model,
+    shape_dict={
+        "serving_default_x:0": (input_height, input_width),
+    },
+    dtype_dict={
+        "serving_default_x:0": input_dtype,
+    },
+)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+##################################
+# Exporting and testing the model using microTVM
+# --------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
+import pathlib
+
+os.system("mkdir dev")
+model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
+tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+
+import tarfile
+
+with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
+    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
+
+# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+project_options = {
+    "project_type": "dense_example"
+}  
+
+generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+generated_project = tvm.micro.generate_project(
+    template_project_path, module, generated_project_dir, project_options
+)
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+generated_project.flash()
+
+
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py
new file mode 100644
index 000000000000..44d3e57ea2d9
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py
@@ -0,0 +1,207 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single 2d depthwise convolutional layer example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized 2D depthwise convolution layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+"""
+
+import itertools
+from pyrsistent import v
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import numpy as np
+import os
+import argparse
+import random
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+# 
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 112
+input_width = 112
+input_channels = 32
+kernel_size = 3
+stride = 1
+padding = 'same'
+activation = None
+bias = True
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+model = keras.Sequential(
+    [
+        layers.DepthwiseConv2D(
+            kernel_size=kernel_size,
+            padding=padding,
+            activation=activation,
+            use_bias=True,
+            bias_initializer="ones",
+            input_shape=(input_height, input_width, input_channels),
+            strides=stride,
+        )
+    ]
+)
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+def representative_data_gen():
+    dataset = [
+        np.array(np.random.randint(0, 127, size=(10, input_height, input_width, input_channels)), dtype=np.float32)
+        for s in range(10)
+    ]
+    for input_value in dataset:
+        # Model has only one input so each data point has one element.s
+        yield [input_value]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+with open("dwconv.tflite", "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+os.system("rm -rf model.tar dev/ include/ generated-project/")
+
+tflite_file = "./dwconv.tflite"
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+os.system("mkdir -p include")
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_path="./dwconv.tflite")
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+tensor_details = interpreter.get_tensor_details()
+
+input = np.random.randint(0, 2, (1, input_height, input_width, input_channels), dtype=np.uint8)
+interpreter.set_tensor(input_details[0]["index"], input)
+
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+gemmini.create_header_file("inputs", "data", "input", input, "./include")
+gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+mod, params = relay.frontend.from_tflite(
+    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}
+)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+##################################
+# Exporting and testing the model using microTVM
+# --------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
+import pathlib
+
+os.system("mkdir dev")
+model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
+tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+
+import tarfile
+
+with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
+    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
+
+# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+project_options = {
+    "project_type": "dwconv2d_example"
+}  
+
+generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+generated_project = tvm.micro.generate_project(
+    template_project_path, module, generated_project_dir, project_options
+)
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+
+generated_project.flash()
\ No newline at end of file
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py
new file mode 100644
index 000000000000..03798ae62851
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single 2d max pooling layer example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized 2D max pooling layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+"""
+
+import tensorflow as tf
+from tensorflow.keras import layers
+import numpy as np
+import os
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+# 
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 16
+input_width = 16
+input_channels = 16
+pool_size = 2
+pool_stride = 1
+pool_padding = 'valid'
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+class Model(tf.Module):
+    def __init__(self, name=None):
+        super().__init__(name)
+
+    @tf.function(
+        input_signature=[
+            tf.TensorSpec(
+                shape=[1, input_height, input_width, input_channels],
+                dtype=tf.float32,
+            )
+        ]
+    )
+    def maxpool(self, x):
+        return layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)(x)
+
+model = Model()
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+
+def representative_data_gen():
+    dataset = [
+        np.array(
+            np.random.randint(
+                -127, 128, size=(1, input_height, input_width, input_channels)
+            ),
+            dtype=np.float32,
+        )
+        for s in range(100)
+    ]
+    for input_value in dataset:
+        # Model has only one input so each data point has one element.
+        yield [input_value]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+with open("maxpool.tflite", "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+os.system("rm -rf model.tar dev/ include/ generated-project/")
+
+tflite_file = "./maxpool.tflite"
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+os.system("mkdir -p include")
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+tensor_details = interpreter.get_tensor_details()
+
+input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)
+
+interpreter.set_tensor(input_details[0]["index"], input_matrix_1)
+
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+gemmini.create_header_file("inputs", "data", "input", input_matrix_1, "./include")
+gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+mod, params = relay.frontend.from_tflite(
+    tflite_model,
+    shape_dict={"serving_default_x": (1, input_height, input_width, input_channels)},
+    dtype_dict={"serving_default_x": input_dtype},
+)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+##################################
+# Exporting and testing the model using microTVM
+# --------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
+import pathlib
+
+os.system("mkdir dev")
+model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
+tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+
+import tarfile
+
+with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
+    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
+
+# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+project_options = {
+    "project_type": "maxpool2d_example"
+}  
+
+generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+generated_project = tvm.micro.generate_project(
+    template_project_path, module, generated_project_dir, project_options
+)
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+generated_project.flash()
\ No newline at end of file
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py
new file mode 100644
index 000000000000..5d3a5009b67e
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py
@@ -0,0 +1,262 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A complete MobileNet example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized MobileNet network can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+"""
+
+import numpy as np
+import tensorflow as tf
+import os
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+from mobilenet_utils import generate_mobilenet_tflite_model, get_real_image, run_tflite_model
+from tvm.contrib.download import download_testdata
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+# 
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Helper functions
+# --------------------------------
+#
+# This functions will help us generate the MobileNet model
+
+def get_real_image(im_height, im_width):
+    from PIL import Image
+
+    repo_base = "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/"
+    img_name = "elephant-299.jpg"
+    image_url = os.path.join(repo_base, img_name)
+    img_path = download_testdata(image_url, img_name, module="data")
+    image = Image.open(img_path).resize((im_height, im_width))
+    x = np.array(image).astype("uint8")
+    data = np.reshape(x, (1, im_height, im_width, 3))
+    return data
+
+def run_tflite_model(tflite_model_buf, input_data):
+    """Generic function to execute TFLite"""
+    try:
+        from tensorflow import lite as interpreter_wrapper
+    except ImportError:
+        from tensorflow.contrib import lite as interpreter_wrapper
+
+    input_data = input_data if isinstance(input_data, list) else [input_data]
+
+    interpreter = interpreter_wrapper.Interpreter(model_content=tflite_model_buf)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    # set input
+    assert len(input_data) == len(input_details)
+    for i in range(len(input_details)):
+        interpreter.set_tensor(input_details[i]["index"], input_data[i])
+
+    # Run
+    interpreter.invoke()
+
+    # get output
+    tflite_output = list()
+    for i in range(len(output_details)):
+        tflite_output.append(interpreter.get_tensor(output_details[i]["index"]))
+
+    return tflite_output
+
+def download_model():
+    model_url = (
+        "https://storage.googleapis.com/download.tensorflow.org/models/"
+        "tflite_11_05_08/mobilenet_v2_1.0_224.tgz"
+    )
+
+    # Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite
+    model_path = download_testdata(
+        model_url, "mobilenet_v2_1.0_224.tgz", module=["tf", "official", "mobilenet_v2"]
+    )
+    model_dir = os.path.dirname(model_path)
+
+    return model_dir, model_path
+
+
+def extract(path):
+    import tarfile
+
+    if path.endswith("tgz") or path.endswith("gz"):
+        dir_path = os.path.dirname(path)
+        tar = tarfile.open(path)
+        tar.extractall(path=dir_path)
+        tar.close()
+    else:
+        raise RuntimeError("Could not decompress the file: " + path)
+
+
+def create_tflite_model(model_dir: str):
+    # tflite_model_name = [f for f in os.listdir(model_dir) if f.endswith(".tflite")][0]
+    # return f"{model_dir}/{tflite_model_name}"
+    def representative_data_gen():
+        dataset = [
+            np.array(np.random.randint(0, 255, size=(1, 224, 224, 3)), dtype=np.float32)
+            for s in range(100)
+        ]
+        for input_value in dataset:
+            # Model has only one input so each data point has one element.s
+            yield [input_value]
+
+    pb_file = [f for f in os.listdir(model_dir) if f.endswith(".pb")][0]
+    converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
+        f"{model_dir}/{pb_file}",
+        input_arrays=["input"],
+        input_shapes={"input": [1, 224, 224, 3]},
+        output_arrays=["MobilenetV2/Predictions/Reshape"],
+    )
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    # converter.target_spec.supported_ops = [tf.lite.OpsSet.SELECT_TF_OPS]
+    converter.inference_input_type = tf.uint8
+    converter.inference_output_type = tf.uint8
+    converter.representative_dataset = representative_data_gen
+    converter._experimental_disable_per_channel = True
+
+    tflite_model = converter.convert()
+    tflite_model_name = pb_file.replace(".pb", ".tflite")
+    with open(f"{model_dir}/{tflite_model_name}", "wb") as f:
+        f.write(tflite_model)
+
+    return f"{model_dir}/{tflite_model_name}"
+
+
+def generate_mobilenet_tflite_model():
+    model_dir, model_path = download_model()
+    extract(model_path)
+    return create_tflite_model(model_dir)
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# We clean and prepare the workspace
+os.system("rm -rf model.tar dev/ include/ generated-project/")
+os.system("mkdir -p include")
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+tflite_model_dir = generate_mobilenet_tflite_model()
+
+input_image = get_real_image(224, 224)
+
+tflite_model_file = os.path.join(tflite_model_dir)
+tflite_model_buf = open(tflite_model_file, "rb").read()
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+tflite_res = run_tflite_model(tflite_model_buf, input_image)
+tflite_pred = np.squeeze(tflite_res).argsort()[-5:][::-1]
+print("Expected argmax = %i" % (tflite_pred[0],))
+print("Expected max labels = %s" % (tflite_pred,))
+
+# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+gemmini.create_header_file("inputs", "data", "input", input_image, "./include")
+gemmini.create_header_file("outputs", "data", "output", tflite_pred.astype(np.uint32), "./include")
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+dtype_dict = {"input": input_image.dtype.name}
+shape_dict = {"input": input_image.shape}
+
+mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict)
+mod = relay.transform.InferType()(mod)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+##################################
+# Exporting and testing the model using microTVM
+# --------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
+import pathlib
+
+os.system("mkdir dev")
+model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
+tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+
+import tarfile
+
+with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
+    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
+
+# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+project_options = {
+    "project_type": "mobilenet_example"
+}  
+
+generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+generated_project = tvm.micro.generate_project(
+    template_project_path, module, generated_project_dir, project_options
+)
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+generated_project.flash()
\ No newline at end of file
diff --git a/python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb
deleted file mode 100644
index 2c2527830858..000000000000
--- a/python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb
+++ /dev/null
@@ -1,311 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# MobileNet tutorial\n",
-    "\n",
-    "This tutorials shows how a quantized MobileNet network can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import os\n",
-    "import tvm.contrib.gemmini as gemmini\n",
-    "from tvm import relay\n",
-    "import tvm\n",
-    "from mobilenet_utils import generate_mobilenet_tflite_model, get_real_image, run_tflite_model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"CHIPYARD_HOME\"] = \"\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We clean and prepare the workspace"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
-    "os.system(\"mkdir -p include\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tflite_model_dir = generate_mobilenet_tflite_model()\n",
-    "\n",
-    "input_image = get_real_image(224, 224)\n",
-    "\n",
-    "tflite_model_file = os.path.join(tflite_model_dir)\n",
-    "tflite_model_buf = open(tflite_model_file, \"rb\").read()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "try:\n",
-    "    import tflite\n",
-    "\n",
-    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "except AttributeError:\n",
-    "    import tflite.Model\n",
-    "\n",
-    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "\n",
-    "tflite_res = run_tflite_model(tflite_model_buf, input_image)\n",
-    "tflite_pred = np.squeeze(tflite_res).argsort()[-5:][::-1]\n",
-    "print(\"Expected argmax = %i\" % (tflite_pred[0],))\n",
-    "print(\"Expected max labels = %s\" % (tflite_pred,))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input_image, \"./include\")\n",
-    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", tflite_pred.astype(np.uint32), \"./include\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The TFLite model generated in the previous steps is now imported into TVM."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dtype_dict = {\"input\": input_image.dtype.name}\n",
-    "shape_dict = {\"input\": input_image.shape}\n",
-    "\n",
-    "mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict)\n",
-    "mod = relay.transform.InferType()(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod = gemmini.preprocess_pass(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
-    "\n",
-    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
-    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
-    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
-    "\n",
-    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
-    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pathlib\n",
-    "\n",
-    "os.system(\"mkdir dev\")\n",
-    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
-    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
-    "\n",
-    "import tarfile\n",
-    "\n",
-    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
-    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
-    "project_options = {\n",
-    "    \"project_type\": \"mobilenet_example\"\n",
-    "}  \n",
-    "\n",
-    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
-    "generated_project = tvm.micro.generate_project(\n",
-    "    template_project_path, module, generated_project_dir, project_options\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We build the project. This will generate an executable we can run on the Spike simulator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.build()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finally, we execute the compiled baremetal project on the Spike simulator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.flash()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.8.10 ('tvm': venv)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py b/python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py
deleted file mode 100644
index 51e75fdd7022..000000000000
--- a/python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Utils to help generate the MobileNet TFLite model
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
-
-import os
-from tvm.contrib.download import download_testdata
-import numpy as np
-import tensorflow as tf
-
-
-def get_real_image(im_height, im_width):
-    from PIL import Image
-
-    repo_base = "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/"
-    img_name = "elephant-299.jpg"
-    image_url = os.path.join(repo_base, img_name)
-    img_path = download_testdata(image_url, img_name, module="data")
-    image = Image.open(img_path).resize((im_height, im_width))
-    x = np.array(image).astype("uint8")
-    data = np.reshape(x, (1, im_height, im_width, 3))
-    return data
-
-
-def run_tflite_model(tflite_model_buf, input_data):
-    """Generic function to execute TFLite"""
-    try:
-        from tensorflow import lite as interpreter_wrapper
-    except ImportError:
-        from tensorflow.contrib import lite as interpreter_wrapper
-
-    input_data = input_data if isinstance(input_data, list) else [input_data]
-
-    interpreter = interpreter_wrapper.Interpreter(model_content=tflite_model_buf)
-    interpreter.allocate_tensors()
-
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
-
-    # set input
-    assert len(input_data) == len(input_details)
-    for i in range(len(input_details)):
-        interpreter.set_tensor(input_details[i]["index"], input_data[i])
-
-    # Run
-    interpreter.invoke()
-
-    # get output
-    tflite_output = list()
-    for i in range(len(output_details)):
-        tflite_output.append(interpreter.get_tensor(output_details[i]["index"]))
-
-    return tflite_output
-
-
-def download_model():
-    model_url = (
-        "https://storage.googleapis.com/download.tensorflow.org/models/"
-        "tflite_11_05_08/mobilenet_v2_1.0_224.tgz"
-    )
-
-    # Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite
-    model_path = download_testdata(
-        model_url, "mobilenet_v2_1.0_224.tgz", module=["tf", "official", "mobilenet_v2"]
-    )
-    model_dir = os.path.dirname(model_path)
-
-    return model_dir, model_path
-
-
-def extract(path):
-    import tarfile
-
-    if path.endswith("tgz") or path.endswith("gz"):
-        dir_path = os.path.dirname(path)
-        tar = tarfile.open(path)
-        tar.extractall(path=dir_path)
-        tar.close()
-    else:
-        raise RuntimeError("Could not decompress the file: " + path)
-
-
-def create_tflite_model(model_dir: str):
-    # tflite_model_name = [f for f in os.listdir(model_dir) if f.endswith(".tflite")][0]
-    # return f"{model_dir}/{tflite_model_name}"
-    def representative_data_gen():
-        dataset = [
-            np.array(np.random.randint(0, 255, size=(1, 224, 224, 3)), dtype=np.float32)
-            for s in range(100)
-        ]
-        for input_value in dataset:
-            # Model has only one input so each data point has one element.s
-            yield [input_value]
-
-    pb_file = [f for f in os.listdir(model_dir) if f.endswith(".pb")][0]
-    converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
-        f"{model_dir}/{pb_file}",
-        input_arrays=["input"],
-        input_shapes={"input": [1, 224, 224, 3]},
-        output_arrays=["MobilenetV2/Predictions/Reshape"],
-    )
-    converter.optimizations = [tf.lite.Optimize.DEFAULT]
-    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-    # converter.target_spec.supported_ops = [tf.lite.OpsSet.SELECT_TF_OPS]
-    converter.inference_input_type = tf.uint8
-    converter.inference_output_type = tf.uint8
-    converter.representative_dataset = representative_data_gen
-    converter._experimental_disable_per_channel = True
-
-    tflite_model = converter.convert()
-    tflite_model_name = pb_file.replace(".pb", ".tflite")
-    with open(f"{model_dir}/{tflite_model_name}", "wb") as f:
-        f.write(tflite_model)
-
-    return f"{model_dir}/{tflite_model_name}"
-
-
-def generate_mobilenet_tflite_model():
-    model_dir, model_path = download_model()
-    extract(model_path)
-    return create_tflite_model(model_dir)
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb
deleted file mode 100644
index 3bb2fa5788e9..000000000000
--- a/python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb
+++ /dev/null
@@ -1,395 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Add layer tutorial\n",
-    "\n",
-    "This tutorials shows how a quantized add layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.\n",
-    "\n",
-    "Note: This is an **experimental** layer!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorflow as tf\n",
-    "from tensorflow.keras import layers\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import tvm.contrib.gemmini as gemmini\n",
-    "from tvm import relay\n",
-    "import tvm"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"CHIPYARD_HOME\"] = \"\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Then we define the parameters of the layer we want to test. In this case:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_height = 16\n",
-    "input_width = 16\n",
-    "input_channels = 16\n",
-    "activation = 0"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class Model(tf.Module):\n",
-    "    def __init__(self, name=None):\n",
-    "        super().__init__(name)\n",
-    "\n",
-    "    @tf.function(\n",
-    "        input_signature=[\n",
-    "            tf.TensorSpec(\n",
-    "                shape=[1, input_height, input_width, input_channels],\n",
-    "                dtype=tf.float32,\n",
-    "            ),\n",
-    "            tf.TensorSpec(\n",
-    "                shape=[1, input_height, input_width, input_channels],\n",
-    "                dtype=tf.float32,\n",
-    "            ),\n",
-    "        ]\n",
-    "    )\n",
-    "    def add(self, x, y):\n",
-    "        if activation == 0:\n",
-    "            return x + y\n",
-    "        else:\n",
-    "            return layers.Activation(\"relu\")(x + y)\n",
-    "\n",
-    "model = Model()\n",
-    "\n",
-    "# Convert the concrete functions using TFLiteConverter\n",
-    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
-    "\n",
-    "def representative_data_gen():\n",
-    "    dataset = [\n",
-    "        (\n",
-    "            np.array(\n",
-    "                np.random.randint(\n",
-    "                    -127, 128, size=(1, input_height, input_width, input_channels)\n",
-    "                ),\n",
-    "                dtype=np.float32,\n",
-    "            ),\n",
-    "            np.array(\n",
-    "                np.random.randint(\n",
-    "                    0, 128, size=(1, input_height, input_width, input_channels)\n",
-    "                ),\n",
-    "                dtype=np.float32,\n",
-    "            ),\n",
-    "        )\n",
-    "        for s in range(100)\n",
-    "    ]\n",
-    "    for input_value in dataset:\n",
-    "        yield [input_value[0], input_value[1]]\n",
-    "\n",
-    "\n",
-    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
-    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
-    "converter.inference_input_type = tf.uint8\n",
-    "converter.inference_output_type = tf.int8\n",
-    "converter.representative_dataset = representative_data_gen\n",
-    "converter._experimental_disable_per_channel = True\n",
-    "\n",
-    "tflite_model = converter.convert()\n",
-    "\n",
-    "# Save the model.\n",
-    "with open(\"add.tflite\", \"wb\") as f:\n",
-    "    f.write(tflite_model)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
-    "\n",
-    "tflite_file = \"./add.tflite\"\n",
-    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
-    "input_tensor = \"layer1_input\"\n",
-    "input_dtype = \"uint8\"\n",
-    "\n",
-    "os.system(\"mkdir -p include\")\n",
-    "\n",
-    "try:\n",
-    "    import tflite\n",
-    "\n",
-    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "except AttributeError:\n",
-    "    import tflite.Model\n",
-    "\n",
-    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "\n",
-    "# Load the TFLite model and allocate tensors.\n",
-    "interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)\n",
-    "interpreter.allocate_tensors()\n",
-    "input_details = interpreter.get_input_details()\n",
-    "output_details = interpreter.get_output_details()\n",
-    "tensor_details = interpreter.get_tensor_details()\n",
-    "\n",
-    "input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
-    "input_matrix_2 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
-    "\n",
-    "interpreter.set_tensor(input_details[0][\"index\"], input_matrix_1)\n",
-    "interpreter.set_tensor(input_details[1][\"index\"], input_matrix_2)\n",
-    "\n",
-    "interpreter.invoke()\n",
-    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.create_header_file(\"inputs\", \"data\", \"input_1\", input_matrix_2, \"./include\")\n",
-    "gemmini.create_header_file(\"inputs\", \"data\", \"input_2\", input_matrix_1, \"./include\")\n",
-    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096, use_experimental_qnn_add=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The TFLite model generated in the previous steps is now imported into TVM."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod, params = relay.frontend.from_tflite(\n",
-    "    tflite_model,\n",
-    "    shape_dict={\"serving_default_x\": (1, input_height, input_width, input_channels), \"serving_default_y\": (1, input_height, input_width, input_channels)},\n",
-    "    dtype_dict={\"serving_default_x\": input_dtype, \"serving_default_y\": input_dtype},\n",
-    ")\n",
-    "mod = relay.transform.InferType()(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod = gemmini.preprocess_pass(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
-    "\n",
-    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
-    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
-    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
-    "\n",
-    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
-    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pathlib\n",
-    "\n",
-    "os.system(\"mkdir dev\")\n",
-    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
-    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
-    "\n",
-    "import tarfile\n",
-    "\n",
-    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
-    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
-    "project_options = {\n",
-    "    \"project_type\": \"add_example\"\n",
-    "}  \n",
-    "\n",
-    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
-    "generated_project = tvm.micro.generate_project(\n",
-    "    template_project_path, module, generated_project_dir, project_options\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We build the project. This will generate an executable we can run on the Spike simulator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.build()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
-    "\n",
-    "Note: if there are errors, these can be related to rounding errors."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.flash()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.8.10 ('tvm': venv)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb
deleted file mode 100644
index c7512586b809..000000000000
--- a/python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb
+++ /dev/null
@@ -1,378 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 2D convolution layer tutorial\n",
-    "\n",
-    "This tutorials shows how a quantized 2d convolution layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorflow as tf\n",
-    "from tensorflow import keras\n",
-    "from tensorflow.keras import layers\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import tvm.contrib.gemmini as gemmini\n",
-    "from tvm import relay\n",
-    "import tvm"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"CHIPYARD_HOME\"] = \"\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Then we define the parameters of the layer we want to test. In this case:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_height = 16\n",
-    "input_width = 16\n",
-    "input_channels = 16\n",
-    "output_channels = 16\n",
-    "kernel_size = 3\n",
-    "stride = 1\n",
-    "padding = 'valid'\n",
-    "activation = None\n",
-    "bias = True\n",
-    "\n",
-    "# We can add a max pooling layer after the convolution. This can be merged by the integration and can be executed together with the convolution on the Gemmini accelerator.\n",
-    "pool_size = 1\n",
-    "pool_stride = 1\n",
-    "pool_padding = 'valid'\n",
-    "use_pool = False"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "layer_sequence = [\n",
-    "    layers.Conv2D(\n",
-    "        output_channels,\n",
-    "        kernel_size=kernel_size,\n",
-    "        padding=padding,\n",
-    "        activation=activation,\n",
-    "        use_bias=True,\n",
-    "        bias_initializer=\"ones\",\n",
-    "        input_shape=(input_height, input_width, input_channels),\n",
-    "        strides=stride,\n",
-    "    )\n",
-    "]\n",
-    "if use_pool:\n",
-    "    layer_sequence.append(\n",
-    "        layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)\n",
-    "    )\n",
-    "\n",
-    "model = keras.Sequential(layer_sequence)\n",
-    "\n",
-    "# Convert the concrete functions using TFLiteConverter\n",
-    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
-    "\n",
-    "def representative_data_gen():\n",
-    "    dataset = [\n",
-    "        np.array(np.random.randint(0, 10, size=(100, input_height, input_width, input_channels)), dtype=np.float32)\n",
-    "        for s in range(10)\n",
-    "    ]\n",
-    "    for input_value in dataset:\n",
-    "        # Model has only one input so each data point has one element.s\n",
-    "        yield [input_value]\n",
-    "\n",
-    "\n",
-    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
-    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
-    "converter.inference_input_type = tf.uint8\n",
-    "converter.inference_output_type = tf.int8\n",
-    "converter.representative_dataset = representative_data_gen\n",
-    "converter._experimental_disable_per_channel = True\n",
-    "\n",
-    "tflite_model = converter.convert()\n",
-    "\n",
-    "# Save the model.\n",
-    "with open(\"conv.tflite\", \"wb\") as f:\n",
-    "    f.write(tflite_model)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
-    "\n",
-    "tflite_file = \"./conv.tflite\"\n",
-    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
-    "input_tensor = \"layer1_input\"\n",
-    "input_dtype = \"uint8\"\n",
-    "\n",
-    "os.system(\"mkdir -p include\")\n",
-    "\n",
-    "try:\n",
-    "    import tflite\n",
-    "\n",
-    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "except AttributeError:\n",
-    "    import tflite.Model\n",
-    "\n",
-    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "\n",
-    "# Load the TFLite model and allocate tensors.\n",
-    "interpreter = tf.lite.Interpreter(model_path=\"./conv.tflite\")\n",
-    "interpreter.allocate_tensors()\n",
-    "input_details = interpreter.get_input_details()\n",
-    "output_details = interpreter.get_output_details()\n",
-    "input_matrix = np.random.randint(0, 127, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
-    "interpreter.set_tensor(input_details[0][\"index\"], input_matrix)\n",
-    "interpreter.invoke()\n",
-    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input_matrix, \"./include\")\n",
-    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The TFLite model generated in the previous steps is now imported into TVM."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod, params = relay.frontend.from_tflite(\n",
-    "    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}\n",
-    ")\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod = gemmini.preprocess_pass(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
-    "\n",
-    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
-    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
-    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
-    "\n",
-    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
-    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pathlib\n",
-    "\n",
-    "os.system(\"mkdir dev\")\n",
-    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
-    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
-    "\n",
-    "import tarfile\n",
-    "\n",
-    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
-    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
-    "project_options = {\n",
-    "    \"project_type\": \"conv2d_example\"\n",
-    "}  \n",
-    "\n",
-    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
-    "generated_project = tvm.micro.generate_project(\n",
-    "    template_project_path, module, generated_project_dir, project_options\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We build the project. This will generate an executable we can run on the Spike simulator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.build()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
-    "\n",
-    "Note: if there are errors, these can be related to rounding errors."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.flash()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.8.10 ('tvm': venv)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb
deleted file mode 100644
index d1959f66b72a..000000000000
--- a/python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb
+++ /dev/null
@@ -1,378 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Dense layer tutorial\n",
-    "\n",
-    "This tutorials shows how a quantized dense (fully connected) layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorflow as tf\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import tvm.contrib.gemmini as gemmini\n",
-    "from tvm import relay\n",
-    "import tvm"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"CHIPYARD_HOME\"] = \"\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Then we define the parameters of the layer we want to test. In this case:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_height = 32\n",
-    "input_width = 32\n",
-    "output_width = 32"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class Model(tf.Module):\n",
-    "    def __init__(self, name=None):\n",
-    "        super().__init__(name)\n",
-    "        self.w = tf.Variable(tf.random.normal([input_width, output_width]), name=\"w\")\n",
-    "        self.b = tf.Variable(tf.random.normal([output_width]), name=\"b\")\n",
-    "\n",
-    "    @tf.function(\n",
-    "        input_signature=[\n",
-    "            tf.TensorSpec(shape=[input_height, input_width], dtype=tf.float32),\n",
-    "        ]\n",
-    "    )\n",
-    "    def matmul(self, x):\n",
-    "        return tf.linalg.matmul(x, self.w, transpose_b=False) + self.b\n",
-    "\n",
-    "model = Model()\n",
-    "\n",
-    "# Convert the concrete functions using TFLiteConverter\n",
-    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
-    "\n",
-    "\n",
-    "def representative_data_gen():\n",
-    "    dataset = [\n",
-    "        (\n",
-    "            np.array(\n",
-    "                np.random.randint(-127, 128, size=(input_height, input_width)), dtype=np.float32\n",
-    "            ),\n",
-    "            np.array(\n",
-    "                np.random.randint(-127, 128, size=(input_width, output_width)), dtype=np.float32\n",
-    "            ),\n",
-    "        )\n",
-    "        for s in range(100)\n",
-    "    ]\n",
-    "    for input_value in dataset:\n",
-    "        # Model has only one input so each data point has one element.\n",
-    "        yield [input_value[0]]\n",
-    "\n",
-    "\n",
-    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
-    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
-    "converter.inference_input_type = tf.uint8\n",
-    "converter.inference_output_type = tf.int8\n",
-    "converter.representative_dataset = representative_data_gen\n",
-    "converter._experimental_disable_per_channel = True\n",
-    "\n",
-    "tflite_model = converter.convert()\n",
-    "\n",
-    "# Save the model.\n",
-    "with open(\"matmul.tflite\", \"wb\") as f:\n",
-    "    f.write(tflite_model)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
-    "\n",
-    "tflite_file = \"./matmul.tflite\"\n",
-    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
-    "input_tensor = \"layer1_input\"\n",
-    "input_dtype = \"uint8\"\n",
-    "\n",
-    "os.system(\"mkdir -p include\")\n",
-    "\n",
-    "try:\n",
-    "    import tflite\n",
-    "\n",
-    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "except AttributeError:\n",
-    "    import tflite.Model\n",
-    "\n",
-    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "\n",
-    "# Load the TFLite model and allocate tensors.\n",
-    "interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)\n",
-    "interpreter.allocate_tensors()\n",
-    "input_details = interpreter.get_input_details()\n",
-    "output_details = interpreter.get_output_details()\n",
-    "tensor_details = interpreter.get_tensor_details()\n",
-    "\n",
-    "input1 = np.random.randint(0, 255, (input_height, input_width), dtype=np.uint8)\n",
-    "interpreter.set_tensor(input_details[0][\"index\"], input1)\n",
-    "\n",
-    "interpreter.invoke()\n",
-    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input1, \"./include\")\n",
-    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The TFLite model generated in the previous steps is now imported into TVM."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod, params = relay.frontend.from_tflite(\n",
-    "    tflite_model,\n",
-    "    shape_dict={\n",
-    "        \"serving_default_x:0\": (input_height, input_width),\n",
-    "    },\n",
-    "    dtype_dict={\n",
-    "        \"serving_default_x:0\": input_dtype,\n",
-    "    },\n",
-    ")\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod = gemmini.preprocess_pass(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
-    "\n",
-    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
-    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
-    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
-    "\n",
-    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
-    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pathlib\n",
-    "\n",
-    "os.system(\"mkdir dev\")\n",
-    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
-    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
-    "\n",
-    "import tarfile\n",
-    "\n",
-    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
-    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
-    "project_options = {\n",
-    "    \"project_type\": \"dense_example\"\n",
-    "}  \n",
-    "\n",
-    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
-    "generated_project = tvm.micro.generate_project(\n",
-    "    template_project_path, module, generated_project_dir, project_options\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We build the project. This will generate an executable we can run on the Spike simulator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.build()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
-    "\n",
-    "Note: if there are errors, these can be related to rounding errors."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.flash()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.8.10 ('tvm': venv)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb
deleted file mode 100644
index b5753a300401..000000000000
--- a/python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb
+++ /dev/null
@@ -1,373 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 2D depthwise convolution layer tutorial\n",
-    "\n",
-    "This tutorials shows how a quantized 2D depthwise convolution layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import itertools\n",
-    "from pyrsistent import v\n",
-    "import tensorflow as tf\n",
-    "from tensorflow import keras\n",
-    "from tensorflow.keras import layers\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import argparse\n",
-    "import random\n",
-    "import tvm.contrib.gemmini as gemmini\n",
-    "from tvm import relay\n",
-    "import tvm"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"CHIPYARD_HOME\"] = \"\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Then we define the parameters of the layer we want to test. In this case:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_height = 112\n",
-    "input_width = 112\n",
-    "input_channels = 32\n",
-    "kernel_size = 3\n",
-    "stride = 1\n",
-    "padding = 'same'\n",
-    "activation = None\n",
-    "bias = True"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = keras.Sequential(\n",
-    "    [\n",
-    "        layers.DepthwiseConv2D(\n",
-    "            kernel_size=kernel_size,\n",
-    "            padding=padding,\n",
-    "            activation=activation,\n",
-    "            use_bias=True,\n",
-    "            bias_initializer=\"ones\",\n",
-    "            input_shape=(input_height, input_width, input_channels),\n",
-    "            strides=stride,\n",
-    "        )\n",
-    "    ]\n",
-    ")\n",
-    "\n",
-    "# Convert the concrete functions using TFLiteConverter\n",
-    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
-    "\n",
-    "def representative_data_gen():\n",
-    "    dataset = [\n",
-    "        np.array(np.random.randint(0, 127, size=(10, input_height, input_width, input_channels)), dtype=np.float32)\n",
-    "        for s in range(10)\n",
-    "    ]\n",
-    "    for input_value in dataset:\n",
-    "        # Model has only one input so each data point has one element.s\n",
-    "        yield [input_value]\n",
-    "\n",
-    "\n",
-    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
-    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
-    "converter.inference_input_type = tf.uint8\n",
-    "converter.inference_output_type = tf.int8\n",
-    "converter.representative_dataset = representative_data_gen\n",
-    "converter._experimental_disable_per_channel = True\n",
-    "\n",
-    "tflite_model = converter.convert()\n",
-    "\n",
-    "# Save the model.\n",
-    "with open(\"dwconv.tflite\", \"wb\") as f:\n",
-    "    f.write(tflite_model)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
-    "\n",
-    "tflite_file = \"./dwconv.tflite\"\n",
-    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
-    "input_tensor = \"layer1_input\"\n",
-    "input_dtype = \"uint8\"\n",
-    "\n",
-    "os.system(\"mkdir -p include\")\n",
-    "\n",
-    "try:\n",
-    "    import tflite\n",
-    "\n",
-    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "except AttributeError:\n",
-    "    import tflite.Model\n",
-    "\n",
-    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "\n",
-    "# Load the TFLite model and allocate tensors.\n",
-    "interpreter = tf.lite.Interpreter(model_path=\"./dwconv.tflite\")\n",
-    "interpreter.allocate_tensors()\n",
-    "input_details = interpreter.get_input_details()\n",
-    "output_details = interpreter.get_output_details()\n",
-    "tensor_details = interpreter.get_tensor_details()\n",
-    "\n",
-    "input = np.random.randint(0, 2, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
-    "interpreter.set_tensor(input_details[0][\"index\"], input)\n",
-    "\n",
-    "interpreter.invoke()\n",
-    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input, \"./include\")\n",
-    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The TFLite model generated in the previous steps is now imported into TVM."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod, params = relay.frontend.from_tflite(\n",
-    "    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}\n",
-    ")\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod = gemmini.preprocess_pass(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
-    "\n",
-    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
-    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
-    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
-    "\n",
-    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
-    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pathlib\n",
-    "\n",
-    "os.system(\"mkdir dev\")\n",
-    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
-    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
-    "\n",
-    "import tarfile\n",
-    "\n",
-    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
-    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
-    "project_options = {\n",
-    "    \"project_type\": \"dwconv2d_example\"\n",
-    "}  \n",
-    "\n",
-    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
-    "generated_project = tvm.micro.generate_project(\n",
-    "    template_project_path, module, generated_project_dir, project_options\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We build the project. This will generate an executable we can run on the Spike simulator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.build()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
-    "\n",
-    "Note: if there are errors, these can be related to rounding errors."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.flash()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.8.10 ('tvm': venv)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb
deleted file mode 100644
index bdee93760f96..000000000000
--- a/python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb
+++ /dev/null
@@ -1,378 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 2D max pooling layer tutorial\n",
-    "\n",
-    "This tutorials shows how a quantized 2D max pooling layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorflow as tf\n",
-    "from tensorflow.keras import layers\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import tvm.contrib.gemmini as gemmini\n",
-    "from tvm import relay\n",
-    "import tvm"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"CHIPYARD_HOME\"] = \"\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Then we define the parameters of the layer we want to test. In this case:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_height = 16\n",
-    "input_width = 16\n",
-    "input_channels = 16\n",
-    "pool_size = 2\n",
-    "pool_stride = 1\n",
-    "pool_padding = 'valid'"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class Model(tf.Module):\n",
-    "    def __init__(self, name=None):\n",
-    "        super().__init__(name)\n",
-    "\n",
-    "    @tf.function(\n",
-    "        input_signature=[\n",
-    "            tf.TensorSpec(\n",
-    "                shape=[1, input_height, input_width, input_channels],\n",
-    "                dtype=tf.float32,\n",
-    "            )\n",
-    "        ]\n",
-    "    )\n",
-    "    def maxpool(self, x):\n",
-    "        return layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)(x)\n",
-    "\n",
-    "model = Model()\n",
-    "\n",
-    "# Convert the concrete functions using TFLiteConverter\n",
-    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
-    "\n",
-    "\n",
-    "def representative_data_gen():\n",
-    "    dataset = [\n",
-    "        np.array(\n",
-    "            np.random.randint(\n",
-    "                -127, 128, size=(1, input_height, input_width, input_channels)\n",
-    "            ),\n",
-    "            dtype=np.float32,\n",
-    "        )\n",
-    "        for s in range(100)\n",
-    "    ]\n",
-    "    for input_value in dataset:\n",
-    "        # Model has only one input so each data point has one element.\n",
-    "        yield [input_value]\n",
-    "\n",
-    "\n",
-    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
-    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
-    "converter.inference_input_type = tf.uint8\n",
-    "converter.inference_output_type = tf.int8\n",
-    "converter.representative_dataset = representative_data_gen\n",
-    "converter._experimental_disable_per_channel = True\n",
-    "\n",
-    "tflite_model = converter.convert()\n",
-    "\n",
-    "# Save the model.\n",
-    "with open(\"maxpool.tflite\", \"wb\") as f:\n",
-    "    f.write(tflite_model)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
-    "\n",
-    "tflite_file = \"./maxpool.tflite\"\n",
-    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
-    "input_tensor = \"layer1_input\"\n",
-    "input_dtype = \"uint8\"\n",
-    "\n",
-    "os.system(\"mkdir -p include\")\n",
-    "\n",
-    "try:\n",
-    "    import tflite\n",
-    "\n",
-    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "except AttributeError:\n",
-    "    import tflite.Model\n",
-    "\n",
-    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "\n",
-    "# Load the TFLite model and allocate tensors.\n",
-    "interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)\n",
-    "interpreter.allocate_tensors()\n",
-    "input_details = interpreter.get_input_details()\n",
-    "output_details = interpreter.get_output_details()\n",
-    "tensor_details = interpreter.get_tensor_details()\n",
-    "\n",
-    "input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
-    "\n",
-    "interpreter.set_tensor(input_details[0][\"index\"], input_matrix_1)\n",
-    "\n",
-    "interpreter.invoke()\n",
-    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input_matrix_1, \"./include\")\n",
-    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The TFLite model generated in the previous steps is now imported into TVM."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod, params = relay.frontend.from_tflite(\n",
-    "    tflite_model,\n",
-    "    shape_dict={\"serving_default_x\": (1, input_height, input_width, input_channels)},\n",
-    "    dtype_dict={\"serving_default_x\": input_dtype},\n",
-    ")\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod = gemmini.preprocess_pass(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
-    "\n",
-    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
-    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
-    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
-    "\n",
-    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
-    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pathlib\n",
-    "\n",
-    "os.system(\"mkdir dev\")\n",
-    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
-    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
-    "\n",
-    "import tarfile\n",
-    "\n",
-    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
-    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
-    "project_options = {\n",
-    "    \"project_type\": \"maxpool2d_example\"\n",
-    "}  \n",
-    "\n",
-    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
-    "generated_project = tvm.micro.generate_project(\n",
-    "    template_project_path, module, generated_project_dir, project_options\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We build the project. This will generate an executable we can run on the Spike simulator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.build()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
-    "\n",
-    "Note: if there are errors, these can be related to rounding errors."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.flash()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.8.10 64-bit",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From 15c76c650ea83c1bc77f7223fc3f37ab1bb8f5e6 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 11:22:36 +0100
Subject: [PATCH 003/286] Small CMAKE fix and lint fixes

---
 CMakeLists.txt                                |  4 +-
 .../template_project/microtvm_api_server.py   | 14 +++----
 .../micro_gemmini/micro_gemmini_add.py        | 37 +++++++++++--------
 .../micro_gemmini/micro_gemmini_conv2d.py     | 28 ++++++++------
 .../micro_gemmini/micro_gemmini_dense.py      | 11 ++----
 .../micro_gemmini/micro_gemmini_dwconv2d.py   | 22 ++++++-----
 .../micro_gemmini/micro_gemmini_maxpool2d.py  | 21 +++++------
 .../micro_gemmini/micro_gemmini_mobilenet.py  | 14 ++++---
 8 files changed, 83 insertions(+), 68 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 47499ff90356..9cfa48fc045d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -575,7 +575,9 @@ if(USE_MICRO)
   # Unix Makefiles generator, need to add these explicit target-level dependency)
   add_dependencies(tvm_runtime zephyr)
   add_dependencies(tvm_runtime arduino)
-  add_dependencies(tvm_runtime gemmini)
+  if(USE_GEMMINI)
+    add_dependencies(tvm_runtime gemmini)
+  endif()
   if(MSVC)
     target_link_libraries(tvm PRIVATE host_standalone_crt )
     target_link_libraries(tvm_runtime PRIVATE host_standalone_crt)
diff --git a/apps/microtvm/gemmini/template_project/microtvm_api_server.py b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
index 85971316ec4e..df2f27d315ea 100644
--- a/apps/microtvm/gemmini/template_project/microtvm_api_server.py
+++ b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
@@ -109,14 +109,15 @@ def _copy_project_files(self, api_server_dir, project_dir, project_type):
                 shutil.copytree(item, dest)
             else:
                 shutil.copy2(item, dest)
-        
+
         shutil.copy2(project_dir / "src" / "Makefile.template", project_dir / "src" / "Makefile")
 
-        test_name = project_type.replace("_example","")
+        test_name = project_type.replace("_example", "")
         new_line = f"tests = {test_name}\n"
-        with open(project_dir / "src" / "Makefile", 'r') as original: data = original.read()
-        with open(project_dir / "src" / "Makefile", 'w') as modified: modified.write(new_line + data)
-
+        with open(project_dir / "src" / "Makefile", "r") as original:
+            data = original.read()
+        with open(project_dir / "src" / "Makefile", "w") as modified:
+            modified.write(new_line + data)
 
     CRT_COPY_ITEMS = ("include", "src")
 
@@ -264,8 +265,7 @@ def build(self, options):
     def flash(self, options):
         test_name = options["project_type"].split("_")[0]
         subprocess.call(
-            "cd src/build && spike --extension=gemmini %s"
-            % (test_name + "-baremetal",),
+            "cd src/build && spike --extension=gemmini %s" % (test_name + "-baremetal",),
             shell=True,
         )
 
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py
index b3fe3c5bb3a0..b8521c4b6ae2 100644
--- a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py
@@ -38,7 +38,7 @@
 # --------------------------------
 #
 # After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
-# 
+#
 # .. code-block:: bash
 #
 #   source <your chipyard home path>/env.sh
@@ -80,24 +80,22 @@ def add(self, x, y):
         else:
             return layers.Activation("relu")(x + y)
 
+
 model = Model()
 
 # Convert the concrete functions using TFLiteConverter
 converter = tf.lite.TFLiteConverter.from_keras_model(model)
 
+
 def representative_data_gen():
     dataset = [
         (
             np.array(
-                np.random.randint(
-                    -127, 128, size=(1, input_height, input_width, input_channels)
-                ),
+                np.random.randint(-127, 128, size=(1, input_height, input_width, input_channels)),
                 dtype=np.float32,
             ),
             np.array(
-                np.random.randint(
-                    0, 128, size=(1, input_height, input_width, input_channels)
-                ),
+                np.random.randint(0, 128, size=(1, input_height, input_width, input_channels)),
                 dtype=np.float32,
             ),
         )
@@ -147,8 +145,12 @@ def representative_data_gen():
 output_details = interpreter.get_output_details()
 tensor_details = interpreter.get_tensor_details()
 
-input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)
-input_matrix_2 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)
+input_matrix_1 = np.random.randint(
+    0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8
+)
+input_matrix_2 = np.random.randint(
+    0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8
+)
 
 interpreter.set_tensor(input_details[0]["index"], input_matrix_1)
 interpreter.set_tensor(input_details[1]["index"], input_matrix_2)
@@ -168,13 +170,18 @@ def representative_data_gen():
 # In this section, we will compile the model using TVM and the Gemmini integration.
 
 # The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
-gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096, use_experimental_qnn_add=True)
+gemmini.Environment.init_overwrite(
+    dim=16, acc_rows=1024, bank_rows=4096, use_experimental_qnn_add=True
+)
 
 # The TFLite model generated in the previous steps is now imported into TVM.
 
 mod, params = relay.frontend.from_tflite(
     tflite_model,
-    shape_dict={"serving_default_x": (1, input_height, input_width, input_channels), "serving_default_y": (1, input_height, input_width, input_channels)},
+    shape_dict={
+        "serving_default_x": (1, input_height, input_width, input_channels),
+        "serving_default_y": (1, input_height, input_width, input_channels),
+    },
     dtype_dict={"serving_default_x": input_dtype, "serving_default_y": input_dtype},
 )
 mod = relay.transform.InferType()(mod)
@@ -192,7 +199,7 @@ def representative_data_gen():
 TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
 EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
 
-with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
 ##################################
@@ -217,9 +224,7 @@ def representative_data_gen():
 # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
 
 template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {
-    "project_type": "add_example"
-}  
+project_options = {"project_type": "add_example"}
 
 generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
 generated_project = tvm.micro.generate_project(
@@ -231,4 +236,4 @@ def representative_data_gen():
 
 # Finally, we execute the compiled baremetal project on the Spike simulator.
 # Note: if there are errors, these can be related to rounding errors.
-#generated_project.flash()
\ No newline at end of file
+# generated_project.flash()
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py
index 18bca38eafa0..b58881162dcc 100644
--- a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py
@@ -38,7 +38,7 @@
 # --------------------------------
 #
 # After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
-# 
+#
 # .. code-block:: bash
 #
 #   source <your chipyard home path>/env.sh
@@ -58,14 +58,14 @@
 output_channels = 16
 kernel_size = 3
 stride = 1
-padding = 'valid'
+padding = "valid"
 activation = None
 bias = True
 
 # We can add a max pooling layer after the convolution. This can be merged by the integration and can be executed together with the convolution on the Gemmini accelerator.
 pool_size = 1
 pool_stride = 1
-pool_padding = 'valid'
+pool_padding = "valid"
 use_pool = False
 
 # We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
@@ -92,9 +92,13 @@
 # Convert the concrete functions using TFLiteConverter
 converter = tf.lite.TFLiteConverter.from_keras_model(model)
 
+
 def representative_data_gen():
     dataset = [
-        np.array(np.random.randint(0, 10, size=(100, input_height, input_width, input_channels)), dtype=np.float32)
+        np.array(
+            np.random.randint(0, 10, size=(100, input_height, input_width, input_channels)),
+            dtype=np.float32,
+        )
         for s in range(10)
     ]
     for input_value in dataset:
@@ -140,7 +144,9 @@ def representative_data_gen():
 interpreter.allocate_tensors()
 input_details = interpreter.get_input_details()
 output_details = interpreter.get_output_details()
-input_matrix = np.random.randint(0, 127, (1, input_height, input_width, input_channels), dtype=np.uint8)
+input_matrix = np.random.randint(
+    0, 127, (1, input_height, input_width, input_channels), dtype=np.uint8
+)
 interpreter.set_tensor(input_details[0]["index"], input_matrix)
 interpreter.invoke()
 expected_output = interpreter.get_tensor(output_details[0]["index"])
@@ -160,7 +166,9 @@ def representative_data_gen():
 
 # The TFLite model generated in the previous steps is now imported into TVM.
 mod, params = relay.frontend.from_tflite(
-    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}
+    tflite_model,
+    shape_dict={input_tensor: (input_height, input_width, input_channels)},
+    dtype_dict={input_tensor: input_dtype},
 )
 mod["main"]
 
@@ -174,7 +182,7 @@ def representative_data_gen():
 TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
 EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
 
-with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
 ##################################
@@ -198,9 +206,7 @@ def representative_data_gen():
 # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
 
 template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {
-    "project_type": "conv2d_example"
-}  
+project_options = {"project_type": "conv2d_example"}
 
 generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
 generated_project = tvm.micro.generate_project(
@@ -212,4 +218,4 @@ def representative_data_gen():
 
 # Finally, we execute the compiled baremetal project on the Spike simulator.
 # Note: if there are errors, these can be related to rounding errors.
-generated_project.flash()
\ No newline at end of file
+generated_project.flash()
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py
index 35349a5c157f..c9a7caffc71b 100644
--- a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py
@@ -36,7 +36,7 @@
 # --------------------------------
 #
 # After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
-# 
+#
 # .. code-block:: bash
 #
 #   source <your chipyard home path>/env.sh
@@ -69,6 +69,7 @@ def __init__(self, name=None):
     def matmul(self, x):
         return tf.linalg.matmul(x, self.w, transpose_b=False) + self.b
 
+
 model = Model()
 
 # Convert the concrete functions using TFLiteConverter
@@ -172,7 +173,7 @@ def representative_data_gen():
 TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
 EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
 
-with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
 ##################################
@@ -195,9 +196,7 @@ def representative_data_gen():
 
 # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
 template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {
-    "project_type": "dense_example"
-}  
+project_options = {"project_type": "dense_example"}
 
 generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
 generated_project = tvm.micro.generate_project(
@@ -210,5 +209,3 @@ def representative_data_gen():
 # Finally, we execute the compiled baremetal project on the Spike simulator.
 # Note: if there are errors, these can be related to rounding errors.
 generated_project.flash()
-
-
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py
index 44d3e57ea2d9..14c39898278e 100644
--- a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py
@@ -41,7 +41,7 @@
 # --------------------------------
 #
 # After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
-# 
+#
 # .. code-block:: bash
 #
 #   source <your chipyard home path>/env.sh
@@ -60,7 +60,7 @@
 input_channels = 32
 kernel_size = 3
 stride = 1
-padding = 'same'
+padding = "same"
 activation = None
 bias = True
 
@@ -82,9 +82,13 @@
 # Convert the concrete functions using TFLiteConverter
 converter = tf.lite.TFLiteConverter.from_keras_model(model)
 
+
 def representative_data_gen():
     dataset = [
-        np.array(np.random.randint(0, 127, size=(10, input_height, input_width, input_channels)), dtype=np.float32)
+        np.array(
+            np.random.randint(0, 127, size=(10, input_height, input_width, input_channels)),
+            dtype=np.float32,
+        )
         for s in range(10)
     ]
     for input_value in dataset:
@@ -152,7 +156,9 @@ def representative_data_gen():
 
 # The TFLite model generated in the previous steps is now imported into TVM.
 mod, params = relay.frontend.from_tflite(
-    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}
+    tflite_model,
+    shape_dict={input_tensor: (input_height, input_width, input_channels)},
+    dtype_dict={input_tensor: input_dtype},
 )
 mod["main"]
 
@@ -166,7 +172,7 @@ def representative_data_gen():
 TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
 EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
 
-with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
 ##################################
@@ -189,9 +195,7 @@ def representative_data_gen():
 
 # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
 template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {
-    "project_type": "dwconv2d_example"
-}  
+project_options = {"project_type": "dwconv2d_example"}
 
 generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
 generated_project = tvm.micro.generate_project(
@@ -204,4 +208,4 @@ def representative_data_gen():
 # Finally, we execute the compiled baremetal project on the Spike simulator.
 # Note: if there are errors, these can be related to rounding errors.
 
-generated_project.flash()
\ No newline at end of file
+generated_project.flash()
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py
index 03798ae62851..6dbb11695ac2 100644
--- a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py
@@ -36,7 +36,7 @@
 # --------------------------------
 #
 # After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
-# 
+#
 # .. code-block:: bash
 #
 #   source <your chipyard home path>/env.sh
@@ -55,7 +55,7 @@
 input_channels = 16
 pool_size = 2
 pool_stride = 1
-pool_padding = 'valid'
+pool_padding = "valid"
 
 # We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
 class Model(tf.Module):
@@ -73,6 +73,7 @@ def __init__(self, name=None):
     def maxpool(self, x):
         return layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)(x)
 
+
 model = Model()
 
 # Convert the concrete functions using TFLiteConverter
@@ -82,9 +83,7 @@ def maxpool(self, x):
 def representative_data_gen():
     dataset = [
         np.array(
-            np.random.randint(
-                -127, 128, size=(1, input_height, input_width, input_channels)
-            ),
+            np.random.randint(-127, 128, size=(1, input_height, input_width, input_channels)),
             dtype=np.float32,
         )
         for s in range(100)
@@ -133,7 +132,9 @@ def representative_data_gen():
 output_details = interpreter.get_output_details()
 tensor_details = interpreter.get_tensor_details()
 
-input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)
+input_matrix_1 = np.random.randint(
+    0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8
+)
 
 interpreter.set_tensor(input_details[0]["index"], input_matrix_1)
 
@@ -171,7 +172,7 @@ def representative_data_gen():
 TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
 EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
 
-with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
 ##################################
@@ -194,9 +195,7 @@ def representative_data_gen():
 
 # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
 template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {
-    "project_type": "maxpool2d_example"
-}  
+project_options = {"project_type": "maxpool2d_example"}
 
 generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
 generated_project = tvm.micro.generate_project(
@@ -208,4 +207,4 @@ def representative_data_gen():
 
 # Finally, we execute the compiled baremetal project on the Spike simulator.
 # Note: if there are errors, these can be related to rounding errors.
-generated_project.flash()
\ No newline at end of file
+generated_project.flash()
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py
index 5d3a5009b67e..fdb43096c87d 100644
--- a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py
@@ -37,7 +37,7 @@
 # --------------------------------
 #
 # After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
-# 
+#
 # .. code-block:: bash
 #
 #   source <your chipyard home path>/env.sh
@@ -50,6 +50,7 @@
 #
 # This functions will help us generate the MobileNet model
 
+
 def get_real_image(im_height, im_width):
     from PIL import Image
 
@@ -62,6 +63,7 @@ def get_real_image(im_height, im_width):
     data = np.reshape(x, (1, im_height, im_width, 3))
     return data
 
+
 def run_tflite_model(tflite_model_buf, input_data):
     """Generic function to execute TFLite"""
     try:
@@ -92,6 +94,7 @@ def run_tflite_model(tflite_model_buf, input_data):
 
     return tflite_output
 
+
 def download_model():
     model_url = (
         "https://storage.googleapis.com/download.tensorflow.org/models/"
@@ -159,6 +162,7 @@ def generate_mobilenet_tflite_model():
     extract(model_path)
     return create_tflite_model(model_dir)
 
+
 ##################################
 # Baseline generation
 # --------------------------------
@@ -223,7 +227,7 @@ def generate_mobilenet_tflite_model():
 TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
 EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
 
-with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
 ##################################
@@ -246,9 +250,7 @@ def generate_mobilenet_tflite_model():
 
 # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
 template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {
-    "project_type": "mobilenet_example"
-}  
+project_options = {"project_type": "mobilenet_example"}
 
 generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
 generated_project = tvm.micro.generate_project(
@@ -259,4 +261,4 @@ def generate_mobilenet_tflite_model():
 generated_project.build()
 
 # Finally, we execute the compiled baremetal project on the Spike simulator.
-generated_project.flash()
\ No newline at end of file
+generated_project.flash()

From 511c6c9bb84c0911850370fc260f4c1ddbba5cf5 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 11:34:15 +0100
Subject: [PATCH 004/286] CI fixes

---
 src/tir/ir/stmt.cc                                      | 2 +-
 src/tir/transforms/inject_gemmini_pointer_correction.cc | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index c01e6ccaec5f..ff28121db27d 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -35,11 +35,11 @@ namespace tir {
 LetStmt::LetStmt(Var var, PrimExpr value, Stmt body, Span span) {
   ICHECK(value.defined());
   ICHECK(body.defined());
-  auto vdtype = value.dtype();
   // It is still valid to bind a pointer type
   // var to a value that is of type handle.
   if (var->type_annotation.as<PointerTypeNode>()) {
     // TODO (FP): Is this check really necessary?
+    // auto vdtype = value.dtype();
     // ICHECK(vdtype.is_handle());
   } else {
     ICHECK_EQ(value.dtype(), var.dtype());
diff --git a/src/tir/transforms/inject_gemmini_pointer_correction.cc b/src/tir/transforms/inject_gemmini_pointer_correction.cc
index d73f6b9b63ca..54f5692fee2c 100644
--- a/src/tir/transforms/inject_gemmini_pointer_correction.cc
+++ b/src/tir/transforms/inject_gemmini_pointer_correction.cc
@@ -83,10 +83,8 @@ class CorrectGemminisScratchpadAndAccumulatorPointersInjector : public StmtExprM
         auto info = GetMemoryInfo(scope);
         ICHECK(info.defined()) << "Cannot find memory info of " << scope;
         DataType dtype = Downcast<PrimType>(ptr_type->element_type)->dtype;
-        int dtype_bits = dtype.bits() * dtype.lanes();
 
         int div = dim_;
-        const IntImmNode* extent_int = extent.as<IntImmNode>();
 
         PrimExpr inner_offset = indexmod(offset, extent);
         PrimExpr outer_offset = offset - inner_offset;

From 9be07358b9362bd9d0f860f33a186606324cd62c Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 11:50:44 +0100
Subject: [PATCH 005/286] LINT changes

---
 .../template_project/microtvm_api_server.py   |  2 +-
 .../src/{Makefile.template => Makefile}       |  0
 .../src/{Makefrag => Makefrag.mk}             |  0
 cmake/modules/contrib/Gemmini.cmake           | 24 +++++++++----------
 4 files changed, 13 insertions(+), 13 deletions(-)
 rename apps/microtvm/gemmini/template_project/src/{Makefile.template => Makefile} (100%)
 rename apps/microtvm/gemmini/template_project/src/{Makefrag => Makefrag.mk} (100%)

diff --git a/apps/microtvm/gemmini/template_project/microtvm_api_server.py b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
index df2f27d315ea..1f55eedf1e3d 100644
--- a/apps/microtvm/gemmini/template_project/microtvm_api_server.py
+++ b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
@@ -110,7 +110,7 @@ def _copy_project_files(self, api_server_dir, project_dir, project_type):
             else:
                 shutil.copy2(item, dest)
 
-        shutil.copy2(project_dir / "src" / "Makefile.template", project_dir / "src" / "Makefile")
+        shutil.copy2(project_dir / "src" / "Makefrag.mk", project_dir / "src" / "Makefrag")
 
         test_name = project_type.replace("_example", "")
         new_line = f"tests = {test_name}\n"
diff --git a/apps/microtvm/gemmini/template_project/src/Makefile.template b/apps/microtvm/gemmini/template_project/src/Makefile
similarity index 100%
rename from apps/microtvm/gemmini/template_project/src/Makefile.template
rename to apps/microtvm/gemmini/template_project/src/Makefile
diff --git a/apps/microtvm/gemmini/template_project/src/Makefrag b/apps/microtvm/gemmini/template_project/src/Makefrag.mk
similarity index 100%
rename from apps/microtvm/gemmini/template_project/src/Makefrag
rename to apps/microtvm/gemmini/template_project/src/Makefrag.mk
diff --git a/cmake/modules/contrib/Gemmini.cmake b/cmake/modules/contrib/Gemmini.cmake
index 0d224c74ea75..757a99217510 100644
--- a/cmake/modules/contrib/Gemmini.cmake
+++ b/cmake/modules/contrib/Gemmini.cmake
@@ -10,9 +10,9 @@ if(USE_GEMMINI)
 
       # Dense example project generation
       "apps/microtvm/gemmini/template_project/src dense.c -> gemmini/src/dense_example"
-      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/dense_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dense_example"
-      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/dense_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/dense_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/dense_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/dense_example/include"
@@ -20,9 +20,9 @@ if(USE_GEMMINI)
 
       # CONV2D example project generation
       "apps/microtvm/gemmini/template_project/src conv2d.c -> gemmini/src/conv2d_example"
-      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/conv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/conv2d_example"
-      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/conv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/conv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/conv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/conv2d_example/include"
@@ -30,9 +30,9 @@ if(USE_GEMMINI)
 
       # DW CONV2D example project generation
       "apps/microtvm/gemmini/template_project/src dwconv2d.c -> gemmini/src/dwconv2d_example"
-      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/dwconv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dwconv2d_example"
-      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/dwconv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/dwconv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/dwconv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/dwconv2d_example/include"
@@ -40,9 +40,9 @@ if(USE_GEMMINI)
 
       # ADD example project generation
       "apps/microtvm/gemmini/template_project/src add.c -> gemmini/src/add_example"
-      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/add_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/add_example"
-      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/add_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/add_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/add_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/add_example/include"
@@ -50,9 +50,9 @@ if(USE_GEMMINI)
 
       # Max pooling 2d example project generation
       "apps/microtvm/gemmini/template_project/src maxpool2d.c -> gemmini/src/maxpool2d_example"
-      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/maxpool2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/maxpool2d_example"
-      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/maxpool2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/maxpool2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/maxpool2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/maxpool2d_example/include"
@@ -60,9 +60,9 @@ if(USE_GEMMINI)
 
       # Mobilenet example project generation
       "apps/microtvm/gemmini/template_project/src mobilenet.c -> gemmini/src/mobilenet_example"
-      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/mobilenet_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/mobilenet_example"
-      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/mobilenet_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/mobilenet_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/mobilenet_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/mobilenet_example/include"

From be77ea18173230fb042832b1407c9c3bd8d83202 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 11:59:48 +0100
Subject: [PATCH 006/286] LINT fix

---
 src/tir/transforms/inject_gemmini_pointer_correction.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/tir/transforms/inject_gemmini_pointer_correction.cc b/src/tir/transforms/inject_gemmini_pointer_correction.cc
index 54f5692fee2c..4a9260ff014c 100644
--- a/src/tir/transforms/inject_gemmini_pointer_correction.cc
+++ b/src/tir/transforms/inject_gemmini_pointer_correction.cc
@@ -82,7 +82,6 @@ class CorrectGemminisScratchpadAndAccumulatorPointersInjector : public StmtExprM
         auto scope = ptr_type->storage_scope;
         auto info = GetMemoryInfo(scope);
         ICHECK(info.defined()) << "Cannot find memory info of " << scope;
-        DataType dtype = Downcast<PrimType>(ptr_type->element_type)->dtype;
 
         int div = dim_;
 

From ad0318e9f5463cb64ea37f20b26778d6f53941f3 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 12:59:24 +0100
Subject: [PATCH 007/286] Lint fix

---
 python/tvm/tir/transform/transform.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 82b162ef7df0..a72390997420 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -222,7 +222,7 @@ def CorrectGemminisScratchpadAndAccumulatorPointers():
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.CorrectGemminisScratchpadAndAccumulatorPointers()
+    return _ffi_api.CorrectGemminisScratchpadAndAccumulatorPointers()  # type: ignore
 
 
 def StorageRewrite():

From b068ec0eefbeb8398c1ba1d9768d9cd8cbac46ee Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 13:26:47 +0100
Subject: [PATCH 008/286] LINT fix

---
 apps/microtvm/gemmini/template_project/src/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/apps/microtvm/gemmini/template_project/src/Makefile b/apps/microtvm/gemmini/template_project/src/Makefile
index 9368836a8802..8849236926b0 100644
--- a/apps/microtvm/gemmini/template_project/src/Makefile
+++ b/apps/microtvm/gemmini/template_project/src/Makefile
@@ -57,4 +57,3 @@ run-baremetal: $(runs_baremetal)
 	$(RUNNER)$(abs_top_srcdir)/build/$^
 
 junk += $(tests_baremetal)
-

From 35ab5177831d791cc25e504c36a72211d512a278 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 13:50:26 +0100
Subject: [PATCH 009/286] pylint fixes

---
 python/tvm/contrib/gemmini/__init__.py      |  1 -
 python/tvm/contrib/gemmini/build_module.py  |  2 +-
 python/tvm/contrib/gemmini/environment.py   |  4 --
 python/tvm/contrib/gemmini/helpers.py       | 14 +----
 python/tvm/contrib/gemmini/intrin.py        | 20 +++----
 python/tvm/contrib/gemmini/legalize.py      | 11 ----
 python/tvm/contrib/gemmini/pattern_table.py |  7 +--
 python/tvm/contrib/gemmini/transform.py     | 58 +++++++++------------
 8 files changed, 38 insertions(+), 79 deletions(-)

diff --git a/python/tvm/contrib/gemmini/__init__.py b/python/tvm/contrib/gemmini/__init__.py
index 9515769fd641..73c2ce6bfcf1 100644
--- a/python/tvm/contrib/gemmini/__init__.py
+++ b/python/tvm/contrib/gemmini/__init__.py
@@ -20,7 +20,6 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-import sys
 import tvm._ffi.base
 
 from .environment import Environment
diff --git a/python/tvm/contrib/gemmini/build_module.py b/python/tvm/contrib/gemmini/build_module.py
index a094147b7a14..8ef934b02ab3 100644
--- a/python/tvm/contrib/gemmini/build_module.py
+++ b/python/tvm/contrib/gemmini/build_module.py
@@ -190,7 +190,7 @@ def mem_info_acc_buffer():
     Returns:
         node: The corresponding MemoryInfo node
     """
-    spec = Environment.instance()
+    Environment.instance()
     return tvm.ir.make_node(
         "MemoryInfo",
         unit_bits=env.inp_bits,
diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
index 7d6350d1ebb9..ac98b2c2e738 100644
--- a/python/tvm/contrib/gemmini/environment.py
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -31,10 +31,6 @@
     add_mvout_tensorize,
 )
 import re
-from pydevicetree import Devicetree
-import os
-import tvm
-import sys
 from typing import List, Tuple, Dict, Callable
 from .utils import counters
 
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
index 84c028b3d33c..0d84e3039ffe 100644
--- a/python/tvm/contrib/gemmini/helpers.py
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -23,20 +23,8 @@
 import numpy as np
 import pathlib
 from .environment import Environment
-
-import abc
-import collections
-import matplotlib
-import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
-import PIL.Image as Image
-import PIL.ImageColor as ImageColor
-import PIL.ImageDraw as ImageDraw
-import PIL.ImageFont as ImageFont
-import six
 from six.moves import range
-from six.moves import zip
-import tensorflow.compat.v1 as tf
-from typing import List, Tuple
+from typing import List
 
 
 env = Environment.instance()
diff --git a/python/tvm/contrib/gemmini/intrin.py b/python/tvm/contrib/gemmini/intrin.py
index 0909e58a890d..51a0fa7a643e 100644
--- a/python/tvm/contrib/gemmini/intrin.py
+++ b/python/tvm/contrib/gemmini/intrin.py
@@ -399,13 +399,13 @@ def conv2d_cisc(
     inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
     bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
 
-    OC = wgt.shape[3]
+    wgt.shape[3]
     KH = wgt.shape[0]
     KW = wgt.shape[1]
 
-    N = inp.shape[0]
-    IH = inp.shape[1]
-    IW = inp.shape[2]
+    inp.shape[0]
+    inp.shape[1]
+    inp.shape[2]
     IC = inp.shape[3]
 
     ric = te.reduce_axis((0, IC), name="ric")
@@ -571,14 +571,14 @@ def dw_conv2d_cisc(
     inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
     bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
 
-    OC = wgt.shape[0]
+    wgt.shape[0]
     KH = wgt.shape[1]
     KW = wgt.shape[2]
 
-    N = inp.shape[0]
-    IH = inp.shape[1]
-    IW = inp.shape[2]
-    IC = inp.shape[3]
+    inp.shape[0]
+    inp.shape[1]
+    inp.shape[2]
+    inp.shape[3]
 
     rkh = te.reduce_axis((0, KH), name="rkh")
     rkw = te.reduce_axis((0, KW), name="rkw")
@@ -751,7 +751,7 @@ def add_tensorize(env, oshape: Tuple[int, ...]):
     def intrin_func(ins, outs):
         """Add intrinsic function"""
         difm1, difm2 = ins
-        dout = outs[0]
+        outs[0]
 
         def _body():
             irb = tvm.tir.ir_builder.create()
diff --git a/python/tvm/contrib/gemmini/legalize.py b/python/tvm/contrib/gemmini/legalize.py
index 6f279bb512b3..083268d9c469 100644
--- a/python/tvm/contrib/gemmini/legalize.py
+++ b/python/tvm/contrib/gemmini/legalize.py
@@ -20,23 +20,12 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-from typing import List, Type, Callable
-import math
-
-import numpy as np  # type: ignore
-
 import tvm  # type: ignore
-from tvm import te
 from tvm import relay
 from tvm import ir
 from tvm.relay.dataflow_pattern import DFPatternCallback  # type: ignore
 from tvm.relay.dataflow_pattern import wildcard
-from tvm.relay.dataflow_pattern import is_op
 from tvm.relay.dataflow_pattern import rewrite
-from tvm.relay.dataflow_pattern import CallPattern
-from tvm.relay.frontend.common import infer_shape as _infer_shape
-from tvm.relay.frontend.common import infer_type as _infer_type
-from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 
 from tvm.relay.op import _make  # type: ignore
 
diff --git a/python/tvm/contrib/gemmini/pattern_table.py b/python/tvm/contrib/gemmini/pattern_table.py
index a43f10699c75..9faecbe49d07 100644
--- a/python/tvm/contrib/gemmini/pattern_table.py
+++ b/python/tvm/contrib/gemmini/pattern_table.py
@@ -20,19 +20,15 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-from typing import Dict, List, Tuple, Callable, Optional
+from typing import Callable, List, Tuple
 
 import tvm  # type: ignore
 from tvm import relay
-from tvm.target import Target
-from tvm.relay.build_module import bind_params_by_name  # type: ignore
 from tvm.relay.op.contrib.register import register_pattern_table  # type: ignore
 from tvm.relay.dataflow_pattern import is_constant, wildcard, is_op
 from .utils import *
 
-from tvm.topi.utils import const_vector, get_const_int, get_const_float
 from tvm.relay.frontend.common import infer_shape as _infer_shape
-from tvm.relay.frontend.common import infer_type as _infer_type
 
 from .environment import Environment
 
@@ -89,7 +85,6 @@ class AddParams:
 
     def __init__(self, func_body: tvm.relay.Function):
         if str(func_body.op) in self.activation_map.keys():
-            activation = func_body
             add_op = func_body.args[0]
         else:
             add_op = func_body
diff --git a/python/tvm/contrib/gemmini/transform.py b/python/tvm/contrib/gemmini/transform.py
index 312217cc8210..22146175ce5a 100644
--- a/python/tvm/contrib/gemmini/transform.py
+++ b/python/tvm/contrib/gemmini/transform.py
@@ -21,13 +21,7 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-from numpy import isin
 import tvm
-from tvm import te
-from tvm.topi import utils
-import numpy as np
-from copy import deepcopy
-import itertools
 import ast
 from tvm.tir.ir_builder import IRBuilder
 from typing import Dict
@@ -273,8 +267,6 @@ def _ftransform(f, mod, ctx):
 def InsertGemminiFenceOperator():
     """Pass to generate the call to the fence instruction at the end of the operator"""
 
-    func_name = ""
-
     def _do_fold(stmt):
         if _match_pragma(stmt, "gemm_end"):
             irb = tvm.tir.ir_builder.create()
@@ -285,7 +277,7 @@ def _do_fold(stmt):
         return None
 
     def _ftransform(f, mod, ctx):
-        func_name = f.attrs["global_symbol"]
+        f.attrs["global_symbol"]
         return f.with_body(
             tvm.tir.stmt_functor.ir_transform(f.body, _do_fold, None, ["tir.AttrStmt"])
         )
@@ -303,8 +295,8 @@ def InjectAMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -347,8 +339,8 @@ def InjectAMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -390,8 +382,8 @@ def InjectBMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -433,8 +425,8 @@ def InjectBMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -475,8 +467,8 @@ def InjectDMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -518,8 +510,8 @@ def InjectDMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -561,8 +553,8 @@ def InjectCMVOUTIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -607,8 +599,8 @@ def InjectCMVOUTIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -654,8 +646,8 @@ def InjectCMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -697,8 +689,8 @@ def InjectCMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -740,8 +732,8 @@ def InjectCMVINAccumIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -782,8 +774,8 @@ def InjectCMVINAccumIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...

From 1996073e68cabfe043cbf75c58c3ce7e289af5c7 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 14:25:41 +0100
Subject: [PATCH 010/286] more lint fixes

---
 python/tvm/relay/backend/contrib/gemmini/gemmini_add.py   | 5 +----
 .../relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py  | 8 ++------
 python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py | 5 ++---
 .../relay/backend/contrib/gemmini/gemmini_dense_cisc.py   | 6 +-----
 .../contrib/gemmini/gemmini_depthwise_conv2d_cisc.py      | 7 +------
 .../relay/backend/contrib/gemmini/gemmini_max_pool2d.py   | 5 -----
 6 files changed, 7 insertions(+), 29 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
index 9f7837c076e5..0be4afebbb9e 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
@@ -26,13 +26,10 @@
 from tvm import te
 from tvm import autotvm
 from tvm import topi
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
 
 from tvm.contrib.gemmini.environment import Environment
-from tvm.contrib.gemmini.build_module import lower
 from tvm.contrib.gemmini.helpers import get_greater_div
 
-import json
 
 env = Environment.instance()
 
@@ -131,7 +128,7 @@ def schedule_add(
 
     ifm1, ifm2_op = add_stage.op.input_tensors
     ifm2, ofm_offset_op = ifm2_op.op.input_tensors
-    ofm_offset = ofm_offset_op.op.input_tensors[0]
+    ofm_offset_op.op.input_tensors[0]
 
     b, x, y, c = sch[add_stage].op.axis
 
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
index 6d129a0e8b0f..fdb9213aeb4a 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
@@ -28,9 +28,6 @@
 from tvm import topi
 
 from tvm.contrib.gemmini.environment import Environment
-from tvm.contrib.gemmini.build_module import lower
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
-from tvm.contrib.gemmini.helpers import get_greater_div
 
 env = Environment.instance()
 
@@ -104,7 +101,8 @@ def conv2d_cisc(
     oshape = (N, OH, OW, OC)
 
     if len(set(padding)) == 1 and (env.supports_non_zero_padding or ifm_offset == 0):
-        # If the padding is the same for all borders, there is no need to use topi.nn.pad, because Gemminis CISC instructions support equal padding
+        # If the padding is the same for all borders, there is no need to use topi.nn.pad,
+        # because Gemminis CISC instructions support equal padding
         data = orig_data
     else:
         # If not, then pad before calling Gemminis functions
@@ -204,8 +202,6 @@ def _traverse(op):
     else:
         pad_data = data
 
-    orig_kernel = kernel
-
     x_bo, x_i, x_j, x_co = sch[conv2d_stage].op.axis
     rkh, rkw, ric = sch[conv2d_stage].op.reduce_axis
 
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
index 03051f193638..d37e1922027d 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
@@ -26,10 +26,9 @@
 from tvm import te
 from tvm import autotvm
 from tvm import topi
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
 from tvm.contrib.gemmini.environment import Environment
-from tvm.contrib.gemmini.build_module import lower
 from tvm.contrib.gemmini.helpers import get_greater_div
 
 env = Environment.instance()
@@ -114,7 +113,7 @@ def schedule_gemm(
     sch = te.create_schedule([x.op for x in outs])
 
     data, weight, bias_op = dense_stage.op.input_tensors
-    bias = bias_op.op.input_tensors[0]
+    bias_op.op.input_tensors[0]
 
     ##### space definition begin #####
     x, y = sch[dense_stage].op.axis
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
index 0144563940f9..09097a003ce2 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
@@ -21,18 +21,14 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-import math
-import sys
 import numpy as np
 import tvm
 from tvm import te
 from tvm import autotvm
 from tvm import topi
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+from tvm.autotvm.task.space import OtherOptionEntity
 
 from tvm.contrib.gemmini.environment import Environment
-from tvm.contrib.gemmini.build_module import lower
-from tvm.contrib.gemmini.intrin import gemm_cisc
 
 env = Environment.instance()
 
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
index c67767f783c2..eedbc6b052b0 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
@@ -28,9 +28,6 @@
 from tvm import topi
 
 from tvm.contrib.gemmini.environment import Environment
-from tvm.contrib.gemmini.build_module import lower
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-from tvm.contrib.gemmini.helpers import get_greater_div
 
 env = Environment.instance()
 
@@ -80,7 +77,7 @@ def depthwise_conv2d_cisc(
     N = orig_data.shape[0]
     IH = orig_data.shape[1]
     IW = orig_data.shape[2]
-    IC = orig_data.shape[3]
+    orig_data.shape[3]
 
     HSTR = strides[0]
     WSTR = strides[1]
@@ -191,8 +188,6 @@ def _traverse(op):
     else:
         pad_data = data
 
-    orig_kernel = kernel
-
     x_bo, x_i, x_j, x_co = sch[conv2d_stage].op.axis
     rkh, rkw = sch[conv2d_stage].op.reduce_axis
 
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
index 7d922ddd2db4..292743eff78c 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
@@ -21,16 +21,11 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-import numpy as np
 import tvm
 from tvm import te
 from tvm import autotvm
-from tvm import topi
 
 from tvm.contrib.gemmini.environment import Environment
-from tvm.contrib.gemmini.build_module import lower
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
-from tvm.contrib.gemmini.helpers import get_greater_div
 
 env = Environment.instance()
 

From d02968a34d9e0bf8a8fb7678ebf09955ba6342fc Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 08:43:35 +0100
Subject: [PATCH 011/286] Small makefile addition to enable use of math.h
 functions

---
 apps/microtvm/gemmini/template_project/src/Makefile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/apps/microtvm/gemmini/template_project/src/Makefile b/apps/microtvm/gemmini/template_project/src/Makefile
index 8849236926b0..b8da778d7eec 100644
--- a/apps/microtvm/gemmini/template_project/src/Makefile
+++ b/apps/microtvm/gemmini/template_project/src/Makefile
@@ -25,7 +25,6 @@ CFLAGS := $(CFLAGS) \
 	-fno-common \
 	-fno-builtin-printf \
 	-march=rv64gc -Wa,-march=rv64gcxhwacha \
-	-lm \
 	-lgcc \
 	-I${RISCV_TESTS} \
 	-I${RISCV_TESTS}/env \
@@ -37,7 +36,6 @@ CFLAGS := $(CFLAGS) \
 
 CFLAGS_BAREMETAL := \
 	$(CFLAGS) \
-	-nostdlib \
 	-nostartfiles \
 	-static \
 	-T $(BENCH_COMMON)/test.ld \
@@ -49,7 +47,7 @@ vpath %.c $(src_dir)
 
 %-baremetal: %.c $(GEMMINI_HEADERS)
 	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
-		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS) -lm
 
 run-baremetal: $(runs_baremetal)
 

From 46c900bad8390a9e1b21670c3b2da8e955f1f403 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 08:56:50 +0100
Subject: [PATCH 012/286] moved gemmini how tos to tutorials

---
 gallery/how_to/work_with_microtvm/micro_gemmini/README.txt   | 5 -----
 .../micro_gemmini => tutorial}/micro_gemmini_add.py          | 0
 .../micro_gemmini => tutorial}/micro_gemmini_conv2d.py       | 0
 .../micro_gemmini => tutorial}/micro_gemmini_dense.py        | 0
 .../micro_gemmini => tutorial}/micro_gemmini_dwconv2d.py     | 0
 .../micro_gemmini => tutorial}/micro_gemmini_maxpool2d.py    | 0
 .../micro_gemmini => tutorial}/micro_gemmini_mobilenet.py    | 0
 7 files changed, 5 deletions(-)
 delete mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/README.txt
 rename gallery/{how_to/work_with_microtvm/micro_gemmini => tutorial}/micro_gemmini_add.py (100%)
 rename gallery/{how_to/work_with_microtvm/micro_gemmini => tutorial}/micro_gemmini_conv2d.py (100%)
 rename gallery/{how_to/work_with_microtvm/micro_gemmini => tutorial}/micro_gemmini_dense.py (100%)
 rename gallery/{how_to/work_with_microtvm/micro_gemmini => tutorial}/micro_gemmini_dwconv2d.py (100%)
 rename gallery/{how_to/work_with_microtvm/micro_gemmini => tutorial}/micro_gemmini_maxpool2d.py (100%)
 rename gallery/{how_to/work_with_microtvm/micro_gemmini => tutorial}/micro_gemmini_mobilenet.py (100%)

diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/README.txt b/gallery/how_to/work_with_microtvm/micro_gemmini/README.txt
deleted file mode 100644
index 6826cc7ab810..000000000000
--- a/gallery/how_to/work_with_microtvm/micro_gemmini/README.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-.. _tutorial-micro-gemmini:
-
-Generate code for the Gemmini accelerator using microTVM
-------------------
-These how-tos demonstrate how to deploy models for the Gemmini accelerator using microTVM.
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py b/gallery/tutorial/micro_gemmini_add.py
similarity index 100%
rename from gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py
rename to gallery/tutorial/micro_gemmini_add.py
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py b/gallery/tutorial/micro_gemmini_conv2d.py
similarity index 100%
rename from gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py
rename to gallery/tutorial/micro_gemmini_conv2d.py
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py b/gallery/tutorial/micro_gemmini_dense.py
similarity index 100%
rename from gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py
rename to gallery/tutorial/micro_gemmini_dense.py
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py b/gallery/tutorial/micro_gemmini_dwconv2d.py
similarity index 100%
rename from gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py
rename to gallery/tutorial/micro_gemmini_dwconv2d.py
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py b/gallery/tutorial/micro_gemmini_maxpool2d.py
similarity index 100%
rename from gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py
rename to gallery/tutorial/micro_gemmini_maxpool2d.py
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py b/gallery/tutorial/micro_gemmini_mobilenet.py
similarity index 100%
rename from gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py
rename to gallery/tutorial/micro_gemmini_mobilenet.py

From b389fad68b9631efafa490161312c84baf4ee5c0 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 13:01:31 +0100
Subject: [PATCH 013/286] Fix docs

---
 gallery/tutorial/micro_gemmini_add.py       | 4 ++--
 gallery/tutorial/micro_gemmini_conv2d.py    | 4 ++--
 gallery/tutorial/micro_gemmini_dense.py     | 4 ++--
 gallery/tutorial/micro_gemmini_dwconv2d.py  | 4 ++--
 gallery/tutorial/micro_gemmini_maxpool2d.py | 4 ++--
 gallery/tutorial/micro_gemmini_mobilenet.py | 4 ++--
 python/tvm/tir/transform/transform.py       | 2 +-
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/gallery/tutorial/micro_gemmini_add.py b/gallery/tutorial/micro_gemmini_add.py
index b8521c4b6ae2..c90344aa75f0 100644
--- a/gallery/tutorial/micro_gemmini_add.py
+++ b/gallery/tutorial/micro_gemmini_add.py
@@ -202,9 +202,9 @@ def representative_data_gen():
 with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
-##################################
+#################################################
 # Exporting and testing the model using microTVM
-# --------------------------------
+# -----------------------------------------------
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
diff --git a/gallery/tutorial/micro_gemmini_conv2d.py b/gallery/tutorial/micro_gemmini_conv2d.py
index b58881162dcc..14ac6933be98 100644
--- a/gallery/tutorial/micro_gemmini_conv2d.py
+++ b/gallery/tutorial/micro_gemmini_conv2d.py
@@ -185,9 +185,9 @@ def representative_data_gen():
 with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
-##################################
+#################################################
 # Exporting and testing the model using microTVM
-# --------------------------------
+# -----------------------------------------------
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
diff --git a/gallery/tutorial/micro_gemmini_dense.py b/gallery/tutorial/micro_gemmini_dense.py
index c9a7caffc71b..22419ad22276 100644
--- a/gallery/tutorial/micro_gemmini_dense.py
+++ b/gallery/tutorial/micro_gemmini_dense.py
@@ -176,9 +176,9 @@ def representative_data_gen():
 with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
-##################################
+#################################################
 # Exporting and testing the model using microTVM
-# --------------------------------
+# -----------------------------------------------
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
diff --git a/gallery/tutorial/micro_gemmini_dwconv2d.py b/gallery/tutorial/micro_gemmini_dwconv2d.py
index 14c39898278e..6030d14ea024 100644
--- a/gallery/tutorial/micro_gemmini_dwconv2d.py
+++ b/gallery/tutorial/micro_gemmini_dwconv2d.py
@@ -175,9 +175,9 @@ def representative_data_gen():
 with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
-##################################
+#################################################
 # Exporting and testing the model using microTVM
-# --------------------------------
+# -----------------------------------------------
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
diff --git a/gallery/tutorial/micro_gemmini_maxpool2d.py b/gallery/tutorial/micro_gemmini_maxpool2d.py
index 6dbb11695ac2..39f84f88fba5 100644
--- a/gallery/tutorial/micro_gemmini_maxpool2d.py
+++ b/gallery/tutorial/micro_gemmini_maxpool2d.py
@@ -175,9 +175,9 @@ def representative_data_gen():
 with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
-##################################
+#################################################
 # Exporting and testing the model using microTVM
-# --------------------------------
+# -----------------------------------------------
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
diff --git a/gallery/tutorial/micro_gemmini_mobilenet.py b/gallery/tutorial/micro_gemmini_mobilenet.py
index fdb43096c87d..ca3690fbdb33 100644
--- a/gallery/tutorial/micro_gemmini_mobilenet.py
+++ b/gallery/tutorial/micro_gemmini_mobilenet.py
@@ -230,9 +230,9 @@ def generate_mobilenet_tflite_model():
 with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
-##################################
+#################################################
 # Exporting and testing the model using microTVM
-# --------------------------------
+# -----------------------------------------------
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index a72390997420..0040f0ae5897 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -217,7 +217,7 @@ def InjectRollingBuffer():
 def CorrectGemminisScratchpadAndAccumulatorPointers():
     """Corrects the pointer addresses of buffers inside Gemmini's scratchpad and accumulator
 
-    Returns:
+    Returns
     -------
     fpass : tvm.transform.Pass
         The result pass

From b5fb7c4e076dc777f16dd9d80a99f2b023de18cc Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 14:54:05 +0100
Subject: [PATCH 014/286] lint changes

---
 python/tvm/contrib/gemmini/__init__.py        |   2 +-
 python/tvm/contrib/gemmini/build_module.py    |  36 +++--
 python/tvm/contrib/gemmini/environment.py     |  46 +++---
 python/tvm/contrib/gemmini/helpers.py         |  46 +++---
 python/tvm/contrib/gemmini/intrin.py          | 140 +++++++++---------
 python/tvm/contrib/gemmini/legalize.py        |  35 ++---
 python/tvm/contrib/gemmini/pattern_table.py   |  14 +-
 python/tvm/contrib/gemmini/transform.py       |  58 ++++----
 .../backend/contrib/gemmini/gemmini_add.py    |  42 +++---
 .../contrib/gemmini/gemmini_conv2d_cisc.py    |  71 +++++----
 .../backend/contrib/gemmini/gemmini_dense.py  |  96 ++++++------
 .../contrib/gemmini/gemmini_dense_cisc.py     |  14 +-
 .../gemmini/gemmini_depthwise_conv2d_cisc.py  |  67 +++++----
 .../contrib/gemmini/gemmini_max_pool2d.py     |   4 +-
 .../tvm/relay/backend/contrib/gemmini/op.py   |  12 +-
 15 files changed, 339 insertions(+), 344 deletions(-)

diff --git a/python/tvm/contrib/gemmini/__init__.py b/python/tvm/contrib/gemmini/__init__.py
index 73c2ce6bfcf1..02d10645e2a3 100644
--- a/python/tvm/contrib/gemmini/__init__.py
+++ b/python/tvm/contrib/gemmini/__init__.py
@@ -22,9 +22,9 @@
 
 import tvm._ffi.base
 
+from tvm.relay.backend.contrib.gemmini import *
 from .environment import Environment
 from .build_module import build_config, lower, build, preprocess_pass
-from tvm.relay.backend.contrib.gemmini import *
 from .helpers import create_header_file
 from .utils import *
 
diff --git a/python/tvm/contrib/gemmini/build_module.py b/python/tvm/contrib/gemmini/build_module.py
index 8ef934b02ab3..fc72a6b03af8 100644
--- a/python/tvm/contrib/gemmini/build_module.py
+++ b/python/tvm/contrib/gemmini/build_module.py
@@ -21,10 +21,24 @@
 """
 
 import tvm
-
-from .environment import Environment
-from .transform import *
 from tvm import relay
+from .environment import Environment
+from .transform import (
+    InjectAMVINIntrin,
+    InjectAMVINIntrinTransposed,
+    InjectBMVINIntrin,
+    InjectBMVINIntrinTransposed,
+    InjectCMVOUTIntrin,
+    InjectCMVOUTIntrinTransposed,
+    InjectDMVINIntrin,
+    InjectDMVINIntrinTransposed,
+    InjectCMVINIntrin,
+    InjectCMVINIntrinTransposed,
+    InjectCMVINAccumIntrin,
+    InjectCMVINAccumIntrinTransposed,
+    InsertGemminiHeaderOperators,
+    InsertGemminiFenceOperator,
+)
 from .legalize import LegalizeGemmini
 
 
@@ -145,7 +159,7 @@ def build(*args, **kwargs):
 
 
 # The memory information for the compiler
-@tvm.register_func("tvm.info.mem.%s" % Environment.instance().scr_scope)
+@tvm.register_func(f"tvm.info.mem.{Environment.instance().scr_scope}")
 def mem_info_inp_buffer():
     """Creates the information about the local.scratchpad memory node
 
@@ -164,7 +178,7 @@ def mem_info_inp_buffer():
 
 
 # The memory information for the compiler
-@tvm.register_func("tvm.info.mem.%s" % Environment.instance().scr_wgt_scope)
+@tvm.register_func(f"tvm.info.mem.{Environment.instance().scr_wgt_scope}")
 def mem_info_wgt_buffer():
     """Creates the information about the local.scratchpad_weight memory node
 
@@ -183,7 +197,7 @@ def mem_info_wgt_buffer():
 
 
 # The memory information for the compiler
-@tvm.register_func("tvm.info.mem.%s" % Environment.instance().acc_scope)
+@tvm.register_func(f"tvm.info.mem.{Environment.instance().acc_scope}")
 def mem_info_acc_buffer():
     """Creates the information about the local.accumulator memory node
 
@@ -193,9 +207,13 @@ def mem_info_acc_buffer():
     Environment.instance()
     return tvm.ir.make_node(
         "MemoryInfo",
-        unit_bits=env.inp_bits,
-        max_simd_bits=env.DIM,
-        max_num_bits=int(env.ACC_ROWS * env.DIM * env.inp_bits),
+        unit_bits=Environment.instance().inp_bits,
+        max_simd_bits=Environment.instance().DIM,
+        max_num_bits=int(
+            Environment.instance().ACC_ROWS
+            * Environment.instance().DIM
+            * Environment.instance().inp_bits
+        ),
         # head_address=tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32"),
         head_address=None,
     )
diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
index ac98b2c2e738..56e8e61b646d 100644
--- a/python/tvm/contrib/gemmini/environment.py
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -22,6 +22,8 @@
 """
 
 from __future__ import absolute_import as _abs
+import re
+from typing import List, Tuple, Dict, Callable
 from .intrin import (
     gemm,
     gemm_cisc,
@@ -30,8 +32,6 @@
     add_tensorize,
     add_mvout_tensorize,
 )
-import re
-from typing import List, Tuple, Dict, Callable
 from .utils import counters
 
 
@@ -67,17 +67,17 @@ def init_overwrite(
         Args:
             batch (int, optional): Batch size. Defaults to 1.
             dim (int, optional): Gemminis systolic array dimensions (DIM). Defaults to 32.
-            max_bytes (int, optional): Used to calculate the maximum amount of columns one mvin instruction can generate. Defaults to 64.
-            inp_dtype (str, optional): Type supported by the Gemmini scratchpad. Defaults to "int8".
-            wgt_dtype (str, optional): Type supported by the Gemmini "logical" weight scratchpad. Defaults to "int8".
-            acc_dtype (str, optional): Type supported by the Gemmini accumulator. Defaults to "int32".
-            acc_rows (int, optional): Amount of rows of the accumulator. Defaults to 4096.
-            bank_rows (int, optional): Amount of rows of each bank in the scratchpad. Defaults to 8192.
-            bank_num (int, optional): Amount of banks for the scratchpad. Defaults to 4.
-            debug (bool, optional): Adds debug of Gemmini counters to generated code. Defaults to False.
-            enabled_counters (dict, optional): Dictionary of enabled Gemmini counters for debug purposes. Defaults to empty.
-            supports_non_zero_padding (bool, optional): If Gemmini supports instructions with non-zero padding. Defaults to False.
-            use_experimental_qnn_add (bool, optional): Activate pattern matching for qnn.add. Defaults to False.
+            max_bytes (int, optional): Limits maximum amount of mvin columns. Defaults to 64.
+            inp_dtype (str, optional): Type of the Gemmini scratchpad. Defaults to "int8".
+            wgt_dtype (str, optional): Type of the Gemmini weight scratchpad. Defaults to "int8".
+            acc_dtype (str, optional): Type of the Gemmini accumulator. Defaults to "int32".
+            acc_rows (int, optional): Rows of the accumulator. Defaults to 4096.
+            bank_rows (int, optional): Rows of each bank in the scratchpad. Defaults to 8192.
+            bank_num (int, optional): Banks for the scratchpad. Defaults to 4.
+            debug (bool, optional): Adds debug of Gemmini counters. Defaults to False.
+            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to empty.
+            supports_non_zero_padding (bool, optional): Gemmini supports instructions with non-zero padding. Defaults to False.
+            use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add. Defaults to False.
         """
         inst = Environment.instance()
         inst.init(
@@ -129,17 +129,17 @@ def init(
         Args:
             batch (int, optional): Batch size. Defaults to 1.
             dim (int, optional): Gemminis systolic array dimensions (DIM). Defaults to 32.
-            max_bytes (int, optional): Used to calculate the maximum amount of columns one mvin instruction can generate. Defaults to 64.
-            inp_dtype (str, optional): Type supported by the Gemmini scratchpad. Defaults to "int8".
-            wgt_dtype (str, optional): Type supported by the Gemmini "logical" weight scratchpad. Defaults to "int8".
-            acc_dtype (str, optional): Type supported by the Gemmini accumulator. Defaults to "int32".
+            max_bytes (int, optional): Limits maximum amount of mvin columns. Defaults to 64.
+            inp_dtype (str, optional): Type of the Gemmini scratchpad. Defaults to "int8".
+            wgt_dtype (str, optional): Type of the Gemmini "logical" weight scratchpad. Defaults to "int8".
+            acc_dtype (str, optional): Type of the Gemmini accumulator. Defaults to "int32".
             acc_rows (int, optional): Amount of rows of the accumulator. Defaults to 4096.
             bank_rows (int, optional): Amount of rows of each bank in the scratchpad. Defaults to 8192.
             bank_num (int, optional): Amount of banks for the scratchpad. Defaults to 4.
-            debug (bool, optional): Adds debug of Gemmini counters to generated code. Defaults to False.
-            enabled_counters (dict, optional): Dictionary of enabled Gemmini counters for debug purposes. Defaults to empty.
-            supports_non_zero_padding (bool, optional): If Gemmini supports instructions with non-zero padding. Defaults to False.
-            use_experimental_qnn_add (bool, optional): Activate pattern matching for qnn.add. Defaults to False.
+            debug (bool, optional): Adds debug of Gemmini counters. Defaults to False.
+            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to empty.
+            supports_non_zero_padding (bool, optional): Gemmini supports instructions with non-zero padding. Defaults to False.
+            use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add. Defaults to False.
         """
 
         assert batch == 1, "Only batch size of 1 is currently supported"
@@ -190,7 +190,9 @@ def init(
 
         self.scr_scope = "local.scratchpad"
         self.acc_scope = "local.accumulator"
-        # TODO (FP): check this scratchpad_weight. Actually, only one scratchpad should exist, but we do this logical partition to correctly manage the pointers to the buffers stored in this memories. Should see how we can fix this in the future.
+        # Actually, only one scratchpad should exist.
+        # But we do this logical partition to correctly manage the pointers to the buffers stored in this memories.
+        # Should see how we can fix this in the future.
         self.scr_wgt_scope = "local.scratchpad_weight"
 
         self.A_mvin = "A_mvin"
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
index 0d84e3039ffe..df3a9bfe9bce 100644
--- a/python/tvm/contrib/gemmini/helpers.py
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -20,14 +20,14 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-import numpy as np
 import pathlib
-from .environment import Environment
-from six.moves import range
 from typing import List
+import numpy as np
+from six.moves import range
+from .environment import Environment
 
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 def create_header_file(
@@ -56,42 +56,42 @@ def create_header_file(
     raw_source_path = file_path.with_suffix(".c").resolve()
 
     if tensor_data.dtype == np.float32:
-        type = "float"
+        datatype = "float"
         align = 32
     elif tensor_data.dtype == np.int8:
-        type = "int8_t"
+        datatype = "int8_t"
         align = 16
     elif tensor_data.dtype == np.uint8:
-        type = "uint8_t"
+        datatype = "uint8_t"
         align = 16
     elif tensor_data.dtype == np.uint32:
-        type = "uint32_t"
+        datatype = "uint32_t"
         align = 16
     else:
-        assert False, "Type %s is not supported!" % tensor_data.dtype
+        assert False, f"Type {tensor_data.dtype} is not supported!"
 
     with open(raw_header_path, "a+") as header_file:
         header_file.write(
             f"#define {tensor_name}_len {tensor_data.size}\n"
-            + f"extern {type} {tensor_name}[{tensor_name}_len];\n"
+            + f"extern {datatype} {tensor_name}[{tensor_name}_len];\n"
         )
 
     if not raw_source_path.is_file():
         with open(raw_source_path, "a+") as source_file:
-            source_file.write(f"#include <stdint.h>\n")
+            source_file.write("#include <stdint.h>\n")
     with open(raw_source_path, "a+") as source_file:
 
         source_file.write(
-            f'{type} {tensor_name}[] __attribute__((section("{section}"), aligned({align}))) = {{'
+            f'{datatype} {tensor_name}[] __attribute__((section("{section}"), aligned({align}))) = {{'
             if section
-            else f"{type} {tensor_name}[] __attribute__((aligned({align}))) = {{"
+            else f"{datatype} {tensor_name}[] __attribute__((aligned({align}))) = {{"
         )
         data_hexstr = tensor_data.tobytes().hex()
         flatten = tensor_data.flatten()
 
-        if tensor_data.dtype == np.float32 or tensor_data.dtype == np.uint32:
-            for i in range(0, len(flatten)):
-                source_file.write(f"{flatten[i]},")
+        if tensor_data.dtype in (np.float32, np.uint32):
+            for element in flatten:
+                source_file.write(f"{element},")
             source_file.write("};\n\n")
         else:
             for i in range(0, len(data_hexstr), 2):
@@ -110,20 +110,20 @@ def create_header_file(
         if debug:
             source_file.write("/*\n")
             for n in range(tensor_data.shape[0]):
-                for ch in range(tensor_data.shape[3]):
-                    source_file.write("Channel %i:\n" % ch)
+                for i_ch in range(tensor_data.shape[3]):
+                    source_file.write(f"Channel {i_ch}:\n")
                     for row in range(tensor_data.shape[1]):
                         for col in range(tensor_data.shape[2]):
-                            source_file.write(f"{tensor_data[n][row][col][ch]}\t")
+                            source_file.write(f"{tensor_data[n][row][col][i_ch]}\t")
                         source_file.write("\n")
             source_file.write("*/\n")
 
             if weights is not None:
                 source_file.write("/*\n")
                 for o_ch in range(weights.shape[3]):
-                    source_file.write("Output channel %i:\n" % o_ch)
+                    source_file.write(f"Output channel {o_ch}:\n")
                     for i_ch in range(weights.shape[2]):
-                        source_file.write("Input channel %i:\n" % i_ch)
+                        source_file.write(f"Input channel {i_ch}:\n")
                         for row in range(weights.shape[0]):
                             for col in range(weights.shape[1]):
                                 source_file.write(f"{weights[row][col][i_ch][o_ch]}\t")
@@ -158,14 +158,14 @@ def get_greater_div(x, limit: int = None):
         int: Greater divisor
     """
 
-    limit = env.DIM if limit == None else limit
+    limit = ENV.DIM if limit is None else limit
 
     if isinstance(x, int):
         elements = [x]
     elif isinstance(x, list):
         elements = x
     else:
-        assert False, "type of x not supported!"
+        assert False, "datatype of x not supported!"
 
     divisors = []
     for element in elements:
diff --git a/python/tvm/contrib/gemmini/intrin.py b/python/tvm/contrib/gemmini/intrin.py
index 51a0fa7a643e..d8809726555a 100644
--- a/python/tvm/contrib/gemmini/intrin.py
+++ b/python/tvm/contrib/gemmini/intrin.py
@@ -22,16 +22,16 @@
 
 from __future__ import absolute_import as _abs
 
+from typing import List, Tuple
 import tvm
 from tvm import te
-from typing import List, Tuple
 
 
 def gemm(
     env,
-    I: int,
-    K: int,
-    J: int,
+    dim_i: int,
+    dim_k: int,
+    dim_j: int,
     stride: int = 1,
     is_depthwise_conv2d: bool = True,
     mode: int = 1,
@@ -41,9 +41,9 @@ def gemm(
 
     Args:
         env (Environment): Environment with configurations
-        I (int): output first axis dimension
-        K (int): reduction axis dimension
-        J (int): output second axis dimension
+        dim_i (int): output first axis dimension
+        dim_k (int): reduction axis dimension
+        dim_j (int): output second axis dimension
         stride (int, optional): Stride, useful for convolutions. Defaults to 1.
         is_depthwise_conv2d (bool, optional): Flag to explain if this is a GEMM for a depthwise convolution. Defaults to False.
         mode (int, optional): Systolic array mode (WS=1,OS=0). Defaults to 1.
@@ -53,13 +53,13 @@ def gemm(
         TensorIntrin: gemm tensor intrinsic
     """
 
-    # TODO (FP): add assertions here for I, K and J?
+    # TODO (FP): add assertions here for dim_i, dim_k and dim_j?
 
-    wgt_shape = (K, J)
+    wgt_shape = (dim_k, dim_j)
 
-    inp_shape = (I, K)
+    inp_shape = (dim_i, dim_k)
 
-    out_shape = (I, J)
+    out_shape = (dim_i, dim_j)
 
     wgt = te.placeholder(wgt_shape, dtype=env.wgt_dtype, name=env.scr_wgt_scope)
     inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
@@ -125,7 +125,7 @@ def gemm(
 
     def intrin_func(ins, outs):
         """Matrix-matrix multiply intrinsic function"""
-        dinp, dwgt, dbias = ins
+        dinp, dwgt, _ = ins
         dout = outs[0]
 
         inp_base_address = tvm.runtime.const(env.INP_SCR_BASE_ADDRESS, "uint32")
@@ -142,47 +142,47 @@ def _body():
 
             inp_access_ptr = dinp.access_ptr("r", "uint32")
 
-            A_access_ptr = inp_base_address + inp_access_ptr
-            BD_access_ptr = (
+            a_access_ptr = inp_base_address + inp_access_ptr
+            bd_access_ptr = (
                 wgt_base_address + wgt_access_ptr if mode == env.WEIGHT_STATIONARY else garbage
             )
-            C_access_ptr = out_base_address + out_access_ptr
-            DB_access_ptr = (
+            c_access_ptr = out_base_address + out_access_ptr
+            db_access_ptr = (
                 garbage if mode == env.WEIGHT_STATIONARY else wgt_base_address + wgt_access_ptr
             )
 
-            A_cols = dinp.shape[1]
-            A_rows = dinp.shape[0]
-            BD_cols = dwgt.shape[1] if mode == env.WEIGHT_STATIONARY else dout.shape[1]
-            BD_rows = dwgt.shape[0] if mode == env.WEIGHT_STATIONARY else dout.shape[0]
-            C_cols = dout.shape[1]
-            C_rows = dout.shape[0]
-            DB_cols = C_cols if mode == env.WEIGHT_STATIONARY else dwgt.shape[1]
-            DB_rows = C_rows if mode == env.WEIGHT_STATIONARY else dwgt.shape[0]
+            a_cols = dinp.shape[1]
+            a_rows = dinp.shape[0]
+            bd_cols = dwgt.shape[1] if mode == env.WEIGHT_STATIONARY else dout.shape[1]
+            bd_rows = dwgt.shape[0] if mode == env.WEIGHT_STATIONARY else dout.shape[0]
+            c_cols = dout.shape[1]
+            c_rows = dout.shape[0]
+            db_cols = c_cols if mode == env.WEIGHT_STATIONARY else dwgt.shape[1]
+            db_rows = c_rows if mode == env.WEIGHT_STATIONARY else dwgt.shape[0]
 
             with irb.if_scope(accum_patch == 0):
                 irb.emit(
                     tvm.tir.call_extern(
                         "",
                         "gemmini_extended_preload",
-                        BD_access_ptr,
-                        C_access_ptr,
-                        BD_cols,
-                        BD_rows,
-                        C_cols,
-                        C_rows,
+                        bd_access_ptr,
+                        c_access_ptr,
+                        bd_cols,
+                        bd_rows,
+                        c_cols,
+                        c_rows,
                     )
                 )
                 irb.emit(
                     tvm.tir.call_extern(
                         "",
                         "gemmini_extended_compute_preloaded",
-                        A_access_ptr,
-                        DB_access_ptr,
-                        A_cols,
-                        A_rows,
-                        DB_cols,
-                        DB_rows,
+                        a_access_ptr,
+                        db_access_ptr,
+                        a_cols,
+                        a_rows,
+                        db_cols,
+                        db_rows,
                     )
                 )
             with irb.else_scope():
@@ -191,23 +191,23 @@ def _body():
                         "",
                         "gemmini_extended_preload",
                         garbage,
-                        C_access_ptr,
-                        BD_cols,
-                        BD_rows,
-                        C_cols,
-                        C_rows,
+                        c_access_ptr,
+                        bd_cols,
+                        bd_rows,
+                        c_cols,
+                        c_rows,
                     )
                 )
                 irb.emit(
                     tvm.tir.call_extern(
                         "",
                         "gemmini_extended_compute_accumulated",
-                        A_access_ptr,
-                        DB_access_ptr,
-                        A_cols,
-                        A_rows,
-                        DB_cols,
-                        DB_rows,
+                        a_access_ptr,
+                        db_access_ptr,
+                        a_cols,
+                        a_rows,
+                        db_cols,
+                        db_rows,
                     )
                 )
             return irb.get()
@@ -258,20 +258,20 @@ def gemm_cisc(
     inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
     bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
 
-    K = wgt.shape[0]
-    J = wgt.shape[1]
-    I = inp.shape[0]
+    dim_k = wgt.shape[0]
+    dim_j = wgt.shape[1]
+    dim_i = inp.shape[0]
 
-    k_ = te.reduce_axis((0, K), name="K")
+    k_reduce = te.reduce_axis((0, dim_k), name="dim_k")
 
-    output_shape = (I, J)
+    output_shape = (dim_i, dim_j)
 
     out = te.compute(
         output_shape,
         lambda x_, y_: te.sum(
-            inp[x_, k_].astype(env.inp_dtype) * wgt[k_, y_].astype(env.inp_dtype)
+            inp[x_, k_reduce].astype(env.inp_dtype) * wgt[k_reduce, y_].astype(env.inp_dtype)
             + bias[y_].astype(env.inp_dtype),
-            axis=[k_],
+            axis=[k_reduce],
         ),
     )
 
@@ -400,25 +400,25 @@ def conv2d_cisc(
     bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
 
     wgt.shape[3]
-    KH = wgt.shape[0]
-    KW = wgt.shape[1]
+    k_h = wgt.shape[0]
+    k_w = wgt.shape[1]
 
     inp.shape[0]
     inp.shape[1]
     inp.shape[2]
-    IC = inp.shape[3]
+    i_c = inp.shape[3]
 
-    ric = te.reduce_axis((0, IC), name="ric")
-    rkh = te.reduce_axis((0, KH), name="rkh")
-    rkw = te.reduce_axis((0, KW), name="rkw")
+    ric = te.reduce_axis((0, i_c), name="ric")
+    rkh = te.reduce_axis((0, k_h), name="rkh")
+    rkw = te.reduce_axis((0, k_w), name="rkw")
 
-    HSTR = strides[0]
-    WSTR = strides[1]
+    hstr = strides[0]
+    wstr = strides[1]
 
     out = te.compute(
         out_shape,
         lambda b_o, i, j, c_o: te.sum(
-            inp[b_o, i * HSTR + rkh, j * WSTR + rkw, ric].astype(env.inp_dtype)
+            inp[b_o, i * hstr + rkh, j * wstr + rkw, ric].astype(env.inp_dtype)
             * wgt[rkh, rkw, ric, c_o].astype(env.inp_dtype)
             + bias[c_o].astype(env.inp_dtype),
             axis=[rkh, rkw, ric],
@@ -572,24 +572,24 @@ def dw_conv2d_cisc(
     bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
 
     wgt.shape[0]
-    KH = wgt.shape[1]
-    KW = wgt.shape[2]
+    k_h = wgt.shape[1]
+    k_w = wgt.shape[2]
 
     inp.shape[0]
     inp.shape[1]
     inp.shape[2]
     inp.shape[3]
 
-    rkh = te.reduce_axis((0, KH), name="rkh")
-    rkw = te.reduce_axis((0, KW), name="rkw")
+    rkh = te.reduce_axis((0, k_h), name="rkh")
+    rkw = te.reduce_axis((0, k_w), name="rkw")
 
-    HSTR = strides[0]
-    WSTR = strides[1]
+    hstr = strides[0]
+    wstr = strides[1]
 
     out = te.compute(
         out_shape,
         lambda b_o, i, j, c_o: te.sum(
-            inp[b_o, i * HSTR + rkh, j * WSTR + rkw, c_o].astype(env.inp_dtype)
+            inp[b_o, i * hstr + rkh, j * wstr + rkw, c_o].astype(env.inp_dtype)
             * wgt[c_o, rkh, rkw].astype(env.inp_dtype)
             + bias[c_o].astype(env.inp_dtype),
             axis=[rkh, rkw],
diff --git a/python/tvm/contrib/gemmini/legalize.py b/python/tvm/contrib/gemmini/legalize.py
index 083268d9c469..f924f1dfe716 100644
--- a/python/tvm/contrib/gemmini/legalize.py
+++ b/python/tvm/contrib/gemmini/legalize.py
@@ -20,6 +20,7 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
+from typing import Tuple
 import tvm  # type: ignore
 from tvm import relay
 from tvm import ir
@@ -29,11 +30,7 @@
 
 from tvm.relay.op import _make  # type: ignore
 
-from .pattern_table import *  # type: ignore
-
-from .environment import Environment
-
-env = Environment.instance()
+from .pattern_table import AddParams, CONV2DParams, GEMMParams, MaxPoolParams  # type: ignore
 
 
 def gemmini_gemm(
@@ -464,9 +461,7 @@ def callback(
 class LegalizeAdd:
     """This is the pass that wraps the AddRewriter"""
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
         for global_var, func in mod.functions.items():
             func = rewrite(AddRewriter(), func)
             mod.update_func(global_var, func)
@@ -480,9 +475,7 @@ def __call__(self, *args, **kwargs):
 class LegalizeMaxPool2D:
     """This is the pass that wraps the MAXPOOL2DRewriter"""
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
         for global_var, func in mod.functions.items():
             func = rewrite(MAXPOOL2DRewriter(), func)
             mod.update_func(global_var, func)
@@ -496,9 +489,7 @@ def __call__(self, *args, **kwargs):
 class LegalizeGEMM:
     """This is the pass that wraps the GEMMRewriter"""
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
         for global_var, func in mod.functions.items():
             func = rewrite(GEMMRewriter(), func)
             mod.update_func(global_var, func)
@@ -512,9 +503,7 @@ def __call__(self, *args, **kwargs):
 class LegalizeCONV2D:
     """This is the pass that wraps the CONV2DRewriter"""
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
         for global_var, func in mod.functions.items():
             func = rewrite(CONV2DRewriter(), func)
             mod.update_func(global_var, func)
@@ -528,9 +517,7 @@ def __call__(self, *args, **kwargs):
 class LegalizeCONV2DExternalPad:
     """This is the pass that wraps the CONV2DExternalPadRewriter"""
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
         for global_var, func in mod.functions.items():
             func = rewrite(CONV2DExternalPadRewriter(), func)
             mod.update_func(global_var, func)
@@ -544,9 +531,7 @@ def __call__(self, *args, **kwargs):
 class LegalizeCONV2DExternalPadAndRelu6:
     """This is the pass that wraps the CONV2DExternalPadAndRelu6Rewriter"""
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
         for global_var, func in mod.functions.items():
             func = rewrite(CONV2DExternalPadAndRelu6Rewriter(), func)
             mod.update_func(global_var, func)
@@ -563,9 +548,7 @@ class LegalizeGemmini:
     operations.
     """
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
         """This is the method that replaces the operations with hardware/codegen supported
         operations.
         """
diff --git a/python/tvm/contrib/gemmini/pattern_table.py b/python/tvm/contrib/gemmini/pattern_table.py
index 9faecbe49d07..46e29ad6ffa6 100644
--- a/python/tvm/contrib/gemmini/pattern_table.py
+++ b/python/tvm/contrib/gemmini/pattern_table.py
@@ -26,13 +26,12 @@
 from tvm import relay
 from tvm.relay.op.contrib.register import register_pattern_table  # type: ignore
 from tvm.relay.dataflow_pattern import is_constant, wildcard, is_op
-from .utils import *
-
 from tvm.relay.frontend.common import infer_shape as _infer_shape
+from .utils import QDenseArgs, RequantArgs, BinaryElementwiseArgs, QConv2DArgs
 
 from .environment import Environment
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 class GEMMParams:
@@ -84,7 +83,7 @@ class AddParams:
     activation_map = {"clip": "CLIP"}
 
     def __init__(self, func_body: tvm.relay.Function):
-        if str(func_body.op) in self.activation_map.keys():
+        if str(func_body.op) in self.activation_map:
             add_op = func_body.args[0]
         else:
             add_op = func_body
@@ -421,6 +420,11 @@ def make_maxpool_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
 
 @register_pattern_table("gemmini")
 def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]:
+    """Declares Gemminis pattern table
+
+    Returns:
+        List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]: List of pattern, callable tuples
+    """
 
     pattern_table_filters = []
     pattern_table_filters.append(
@@ -452,7 +456,7 @@ def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Cal
         )
     )
 
-    if env.use_experimental_qnn_add:
+    if ENV.use_experimental_qnn_add:
         pattern_table_filters.append(
             (
                 AddParams.composite_name,
diff --git a/python/tvm/contrib/gemmini/transform.py b/python/tvm/contrib/gemmini/transform.py
index 22146175ce5a..eddd9012ae07 100644
--- a/python/tvm/contrib/gemmini/transform.py
+++ b/python/tvm/contrib/gemmini/transform.py
@@ -21,10 +21,10 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-import tvm
 import ast
-from tvm.tir.ir_builder import IRBuilder
 from typing import Dict
+import tvm
+from tvm.tir.ir_builder import IRBuilder
 
 from .environment import Environment
 
@@ -40,12 +40,12 @@ def _get_counters(irb: IRBuilder):
     irb.emit(tvm.tir.call_extern("", "counter_snapshot_take"))
     irb.emit(tvm.tir.call_extern("", "printf", "Counter values:\\r\\n"))
     counter_vars = []
-    for i, (key, value) in enumerate(env.enabled_counters.items()):
+    for i, (_, value) in enumerate(env.enabled_counters.items()):
         counter_var = irb.let(
             value.lower() + "_var", tvm.tir.call_extern("uint32", "counter_read", i)
         )
         counter_vars.append(counter_var)
-        irb.emit(tvm.tir.call_extern("", "printf", tvm.tir.StringImm("%s," % value)))
+        irb.emit(tvm.tir.call_extern("", "printf", tvm.tir.StringImm(f"{value},")))
     irb.emit(tvm.tir.call_extern("", "printf", "\\r\\n"))
     for c in counter_vars:
         irb.emit(tvm.tir.call_extern("", "printf", tvm.tir.StringImm("%lu,"), c))
@@ -58,7 +58,7 @@ def _configure_timers(irb: IRBuilder):
     Args:
         irb (IRBuilder): IRBuilder
     """
-    for i, (key, value) in enumerate(env.enabled_counters.items()):
+    for i, (key, _) in enumerate(env.enabled_counters.items()):
         irb.emit(tvm.tir.call_extern("", "counter_configure", i, key))
 
 
@@ -303,7 +303,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("A mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -326,7 +326,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
 
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.A_mvin, _inject_copy)
 
@@ -347,7 +347,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("A mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             # TODO (FP): check this pointers types again!
@@ -369,7 +369,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.A_mvin + "_t", _inject_copy)
 
@@ -391,7 +391,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         wgt_base_address = tvm.runtime.const(env.WGT_SCR_BASE_ADDRESS, "int32")
         if dst.scope() == "global":
             raise RuntimeError("B mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -412,7 +412,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.B_mvin, _inject_copy)
 
@@ -433,7 +433,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("B mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -454,7 +454,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.B_mvin + "_t", _inject_copy)
 
@@ -475,7 +475,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("D mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -497,7 +497,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.D_mvin, _inject_copy)
 
@@ -518,7 +518,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("D mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -540,7 +540,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.D_mvin + "_t", _inject_copy)
 
@@ -561,7 +561,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if src.scope() == "global":
             raise RuntimeError("C mvout should have a local source")
-        elif dst.scope() == "global":
+        if dst.scope() == "global":
             # Store
             irb = tvm.tir.ir_builder.create()
             if len(dst.shape) == 1:
@@ -586,7 +586,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.C_mvout, _inject_copy)
 
@@ -607,7 +607,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if src.scope() == "global":
             raise RuntimeError("C mvout should have a local source")
-        elif dst.scope() == "global":
+        if dst.scope() == "global":
             # Store
             irb = tvm.tir.ir_builder.create()
             # TODO (FP): check this pointers types again!
@@ -633,7 +633,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.C_mvout + "_t", _inject_copy)
 
@@ -654,7 +654,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("C mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -676,7 +676,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.C_mvin, _inject_copy)
 
@@ -697,7 +697,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("C mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -719,7 +719,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.C_mvin + "_t", _inject_copy)
 
@@ -740,7 +740,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("C mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -761,7 +761,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.C_mvin_accum, _inject_copy)
 
@@ -782,7 +782,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("C mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -803,6 +803,6 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.C_mvin_accum + "_t", _inject_copy)
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
index 0be4afebbb9e..a561a01d6c32 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
@@ -31,7 +31,7 @@
 from tvm.contrib.gemmini.helpers import get_greater_div
 
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 @autotvm.register_topi_compute("contrib.gemmini.add")
@@ -67,7 +67,7 @@ def add(
     # Derive shapes
     oshape = topi.utils.get_const_tuple(ifm1.shape)
 
-    tensor_type = env.inp_dtype
+    tensor_type = ENV.inp_dtype
 
     ofm_offset_stage = te.compute(
         oshape,
@@ -130,12 +130,10 @@ def schedule_add(
     ifm2, ofm_offset_op = ifm2_op.op.input_tensors
     ofm_offset_op.op.input_tensors[0]
 
-    b, x, y, c = sch[add_stage].op.axis
-
     # Prepare the scope of each buffer
-    cifm1 = sch.cache_read(ifm1, env.acc_scope, [add_stage])
-    sch[ifm2_op].set_scope(env.acc_scope)
-    sch[ofm_offset_op].set_scope(env.acc_scope)
+    cifm1 = sch.cache_read(ifm1, ENV.acc_scope, [add_stage])
+    sch[ifm2_op].set_scope(ENV.acc_scope)
+    sch[ofm_offset_op].set_scope(ENV.acc_scope)
 
     # Split axis, taking into account the maximum value of rows and columns that can be moved into Gemminis accumulator (DIM)
     y_factor = get_greater_div(int(sch[add_stage].op.axis[3].dom.extent))
@@ -150,23 +148,23 @@ def schedule_add(
     sch[ofm_offset_op].compute_at(sch[add_stage], y_o)
 
     # Split axis, taking into account the maximum value of rows and columns that can be moved into Gemminis accumulator (DIM)
-    cifm1_ax_0_1, cifm1_ax_0_2 = sch[cifm1].split(sch[cifm1].op.axis[2], factor=env.DIM)
+    cifm1_ax_0_1, cifm1_ax_0_2 = sch[cifm1].split(sch[cifm1].op.axis[2], factor=ENV.DIM)
     cifm1_ax_1_1, cifm1_ax_1_2 = sch[cifm1].split(
-        sch[cifm1].op.axis[3], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+        sch[cifm1].op.axis[3], factor=ENV.MAX_BLOCK_LEN_ACC * ENV.DIM
     )
     sch[cifm1].reorder(cifm1_ax_0_1, cifm1_ax_1_1, cifm1_ax_0_2, cifm1_ax_1_2)
 
-    cifm2_ax_0_1, cifm2_ax_0_2 = sch[ifm2_op].split(sch[ifm2_op].op.axis[2], factor=env.DIM)
+    cifm2_ax_0_1, cifm2_ax_0_2 = sch[ifm2_op].split(sch[ifm2_op].op.axis[2], factor=ENV.DIM)
     cifm2_ax_1_1, cifm2_ax_1_2 = sch[ifm2_op].split(
-        sch[ifm2_op].op.axis[3], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+        sch[ifm2_op].op.axis[3], factor=ENV.MAX_BLOCK_LEN_ACC * ENV.DIM
     )
     sch[ifm2_op].reorder(cifm2_ax_0_1, cifm2_ax_1_1, cifm2_ax_0_2, cifm2_ax_1_2)
 
     cofm_offset_ax_0_1, cofm_offset_ax_0_2 = sch[ofm_offset_op].split(
-        sch[ofm_offset_op].op.axis[2], factor=env.DIM
+        sch[ofm_offset_op].op.axis[2], factor=ENV.DIM
     )
     cofm_offset_ax_1_1, cofm_offset_ax_1_2 = sch[ofm_offset_op].split(
-        sch[ofm_offset_op].op.axis[3], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+        sch[ofm_offset_op].op.axis[3], factor=ENV.MAX_BLOCK_LEN_ACC * ENV.DIM
     )
     sch[ofm_offset_op].reorder(
         cofm_offset_ax_0_1, cofm_offset_ax_1_1, cofm_offset_ax_0_2, cofm_offset_ax_1_2
@@ -175,26 +173,26 @@ def schedule_add(
     # Set pragmas to insert mvin instructions
     oshape = (x_factor, y_factor)
     if x_factor == 1:
-        sch[cifm1].pragma(cifm1_ax_0_2, env.C_mvin + "_t")
-        sch[ofm_offset_op].pragma(cofm_offset_ax_0_2, env.C_mvin_accum + "_t")
+        sch[cifm1].pragma(cifm1_ax_0_2, ENV.C_mvin + "_t")
+        sch[ofm_offset_op].pragma(cofm_offset_ax_0_2, ENV.C_mvin_accum + "_t")
     else:
-        sch[cifm1].pragma(cifm1_ax_0_2, env.C_mvin)
-        sch[ofm_offset_op].pragma(cofm_offset_ax_0_2, env.C_mvin_accum)
+        sch[cifm1].pragma(cifm1_ax_0_2, ENV.C_mvin)
+        sch[ofm_offset_op].pragma(cofm_offset_ax_0_2, ENV.C_mvin_accum)
 
     # Tensorize
-    sch[ifm2_op].tensorize(cifm2_ax_0_2, env.add_tensorize(oshape))
-    sch[add_stage].tensorize(x_i, env.add_mvout_tensorize(oshape))
+    sch[ifm2_op].tensorize(cifm2_ax_0_2, ENV.add_tensorize(oshape))
+    sch[add_stage].tensorize(x_i, ENV.add_mvout_tensorize(oshape))
 
     # Create configuration dictionary
     config_dict = {}
     config_dict["A_size"] = int(ifm1.shape[3])
     config_dict["B_size"] = int(ifm2.shape[3])
     config_dict["C_size"] = int(output.shape[3])
-    config_dict["A_private_stride"] = env.DIM
-    config_dict["B_private_stride"] = env.DIM
+    config_dict["A_private_stride"] = ENV.DIM
+    config_dict["B_private_stride"] = ENV.DIM
     config_dict["execution_stride"] = 1
     config_dict["activation"] = 0
-    config_dict["mode"] = env.WEIGHT_STATIONARY
+    config_dict["mode"] = ENV.WEIGHT_STATIONARY
     config_dict["max_pixels_per_row"] = 1
     config_dict["ifm1_scale"] = float(add_stage.op.attrs["ifm1_scale"])
     config_dict["ifm2_scale"] = float(add_stage.op.attrs["ifm2_scale"])
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
index fdb9213aeb4a..f82bea64a51d 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
@@ -29,7 +29,7 @@
 
 from tvm.contrib.gemmini.environment import Environment
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 @autotvm.register_topi_compute("contrib.gemmini.conv2d_cisc")
@@ -75,32 +75,32 @@ def conv2d_cisc(
         orig_data.shape[1] == orig_data.shape[2]
     ), "GEMMINIs Conv2d CISC schedule only supports square inputs!"
 
-    OC = kernel.shape[3]
-    KH = kernel.shape[0]
-    KW = kernel.shape[1]
+    o_c = kernel.shape[3]
+    k_h = kernel.shape[0]
+    k_w = kernel.shape[1]
 
-    N = orig_data.shape[0]
-    IH = orig_data.shape[1]
-    IW = orig_data.shape[2]
-    IC = orig_data.shape[3]
+    n = orig_data.shape[0]
+    i_h = orig_data.shape[1]
+    i_w = orig_data.shape[2]
+    i_c = orig_data.shape[3]
 
-    HSTR = strides[0]
-    WSTR = strides[1]
-    TOP_PAD = padding[0]
-    LEFT_PAD = padding[1]
-    BOTTOM_PAD = padding[2]
-    RIGHT_PAD = padding[3]
+    hstr = strides[0]
+    wstr = strides[1]
+    top_pad = padding[0]
+    left_pad = padding[1]
+    bottom_pad = padding[2]
+    right_pad = padding[3]
 
-    OH = topi.utils.get_const_int(tvm.tir.div((IH + (TOP_PAD + BOTTOM_PAD) - KH), HSTR) + 1)
-    OW = topi.utils.get_const_int(tvm.tir.div((IW + (LEFT_PAD + RIGHT_PAD) - KW), WSTR) + 1)
+    o_h = topi.utils.get_const_int(tvm.tir.div((i_h + (top_pad + bottom_pad) - k_h), hstr) + 1)
+    o_w = topi.utils.get_const_int(tvm.tir.div((i_w + (left_pad + right_pad) - k_w), wstr) + 1)
 
-    ric = te.reduce_axis((0, IC), name="ric")
-    rkh = te.reduce_axis((0, KH), name="rkh")
-    rkw = te.reduce_axis((0, KW), name="rkw")
+    ric = te.reduce_axis((0, i_c), name="ric")
+    rkh = te.reduce_axis((0, k_h), name="rkh")
+    rkw = te.reduce_axis((0, k_w), name="rkw")
 
-    oshape = (N, OH, OW, OC)
+    oshape = (n, o_h, o_w, o_c)
 
-    if len(set(padding)) == 1 and (env.supports_non_zero_padding or ifm_offset == 0):
+    if len(set(padding)) == 1 and (ENV.supports_non_zero_padding or ifm_offset == 0):
         # If the padding is the same for all borders, there is no need to use topi.nn.pad,
         # because Gemminis CISC instructions support equal padding
         data = orig_data
@@ -108,8 +108,8 @@ def conv2d_cisc(
         # If not, then pad before calling Gemminis functions
         data = topi.nn.pad(
             orig_data,
-            [0, TOP_PAD, LEFT_PAD, 0],
-            [0, BOTTOM_PAD, RIGHT_PAD, 0],
+            [0, top_pad, left_pad, 0],
+            [0, bottom_pad, right_pad, 0],
             pad_value=ifm_offset,
             name="pad_data",
         )
@@ -117,16 +117,16 @@ def conv2d_cisc(
     res = te.compute(
         oshape,
         lambda b_o, i, j, c_o: te.sum(
-            data[b_o, i * HSTR + rkh, j * WSTR + rkw, ric].astype(env.inp_dtype)
-            * kernel[rkh, rkw, ric, c_o].astype(env.inp_dtype)
-            + bias[c_o].astype(env.inp_dtype),
+            data[b_o, i * hstr + rkh, j * wstr + rkw, ric].astype(ENV.inp_dtype)
+            * kernel[rkh, rkw, ric, c_o].astype(ENV.inp_dtype)
+            + bias[c_o].astype(ENV.inp_dtype),
             axis=[rkh, rkw, ric],
         ),
         name="res",
         tag="conv2d",
         attrs={
             "activation": activation,
-            "strides": [HSTR, WSTR],
+            "strides": [hstr, wstr],
             "padding": padding,
             "padding_value": ifm_offset,
             "scale": gemmini_scale,
@@ -138,9 +138,9 @@ def conv2d_cisc(
     )
 
     cfg.add_flop(
-        np.prod(topi.utils.get_const_tuple(oshape)) * KH * KW * IC
+        np.prod(topi.utils.get_const_tuple(oshape)) * k_h * k_w * i_c
         + np.prod(topi.utils.get_const_tuple(oshape))
-        * (KH * KW * IC - 1)  # Multiplications and additions needed
+        * (k_h * k_w * i_c - 1)  # Multiplications and additions needed
         + np.prod(  # Additions needed
             topi.utils.get_const_tuple(oshape)
         )  # Output scaling multiplications
@@ -202,28 +202,27 @@ def _traverse(op):
     else:
         pad_data = data
 
-    x_bo, x_i, x_j, x_co = sch[conv2d_stage].op.axis
-    rkh, rkw, ric = sch[conv2d_stage].op.reduce_axis
+    x_bo, _, _, _ = sch[conv2d_stage].op.axis
 
     x_bo_o, x_bo_i = sch[conv2d_stage].split(x_bo, factor=pad_data.shape[0])
 
     axis_for_start = x_bo_o
 
     # If topi.nn.pad was added, its because the padding was not equal in all dimensions.
-    padding_for_C_code = conv2d_stage.op.attrs["padding"] if pad_data == data else [0, 0, 0, 0]
-    padding_value_for_C_code = conv2d_stage.op.attrs["padding_value"] if pad_data == data else 0
+    padding = conv2d_stage.op.attrs["padding"] if pad_data == data else [0, 0, 0, 0]
+    padding_value = conv2d_stage.op.attrs["padding_value"] if pad_data == data else 0
 
     # Apply tensorization
     sch[conv2d_stage].tensorize(
         x_bo_i,
-        env.conv2d_cisc(
+        ENV.conv2d_cisc(
             pad_data.shape,
             kernel.shape,
             bias.shape,
             conv2d_stage.shape,
             conv2d_stage.op.attrs["strides"],
-            padding_for_C_code,
-            padding_value_for_C_code,
+            padding,
+            padding_value,
             conv2d_stage.op.attrs["activation"],
             conv2d_stage.op.attrs["scale"],
             conv2d_stage.op.attrs["pool_size"],
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
index d37e1922027d..d52557d8b703 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
@@ -31,7 +31,7 @@
 from tvm.contrib.gemmini.environment import Environment
 from tvm.contrib.gemmini.helpers import get_greater_div
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 @autotvm.register_topi_compute("contrib.gemmini.gemm")
@@ -66,7 +66,7 @@ def gemm(
 
     bias_stage = te.compute(
         oshape,
-        lambda x_o, y_o: bias[y_o].astype(env.inp_dtype),
+        lambda x_o, y_o: bias[y_o].astype(ENV.inp_dtype),
         name="bias.local.accumulator",
         tag="bias_add",
     )
@@ -74,8 +74,8 @@ def gemm(
     res = te.compute(
         oshape,
         lambda x_o, y_o: te.sum(
-            data[x_o, k_o].astype(env.inp_dtype) * weight[k_o, y_o].astype(env.inp_dtype)
-            + bias_stage[x_o, y_o].astype(env.inp_dtype),
+            data[x_o, k_o].astype(ENV.inp_dtype) * weight[k_o, y_o].astype(ENV.inp_dtype)
+            + bias_stage[x_o, y_o].astype(ENV.inp_dtype),
             axis=[k_o],
         ),
         name="res",
@@ -127,8 +127,8 @@ def schedule_gemm(
         policy="power2",
         filter=lambda ax: (
             ax.size[-1] == get_greater_div(int(data.shape[0]))
-            if (data.shape[0] >= env.DIM)
-            else ax.size[-1] <= env.DIM
+            if (data.shape[0] >= ENV.DIM)
+            else ax.size[-1] <= ENV.DIM
         ),
     )
 
@@ -139,8 +139,8 @@ def schedule_gemm(
         policy="power2",
         filter=lambda ax: (
             ax.size[-1] == get_greater_div(int(weight.shape[1]))
-            if (weight.shape[1] >= env.DIM)
-            else ax.size[-1] <= env.DIM
+            if (weight.shape[1] >= ENV.DIM)
+            else ax.size[-1] <= ENV.DIM
         ),
     )
 
@@ -151,8 +151,8 @@ def schedule_gemm(
         policy="power2",
         filter=lambda ax: (
             ax.size[-1] == get_greater_div(int(weight.shape[0]))
-            if (weight.shape[0] >= env.DIM)
-            else ax.size[-1] <= env.DIM
+            if (weight.shape[0] >= ENV.DIM)
+            else ax.size[-1] <= ENV.DIM
         ),
     )
 
@@ -167,7 +167,7 @@ def schedule_gemm(
     # WS/OS
     #   0: Gemmini will be configured as output stationary
     #   1: Gemmini will be configured as weight stationary
-    cfg.define_knob("WS/OS", [env.WEIGHT_STATIONARY, env.OUTPUT_STATIONARY])
+    cfg.define_knob("WS/OS", [ENV.WEIGHT_STATIONARY, ENV.OUTPUT_STATIONARY])
     # mvout_big_block
     #   False: generate mvout instructions moving as maximum DIM columns
     #   True: generate mvout instructions moving more than DIM columns
@@ -180,14 +180,14 @@ def schedule_gemm(
         cfg["accumulate_multiple_patches"] = OtherOptionEntity(0)
         cfg["exchange_axis"] = OtherOptionEntity(False)
         cfg["mvout_big_block"] = OtherOptionEntity(True)
-        cfg["WS/OS"] = OtherOptionEntity(env.WEIGHT_STATIONARY)
+        cfg["WS/OS"] = OtherOptionEntity(ENV.WEIGHT_STATIONARY)
 
     ###### space definition end ######
 
-    cdata = sch.cache_read(data, env.scr_scope, [dense_stage])
-    cweight = sch.cache_read(weight, env.scr_wgt_scope, [dense_stage])
-    dense_stage_acc = sch.cache_write(output, env.acc_scope)
-    sch[bias_op].set_scope(env.acc_scope)
+    cdata = sch.cache_read(data, ENV.scr_scope, [dense_stage])
+    cweight = sch.cache_read(weight, ENV.scr_wgt_scope, [dense_stage])
+    dense_stage_acc = sch.cache_write(output, ENV.acc_scope)
+    sch[bias_op].set_scope(ENV.acc_scope)
     (x_, y_) = sch[dense_stage_acc].op.axis
     (z_,) = sch[dense_stage_acc].op.reduce_axis
 
@@ -215,8 +215,8 @@ def schedule_gemm(
     sch[dense_stage_acc].compute_at(sch[output], axis_for_output)
 
     # # Split loops to generate the inner dimensions specified by knob tile_zo
-    xo_o, xi_o = sch[dense_stage_acc].split(x_, factor=env.DIM)
-    yo_o, yi_o = sch[dense_stage_acc].split(y_, factor=env.DIM)
+    xo_o, xi_o = sch[dense_stage_acc].split(x_, factor=ENV.DIM)
+    yo_o, yi_o = sch[dense_stage_acc].split(y_, factor=ENV.DIM)
     b_z, zo_o, zi_o = cfg["tile_zo"].apply(sch, dense_stage_acc, z_)
 
     # Apply the exchange_axis knob
@@ -242,20 +242,20 @@ def schedule_gemm(
     if cfg["axis_for_cdata"].val == 0:
         assert (
             cfg["tile_xo"].size[1] * cfg["tile_xo"].size[2] * data.shape[1]
-            <= env.INP_SCR_ROWS * env.DIM
+            <= ENV.INP_SCR_ROWS * ENV.DIM
         ), "Data matrix will not fit in scratchpad!"
     elif cfg["axis_for_cdata"].val == 1:
         assert (
-            cfg["tile_xo"].size[2] * data.shape[1] <= env.INP_SCR_ROWS * env.DIM
+            cfg["tile_xo"].size[2] * data.shape[1] <= ENV.INP_SCR_ROWS * ENV.DIM
         ), "Data matrix will not fit in scratchpad!"
     if cfg["axis_for_cweight"].val == 0:
         assert (
             cfg["tile_yo"].size[1] * cfg["tile_yo"].size[2] * weight.shape[0]
-            <= env.WGT_SCR_ROWS * env.DIM
+            <= ENV.WGT_SCR_ROWS * ENV.DIM
         ), "Weight matrix will not fit in scratchpad!"
     elif cfg["axis_for_cweight"].val == 1:
         assert (
-            cfg["tile_yo"].size[2] * weight.shape[0] <= env.WGT_SCR_ROWS * env.DIM
+            cfg["tile_yo"].size[2] * weight.shape[0] <= ENV.WGT_SCR_ROWS * ENV.DIM
         ), "Weight matrix will not fit in scratchpad!"
 
     # And here we assert that there is enough place available in the accumulator
@@ -265,12 +265,12 @@ def schedule_gemm(
             * cfg["tile_xo"].size[2]
             * cfg["tile_yo"].size[1]
             * cfg["tile_yo"].size[2]
-            <= env.ACC_ROWS * env.DIM
+            <= ENV.ACC_ROWS * ENV.DIM
         ), "Result matrix will not fit in accumulator!"
     elif cfg["accumulate_multiple_patches"].val == 1:
         assert (
             cfg["tile_xo"].size[2] * cfg["tile_yo"].size[1] * cfg["tile_yo"].size[2]
-            <= env.ACC_ROWS * env.DIM
+            <= ENV.ACC_ROWS * ENV.DIM
         ), "Result matrix will not fit in accumulator!"
 
     # Move the data and weight move instructions into the correct loops selected by the axis_for_cdata and axis_for_cweight knobs
@@ -282,21 +282,21 @@ def schedule_gemm(
     )
 
     # Split input moves because Gemmini's mvin only supports mvins with rows <= DIM and cols <= MAX_BLOCK_LEN
-    cdata_ax_0_1, cdata_ax_0_2 = sch[cdata].split(sch[cdata].op.axis[0], factor=env.DIM)
+    cdata_ax_0_1, cdata_ax_0_2 = sch[cdata].split(sch[cdata].op.axis[0], factor=ENV.DIM)
     cdata_ax_1_1, cdata_ax_1_2 = sch[cdata].split(
-        sch[cdata].op.axis[1], factor=env.MAX_BLOCK_LEN * env.DIM
+        sch[cdata].op.axis[1], factor=ENV.MAX_BLOCK_LEN * ENV.DIM
     )
     sch[cdata].reorder(cdata_ax_0_1, cdata_ax_1_1, cdata_ax_0_2, cdata_ax_1_2)
 
-    cweight_ax_0_1, cweight_ax_0_2 = sch[cweight].split(sch[cweight].op.axis[0], factor=env.DIM)
+    cweight_ax_0_1, cweight_ax_0_2 = sch[cweight].split(sch[cweight].op.axis[0], factor=ENV.DIM)
     cweight_ax_1_1, cweight_ax_1_2 = sch[cweight].split(
-        sch[cweight].op.axis[1], factor=env.MAX_BLOCK_LEN * env.DIM
+        sch[cweight].op.axis[1], factor=ENV.MAX_BLOCK_LEN * ENV.DIM
     )
     sch[cweight].reorder(cweight_ax_0_1, cweight_ax_1_1, cweight_ax_0_2, cweight_ax_1_2)
 
-    cbias_ax_0_1, cbias_ax_0_2 = sch[bias_op].split(sch[bias_op].op.axis[0], factor=env.DIM)
+    cbias_ax_0_1, cbias_ax_0_2 = sch[bias_op].split(sch[bias_op].op.axis[0], factor=ENV.DIM)
     cbias_ax_1_1, cbias_ax_1_2 = sch[bias_op].split(
-        sch[bias_op].op.axis[1], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+        sch[bias_op].op.axis[1], factor=ENV.MAX_BLOCK_LEN_ACC * ENV.DIM
     )
     sch[bias_op].reorder(cbias_ax_0_1, cbias_ax_1_1, cbias_ax_0_2, cbias_ax_1_2)
 
@@ -319,34 +319,34 @@ def schedule_gemm(
         fused_x = xi
         fused_y = yi
 
-    fused_x_1, fused_x_2 = sch[output].split(fused_x, factor=env.DIM)
+    fused_x_1, fused_x_2 = sch[output].split(fused_x, factor=ENV.DIM)
     fused_y_1, fused_y_2 = sch[output].split(
-        fused_y, factor=env.MAX_BLOCK_LEN * env.DIM if cfg["mvout_big_block"].val else env.DIM
+        fused_y, factor=ENV.MAX_BLOCK_LEN * ENV.DIM if cfg["mvout_big_block"].val else ENV.DIM
     )
     sch[output].reorder(fused_x_1, fused_y_1, fused_x_2, fused_y_2)
 
     # Tag loops with pragmas, in order to insert the move in and move out instructions
-    sch[cweight].pragma(cweight_ax_0_2, env.B_mvin)
+    sch[cweight].pragma(cweight_ax_0_2, ENV.B_mvin)
     if data.shape[0] == 1 and weight.shape[1] > 1:
-        sch[cdata].pragma(cdata_ax_0_2, env.A_mvin + "_t")
-        sch[bias_op].pragma(cbias_ax_0_2, env.D_mvin + "_t")
-        sch[output].pragma(fused_x_2, env.C_mvout + "_t")
+        sch[cdata].pragma(cdata_ax_0_2, ENV.A_mvin + "_t")
+        sch[bias_op].pragma(cbias_ax_0_2, ENV.D_mvin + "_t")
+        sch[output].pragma(fused_x_2, ENV.C_mvout + "_t")
     else:
-        sch[cdata].pragma(cdata_ax_0_2, env.A_mvin)
-        sch[bias_op].pragma(cbias_ax_0_2, env.D_mvin)
-        sch[output].pragma(fused_x_2, env.C_mvout)
+        sch[cdata].pragma(cdata_ax_0_2, ENV.A_mvin)
+        sch[bias_op].pragma(cbias_ax_0_2, ENV.D_mvin)
+        sch[output].pragma(fused_x_2, ENV.C_mvout)
 
     # Apply tensorize
-    I = data.shape[0] if data.shape[0] < env.DIM else cfg["tile_xo"].size[-1]
-    K = weight.shape[0] if weight.shape[0] < env.DIM else cfg["tile_zo"].size[-1]
-    J = weight.shape[1] if weight.shape[1] < env.DIM else cfg["tile_yo"].size[-1]
+    dim_i = data.shape[0] if data.shape[0] < ENV.DIM else cfg["tile_xo"].size[-1]
+    dim_k = weight.shape[0] if weight.shape[0] < ENV.DIM else cfg["tile_zo"].size[-1]
+    dim_j = weight.shape[1] if weight.shape[1] < ENV.DIM else cfg["tile_yo"].size[-1]
 
     sch[dense_stage_acc].tensorize(
         xi_o if cfg["exchange_axis"].val else yi_o,
-        env.gemm(
-            I,
-            K,
-            J,
+        ENV.gemm(
+            dim_i,
+            dim_k,
+            dim_j,
             mode=cfg["WS/OS"].val,
             accum_patch=tvm.tir.IntImm("uint8", 0)
             if cfg["exchange_axis"].val or cfg["tile_zo"].size[1] != 1
@@ -359,8 +359,8 @@ def schedule_gemm(
     config_dict["A_size"] = int(data.shape[1])
     config_dict["B_size"] = int(weight.shape[1])
     config_dict["C_size"] = int(output.shape[1])
-    config_dict["A_private_stride"] = env.DIM
-    config_dict["B_private_stride"] = env.DIM
+    config_dict["A_private_stride"] = ENV.DIM
+    config_dict["B_private_stride"] = ENV.DIM
     config_dict["execution_stride"] = 1
     config_dict["activation"] = 0
     config_dict["mode"] = cfg["WS/OS"].val
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
index 09097a003ce2..a3978fe5b63d 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
@@ -30,7 +30,7 @@
 
 from tvm.contrib.gemmini.environment import Environment
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 @autotvm.register_topi_compute("contrib.gemmini.gemm_cisc")
@@ -66,8 +66,8 @@ def gemm_cisc(
     res = te.compute(
         oshape,
         lambda x_o, y_o: te.sum(
-            data[x_o, k_o].astype(env.inp_dtype) * weight[k_o, y_o].astype(env.inp_dtype)
-            + bias[y_o].astype(env.inp_dtype),
+            data[x_o, k_o].astype(ENV.inp_dtype) * weight[k_o, y_o].astype(ENV.inp_dtype)
+            + bias[y_o].astype(ENV.inp_dtype),
             axis=[k_o],
         ),
         name="res",
@@ -108,11 +108,11 @@ def schedule_gemm_cisc(
     # WS/OS
     #   0: Gemmini will be configured as output stationary
     #   1: Gemmini will be configured as weight stationary
-    cfg.define_knob("WS/OS", [env.WEIGHT_STATIONARY, env.OUTPUT_STATIONARY])
+    cfg.define_knob("WS/OS", [ENV.WEIGHT_STATIONARY, ENV.OUTPUT_STATIONARY])
     if cfg.is_fallback:
-        cfg["WS/OS"] = OtherOptionEntity(env.WEIGHT_STATIONARY)
+        cfg["WS/OS"] = OtherOptionEntity(ENV.WEIGHT_STATIONARY)
 
-    x_, y_ = sch[dense_stage].op.axis
+    x_, _ = sch[dense_stage].op.axis
 
     x_o, x_i = sch[dense_stage].split(x_, factor=data.shape[0])
 
@@ -121,7 +121,7 @@ def schedule_gemm_cisc(
     # Apply tensorization
     sch[dense_stage].tensorize(
         x_i,
-        env.gemm_cisc(
+        ENV.gemm_cisc(
             data.shape, weight.shape, bias.shape, dense_stage.op.attrs["scale"], cfg["WS/OS"].val
         ),
     )
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
index eedbc6b052b0..d15392efeb32 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
@@ -29,7 +29,7 @@
 
 from tvm.contrib.gemmini.environment import Environment
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 @autotvm.register_topi_compute("contrib.gemmini.depthwiseconv2d_cisc")
@@ -68,58 +68,58 @@ def depthwise_conv2d_cisc(
         orig_data.shape[1] == orig_data.shape[2]
     ), "GEMMINIs depthwise conv2d CISC schedule only supports square inputs!"
 
-    OC = orig_kernel.shape[0]
-    KH = orig_kernel.shape[1]
-    KW = orig_kernel.shape[2]
+    o_c = orig_kernel.shape[0]
+    k_h = orig_kernel.shape[1]
+    k_w = orig_kernel.shape[2]
 
     kernel = orig_kernel
 
-    N = orig_data.shape[0]
-    IH = orig_data.shape[1]
-    IW = orig_data.shape[2]
+    n = orig_data.shape[0]
+    i_h = orig_data.shape[1]
+    i_w = orig_data.shape[2]
     orig_data.shape[3]
 
-    HSTR = strides[0]
-    WSTR = strides[1]
-    TOP_PAD = padding[0]
-    LEFT_PAD = padding[1]
-    BOTTOM_PAD = padding[2]
-    RIGHT_PAD = padding[3]
+    hstr = strides[0]
+    wstr = strides[1]
+    top_pad = padding[0]
+    left_pad = padding[1]
+    bottom_pad = padding[2]
+    right_pad = padding[3]
 
-    OH = topi.utils.get_const_int(tvm.tir.div((IH + (TOP_PAD + BOTTOM_PAD) - KH), HSTR) + 1)
-    OW = topi.utils.get_const_int(tvm.tir.div((IW + (LEFT_PAD + RIGHT_PAD) - KW), WSTR) + 1)
+    o_h = topi.utils.get_const_int(tvm.tir.div((i_h + (top_pad + bottom_pad) - k_h), hstr) + 1)
+    o_w = topi.utils.get_const_int(tvm.tir.div((i_w + (left_pad + right_pad) - k_w), wstr) + 1)
 
-    if len(set(padding)) == 1 and env.supports_non_zero_padding:
+    if len(set(padding)) == 1 and ENV.supports_non_zero_padding:
         # If the padding is the same for all borders, there is no need to use topi.nn.pad, because Gemminis CISC instructions support equal padding
         data = orig_data
     else:
         # If not, then pad before calling Gemminis functions
         data = topi.nn.pad(
             orig_data,
-            [0, TOP_PAD, LEFT_PAD, 0],
-            [0, BOTTOM_PAD, RIGHT_PAD, 0],
+            [0, top_pad, left_pad, 0],
+            [0, bottom_pad, right_pad, 0],
             pad_value=ifm_offset,
             name="pad_data",
         )
 
-    rkh = te.reduce_axis((0, KH), name="rkh")
-    rkw = te.reduce_axis((0, KW), name="rkw")
+    rkh = te.reduce_axis((0, k_h), name="rkh")
+    rkw = te.reduce_axis((0, k_w), name="rkw")
 
-    oshape = (N, OH, OW, OC)
+    oshape = (n, o_h, o_w, o_c)
 
     res = te.compute(
         oshape,
         lambda b_o, i, j, c_o: te.sum(
-            data[b_o, i * HSTR + rkh, j * WSTR + rkw, c_o].astype(env.inp_dtype)
-            * kernel[c_o, rkh, rkw].astype(env.inp_dtype)
-            + bias[c_o].astype(env.inp_dtype),
+            data[b_o, i * hstr + rkh, j * wstr + rkw, c_o].astype(ENV.inp_dtype)
+            * kernel[c_o, rkh, rkw].astype(ENV.inp_dtype)
+            + bias[c_o].astype(ENV.inp_dtype),
             axis=[rkh, rkw],
         ),
         name="res",
         tag="conv2d",
         attrs={
             "activation": activation,
-            "strides": [HSTR, WSTR],
+            "strides": [hstr, wstr],
             "padding": padding,
             "padding_value": ifm_offset,
             "scale": gemmini_scale,
@@ -127,9 +127,9 @@ def depthwise_conv2d_cisc(
     )
 
     cfg.add_flop(
-        np.prod(topi.utils.get_const_tuple(oshape)) * KH * KW
+        np.prod(topi.utils.get_const_tuple(oshape)) * k_h * k_w
         + np.prod(topi.utils.get_const_tuple(oshape))
-        * (KH * KW - 1)  # Multiplications and additions needed
+        * (k_h * k_w - 1)  # Multiplications and additions needed
         + np.prod(topi.utils.get_const_tuple(oshape))  # Output scaling factor multiplications
     )
 
@@ -188,28 +188,27 @@ def _traverse(op):
     else:
         pad_data = data
 
-    x_bo, x_i, x_j, x_co = sch[conv2d_stage].op.axis
-    rkh, rkw = sch[conv2d_stage].op.reduce_axis
+    x_bo, _, _, _ = sch[conv2d_stage].op.axis
 
     x_bo_o, x_bo_i = sch[conv2d_stage].split(x_bo, factor=pad_data.shape[0])
 
     axis_for_start = x_bo_o
 
     # If topi.nn.pad was added, its because the padding was not equal in all dimensions.
-    padding_for_C_code = conv2d_stage.op.attrs["padding"] if pad_data == data else [0, 0, 0, 0]
-    padding_value_for_C_code = conv2d_stage.op.attrs["padding_value"] if pad_data == data else 0
+    padding = conv2d_stage.op.attrs["padding"] if pad_data == data else [0, 0, 0, 0]
+    padding_value = conv2d_stage.op.attrs["padding_value"] if pad_data == data else 0
 
     # Apply tensorization
     sch[conv2d_stage].tensorize(
         x_bo_i,
-        env.dw_conv2d_cisc(
+        ENV.dw_conv2d_cisc(
             pad_data.shape,
             kernel.shape,
             bias.shape,
             conv2d_stage.shape,
             conv2d_stage.op.attrs["strides"],
-            padding_for_C_code,
-            padding_value_for_C_code,
+            padding,
+            padding_value,
             conv2d_stage.op.attrs["activation"],
             conv2d_stage.op.attrs["scale"],
         ),
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
index 292743eff78c..c1c83f8956f7 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
@@ -27,7 +27,7 @@
 
 from tvm.contrib.gemmini.environment import Environment
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 @autotvm.register_topi_compute("contrib.gemmini.max_pool2d")
@@ -61,7 +61,7 @@ def max_pool2d(
     def irb_builder_func(ins, outs):
         irb = tvm.tir.ir_builder.create()
 
-        if env.supports_non_zero_padding:
+        if ENV.supports_non_zero_padding:
             irb.emit(
                 tvm.tir.call_extern(
                     "",
diff --git a/python/tvm/relay/backend/contrib/gemmini/op.py b/python/tvm/relay/backend/contrib/gemmini/op.py
index 6ca41c66d139..a37ef10428bf 100644
--- a/python/tvm/relay/backend/contrib/gemmini/op.py
+++ b/python/tvm/relay/backend/contrib/gemmini/op.py
@@ -24,24 +24,16 @@
 from __future__ import absolute_import as _abs
 
 import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
 
-from tvm.relay.op import op as reg
 from tvm.relay.op import strategy as _strategy
-from tvm.relay.op.op import OpPattern, OpStrategy
-
+from tvm.relay.op.op import OpStrategy
+from tvm.contrib.gemmini.environment import Environment
 from .gemmini_dense import gemm, schedule_gemm
 from .gemmini_dense_cisc import gemm_cisc, schedule_gemm_cisc
 from .gemmini_conv2d_cisc import conv2d_cisc, schedule_conv2d_cisc
 from .gemmini_depthwise_conv2d_cisc import depthwise_conv2d_cisc, schedule_depthwise_conv2d_cisc
 from .gemmini_add import add, schedule_add
 from .gemmini_max_pool2d import max_pool2d, schedule_max_pool2d
-from tvm.contrib.gemmini.environment import Environment
-
-from tvm.topi.utils import const_vector, get_const_int, get_const_float
-import numpy as np
 
 ENV = Environment.instance()
 

From 83050da8606e1915928b211a5256952bf2cc77c3 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 16:32:41 +0100
Subject: [PATCH 015/286] More lint improvements

---
 python/tvm/contrib/gemmini/build_module.py    |  3 +-
 python/tvm/contrib/gemmini/environment.py     | 12 ++---
 python/tvm/contrib/gemmini/helpers.py         | 30 ++++++------
 python/tvm/contrib/gemmini/intrin.py          | 14 +-----
 python/tvm/contrib/gemmini/pattern_table.py   |  3 --
 python/tvm/contrib/gemmini/transform.py       | 25 ----------
 .../backend/contrib/gemmini/gemmini_add.py    |  1 -
 .../backend/contrib/gemmini/gemmini_dense.py  | 47 +++++++++----------
 .../contrib/gemmini/gemmini_dense_cisc.py     |  4 +-
 .../gemmini/gemmini_depthwise_conv2d_cisc.py  |  1 -
 .../contrib/gemmini/gemmini_max_pool2d.py     |  2 +-
 11 files changed, 51 insertions(+), 91 deletions(-)

diff --git a/python/tvm/contrib/gemmini/build_module.py b/python/tvm/contrib/gemmini/build_module.py
index fc72a6b03af8..bf2ff9832309 100644
--- a/python/tvm/contrib/gemmini/build_module.py
+++ b/python/tvm/contrib/gemmini/build_module.py
@@ -76,7 +76,6 @@ def internal_build_configs(usmp_alg=""):
     Returns:
         dict: configurations
     """
-    enable_usmp = False if usmp_alg == "" else True
     pass_list = [
         (0, tvm.tir.transform.StorageFlatten(16)),
         (1, InjectAMVINIntrin()),
@@ -101,7 +100,7 @@ def internal_build_configs(usmp_alg=""):
         "tir.add_lower_pass": pass_list,
         "tir.disable_vectorize": True,
         # "tir.CorrectGemminisScratchpadAndAccumulatorPointers": {"dim": env.DIM}
-        "tir.usmp.enable": enable_usmp,
+        "tir.usmp.enable": bool(usmp_alg),
         "tir.usmp.algorithm": usmp_alg,
     }
 
diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
index 56e8e61b646d..37e18987883b 100644
--- a/python/tvm/contrib/gemmini/environment.py
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -58,7 +58,7 @@ def init_overwrite(
         bank_rows=8192,
         bank_num=4,
         debug=False,
-        enabled_counters: Dict = {},
+        enabled_counters: Dict = None,
         supports_non_zero_padding: bool = False,
         use_experimental_qnn_add: bool = False,
     ):
@@ -75,7 +75,7 @@ def init_overwrite(
             bank_rows (int, optional): Rows of each bank in the scratchpad. Defaults to 8192.
             bank_num (int, optional): Banks for the scratchpad. Defaults to 4.
             debug (bool, optional): Adds debug of Gemmini counters. Defaults to False.
-            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to empty.
+            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to None.
             supports_non_zero_padding (bool, optional): Gemmini supports instructions with non-zero padding. Defaults to False.
             use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add. Defaults to False.
         """
@@ -120,7 +120,7 @@ def init(
         bank_rows=4096,
         bank_num=4,
         debug=False,
-        enabled_counters: Dict = {},
+        enabled_counters: Dict = None,
         supports_non_zero_padding: bool = False,
         use_experimental_qnn_add: bool = False,
     ):
@@ -137,7 +137,7 @@ def init(
             bank_rows (int, optional): Amount of rows of each bank in the scratchpad. Defaults to 8192.
             bank_num (int, optional): Amount of banks for the scratchpad. Defaults to 4.
             debug (bool, optional): Adds debug of Gemmini counters. Defaults to False.
-            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to empty.
+            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to None.
             supports_non_zero_padding (bool, optional): Gemmini supports instructions with non-zero padding. Defaults to False.
             use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add. Defaults to False.
         """
@@ -212,11 +212,11 @@ def init(
         self.supports_non_zero_padding = supports_non_zero_padding
         self.use_experimental_qnn_add = use_experimental_qnn_add
 
-        self.enabled_counters = enabled_counters if bool(enabled_counters) else counters
+        self.enabled_counters = enabled_counters if enabled_counters is not None else counters
         # Check that all enabled counters exist in the actual counters from Gemmini
         for key, value in self.enabled_counters.items():
             assert (
-                self.enabled_counters[key] == counters[key]
+                value == counters[key]
             ), f"Enabled counter with key {key} does not exist or has a different name in the actual counters dict!"
 
     def gemm(
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
index df3a9bfe9bce..0bc3b4f8f386 100644
--- a/python/tvm/contrib/gemmini/helpers.py
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -50,6 +50,9 @@ def create_header_file(
         debug (bool, optional): enable debug. Defaults to False.
         weights (bool, optional): For debug purposes. Defaults to None.
     """
+    if debug:
+        assert weights is not None, "When passing the debug flag as True, the weights parameter must be given!"
+
     file_path = pathlib.Path(f"{output_path}/" + name).resolve()
     # Create header file with npy_data as a C array
     raw_header_path = file_path.with_suffix(".h").resolve()
@@ -70,16 +73,16 @@ def create_header_file(
     else:
         assert False, f"Type {tensor_data.dtype} is not supported!"
 
-    with open(raw_header_path, "a+") as header_file:
+    with open(raw_header_path, "a+", encoding="utf8") as header_file:
         header_file.write(
             f"#define {tensor_name}_len {tensor_data.size}\n"
             + f"extern {datatype} {tensor_name}[{tensor_name}_len];\n"
         )
 
     if not raw_source_path.is_file():
-        with open(raw_source_path, "a+") as source_file:
+        with open(raw_source_path, "a+", encoding="utf8") as source_file:
             source_file.write("#include <stdint.h>\n")
-    with open(raw_source_path, "a+") as source_file:
+    with open(raw_source_path, "a+", encoding="utf8") as source_file:
 
         source_file.write(
             f'{datatype} {tensor_name}[] __attribute__((section("{section}"), aligned({align}))) = {{'
@@ -118,17 +121,16 @@ def create_header_file(
                         source_file.write("\n")
             source_file.write("*/\n")
 
-            if weights is not None:
-                source_file.write("/*\n")
-                for o_ch in range(weights.shape[3]):
-                    source_file.write(f"Output channel {o_ch}:\n")
-                    for i_ch in range(weights.shape[2]):
-                        source_file.write(f"Input channel {i_ch}:\n")
-                        for row in range(weights.shape[0]):
-                            for col in range(weights.shape[1]):
-                                source_file.write(f"{weights[row][col][i_ch][o_ch]}\t")
-                            source_file.write("\n")
-                source_file.write("*/\n")
+            source_file.write("/*\n")
+            for o_ch in range(weights.shape[3]):
+                source_file.write(f"Output channel {o_ch}:\n")
+                for i_ch in range(weights.shape[2]):
+                    source_file.write(f"Input channel {i_ch}:\n")
+                    for row in range(weights.shape[0]):
+                        for col in range(weights.shape[1]):
+                            source_file.write(f"{weights[row][col][i_ch][o_ch]}\t")
+                        source_file.write("\n")
+            source_file.write("*/\n")
 
 
 def get_divisors(x: int) -> List[int]:
diff --git a/python/tvm/contrib/gemmini/intrin.py b/python/tvm/contrib/gemmini/intrin.py
index d8809726555a..65c27caf119c 100644
--- a/python/tvm/contrib/gemmini/intrin.py
+++ b/python/tvm/contrib/gemmini/intrin.py
@@ -392,20 +392,16 @@ def conv2d_cisc(
     Returns:
         TensorIntrin: CONV2D CISC tensor intrinsic
     """
-
+    _ = pool_dilation
     # TODO (FP): add assertions here for the supported parameters?
 
     wgt = te.placeholder(wgt_shape, dtype=env.inp_dtype, name=env.scr_wgt_scope)
     inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
     bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
 
-    wgt.shape[3]
     k_h = wgt.shape[0]
     k_w = wgt.shape[1]
 
-    inp.shape[0]
-    inp.shape[1]
-    inp.shape[2]
     i_c = inp.shape[3]
 
     ric = te.reduce_axis((0, i_c), name="ric")
@@ -571,15 +567,9 @@ def dw_conv2d_cisc(
     inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
     bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
 
-    wgt.shape[0]
     k_h = wgt.shape[1]
     k_w = wgt.shape[2]
 
-    inp.shape[0]
-    inp.shape[1]
-    inp.shape[2]
-    inp.shape[3]
-
     rkh = te.reduce_axis((0, k_h), name="rkh")
     rkw = te.reduce_axis((0, k_w), name="rkw")
 
@@ -751,7 +741,7 @@ def add_tensorize(env, oshape: Tuple[int, ...]):
     def intrin_func(ins, outs):
         """Add intrinsic function"""
         difm1, difm2 = ins
-        outs[0]
+        _ = outs
 
         def _body():
             irb = tvm.tir.ir_builder.create()
diff --git a/python/tvm/contrib/gemmini/pattern_table.py b/python/tvm/contrib/gemmini/pattern_table.py
index 46e29ad6ffa6..37a93b8a51bb 100644
--- a/python/tvm/contrib/gemmini/pattern_table.py
+++ b/python/tvm/contrib/gemmini/pattern_table.py
@@ -248,9 +248,6 @@ class DepthwiseCONV2DParams(CONV2DParams):
     composite_name = "gemmini.depthwiseconv2d"
     activation_map = {"clip": "CLIP"}
 
-    def __init__(self, func_body: tvm.relay.Function):
-        super().__init__(func_body)
-
 
 class MaxPoolParams:
     """
diff --git a/python/tvm/contrib/gemmini/transform.py b/python/tvm/contrib/gemmini/transform.py
index eddd9012ae07..41455bb8d283 100644
--- a/python/tvm/contrib/gemmini/transform.py
+++ b/python/tvm/contrib/gemmini/transform.py
@@ -277,7 +277,6 @@ def _do_fold(stmt):
         return None
 
     def _ftransform(f, mod, ctx):
-        f.attrs["global_symbol"]
         return f.with_body(
             tvm.tir.stmt_functor.ir_transform(f.body, _do_fold, None, ["tir.AttrStmt"])
         )
@@ -295,8 +294,6 @@ def InjectAMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -339,8 +336,6 @@ def InjectAMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -382,8 +377,6 @@ def InjectBMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -425,8 +418,6 @@ def InjectBMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -467,8 +458,6 @@ def InjectDMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -510,8 +499,6 @@ def InjectDMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -553,8 +540,6 @@ def InjectCMVOUTIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -599,8 +584,6 @@ def InjectCMVOUTIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -646,8 +629,6 @@ def InjectCMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -689,8 +670,6 @@ def InjectCMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -732,8 +711,6 @@ def InjectCMVINAccumIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -774,8 +751,6 @@ def InjectCMVINAccumIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
index a561a01d6c32..f324b8f9732d 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
@@ -128,7 +128,6 @@ def schedule_add(
 
     ifm1, ifm2_op = add_stage.op.input_tensors
     ifm2, ofm_offset_op = ifm2_op.op.input_tensors
-    ofm_offset_op.op.input_tensors[0]
 
     # Prepare the scope of each buffer
     cifm1 = sch.cache_read(ifm1, ENV.acc_scope, [add_stage])
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
index d52557d8b703..d43bdc8fc5b7 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
@@ -113,11 +113,10 @@ def schedule_gemm(
     sch = te.create_schedule([x.op for x in outs])
 
     data, weight, bias_op = dense_stage.op.input_tensors
-    bias_op.op.input_tensors[0]
 
     ##### space definition begin #####
     x, y = sch[dense_stage].op.axis
-    (z,) = sch[dense_stage].op.reduce_axis
+    (z_axis,) = sch[dense_stage].op.reduce_axis
 
     # TODO (FP): add limits for scratchpad and accumulator sizes perhaps?
     cfg.define_split(
@@ -146,7 +145,7 @@ def schedule_gemm(
 
     cfg.define_split(
         "tile_zo",
-        z,
+        z_axis,
         num_outputs=3,
         policy="power2",
         filter=lambda ax: (
@@ -188,26 +187,26 @@ def schedule_gemm(
     cweight = sch.cache_read(weight, ENV.scr_wgt_scope, [dense_stage])
     dense_stage_acc = sch.cache_write(output, ENV.acc_scope)
     sch[bias_op].set_scope(ENV.acc_scope)
-    (x_, y_) = sch[dense_stage_acc].op.axis
-    (z_,) = sch[dense_stage_acc].op.reduce_axis
+    (x_axis, y_axis) = sch[dense_stage_acc].op.axis
+    (z_axis_int,) = sch[dense_stage_acc].op.reduce_axis
 
     # Split loops to generate the inner dimensions specified by knobs tile_xo and tile_yo
-    b_y, yo, yi = cfg["tile_yo"].apply(sch, output, sch[output].op.axis[1])
-    b_x, xo, xi = cfg["tile_xo"].apply(sch, output, sch[output].op.axis[0])
+    b_y, yo_axis, yi_axis = cfg["tile_yo"].apply(sch, output, sch[output].op.axis[1])
+    b_x, xo_axis, xi_axis = cfg["tile_xo"].apply(sch, output, sch[output].op.axis[0])
 
     # Apply the exchange_axis knob
     if cfg["exchange_axis"].val:
-        sch[output].reorder(b_y, b_x, yo, xo, yi, xi)
+        sch[output].reorder(b_y, b_x, yo_axis, xo_axis, yi_axis, xi_axis)
     else:
-        sch[output].reorder(b_x, b_y, xo, yo, xi, yi)
+        sch[output].reorder(b_x, b_y, xo_axis, yo_axis, xi_axis, yi_axis)
 
     # Apply the accumulate_multiple_patches knob
     if cfg["accumulate_multiple_patches"].val == 0:
         axis_for_output = b_x if cfg["exchange_axis"].val else b_y
     elif cfg["accumulate_multiple_patches"].val == 1:
-        axis_for_output = yo if cfg["exchange_axis"].val else xo
+        axis_for_output = yo_axis if cfg["exchange_axis"].val else xo_axis
     else:
-        axis_for_output = xo if cfg["exchange_axis"].val else yo
+        axis_for_output = xo_axis if cfg["exchange_axis"].val else yo_axis
 
     axis_gemm_start = b_y if cfg["exchange_axis"].val else b_x
 
@@ -215,9 +214,9 @@ def schedule_gemm(
     sch[dense_stage_acc].compute_at(sch[output], axis_for_output)
 
     # # Split loops to generate the inner dimensions specified by knob tile_zo
-    xo_o, xi_o = sch[dense_stage_acc].split(x_, factor=ENV.DIM)
-    yo_o, yi_o = sch[dense_stage_acc].split(y_, factor=ENV.DIM)
-    b_z, zo_o, zi_o = cfg["tile_zo"].apply(sch, dense_stage_acc, z_)
+    xo_o, xi_o = sch[dense_stage_acc].split(x_axis, factor=ENV.DIM)
+    yo_o, yi_o = sch[dense_stage_acc].split(y_axis, factor=ENV.DIM)
+    b_z, zo_o, zi_o = cfg["tile_zo"].apply(sch, dense_stage_acc, z_axis_int)
 
     # Apply the exchange_axis knob
     if cfg["exchange_axis"].val:
@@ -302,22 +301,22 @@ def schedule_gemm(
 
     # Mvout preparation
     if cfg["exchange_axis"].val:
-        sch[output].reorder(yo, yi, xo, xi)
+        sch[output].reorder(yo_axis, yi_axis, xo_axis, xi_axis)
     else:
-        sch[output].reorder(xo, xi, yo, yi)
+        sch[output].reorder(xo_axis, xi_axis, yo_axis, yi_axis)
     if cfg["accumulate_multiple_patches"].val == 0:
-        fused_x = sch[output].fuse(xo, xi)
-        fused_y = sch[output].fuse(yo, yi)
+        fused_x = sch[output].fuse(xo_axis, xi_axis)
+        fused_y = sch[output].fuse(yo_axis, yi_axis)
     elif cfg["accumulate_multiple_patches"].val == 1:
         if cfg["exchange_axis"].val:
-            fused_x = sch[output].fuse(xo, xi)
-            fused_y = yi
+            fused_x = sch[output].fuse(xo_axis, xi_axis)
+            fused_y = yi_axis
         else:
-            fused_x = xi
-            fused_y = sch[output].fuse(yo, yi)
+            fused_x = xi_axis
+            fused_y = sch[output].fuse(yo_axis, yi_axis)
     else:
-        fused_x = xi
-        fused_y = yi
+        fused_x = xi_axis
+        fused_y = yi_axis
 
     fused_x_1, fused_x_2 = sch[output].split(fused_x, factor=ENV.DIM)
     fused_y_1, fused_y_2 = sch[output].split(
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
index a3978fe5b63d..8fdc12e5d8d2 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
@@ -112,9 +112,9 @@ def schedule_gemm_cisc(
     if cfg.is_fallback:
         cfg["WS/OS"] = OtherOptionEntity(ENV.WEIGHT_STATIONARY)
 
-    x_, _ = sch[dense_stage].op.axis
+    x_axis, _ = sch[dense_stage].op.axis
 
-    x_o, x_i = sch[dense_stage].split(x_, factor=data.shape[0])
+    x_o, x_i = sch[dense_stage].split(x_axis, factor=data.shape[0])
 
     axis_for_start = x_o
 
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
index d15392efeb32..b25893bc9bd0 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
@@ -77,7 +77,6 @@ def depthwise_conv2d_cisc(
     n = orig_data.shape[0]
     i_h = orig_data.shape[1]
     i_w = orig_data.shape[2]
-    orig_data.shape[3]
 
     hstr = strides[0]
     wstr = strides[1]
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
index c1c83f8956f7..bd71705be711 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
@@ -115,7 +115,7 @@ def irb_builder_func(ins, outs):
         return irb.get()
 
     res = te.extern(
-        (1,), [data, weights], lambda ins, outs: irb_builder_func(ins, outs), dtype="int8"
+        (1,), [data, weights], lambda ins, outs: irb_builder_func(ins, outs), dtype="int8" # pylint: disable=W0108
     )
 
     # TODO (FP): add correct FLOPS

From e3ca02c82e8741c5de32a55d8a711e35bb502e76 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 16:52:41 +0100
Subject: [PATCH 016/286] Fixed ALL pylint "Line too long"

---
 python/tvm/contrib/gemmini/environment.py     | 33 ++++++++++++-------
 python/tvm/contrib/gemmini/helpers.py         |  7 ++--
 python/tvm/contrib/gemmini/intrin.py          | 15 ++++++---
 python/tvm/contrib/gemmini/legalize.py        | 12 ++++---
 python/tvm/contrib/gemmini/pattern_table.py   |  3 +-
 .../backend/contrib/gemmini/gemmini_add.py    |  6 ++--
 .../backend/contrib/gemmini/gemmini_dense.py  | 12 ++++---
 .../gemmini/gemmini_depthwise_conv2d_cisc.py  |  3 +-
 .../contrib/gemmini/gemmini_max_pool2d.py     | 10 ++++--
 9 files changed, 69 insertions(+), 32 deletions(-)

diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
index 37e18987883b..1fa94acd9efe 100644
--- a/python/tvm/contrib/gemmini/environment.py
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -75,9 +75,12 @@ def init_overwrite(
             bank_rows (int, optional): Rows of each bank in the scratchpad. Defaults to 8192.
             bank_num (int, optional): Banks for the scratchpad. Defaults to 4.
             debug (bool, optional): Adds debug of Gemmini counters. Defaults to False.
-            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to None.
-            supports_non_zero_padding (bool, optional): Gemmini supports instructions with non-zero padding. Defaults to False.
-            use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add. Defaults to False.
+            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes.
+                Defaults to None.
+            supports_non_zero_padding (bool, optional): Gemmini supports instructions
+                with non-zero padding. Defaults to False.
+            use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add.
+                Defaults to False.
         """
         inst = Environment.instance()
         inst.init(
@@ -131,15 +134,20 @@ def init(
             dim (int, optional): Gemminis systolic array dimensions (DIM). Defaults to 32.
             max_bytes (int, optional): Limits maximum amount of mvin columns. Defaults to 64.
             inp_dtype (str, optional): Type of the Gemmini scratchpad. Defaults to "int8".
-            wgt_dtype (str, optional): Type of the Gemmini "logical" weight scratchpad. Defaults to "int8".
+            wgt_dtype (str, optional): Type of the Gemmini "logical" weight scratchpad.
+                Defaults to "int8".
             acc_dtype (str, optional): Type of the Gemmini accumulator. Defaults to "int32".
             acc_rows (int, optional): Amount of rows of the accumulator. Defaults to 4096.
-            bank_rows (int, optional): Amount of rows of each bank in the scratchpad. Defaults to 8192.
+            bank_rows (int, optional): Amount of rows of each bank in the scratchpad.
+                Defaults to 8192.
             bank_num (int, optional): Amount of banks for the scratchpad. Defaults to 4.
             debug (bool, optional): Adds debug of Gemmini counters. Defaults to False.
-            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to None.
-            supports_non_zero_padding (bool, optional): Gemmini supports instructions with non-zero padding. Defaults to False.
-            use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add. Defaults to False.
+            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes.
+                Defaults to None.
+            supports_non_zero_padding (bool, optional): Gemmini supports instructions
+                with non-zero padding. Defaults to False.
+            use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add.
+                Defaults to False.
         """
 
         assert batch == 1, "Only batch size of 1 is currently supported"
@@ -191,7 +199,8 @@ def init(
         self.scr_scope = "local.scratchpad"
         self.acc_scope = "local.accumulator"
         # Actually, only one scratchpad should exist.
-        # But we do this logical partition to correctly manage the pointers to the buffers stored in this memories.
+        # But we do this logical partition to correctly manage the pointers
+        # to the buffers stored in this memories.
         # Should see how we can fix this in the future.
         self.scr_wgt_scope = "local.scratchpad_weight"
 
@@ -217,7 +226,8 @@ def init(
         for key, value in self.enabled_counters.items():
             assert (
                 value == counters[key]
-            ), f"Enabled counter with key {key} does not exist or has a different name in the actual counters dict!"
+            ), f"Enabled counter with key {key} does not exist \
+            or has a different name in the actual counters dict!"
 
     def gemm(
         self,
@@ -236,7 +246,8 @@ def gemm(
             K (int): reduction axis dimension
             J (int): output second axis dimension
             stride (int, optional): Stride, useful for convolutions. Defaults to 1.
-            is_depthwise_conv2d (bool, optional): Flag to explain if this is a GEMM for a depthwise convolution. Defaults to False.
+            is_depthwise_conv2d (bool, optional): Flag to explain if this is a
+                GEMM for a depthwise convolution. Defaults to False.
             mode (int, optional): Systolic array mode (WS=1,OS=0). Defaults to 1.
             accum_patch (_type_, optional): Var of the reduction axis loop. Defaults to None.
 
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
index 0bc3b4f8f386..5ebf4c719a06 100644
--- a/python/tvm/contrib/gemmini/helpers.py
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -51,7 +51,9 @@ def create_header_file(
         weights (bool, optional): For debug purposes. Defaults to None.
     """
     if debug:
-        assert weights is not None, "When passing the debug flag as True, the weights parameter must be given!"
+        assert (
+            weights is not None
+        ), "When passing the debug flag as True, the weights parameter must be given!"
 
     file_path = pathlib.Path(f"{output_path}/" + name).resolve()
     # Create header file with npy_data as a C array
@@ -85,7 +87,8 @@ def create_header_file(
     with open(raw_source_path, "a+", encoding="utf8") as source_file:
 
         source_file.write(
-            f'{datatype} {tensor_name}[] __attribute__((section("{section}"), aligned({align}))) = {{'
+            f'{datatype} {tensor_name}[] __attribute__((section("{section}"), \
+                aligned({align}))) = {{'
             if section
             else f"{datatype} {tensor_name}[] __attribute__((aligned({align}))) = {{"
         )
diff --git a/python/tvm/contrib/gemmini/intrin.py b/python/tvm/contrib/gemmini/intrin.py
index 65c27caf119c..6aa20c2c8198 100644
--- a/python/tvm/contrib/gemmini/intrin.py
+++ b/python/tvm/contrib/gemmini/intrin.py
@@ -45,7 +45,8 @@ def gemm(
         dim_k (int): reduction axis dimension
         dim_j (int): output second axis dimension
         stride (int, optional): Stride, useful for convolutions. Defaults to 1.
-        is_depthwise_conv2d (bool, optional): Flag to explain if this is a GEMM for a depthwise convolution. Defaults to False.
+        is_depthwise_conv2d (bool, optional): Flag to explain if this is a GEMM for
+            a depthwise convolution. Defaults to False.
         mode (int, optional): Systolic array mode (WS=1,OS=0). Defaults to 1.
         accum_patch (tvm.tir.Var, optional): Var of the reduction axis loop. Defaults to None.
 
@@ -137,7 +138,8 @@ def intrin_func(ins, outs):
         garbage = tvm.runtime.const(0xFFFFFFFF, "uint32")
 
         def _body():
-            """Generate matrix-matrix multiply Gemmini instruction, without accumulate (garbage address in compute_preloaded)"""
+            """Generate matrix-matrix multiply Gemmini instruction,
+            without accumulate (garbage address in compute_preloaded)"""
             irb = tvm.tir.ir_builder.create()
 
             inp_access_ptr = dinp.access_ptr("r", "uint32")
@@ -238,7 +240,8 @@ def gemm_cisc(
     scale: float,
     matmul_type: int,
 ):
-    """Matrix-matrix multiply intrinsic, inserts the calls to the function provided by the Gemmini developers to run matrix multiplication using the loop instructions
+    """Matrix-matrix multiply intrinsic, inserts the calls to the function
+    provided by the Gemmini developers to run matrix multiplication using the loop instructions
 
     Args:
         env (Environment): Environment with configurations
@@ -371,7 +374,8 @@ def conv2d_cisc(
     pool_dilation: List[int],
     pool_padding: List[int],
 ):
-    """2D convolution intrinsic, inserts the calls to the function provided by the Gemmini developers to run a 2D convolution using the loop instructions
+    """2D convolution intrinsic, inserts the calls to the function provided
+    by the Gemmini developers to run a 2D convolution using the loop instructions
 
     Args:
         env (Environment): Environment with configurations
@@ -543,7 +547,8 @@ def dw_conv2d_cisc(
     activation: int,
     scale: float,
 ):
-    """2D depthwise convolution intrinsic, inserts the calls to the function provided by the Gemmini developers to run a 2D depthwise convolution using the loop instructions
+    """2D depthwise convolution intrinsic, inserts the calls to the function
+    provided by the Gemmini developers to run a 2D depthwise convolution using the loop instructions
 
     Args:
         env (Environment): Environment with configurations
diff --git a/python/tvm/contrib/gemmini/legalize.py b/python/tvm/contrib/gemmini/legalize.py
index f924f1dfe716..4d74707e5acf 100644
--- a/python/tvm/contrib/gemmini/legalize.py
+++ b/python/tvm/contrib/gemmini/legalize.py
@@ -156,10 +156,14 @@ def gemmini_conv2d(
         pool_padding (tvm.relay.Expr): Pooling padding in each direction
         input_req_offset_out (tvm.relay.Expr): Requantize layer output offset
         has_activation (bool): Has activation?
-        activation_scale_in (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer input scaling factor
-        activation_offset_in (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer input offset
-        activation_scale_out (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer output scaling factor
-        activation_offset_out (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer output offset
+        activation_scale_in (tvm.relay.Expr): TODO (FP): check if this can be deleted
+            and made more simple. Activation layer input scaling factor
+        activation_offset_in (tvm.relay.Expr): TODO (FP): check if this can be deleted
+            and made more simple. Activation layer input offset
+        activation_scale_out (tvm.relay.Expr): TODO (FP): check if this can be deleted
+            and made more simple. Activation layer output scaling factor
+        activation_offset_out (tvm.relay.Expr): TODO (FP): check if this can be deleted
+            and made more simple. Activation layer output offset
 
     Returns:
         tvm.relay.Call: Call to the contrib.gemmini.conv2d operator
diff --git a/python/tvm/contrib/gemmini/pattern_table.py b/python/tvm/contrib/gemmini/pattern_table.py
index 37a93b8a51bb..ddb4b69acef9 100644
--- a/python/tvm/contrib/gemmini/pattern_table.py
+++ b/python/tvm/contrib/gemmini/pattern_table.py
@@ -420,7 +420,8 @@ def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Cal
     """Declares Gemminis pattern table
 
     Returns:
-        List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]: List of pattern, callable tuples
+        List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]:
+            List of pattern, callable tuples
     """
 
     pattern_table_filters = []
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
index f324b8f9732d..d019fe4cbc3e 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
@@ -134,7 +134,8 @@ def schedule_add(
     sch[ifm2_op].set_scope(ENV.acc_scope)
     sch[ofm_offset_op].set_scope(ENV.acc_scope)
 
-    # Split axis, taking into account the maximum value of rows and columns that can be moved into Gemminis accumulator (DIM)
+    # Split axis, taking into account the maximum value of rows and columns
+    # that can be moved into Gemminis accumulator (DIM)
     y_factor = get_greater_div(int(sch[add_stage].op.axis[3].dom.extent))
     x_factor = get_greater_div(int(sch[add_stage].op.axis[2].dom.extent))
     y_o, y_i = sch[add_stage].split(sch[add_stage].op.axis[3], factor=y_factor)
@@ -146,7 +147,8 @@ def schedule_add(
     sch[ifm2_op].compute_at(sch[add_stage], y_o)
     sch[ofm_offset_op].compute_at(sch[add_stage], y_o)
 
-    # Split axis, taking into account the maximum value of rows and columns that can be moved into Gemminis accumulator (DIM)
+    # Split axis, taking into account the maximum value of rows and columns
+    # that can be moved into Gemminis accumulator (DIM)
     cifm1_ax_0_1, cifm1_ax_0_2 = sch[cifm1].split(sch[cifm1].op.axis[2], factor=ENV.DIM)
     cifm1_ax_1_1, cifm1_ax_1_2 = sch[cifm1].split(
         sch[cifm1].op.axis[3], factor=ENV.MAX_BLOCK_LEN_ACC * ENV.DIM
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
index d43bdc8fc5b7..dbb7d12f7da5 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
@@ -237,7 +237,8 @@ def schedule_gemm(
     # Compute the move of the bias in the correct loop
     sch[bias_op].compute_at(sch[output], axis_for_output)
 
-    # We assert here that the mvin of data does not use more space than the available one in the scratchpad
+    # We assert here that the mvin of data does not use more space
+    # than the available one in the scratchpad
     if cfg["axis_for_cdata"].val == 0:
         assert (
             cfg["tile_xo"].size[1] * cfg["tile_xo"].size[2] * data.shape[1]
@@ -272,7 +273,8 @@ def schedule_gemm(
             <= ENV.ACC_ROWS * ENV.DIM
         ), "Result matrix will not fit in accumulator!"
 
-    # Move the data and weight move instructions into the correct loops selected by the axis_for_cdata and axis_for_cweight knobs
+    # Move the data and weight move instructions into the correct loops selected
+    # by the axis_for_cdata and axis_for_cweight knobs
     axis_for_cdata = axis_to_input_data[cfg["axis_for_cdata"].val]
     axis_for_cweight = axis_to_input_weights[cfg["axis_for_cweight"].val]
     sch[cdata].compute_at(sch[stages_to_input_data[cfg["axis_for_cdata"].val]], axis_for_cdata)
@@ -280,7 +282,8 @@ def schedule_gemm(
         sch[stages_to_input_data[cfg["axis_for_cweight"].val]], axis_for_cweight
     )
 
-    # Split input moves because Gemmini's mvin only supports mvins with rows <= DIM and cols <= MAX_BLOCK_LEN
+    # Split input moves because Gemmini's mvin only supports mvins with
+    # rows <= DIM and cols <= MAX_BLOCK_LEN
     cdata_ax_0_1, cdata_ax_0_2 = sch[cdata].split(sch[cdata].op.axis[0], factor=ENV.DIM)
     cdata_ax_1_1, cdata_ax_1_2 = sch[cdata].split(
         sch[cdata].op.axis[1], factor=ENV.MAX_BLOCK_LEN * ENV.DIM
@@ -353,7 +356,8 @@ def schedule_gemm(
         ),
     )
 
-    # Generate configuration dictionary, in order to correctly generate the calls to the configuration instructions
+    # Generate configuration dictionary, in order to correctly generate
+    # the calls to the configuration instructions
     config_dict = {}
     config_dict["A_size"] = int(data.shape[1])
     config_dict["B_size"] = int(weight.shape[1])
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
index b25893bc9bd0..d33749823268 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
@@ -89,7 +89,8 @@ def depthwise_conv2d_cisc(
     o_w = topi.utils.get_const_int(tvm.tir.div((i_w + (left_pad + right_pad) - k_w), wstr) + 1)
 
     if len(set(padding)) == 1 and ENV.supports_non_zero_padding:
-        # If the padding is the same for all borders, there is no need to use topi.nn.pad, because Gemminis CISC instructions support equal padding
+        # If the padding is the same for all borders, there is no need to use topi.nn.pad,
+        # because Gemminis CISC instructions support equal padding
         data = orig_data
     else:
         # If not, then pad before calling Gemminis functions
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
index bd71705be711..2e7880bcbdfe 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
@@ -41,7 +41,10 @@ def max_pool2d(
     pool_dilation: tvm.ir.container.Array,
     pool_padding: tvm.ir.container.Array,
 ) -> tvm.te.tensor.Tensor:
-    """Computation definition to run a max pooling layer on Gemmini. Uses a trick: we call a dw convolution + max pooling, but all weights are 1. So the depthwise convolution does nothing, and the Gemmini accelerator takes care internally of applying the max pooling.
+    """Computation definition to run a max pooling layer on Gemmini.
+    Uses a trick: we call a dw convolution + max pooling, but all weights are 1.
+    So the depthwise convolution does nothing, and the Gemmini accelerator takes care
+    internally of applying the max pooling.
 
     Args:
         cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
@@ -115,7 +118,10 @@ def irb_builder_func(ins, outs):
         return irb.get()
 
     res = te.extern(
-        (1,), [data, weights], lambda ins, outs: irb_builder_func(ins, outs), dtype="int8" # pylint: disable=W0108
+        (1,),
+        [data, weights],
+        lambda ins, outs: irb_builder_func(ins, outs),  # pylint: disable=W0108
+        dtype="int8",
     )
 
     # TODO (FP): add correct FLOPS

From 75a7749f4eba68490a885d87dba1d1f9846684ba Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 17:14:22 +0100
Subject: [PATCH 017/286] Pending pylint fixes

---
 python/tvm/contrib/gemmini/environment.py | 6 +++---
 python/tvm/contrib/gemmini/helpers.py     | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
index 1fa94acd9efe..565c1db13f11 100644
--- a/python/tvm/contrib/gemmini/environment.py
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -32,7 +32,7 @@
     add_tensorize,
     add_mvout_tensorize,
 )
-from .utils import counters
+from .utils import COUNTERS
 
 
 class Environment(object):
@@ -221,11 +221,11 @@ def init(
         self.supports_non_zero_padding = supports_non_zero_padding
         self.use_experimental_qnn_add = use_experimental_qnn_add
 
-        self.enabled_counters = enabled_counters if enabled_counters is not None else counters
+        self.enabled_counters = enabled_counters if enabled_counters is not None else COUNTERS
         # Check that all enabled counters exist in the actual counters from Gemmini
         for key, value in self.enabled_counters.items():
             assert (
-                value == counters[key]
+                value == COUNTERS[key]
             ), f"Enabled counter with key {key} does not exist \
             or has a different name in the actual counters dict!"
 
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
index 5ebf4c719a06..69dca3a6b0de 100644
--- a/python/tvm/contrib/gemmini/helpers.py
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -22,8 +22,8 @@
 
 import pathlib
 from typing import List
-import numpy as np
 from six.moves import range
+import numpy as np
 from .environment import Environment
 
 
From 2ebd85acccc3c68e6212d0495e1dd3c3c9cf312b Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 17:14:34 +0100
Subject: [PATCH 018/286] Pending pylint fixes

---
 python/tvm/contrib/gemmini/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/contrib/gemmini/utils.py b/python/tvm/contrib/gemmini/utils.py
index 1f9d6b26134f..e43d2a7a23f9 100644
--- a/python/tvm/contrib/gemmini/utils.py
+++ b/python/tvm/contrib/gemmini/utils.py
@@ -22,7 +22,7 @@
 
 from enum import Enum
 
-counters = {
+COUNTERS = {
     1: "MAIN_LD_CYCLES",
     2: "MAIN_ST_CYCLES",
     3: "MAIN_EX_CYCLES",

From 292c6e1d511fdccd84737e56ff945c323fffacc3 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Thu, 9 Feb 2023 08:35:54 +0100
Subject: [PATCH 019/286] Docs fix

---
 gallery/tutorial/micro_gemmini_dwconv2d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gallery/tutorial/micro_gemmini_dwconv2d.py b/gallery/tutorial/micro_gemmini_dwconv2d.py
index 6030d14ea024..3fbd41a1c21b 100644
--- a/gallery/tutorial/micro_gemmini_dwconv2d.py
+++ b/gallery/tutorial/micro_gemmini_dwconv2d.py
@@ -16,7 +16,7 @@
 # under the License.
 """
 Running TVM on the Gemmini accelerator - A single 2d depthwise convolutional layer example
-======================================================================================
+===========================================================================================
 **Author**:
 `Federico Peccia <https://fPecc.github.io/>`_
 

From a0d4ba0729f3e824decb5f9769714cb5b40553fb Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Thu, 9 Feb 2023 13:10:29 +0100
Subject: [PATCH 020/286] Added missing license text

---
 apps/microtvm/gemmini/README.md                 | 17 +++++++++++++++++
 .../gemmini/template_project/src/Makefile       | 17 +++++++++++++++++
 .../gemmini/template_project/src/Makefrag.mk    | 17 +++++++++++++++++
 cmake/modules/contrib/Gemmini.cmake             | 17 +++++++++++++++++
 4 files changed, 68 insertions(+)

diff --git a/apps/microtvm/gemmini/README.md b/apps/microtvm/gemmini/README.md
index 9b4c45716062..2691844797f5 100644
--- a/apps/microtvm/gemmini/README.md
+++ b/apps/microtvm/gemmini/README.md
@@ -1,3 +1,20 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
 This directory contains code to create code for the Gemmini accelerator using microTVM. These tests are then executed on the Spike RISC-V ISA simulator.
 
 In order to use this correctly, the Spike simulator has to be installed. This can be done by following the steps found on the [Chipyard](https://chipyard.readthedocs.io/en/stable/) repository. The instructions to also install the patch of the Spike simulator that adds the Gemmini functional simulator can be found in the [Gemmini](https://github.com/ucb-bar/gemmini) repository.
diff --git a/apps/microtvm/gemmini/template_project/src/Makefile b/apps/microtvm/gemmini/template_project/src/Makefile
index b8da778d7eec..c1badcf1816c 100644
--- a/apps/microtvm/gemmini/template_project/src/Makefile
+++ b/apps/microtvm/gemmini/template_project/src/Makefile
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 include $(abs_top_srcdir)/Makefrag
 
 tests_baremetal = $(tests:=-baremetal)
diff --git a/apps/microtvm/gemmini/template_project/src/Makefrag.mk b/apps/microtvm/gemmini/template_project/src/Makefrag.mk
index a60184526081..cb4e5ee72da9 100644
--- a/apps/microtvm/gemmini/template_project/src/Makefrag.mk
+++ b/apps/microtvm/gemmini/template_project/src/Makefrag.mk
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 XLEN ?= 64
 
 CC_BAREMETAL := riscv$(XLEN)-unknown-elf-gcc
diff --git a/cmake/modules/contrib/Gemmini.cmake b/cmake/modules/contrib/Gemmini.cmake
index 757a99217510..2e5a76bcc06c 100644
--- a/cmake/modules/contrib/Gemmini.cmake
+++ b/cmake/modules/contrib/Gemmini.cmake
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 if(USE_GEMMINI)
   message(STATUS "Add Gemmini for microTVM")
 

From 626fd6b3f074cc4721542546dc651c868ae81a67 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Thu, 9 Feb 2023 13:54:12 +0100
Subject: [PATCH 021/286] Small lint fixes

---
 src/relay/op/contrib/gemmini/convolution.cc           | 1 -
 src/relay/op/contrib/gemmini/depthwise_convolution.cc | 1 -
 src/relay/op/contrib/gemmini/gemm.cc                  | 1 -
 src/relay/op/contrib/gemmini/max_pool2d.cc            | 1 -
 src/tir/ir/stmt.cc                                    | 2 +-
 5 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/relay/op/contrib/gemmini/convolution.cc b/src/relay/op/contrib/gemmini/convolution.cc
index 1ac0a3ad0df5..78c7f249c51c 100644
--- a/src/relay/op/contrib/gemmini/convolution.cc
+++ b/src/relay/op/contrib/gemmini/convolution.cc
@@ -26,7 +26,6 @@
 
 #include "../../../qnn/utils.h"
 #include "../../op_common.h"
-//#include "common.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/contrib/gemmini/depthwise_convolution.cc b/src/relay/op/contrib/gemmini/depthwise_convolution.cc
index d9cb264fb514..c956c5e1b815 100644
--- a/src/relay/op/contrib/gemmini/depthwise_convolution.cc
+++ b/src/relay/op/contrib/gemmini/depthwise_convolution.cc
@@ -26,7 +26,6 @@
 
 #include "../../../qnn/utils.h"
 #include "../../op_common.h"
-//#include "common.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/contrib/gemmini/gemm.cc b/src/relay/op/contrib/gemmini/gemm.cc
index 6002e72aaa41..eacbabafdc77 100644
--- a/src/relay/op/contrib/gemmini/gemm.cc
+++ b/src/relay/op/contrib/gemmini/gemm.cc
@@ -26,7 +26,6 @@
 
 #include "../../../qnn/utils.h"
 #include "../../op_common.h"
-//#include "common.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/contrib/gemmini/max_pool2d.cc b/src/relay/op/contrib/gemmini/max_pool2d.cc
index 2e435ceea875..082a4492547b 100644
--- a/src/relay/op/contrib/gemmini/max_pool2d.cc
+++ b/src/relay/op/contrib/gemmini/max_pool2d.cc
@@ -26,7 +26,6 @@
 
 #include "../../../qnn/utils.h"
 #include "../../op_common.h"
-//#include "common.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index ff28121db27d..250465257301 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -38,7 +38,7 @@ LetStmt::LetStmt(Var var, PrimExpr value, Stmt body, Span span) {
   // It is still valid to bind a pointer type
   // var to a value that is of type handle.
   if (var->type_annotation.as<PointerTypeNode>()) {
-    // TODO (FP): Is this check really necessary?
+    // TODO(FP): Is this check really necessary?
     // auto vdtype = value.dtype();
     // ICHECK(vdtype.is_handle());
   } else {

From 0ed4093bdd19bd2e4228c733b45d03fa99477dee Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Tue, 6 Dec 2022 14:30:46 +0100
Subject: [PATCH 022/286] Added integration to generate C code able to execute
 neural networks on the Gemmini accelerator

---
 .gitmodules                                   |   3 +
 3rdparty/gemmini                              |   1 +
 CMakeLists.txt                                |   2 +
 apps/microtvm/gemmini/README.md               |   3 +
 .../template_project/crt_config/crt_config.h  |  57 ++
 .../template_project/microtvm_api_server.py   | 386 ++++++++
 .../gemmini/template_project/src/Makefile.in  |  34 +
 .../gemmini/template_project/src/Makefrag     |  25 +
 .../gemmini/template_project/src/add.c        |  69 ++
 .../gemmini/template_project/src/conv2d.c     |  67 ++
 .../gemmini/template_project/src/dense.c      |  67 ++
 .../gemmini/template_project/src/dwconv2d.c   |  67 ++
 .../src/makefiles/add/Makefile                |  68 ++
 .../src/makefiles/conv2d/Makefile             |  68 ++
 .../src/makefiles/dense/Makefile              |  68 ++
 .../src/makefiles/dwconv2d/Makefile           |  68 ++
 .../src/makefiles/maxpool2d/Makefile          |  68 ++
 .../src/makefiles/mobilenet/Makefile          |  68 ++
 .../gemmini/template_project/src/maxpool2d.c  |  67 ++
 .../gemmini/template_project/src/mobilenet.c  | 127 +++
 cmake/modules/contrib/Gemmini.cmake           | 117 +++
 python/tvm/autotvm/measure/measure_methods.py |  12 +
 python/tvm/contrib/gemmini/__init__.py        |  32 +
 python/tvm/contrib/gemmini/build_module.py    | 201 ++++
 python/tvm/contrib/gemmini/environment.py     | 386 ++++++++
 python/tvm/contrib/gemmini/helpers.py         | 188 ++++
 python/tvm/contrib/gemmini/intrin.py          | 873 ++++++++++++++++++
 python/tvm/contrib/gemmini/legalize.py        | 595 ++++++++++++
 python/tvm/contrib/gemmini/pattern_table.py   | 469 ++++++++++
 python/tvm/contrib/gemmini/transform.py       | 816 ++++++++++++++++
 .../networks/mobilenet-tutorial.ipynb         | 311 +++++++
 .../tutorials/networks/mobilenet_utils.py     | 138 +++
 .../single_operators/add-tutorial.ipynb       | 395 ++++++++
 .../single_operators/conv2d-tutorial.ipynb    | 378 ++++++++
 .../single_operators/dense-tutorial.ipynb     | 378 ++++++++
 .../single_operators/dwconv2d-tutorial.ipynb  | 373 ++++++++
 .../single_operators/maxpool2d-tutorial.ipynb | 378 ++++++++
 python/tvm/contrib/gemmini/utils.py           | 142 +++
 python/tvm/micro/build.py                     |   1 +
 python/tvm/micro/model_library_format.py      |   2 +-
 .../relay/backend/contrib/gemmini/__init__.py |  23 +
 .../backend/contrib/gemmini/gemmini_add.py    | 214 +++++
 .../contrib/gemmini/gemmini_conv2d_cisc.py    | 244 +++++
 .../backend/contrib/gemmini/gemmini_dense.py  | 377 ++++++++
 .../contrib/gemmini/gemmini_dense_cisc.py     | 137 +++
 .../gemmini/gemmini_depthwise_conv2d_cisc.py  | 227 +++++
 .../contrib/gemmini/gemmini_max_pool2d.py     | 148 +++
 .../tvm/relay/backend/contrib/gemmini/op.py   | 286 ++++++
 python/tvm/tir/transform/transform.py         |  11 +
 src/relay/op/contrib/gemmini/add.cc           | 134 +++
 src/relay/op/contrib/gemmini/convolution.cc   | 221 +++++
 .../contrib/gemmini/depthwise_convolution.cc  | 159 ++++
 src/relay/op/contrib/gemmini/gemm.cc          | 125 +++
 src/relay/op/contrib/gemmini/max_pool2d.cc    | 121 +++
 src/target/metadata_module.cc                 |   3 +-
 src/target/source/codegen_c_host.cc           |   3 +
 src/tir/ir/stmt.cc                            |   3 +-
 .../inject_gemmini_pointer_correction.cc      | 131 +++
 58 files changed, 10132 insertions(+), 3 deletions(-)
 create mode 160000 3rdparty/gemmini
 create mode 100644 apps/microtvm/gemmini/README.md
 create mode 100644 apps/microtvm/gemmini/template_project/crt_config/crt_config.h
 create mode 100644 apps/microtvm/gemmini/template_project/microtvm_api_server.py
 create mode 100644 apps/microtvm/gemmini/template_project/src/Makefile.in
 create mode 100644 apps/microtvm/gemmini/template_project/src/Makefrag
 create mode 100644 apps/microtvm/gemmini/template_project/src/add.c
 create mode 100644 apps/microtvm/gemmini/template_project/src/conv2d.c
 create mode 100644 apps/microtvm/gemmini/template_project/src/dense.c
 create mode 100644 apps/microtvm/gemmini/template_project/src/dwconv2d.c
 create mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/add/Makefile
 create mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile
 create mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile
 create mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile
 create mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile
 create mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile
 create mode 100644 apps/microtvm/gemmini/template_project/src/maxpool2d.c
 create mode 100644 apps/microtvm/gemmini/template_project/src/mobilenet.c
 create mode 100644 cmake/modules/contrib/Gemmini.cmake
 create mode 100644 python/tvm/contrib/gemmini/__init__.py
 create mode 100644 python/tvm/contrib/gemmini/build_module.py
 create mode 100644 python/tvm/contrib/gemmini/environment.py
 create mode 100644 python/tvm/contrib/gemmini/helpers.py
 create mode 100644 python/tvm/contrib/gemmini/intrin.py
 create mode 100644 python/tvm/contrib/gemmini/legalize.py
 create mode 100644 python/tvm/contrib/gemmini/pattern_table.py
 create mode 100644 python/tvm/contrib/gemmini/transform.py
 create mode 100644 python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb
 create mode 100644 python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py
 create mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb
 create mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb
 create mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb
 create mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb
 create mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb
 create mode 100644 python/tvm/contrib/gemmini/utils.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/__init__.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
 create mode 100644 python/tvm/relay/backend/contrib/gemmini/op.py
 create mode 100644 src/relay/op/contrib/gemmini/add.cc
 create mode 100644 src/relay/op/contrib/gemmini/convolution.cc
 create mode 100644 src/relay/op/contrib/gemmini/depthwise_convolution.cc
 create mode 100644 src/relay/op/contrib/gemmini/gemm.cc
 create mode 100644 src/relay/op/contrib/gemmini/max_pool2d.cc
 create mode 100644 src/tir/transforms/inject_gemmini_pointer_correction.cc

diff --git a/.gitmodules b/.gitmodules
index 66fd0390cf35..64c1a30050bc 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -19,3 +19,6 @@
 [submodule "3rdparty/OpenCL-Headers"]
 	path = 3rdparty/OpenCL-Headers
 	url = https://github.com/KhronosGroup/OpenCL-Headers.git
+[submodule "3rdparty/gemmini"]
+	path = 3rdparty/gemmini
+	url = https://github.com/ucb-bar/gemmini
diff --git a/3rdparty/gemmini b/3rdparty/gemmini
new file mode 160000
index 000000000000..b6bdad59cbd6
--- /dev/null
+++ b/3rdparty/gemmini
@@ -0,0 +1 @@
+Subproject commit b6bdad59cbd6313f1ea4c93d3493db3d59b9e418
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 736d516fa1f6..47499ff90356 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -492,6 +492,7 @@ include(cmake/modules/Micro.cmake)
 include(cmake/modules/contrib/EthosN.cmake)
 include(cmake/modules/contrib/CMSISNN.cmake)
 include(cmake/modules/contrib/EthosU.cmake)
+include(cmake/modules/contrib/Gemmini.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
 include(cmake/modules/contrib/CODEGENC.cmake)
 include(cmake/modules/contrib/DNNL.cmake)
@@ -574,6 +575,7 @@ if(USE_MICRO)
   # Unix Makefiles generator, need to add these explicit target-level dependency)
   add_dependencies(tvm_runtime zephyr)
   add_dependencies(tvm_runtime arduino)
+  add_dependencies(tvm_runtime gemmini)
   if(MSVC)
     target_link_libraries(tvm PRIVATE host_standalone_crt )
     target_link_libraries(tvm_runtime PRIVATE host_standalone_crt)
diff --git a/apps/microtvm/gemmini/README.md b/apps/microtvm/gemmini/README.md
new file mode 100644
index 000000000000..11fea3415b70
--- /dev/null
+++ b/apps/microtvm/gemmini/README.md
@@ -0,0 +1,3 @@
+This directory contains code to create code for the Gemmini accelerator using microTVM. These tests are then executed on the Spike RISC-V ISA simulator.
+
+In order to use this correctly, the Spike simulator has to be installed. This can be done by following the steps found on the Chipyard repository.
diff --git a/apps/microtvm/gemmini/template_project/crt_config/crt_config.h b/apps/microtvm/gemmini/template_project/crt_config/crt_config.h
new file mode 100644
index 000000000000..b3126cfac920
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/crt_config/crt_config.h
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief CRT configuration for the host-linked CRT.
+ */
+#ifndef TVM_RUNTIME_MICRO_CRT_CONFIG_H_
+#define TVM_RUNTIME_MICRO_CRT_CONFIG_H_
+
+/*! Log level of the CRT runtime */
+#define TVM_CRT_LOG_LEVEL TVM_CRT_LOG_LEVEL_DEBUG
+
+/*! Support low-level debugging in MISRA-C runtime */
+#define TVM_CRT_DEBUG 0
+
+/*! Maximum supported dimension in NDArray */
+#define TVM_CRT_MAX_NDIM 6
+/*! Maximum supported arguments in generated functions */
+#define TVM_CRT_MAX_ARGS 10
+/*! Maximum supported string length in dltype, e.g. "int8", "int16", "float32" */
+#define TVM_CRT_MAX_STRLEN_DLTYPE 10
+/*! Maximum supported string length in function names */
+#define TVM_CRT_MAX_STRLEN_FUNCTION_NAME 120
+/*! Maximum supported string length in parameter names */
+#define TVM_CRT_MAX_STRLEN_PARAM_NAME 80
+
+/*! Maximum number of registered modules. */
+#define TVM_CRT_MAX_REGISTERED_MODULES 2
+
+/*! Size of the global function registry, in bytes. */
+#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 512
+
+/*! Maximum packet size, in bytes, including the length header. */
+#define TVM_CRT_MAX_PACKET_SIZE_BYTES 8 * 1024
+
+/*! \brief Maximum length of a PackedFunc function name. */
+#define TVM_CRT_MAX_FUNCTION_NAME_LENGTH_BYTES 30
+
+// #define TVM_CRT_FRAMER_ENABLE_LOGS
+
+#endif  // TVM_RUNTIME_MICRO_CRT_CONFIG_H_
diff --git a/apps/microtvm/gemmini/template_project/microtvm_api_server.py b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
new file mode 100644
index 000000000000..f4d4f7eb5e89
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
@@ -0,0 +1,386 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+MicroTVM API Server for Gemmini baremetal tests on the Spike simulator
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import atexit
+import collections
+import functools
+import json
+import logging
+import os
+import os.path
+import pathlib
+import re
+import shlex
+import shutil
+import shlex, subprocess
+import sys
+import tarfile
+import tempfile
+import time
+from string import Template
+import re
+from distutils.dir_util import copy_tree
+import subprocess
+import serial
+
+# import serial.tools.list_ports
+from tvm.micro.project_api import server
+
+from subprocess import PIPE
+
+_LOG = logging.getLogger(__name__)
+
+MODEL_LIBRARY_FORMAT_RELPATH = pathlib.Path("src") / "model" / "model.tar"
+API_SERVER_DIR = pathlib.Path(os.path.dirname(__file__) or os.path.getcwd())
+BUILD_DIR = API_SERVER_DIR / "build"
+MODEL_LIBRARY_FORMAT_PATH = API_SERVER_DIR / MODEL_LIBRARY_FORMAT_RELPATH
+
+IS_TEMPLATE = not (API_SERVER_DIR / MODEL_LIBRARY_FORMAT_RELPATH).exists()
+
+PROJECT_TYPES = [
+    "dense_example",
+    "conv2d_example",
+    "dwconv2d_example",
+    "add_example",
+    "maxpool2d_example",
+    "mobilenet_example",
+]
+
+PROJECT_OPTIONS = [
+    server.ProjectOption(
+        "project_type",
+        required=["generate_project"],
+        choices=tuple(PROJECT_TYPES),
+        type="str",
+        help="Type of project to generate.",
+    )
+]
+
+
+class Handler(server.ProjectAPIHandler):
+    def __init__(self):
+        super(Handler, self).__init__()
+        self._proc = None
+        self._port = None
+        self._transport = None
+        self._project_dir = None
+        self._qemu_instance = None
+
+    def server_info_query(self, tvm_version):
+        return server.ServerInfo(
+            platform_name="gemmini",
+            is_template=IS_TEMPLATE,
+            model_library_format_path="" if IS_TEMPLATE else MODEL_LIBRARY_FORMAT_PATH,
+            project_options=PROJECT_OPTIONS,
+        )
+
+    def _copy_project_files(self, api_server_dir, project_dir, project_type):
+        """Copies the files for project_type into project_dir.
+
+        Notes
+        -----
+        template_dir is NOT a project type, and that directory is never copied
+        in this function. template_dir only holds this file and its unit tests,
+        so this file is copied separately in generate_project.
+
+        """
+        for item in (API_SERVER_DIR / "src" / project_type).iterdir():
+            dest = project_dir / "src" / item.name
+            if item.is_dir():
+                shutil.copytree(item, dest)
+            else:
+                shutil.copy2(item, dest)
+
+    CRT_COPY_ITEMS = ("include", "src")
+
+    def _copy_standalone_crt(self, source_dir, standalone_crt_dir):
+        output_crt_dir = source_dir / "standalone_crt"
+        for item in self.CRT_COPY_ITEMS:
+            src_path = os.path.join(standalone_crt_dir, item)
+            dst_path = output_crt_dir / item
+            if os.path.isdir(src_path):
+                shutil.copytree(src_path, dst_path)
+            else:
+                shutil.copy2(src_path, dst_path)
+
+    # Example project is the "minimum viable project",
+    # and doesn't need a fancy RPC server
+    EXAMPLE_PROJECT_UNUSED_COMPONENTS = []
+
+    def _remove_unused_components(self, source_dir, project_type):
+        unused_components = []
+        if project_type == "example_project":
+            unused_components = self.EXAMPLE_PROJECT_UNUSED_COMPONENTS
+
+        for component in unused_components:
+            shutil.rmtree(source_dir / "standalone_crt" / component)
+
+    def _disassemble_mlf(self, mlf_tar_path, source_dir):
+        with tempfile.TemporaryDirectory() as mlf_unpacking_dir_str:
+            mlf_unpacking_dir = pathlib.Path(mlf_unpacking_dir_str)
+            with tarfile.open(mlf_tar_path, "r:") as tar:
+                tar.extractall(mlf_unpacking_dir)
+
+            model_dir = source_dir / "model"
+            model_dir.mkdir()
+
+            # Copy C files from model. The filesnames and quantity
+            # depend on the target string, so we just copy all c files
+            source_dir = mlf_unpacking_dir / "codegen" / "host" / "src"
+            for file in source_dir.rglob(f"*.c"):
+                shutil.copy(file, model_dir)
+
+            source_dir = mlf_unpacking_dir / "codegen" / "host" / "include"
+            for file in source_dir.rglob(f"*.h"):
+                shutil.copy(file, model_dir)
+
+            # Return metadata.json for use in templating
+            with open(os.path.join(mlf_unpacking_dir, "metadata.json")) as f:
+                metadata = json.load(f)
+        return metadata
+
+    def _template_model_header(self, source_dir, metadata):
+        with open(source_dir / "model.h", "r") as f:
+            model_h_template = Template(f.read())
+
+        assert (
+            metadata["style"] == "full-model"
+        ), "when generating AOT, expect only full-model Model Library Format"
+
+        template_values = {
+            "workspace_size_bytes": metadata["memory"]["functions"]["main"][0][
+                "workspace_size_bytes"
+            ],
+        }
+
+        with open(source_dir / "model.h", "w") as f:
+            f.write(model_h_template.substitute(template_values))
+
+    # Arduino ONLY recognizes .ino, .ccp, .c, .h
+
+    CPP_FILE_EXTENSION_SYNONYMS = ("cc", "cxx")
+
+    def _change_cpp_file_extensions(self, source_dir):
+        for ext in self.CPP_FILE_EXTENSION_SYNONYMS:
+            for filename in source_dir.rglob(f"*.{ext}"):
+                filename.rename(filename.with_suffix(".cpp"))
+
+        for filename in source_dir.rglob(f"*.inc"):
+            filename.rename(filename.with_suffix(".h"))
+
+    def _convert_includes(self, project_dir, source_dir):
+        """Changes all #include statements in project_dir to be relevant to their
+        containing file's location.
+
+        Arduino only supports includes relative to a file's location, so this
+        function finds each time we #include a file and changes the path to
+        be relative to the file location. Does not do this for standard C
+        libraries. Also changes angle brackets syntax to double quotes syntax.
+
+        See Also
+        -----
+        https://www.arduino.cc/reference/en/language/structure/further-syntax/include/
+
+        """
+        for ext in ("c", "h", "cpp"):
+            for filename in source_dir.rglob(f"*.{ext}"):
+                with filename.open("rb") as src_file:
+                    lines = src_file.readlines()
+                    with filename.open("wb") as dst_file:
+                        for i, line in enumerate(lines):
+                            line_str = str(line, "utf-8")
+                            # Check if line has an include
+                            result = re.search(r"#include\s*[<\"]([^>]*)[>\"]", line_str)
+                            if not result:
+                                dst_file.write(line)
+                            else:
+                                new_include = self._find_modified_include_path(
+                                    project_dir, filename, result.groups()[0]
+                                )
+                                updated_line = f'#include "{new_include}"\n'
+                                dst_file.write(updated_line.encode("utf-8"))
+
+    # Most of the files we used to be able to point to directly are under "src/standalone_crt/include/".
+    # Howver, crt_config.h lives under "src/standalone_crt/crt_config/", and more exceptions might
+    # be added in the future.
+    POSSIBLE_BASE_PATHS = ["src/standalone_crt/include/", "src/standalone_crt/crt_config/"]
+
+    def _find_modified_include_path(self, project_dir, file_path, include_path):
+        """Takes a single #include path, and returns the location it should point to.
+
+        Examples
+        --------
+        >>> _find_modified_include_path(
+        ...     "/path/to/project/dir"
+        ...     "/path/to/project/dir/src/standalone_crt/src/runtime/crt/common/ndarray.c"
+        ...     "tvm/runtime/crt/platform.h"
+        ... )
+        "../../../../../../src/standalone_crt/include/tvm/runtime/crt/platform.h"
+
+        """
+        if include_path.endswith(".inc"):
+            include_path = re.sub(r"\.[a-z]+$", ".h", include_path)
+
+        # Change includes referencing .cc and .cxx files to point to the renamed .cpp file
+        if include_path.endswith(self.CPP_FILE_EXTENSION_SYNONYMS):
+            include_path = re.sub(r"\.[a-z]+$", ".cpp", include_path)
+
+        # If the include already works, don't modify it
+        if (file_path.parents[0] / include_path).exists():
+            return include_path
+
+        relative_path = file_path.relative_to(project_dir)
+        up_dirs_path = "../" * str(relative_path).count("/")
+
+        for base_path in self.POSSIBLE_BASE_PATHS:
+            full_potential_path = project_dir / base_path / include_path
+            if full_potential_path.exists():
+                return up_dirs_path + base_path + include_path
+
+        # If we can't find the file, just leave it untouched
+        # It's probably a standard C/C++ header
+        return include_path
+
+    def _copy_standalone_crt_makefiles(self, api_server_dir, source_dir):
+        print(source_dir)
+        shutil.copy2(
+            api_server_dir / "src/example_project/Makefile",
+            source_dir,
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/Makefile.in",
+            source_dir,
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/Makefrag",
+            source_dir,
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/build.sh",
+            source_dir,
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/configure.ac",
+            source_dir,
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/include/gemmini_nn.h",
+            source_dir / "include/gemmini_nn.h",
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/include/gemmini_testutils.h",
+            source_dir / "include/gemmini_testutils.h",
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/include/gemmini.h",
+            source_dir / "include/gemmini.h",
+        )
+        shutil.copy2(
+            api_server_dir / "src/example_project/rocc-software/src/xcustom.h",
+            source_dir / "rocc-software/src/xcustom.h",
+        )
+
+    def _copy_debug_data_files(self, project_dir):
+        if os.path.isdir(str(project_dir / ".." / "include")):
+            copy_tree(str(project_dir / ".." / "include"), str(project_dir / "src" / "model"))
+
+    def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
+
+        # Reference key directories with pathlib
+        project_dir = pathlib.Path(project_dir)
+        project_dir.mkdir()
+        source_dir = project_dir / "src"
+        source_dir.mkdir()
+
+        # Copies files from the template folder to project_dir
+        shutil.copy2(API_SERVER_DIR / "microtvm_api_server.py", project_dir)
+        self._copy_project_files(API_SERVER_DIR, project_dir, options["project_type"])
+
+        # Copy standalone_crt into src folder
+        self._copy_standalone_crt(source_dir, standalone_crt_dir)
+        self._remove_unused_components(source_dir, options["project_type"])
+
+        # Populate crt-config.h
+        crt_config_dir = project_dir / "src" / "standalone_crt" / "crt_config"
+        crt_config_dir.mkdir()
+        shutil.copy2(
+            API_SERVER_DIR / "crt_config" / "crt_config.h", crt_config_dir / "crt_config.h"
+        )
+
+        # Unpack the MLF and copy the relevant files
+        # extract_path = os.path.splitext(model_library_format_path)[0]
+        # with tarfile.TarFile(model_library_format_path) as tf:
+        #    os.makedirs(project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
+        #    tf.extractall(path=project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
+        metadata = self._disassemble_mlf(model_library_format_path, source_dir)
+        shutil.copy2(model_library_format_path, project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
+
+        self._copy_debug_data_files(project_dir)
+        # For AOT, template model.h with metadata to minimize space usage
+        # if options["project_type"] == "example_project":
+        #    self._template_model_header(source_dir, metadata)
+
+        # Copy makefiles to treat standalone crt code as RIOT modules
+        # self._copy_standalone_crt_makefiles(API_SERVER_DIR, source_dir)
+
+        self._change_cpp_file_extensions(source_dir)
+
+        # Recursively change includes
+        self._convert_includes(project_dir, source_dir)
+
+    def build(self, options):
+        subprocess.call(
+            "source %s && cd src && ./build.sh" % (os.environ["CHIPYARD_HOME"] + "/env.sh",),
+            shell=True,
+            executable="/bin/bash",
+        )
+        # os.system("source %s && cd src && ./build.sh" % (os.environ["CHIPYARD_HOME"] + "/env.sh",))
+
+    def flash(self, options):
+        test_name = options["project_type"].split("_")[0]
+        subprocess.call(
+            "source %s && cd src/build && spike --extension=gemmini %s"
+            % (os.environ["CHIPYARD_HOME"] + "/env.sh", test_name + "-baremetal"),
+            shell=True,
+            executable="/bin/bash",
+        )
+        # os.system("source %s && cd src/build && spike --extension=gemmini %s" % (os.environ["CHIPYARD_HOME"] + "/env.sh",test_name + "-baremetal",))
+        # if logging.root.level == logging.DEBUG:
+        #    os.system("cd src/build && spike --extension=gemmini ")
+        # else:
+        #    os.system("cd src && make flash -s > /dev/null")
+
+    def open_transport(self, options):
+        pass
+
+    def close_transport(self):
+        pass
+
+    def read_transport(self, n, timeout_sec):
+        pass
+
+    def write_transport(self, data, timeout_sec):
+        pass
+
+
+if __name__ == "__main__":
+    server.main(Handler())
diff --git a/apps/microtvm/gemmini/template_project/src/Makefile.in b/apps/microtvm/gemmini/template_project/src/Makefile.in
new file mode 100644
index 000000000000..ed017cc918ce
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/Makefile.in
@@ -0,0 +1,34 @@
+prefix		:= @prefix@
+abs_top_srcdir	:= @abs_top_srcdir@
+XLEN            := @XLEN@
+RISCVTOOLS      := @RISCVTOOLS@
+ROCC = examples
+RUNNER         := "spike --extension=gemmini "
+
+.PHONY: all bareMetalC clean
+all: bareMetalC
+
+vars = \
+	abs_top_srcdir=$(abs_top_srcdir) \
+	XLEN=$(XLEN) \
+	PREFIX=$(ROCC)-$@ \
+	src_dir=$(abs_top_srcdir) \
+	RISCVTOOLS=$(RISCVTOOLS)
+
+bareMetalC:
+	$(MAKE) -f $(abs_top_srcdir)/Makefile $(vars)
+
+clean:
+	$(MAKE) -f $(abs_top_srcdir)/Makefile abs_top_srcdir=$(abs_top_srcdir) PREFIX=$(ROCC)-bareMetalC clean
+
+test-baremetal-bareMetalC:
+	make	\
+	        -f $(abs_top_srcdir)/Makefile \
+                TARGET_MAKEFILE=$(abs_top_srcdir)/Makefile \
+		abs_top_srcdir=$(abs_top_srcdir) \
+	 	src_dir=$(abs_top_srcdir) \
+	 	XLEN=$(XLEN) \
+	 	PREFIX=$(ROCC)-bareMetalC \
+		RISCVTOOLS=$(RISCVTOOLS) \
+		RUNNER=$(RUNNER) \
+		run-baremetal
diff --git a/apps/microtvm/gemmini/template_project/src/Makefrag b/apps/microtvm/gemmini/template_project/src/Makefrag
new file mode 100644
index 000000000000..a60184526081
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/Makefrag
@@ -0,0 +1,25 @@
+XLEN ?= 64
+
+CC_BAREMETAL := riscv$(XLEN)-unknown-elf-gcc
+
+CC_LINUX_PRESENT := $(shell command -v riscv$(XLEN)-unknown-linux-gnu-gcc 2> /dev/null)
+
+# Support Linux gcc from riscv-gnu-toolchain and from system packages
+# riscv64-unknown-linux-gnu-gcc is built from riscv-gnu-toolchain, comes with Firesim's tools
+# riscv64-linux-gnu-gcc comes from a system package
+ifdef CC_LINUX_PRESENT
+    CC_LINUX := riscv$(XLEN)-unknown-linux-gnu-gcc
+else
+    CC_LINUX := riscv$(XLEN)-linux-gnu-gcc
+endif
+
+ENV_P = $(abs_top_srcdir)/riscv-tests/env/p
+ENV_V = $(abs_top_srcdir)/riscv-tests/env/v
+
+.PHONY: all clean default
+
+default: all
+src_dir = .
+
+clean:
+	rm -rf $(junk)
diff --git a/apps/microtvm/gemmini/template_project/src/add.c b/apps/microtvm/gemmini/template_project/src/add.c
new file mode 100644
index 000000000000..13aeb1a80e3f
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/add.c
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_add[output_len];
+
+int main() {
+  printf("Starting add test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_x_0 = input_1;
+  inputs.serving_default_y_0 = input_2;
+  struct tvmgen_default_outputs outputs;
+  outputs.PartitionedCall_0 = output_add;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_add[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN ADD EXAMPLE! output_add[%d] (%d) != output[%d] (%d)\r\n", i, output_add[i],
+             i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  float error_perc = ((float)(error_counter / output_len) * 100);
+  if (error_perc < 1)
+    printf("SUCCESS! (error_counter = %d)\r\n", error_counter);
+  else
+    printf("FAIL! (error_counter = %d)\r\n", error_counter);
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/conv2d.c b/apps/microtvm/gemmini/template_project/src/conv2d.c
new file mode 100644
index 000000000000..22f1bcb1d281
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/conv2d.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_conv[output_len];
+
+int main() {
+  printf("Starting conv2d test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_conv2d_input_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.StatefulPartitionedCall_0 = output_conv;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_conv[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN CONV2D EXAMPLE! output_conv[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_conv[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / output_len) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/dense.c b/apps/microtvm/gemmini/template_project/src/dense.c
new file mode 100644
index 000000000000..414eeac88020
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/dense.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_gemm[output_len];
+
+int main() {
+  printf("Starting dense test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_x_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.StatefulPartitionedCall_0 = output_gemm;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_gemm[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN DENSE EXAMPLE! output_gemm[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_gemm[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / output_len) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/dwconv2d.c b/apps/microtvm/gemmini/template_project/src/dwconv2d.c
new file mode 100644
index 000000000000..ee125e2fdc25
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/dwconv2d.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_conv[output_len];
+
+int main() {
+  printf("Starting dw conv2d test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_depthwise_conv2d_input_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.StatefulPartitionedCall_0 = output_conv;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_conv[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN DW CONV2D EXAMPLE! output_conv[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_conv[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / output_len) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/add/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/add/Makefile
new file mode 100644
index 000000000000..2c997cea1a80
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/makefiles/add/Makefile
@@ -0,0 +1,68 @@
+include $(abs_top_srcdir)/Makefrag
+
+tests = \
+	add \
+
+tests_baremetal = $(tests:=-baremetal)
+
+ifeq ($(findstring spike,$(RUNNER)),spike)
+# Currently don't support conv or conv-with-pool on spike
+runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
+else
+# Don't run very long benchmarks for RTL sim
+runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
+endif
+
+RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
+BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
+GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
+STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
+
+CFLAGS := $(CFLAGS) \
+	-DPREALLOCATE=1 \
+	-DMULTITHREAD=1 \
+	-mcmodel=medany \
+	-std=gnu99 \
+	-O2 \
+	-ffast-math \
+	-fno-common \
+	-fno-builtin-printf \
+	-march=rv64gc -Wa,-march=rv64gcxhwacha \
+	-lm \
+	-lgcc \
+	-I${RISCV_TESTS} \
+	-I${RISCV_TESTS}/env \
+	-I$(abs_top_srcdir) \
+	-I$(abs_top_srcdir)/include \
+	-I$(BENCH_COMMON) \
+	-DID_STRING=$(ID_STRING) \
+	-DPRINT_TILE=0 \
+
+CFLAGS_BAREMETAL := \
+	$(CFLAGS) \
+	-nostdlib \
+	-nostartfiles \
+	-static \
+	-T $(BENCH_COMMON)/test.ld \
+	-DBAREMETAL=1 \
+
+all: $(tests_baremetal)
+
+vpath %.c $(src_dir)
+
+%-baremetal: %.c $(GEMMINI_HEADERS)
+	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
+#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
+		$(LIBS)
+
+run-baremetal: $(runs_baremetal)
+
+%-baremetal.run: %-baremetal
+	$(RUNNER)$(abs_top_srcdir)/build/$^
+
+junk += $(tests_baremetal)
+
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile
new file mode 100644
index 000000000000..f80da67c3f98
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile
@@ -0,0 +1,68 @@
+include $(abs_top_srcdir)/Makefrag
+
+tests = \
+	conv2d \
+
+tests_baremetal = $(tests:=-baremetal)
+
+ifeq ($(findstring spike,$(RUNNER)),spike)
+# Currently don't support conv or conv-with-pool on spike
+runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
+else
+# Don't run very long benchmarks for RTL sim
+runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
+endif
+
+RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
+BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
+GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
+STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
+
+CFLAGS := $(CFLAGS) \
+	-DPREALLOCATE=1 \
+	-DMULTITHREAD=1 \
+	-mcmodel=medany \
+	-std=gnu99 \
+	-O2 \
+	-ffast-math \
+	-fno-common \
+	-fno-builtin-printf \
+	-march=rv64gc -Wa,-march=rv64gcxhwacha \
+	-lm \
+	-lgcc \
+	-I${RISCV_TESTS} \
+	-I${RISCV_TESTS}/env \
+	-I$(abs_top_srcdir) \
+	-I$(abs_top_srcdir)/include \
+	-I$(BENCH_COMMON) \
+	-DID_STRING=$(ID_STRING) \
+	-DPRINT_TILE=0 \
+
+CFLAGS_BAREMETAL := \
+	$(CFLAGS) \
+	-nostdlib \
+	-nostartfiles \
+	-static \
+	-T $(BENCH_COMMON)/test.ld \
+	-DBAREMETAL=1 \
+
+all: $(tests_baremetal)
+
+vpath %.c $(src_dir)
+
+%-baremetal: %.c $(GEMMINI_HEADERS)
+	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
+#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
+		$(LIBS)
+
+run-baremetal: $(runs_baremetal)
+
+%-baremetal.run: %-baremetal
+	$(RUNNER)$(abs_top_srcdir)/build/$^
+
+junk += $(tests_baremetal)
+
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile
new file mode 100644
index 000000000000..0b1932ceef91
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile
@@ -0,0 +1,68 @@
+include $(abs_top_srcdir)/Makefrag
+
+tests = \
+	dense \
+
+tests_baremetal = $(tests:=-baremetal)
+
+ifeq ($(findstring spike,$(RUNNER)),spike)
+# Currently don't support conv or conv-with-pool on spike
+runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
+else
+# Don't run very long benchmarks for RTL sim
+runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
+endif
+
+RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
+BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
+GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
+STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
+
+CFLAGS := $(CFLAGS) \
+	-DPREALLOCATE=1 \
+	-DMULTITHREAD=1 \
+	-mcmodel=medany \
+	-std=gnu99 \
+	-O2 \
+	-ffast-math \
+	-fno-common \
+	-fno-builtin-printf \
+	-march=rv64gc -Wa,-march=rv64gcxhwacha \
+	-lm \
+	-lgcc \
+	-I${RISCV_TESTS} \
+	-I${RISCV_TESTS}/env \
+	-I$(abs_top_srcdir) \
+	-I$(abs_top_srcdir)/include \
+	-I$(BENCH_COMMON) \
+	-DID_STRING=$(ID_STRING) \
+	-DPRINT_TILE=0 \
+
+CFLAGS_BAREMETAL := \
+	$(CFLAGS) \
+	-nostdlib \
+	-nostartfiles \
+	-static \
+	-T $(BENCH_COMMON)/test.ld \
+	-DBAREMETAL=1 \
+
+all: $(tests_baremetal)
+
+vpath %.c $(src_dir)
+
+%-baremetal: %.c $(GEMMINI_HEADERS)
+	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
+#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
+		$(LIBS)
+
+run-baremetal: $(runs_baremetal)
+
+%-baremetal.run: %-baremetal
+	$(RUNNER)$(abs_top_srcdir)/build/$^
+
+junk += $(tests_baremetal)
+
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile
new file mode 100644
index 000000000000..fa89e5be162d
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile
@@ -0,0 +1,68 @@
+include $(abs_top_srcdir)/Makefrag
+
+tests = \
+	dwconv2d \
+
+tests_baremetal = $(tests:=-baremetal)
+
+ifeq ($(findstring spike,$(RUNNER)),spike)
+# Currently don't support conv or conv-with-pool on spike
+runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
+else
+# Don't run very long benchmarks for RTL sim
+runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
+endif
+
+RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
+BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
+GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
+STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
+
+CFLAGS := $(CFLAGS) \
+	-DPREALLOCATE=1 \
+	-DMULTITHREAD=1 \
+	-mcmodel=medany \
+	-std=gnu99 \
+	-O2 \
+	-ffast-math \
+	-fno-common \
+	-fno-builtin-printf \
+	-march=rv64gc -Wa,-march=rv64gcxhwacha \
+	-lm \
+	-lgcc \
+	-I${RISCV_TESTS} \
+	-I${RISCV_TESTS}/env \
+	-I$(abs_top_srcdir) \
+	-I$(abs_top_srcdir)/include \
+	-I$(BENCH_COMMON) \
+	-DID_STRING=$(ID_STRING) \
+	-DPRINT_TILE=0 \
+
+CFLAGS_BAREMETAL := \
+	$(CFLAGS) \
+	-nostdlib \
+	-nostartfiles \
+	-static \
+	-T $(BENCH_COMMON)/test.ld \
+	-DBAREMETAL=1 \
+
+all: $(tests_baremetal)
+
+vpath %.c $(src_dir)
+
+%-baremetal: %.c $(GEMMINI_HEADERS)
+	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
+#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
+		$(LIBS)
+
+run-baremetal: $(runs_baremetal)
+
+%-baremetal.run: %-baremetal
+	$(RUNNER)$(abs_top_srcdir)/build/$^
+
+junk += $(tests_baremetal)
+
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile
new file mode 100644
index 000000000000..1218e9e67a96
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile
@@ -0,0 +1,68 @@
+include $(abs_top_srcdir)/Makefrag
+
+tests = \
+	maxpool2d \
+
+tests_baremetal = $(tests:=-baremetal)
+
+ifeq ($(findstring spike,$(RUNNER)),spike)
+# Currently don't support conv or conv-with-pool on spike
+runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
+else
+# Don't run very long benchmarks for RTL sim
+runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
+endif
+
+RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
+BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
+GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
+STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
+
+CFLAGS := $(CFLAGS) \
+	-DPREALLOCATE=1 \
+	-DMULTITHREAD=1 \
+	-mcmodel=medany \
+	-std=gnu99 \
+	-O2 \
+	-ffast-math \
+	-fno-common \
+	-fno-builtin-printf \
+	-march=rv64gc -Wa,-march=rv64gcxhwacha \
+	-lm \
+	-lgcc \
+	-I${RISCV_TESTS} \
+	-I${RISCV_TESTS}/env \
+	-I$(abs_top_srcdir) \
+	-I$(abs_top_srcdir)/include \
+	-I$(BENCH_COMMON) \
+	-DID_STRING=$(ID_STRING) \
+	-DPRINT_TILE=0 \
+
+CFLAGS_BAREMETAL := \
+	$(CFLAGS) \
+	-nostdlib \
+	-nostartfiles \
+	-static \
+	-T $(BENCH_COMMON)/test.ld \
+	-DBAREMETAL=1 \
+
+all: $(tests_baremetal)
+
+vpath %.c $(src_dir)
+
+%-baremetal: %.c $(GEMMINI_HEADERS)
+	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
+#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
+		$(LIBS)
+
+run-baremetal: $(runs_baremetal)
+
+%-baremetal.run: %-baremetal
+	$(RUNNER)$(abs_top_srcdir)/build/$^
+
+junk += $(tests_baremetal)
+
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile
new file mode 100644
index 000000000000..b6d977550097
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile
@@ -0,0 +1,68 @@
+include $(abs_top_srcdir)/Makefrag
+
+tests = \
+	mobilenet \
+
+tests_baremetal = $(tests:=-baremetal)
+
+ifeq ($(findstring spike,$(RUNNER)),spike)
+# Currently don't support conv or conv-with-pool on spike
+runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
+else
+# Don't run very long benchmarks for RTL sim
+runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
+endif
+
+RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
+BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
+GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
+STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
+
+CFLAGS := $(CFLAGS) \
+	-DPREALLOCATE=1 \
+	-DMULTITHREAD=1 \
+	-mcmodel=medany \
+	-std=gnu99 \
+	-O2 \
+	-ffast-math \
+	-fno-common \
+	-fno-builtin-printf \
+	-march=rv64gc -Wa,-march=rv64gcxhwacha \
+	-lm \
+	-lgcc \
+	-I${RISCV_TESTS} \
+	-I${RISCV_TESTS}/env \
+	-I$(abs_top_srcdir) \
+	-I$(abs_top_srcdir)/include \
+	-I$(BENCH_COMMON) \
+	-DID_STRING=$(ID_STRING) \
+	-DPRINT_TILE=0 \
+
+CFLAGS_BAREMETAL := \
+	$(CFLAGS) \
+	-nostdlib \
+	-nostartfiles \
+	-static \
+	-T $(BENCH_COMMON)/test.ld \
+	-DBAREMETAL=1 \
+
+all: $(tests_baremetal)
+
+vpath %.c $(src_dir)
+
+%-baremetal: %.c $(GEMMINI_HEADERS)
+	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
+#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
+		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
+		$(LIBS)
+
+run-baremetal: $(runs_baremetal)
+
+%-baremetal.run: %-baremetal
+	$(RUNNER)$(abs_top_srcdir)/build/$^
+
+junk += $(tests_baremetal)
+
diff --git a/apps/microtvm/gemmini/template_project/src/maxpool2d.c b/apps/microtvm/gemmini/template_project/src/maxpool2d.c
new file mode 100644
index 000000000000..8f508333c492
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/maxpool2d.c
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+int8_t output_maxpool2d[output_len];
+
+int main() {
+  printf("Starting max pooling 2D test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  struct tvmgen_default_inputs inputs;
+  inputs.serving_default_x_0 = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.PartitionedCall_0 = output_maxpool2d;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  for (int i = 0; i < output_len; i++) {
+    if (output_maxpool2d[i] != output[i]) {
+      error_counter += 1;
+      printf("ERROR IN MAX POOL 2D EXAMPLE! output_maxpool2d[%d] (%d) != output[%d] (%d)\r\n", i,
+             output_maxpool2d[i], i, output[i]);
+      // exit(1);
+    }
+  }
+
+  // We allow for a very small percentage of errors, this could be related to rounding errors
+  if (((float)(error_counter / output_len) * 100) < 1)
+    printf("SUCCESS!\r\n");
+  else
+    printf("FAIL!\r\n");
+  exit(0);
+}
diff --git a/apps/microtvm/gemmini/template_project/src/mobilenet.c b/apps/microtvm/gemmini/template_project/src/mobilenet.c
new file mode 100644
index 000000000000..45b606004653
--- /dev/null
+++ b/apps/microtvm/gemmini/template_project/src/mobilenet.c
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "assert.h"
+#include "stddef.h"
+#include "stdint.h"
+#include "stdio.h"
+#include "stdlib.h"
+#ifndef BAREMETAL
+#include "sys/mman.h"
+#endif
+#include "model/inputs.h"
+#include "model/outputs.h"
+#include "model/tvmgen_default.h"
+
+uint8_t output_pred[1001];
+
+int argmax(uint8_t* vec) {
+  int idx = 0;
+  uint8_t max_value = 0;
+  for (int i = 0; i < 1001; i++) {
+    if (vec[i] > max_value) {
+      idx = i;
+      max_value = vec[i];
+    }
+  }
+  return idx;
+}
+
+void get_top_5_labels(int* top_5, uint8_t* predicted_output) {
+  uint8_t prev_max_value = (uint8_t)255;
+  uint8_t current_max_value = 0;
+  int idx = 0;
+  for (int i = 0; i < 5; i++) {
+    current_max_value = 0;
+    idx = 0;
+    for (int j = 0; j < 1001; j++) {
+      if ((predicted_output[j] > current_max_value) && (predicted_output[j] < prev_max_value)) {
+        current_max_value = predicted_output[j];
+        idx = j;
+      }
+    }
+    top_5[i] = idx;
+    prev_max_value = current_max_value;
+  }
+}
+
+int main() {
+  printf("Starting MobileNet test...\r\n");
+#ifndef BAREMETAL
+  if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+    perror("mlockall failed");
+    exit(1);
+  }
+#endif
+
+  int top_5_labels[5];
+
+  struct tvmgen_default_inputs inputs;
+  inputs.input = input;
+  struct tvmgen_default_outputs outputs;
+  outputs.MobilenetV2_Predictions_Reshape = output_pred;
+  int error_counter = 0;
+
+  tvmgen_default_run(&inputs, &outputs);
+
+  // Look for errors!
+  /*for(int i = 0; i < output_len; i++)
+  {
+          if(output_pred[i] != output[i])
+{
+error_counter += 1;
+printf("ERROR IN MOBILENET EXAMPLE! output_pred[%d] (%d) != output[%d]
+(%d)\r\n",i,(int)output_pred[i],i,(int)output[i]);
+//exit(1);
+}
+  }*/
+
+  get_top_5_labels(top_5_labels, output_pred);
+
+  printf("Real Top-5 output labels: [ ");
+  for (int i = 0; i < 5; i++) printf("%d ", (int)top_5_labels[i]);
+  printf("]\r\n");
+
+  printf("Expected Top-5 output labels: [ ");
+  for (int i = 0; i < 5; i++) printf("%d ", (int)output[i]);
+  printf("]\r\n");
+
+  /*for(int i = 0; i < 5; i++)
+        {
+                if(top_5_labels[i] != output[i])
+    {
+      error_counter += 1;
+      printf("ERROR IN MOBILENET EXAMPLE! top_5_labels[%d] (%d) != output[%d]
+    (%d)\r\n",i,(int)top_5_labels[i],i,(int)output[i]);
+      //exit(1);
+    }
+        }*/
+
+  // printf("SUCCESS!\r\n");
+  exit(0);
+
+  // Take the argmax to get the predicted label, and the expected label
+  /*int predicted_label = argmax(output_pred);
+  int expected_label = argmax(output);
+  printf("Expected label = %d\r\n",expected_label);
+  printf("Predicted label = %d\r\n",predicted_label);
+  if(expected_label == predicted_label) printf("SUCCESS!\r\n");
+  else printf("FAILED!\r\n");
+  exit(0);*/
+}
diff --git a/cmake/modules/contrib/Gemmini.cmake b/cmake/modules/contrib/Gemmini.cmake
new file mode 100644
index 000000000000..4b73d183ddc1
--- /dev/null
+++ b/cmake/modules/contrib/Gemmini.cmake
@@ -0,0 +1,117 @@
+if(USE_MICRO)
+  message(STATUS "Add Gemmini for microTVM")
+
+  function(microtvm_add_gemmini)
+    list(
+      APPEND
+      GEMMINI_FILE_COPY_JOBS
+      "apps/microtvm/gemmini/template_project microtvm_api_server.py -> gemmini"
+      "apps/microtvm/gemmini/template_project/crt_config *.h -> gemmini/crt_config"
+
+      # Dense example project generation
+      "apps/microtvm/gemmini/template_project/src dense.c -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src/makefiles/dense Makefile -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/dense_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/dense_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/dense_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/dense_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/dense_example/rocc-software/src"
+
+      # CONV2D example project generation
+      "apps/microtvm/gemmini/template_project/src conv2d.c -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src/makefiles/conv2d Makefile -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/conv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/conv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/conv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/conv2d_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/conv2d_example/rocc-software/src"
+
+      # DW CONV2D example project generation
+      "apps/microtvm/gemmini/template_project/src dwconv2d.c -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d Makefile -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/dwconv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/dwconv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/dwconv2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/dwconv2d_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/dwconv2d_example/rocc-software/src"
+
+      # ADD example project generation
+      "apps/microtvm/gemmini/template_project/src add.c -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src/makefiles/add Makefile -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/add_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/add_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/add_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/add_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/add_example/rocc-software/src"
+
+      # Max pooling 2d example project generation
+      "apps/microtvm/gemmini/template_project/src maxpool2d.c -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d Makefile -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/maxpool2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/maxpool2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/maxpool2d_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/maxpool2d_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/maxpool2d_example/rocc-software/src"
+
+      # Mobilenet example project generation
+      "apps/microtvm/gemmini/template_project/src mobilenet.c -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src/makefiles/mobilenet Makefile -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/mobilenet_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/mobilenet_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/mobilenet_example"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/mobilenet_example/include"
+      "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/mobilenet_example/rocc-software/src"
+    )
+
+    foreach(job_spec IN LISTS GEMMINI_FILE_COPY_JOBS)
+      string(REPLACE " " ";" job_spec "${job_spec}")
+      list(LENGTH job_spec job_spec_length)
+      math(EXPR job_spec_length_mod "${job_spec_length} % 3")
+      if(NOT "${job_spec_length_mod}" EQUAL 1)
+        message(
+          FATAL_ERROR
+            "Gemmini copy job spec list length is ${job_spec_length}; parsed job spec is ${job_spec}"
+        )
+      endif()
+      math(EXPR job_spec_stop "${job_spec_length} - 3")
+
+      list(GET job_spec 0 job_src_base)
+      set(job_src_base "${CMAKE_SOURCE_DIR}/${job_src_base}")
+      foreach(copy_pattern_index RANGE 1 "${job_spec_stop}" 3)
+        list(GET job_spec ${copy_pattern_index} copy_pattern)
+        math(EXPR copy_dest_index "${copy_pattern_index} + 2")
+        list(GET job_spec ${copy_dest_index} copy_dest)
+
+        file(
+          GLOB_RECURSE copy_files
+          RELATIVE "${job_src_base}"
+          "${job_src_base}/${copy_pattern}")
+        list(LENGTH copy_files copy_files_length)
+        if("${copy_files_length}" EQUAL 0)
+          message(
+            FATAL_ERROR
+              "Gemmini copy job matched 0 files: ${job_src_base}/${copy_pattern} -> ${copy_dest}"
+          )
+        endif()
+        foreach(copy_src IN LISTS copy_files)
+          get_filename_component(
+            dest_path "${MICROTVM_TEMPLATE_PROJECTS}/${copy_dest}/${copy_src}"
+            ABSOLUTE)
+          tvm_micro_add_copy_file(gemmini_template_deps
+                                  ${job_src_base}/${copy_src} ${dest_path})
+        endforeach()
+      endforeach()
+    endforeach()
+
+    add_custom_target(gemmini DEPENDS ${gemmini_template_deps})
+  endfunction()
+
+  microtvm_add_gemmini()
+
+endif(USE_MICRO)
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index f1c14c3cd914..413daf430ed5 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -500,9 +500,21 @@ def _build_func_common(measure_input, runtime=None, checks=None, build_option=No
     target, task, config = measure_input
     target, task.target_host = Target.canon_target_and_host(target, task.target_host)
     checks = checks or {}
+
     with target:
         s, args = task.instantiate(config)
 
+        # if target is gemmini, we need to use gemmini build
+        if (
+            hasattr(measure_input.target, "device_name")
+            and measure_input.target.device_name == "gemmini"
+        ):
+            # pylint: disable=import-outside-toplevel
+            import tvm.contrib.gemmini as gemmini
+
+            func = gemmini.build(s, args, target=measure_input.target, runtime=runtime)
+            return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
+
         # check invalidity of template and code hash consistency
         if not config.valid():
             raise InstantiationError(config.errors)
diff --git a/python/tvm/contrib/gemmini/__init__.py b/python/tvm/contrib/gemmini/__init__.py
new file mode 100644
index 000000000000..9515769fd641
--- /dev/null
+++ b/python/tvm/contrib/gemmini/__init__.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Gemmini package is a TVM backend extension to support the Gemmini hardware accelerator
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import sys
+import tvm._ffi.base
+
+from .environment import Environment
+from .build_module import build_config, lower, build, preprocess_pass
+from tvm.relay.backend.contrib.gemmini import *
+from .helpers import create_header_file
+from .utils import *
+
+__version__ = "0.1.0"
diff --git a/python/tvm/contrib/gemmini/build_module.py b/python/tvm/contrib/gemmini/build_module.py
new file mode 100644
index 000000000000..a094147b7a14
--- /dev/null
+++ b/python/tvm/contrib/gemmini/build_module.py
@@ -0,0 +1,201 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Helpers and functions related to the build process to generate code for the Gemmini accelerator
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import tvm
+
+from .environment import Environment
+from .transform import *
+from tvm import relay
+from .legalize import LegalizeGemmini
+
+
+def preprocess_pass(mod):
+    """This is the preprocess pass to use the Gemmini accelerator, it groups the
+
+    Args:
+        mod (tvm.ir.IRModule): IRModule to preprocess
+
+    Returns:
+        tvm.ir.IRModule: preprocessed IRModule
+    """
+
+    # First, merge all dw and convs that can be merged!
+    pattern = relay.op.contrib.get_pattern_table("gemmini")
+
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.ConvertLayout({"qnn.conv2d": ["NHWC", "HWIO"]})(mod)
+    mod = relay.transform.SimplifyExpr()(mod)
+    mod = relay.transform.MergeComposite(pattern)(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.SimplifyExpr()(mod)
+    mod = LegalizeGemmini()(mod)
+    mod = relay.transform.InferType()(mod)
+    return mod
+
+
+def internal_build_configs(usmp_alg=""):
+    """Builds the internal configurations for the build process
+
+    Args:
+        usmp_alg (str, optional): Which USMP algorithm to use. Defaults to "".
+
+    Returns:
+        dict: configurations
+    """
+    enable_usmp = False if usmp_alg == "" else True
+    pass_list = [
+        (0, tvm.tir.transform.StorageFlatten(16)),
+        (1, InjectAMVINIntrin()),
+        (1, InjectAMVINIntrinTransposed()),
+        (1, InjectBMVINIntrin()),
+        (1, InjectBMVINIntrinTransposed()),
+        (1, InjectCMVOUTIntrin()),
+        (1, InjectCMVOUTIntrinTransposed()),
+        (1, InjectDMVINIntrin()),
+        (1, InjectDMVINIntrinTransposed()),
+        (1, InjectCMVINIntrin()),
+        (1, InjectCMVINIntrinTransposed()),
+        (1, InjectCMVINAccumIntrin()),
+        (1, InjectCMVINAccumIntrinTransposed()),
+        (1, tvm.tir.transform.CorrectGemminisScratchpadAndAccumulatorPointers()),
+        (2, tvm.tir.transform.LowerDeviceStorageAccessInfo()),
+        (4, InsertGemminiHeaderOperators()),
+        (5, InsertGemminiFenceOperator()),
+    ]
+
+    return {
+        "tir.add_lower_pass": pass_list,
+        "tir.disable_vectorize": True,
+        # "tir.CorrectGemminisScratchpadAndAccumulatorPointers": {"dim": env.DIM}
+        "tir.usmp.enable": enable_usmp,
+        "tir.usmp.algorithm": usmp_alg,
+    }
+
+
+def build_config(usmp_alg="", **kwargs):
+    """Creates the PassContext needed by the build process to correctly build the Gemmini operators
+
+    Args:
+        usmp_alg (str, optional): Which USMP algorithm to use. Defaults to "".
+
+    Returns:
+        tvm.transform.PassContext: PassContext with specific configurations
+    """
+
+    config = internal_build_configs(usmp_alg)
+    if kwargs.get("config"):
+        config.update(kwargs[config])
+        del kwargs["config"]
+
+    return tvm.transform.PassContext(config=config, **kwargs)
+
+
+def lower(*args, **kwargs):
+    """Thin wrapper of tvm.lower
+
+    This wrapper automatically applies Gemmini's build_config
+    if there is no user specified build_config in context.
+
+    See Also
+    --------
+    tvm.lower : The original TVM's lower function
+    """
+    pass_ctx = tvm.transform.PassContext.current()
+    if not pass_ctx.config.get("add_lower_pass"):
+        with build_config():
+            return tvm.lower(*args, **kwargs)
+    return tvm.lower(*args, **kwargs)
+
+
+def build(*args, **kwargs):
+    """Thin wrapper of tvm.build
+
+    This wrapper automatically applies Gemmini's build_config
+    if there is no user specified build_config in context.
+
+    See Also
+    --------
+    tvm.build : The original TVM's build function
+    """
+    pass_ctx = tvm.transform.PassContext.current()
+    if not pass_ctx.config.get("tir.add_lower_pass"):
+        with build_config():
+            return tvm.build(*args, **kwargs)
+    return tvm.build(*args, **kwargs)
+
+
+# The memory information for the compiler
+@tvm.register_func("tvm.info.mem.%s" % Environment.instance().scr_scope)
+def mem_info_inp_buffer():
+    """Creates the information about the local.scratchpad memory node
+
+    Returns:
+        node: The corresponding MemoryInfo node
+    """
+    spec = Environment.instance()
+    return tvm.ir.make_node(
+        "MemoryInfo",
+        unit_bits=spec.inp_bits,
+        max_simd_bits=spec.DIM,
+        max_num_bits=int(spec.INP_SCR_ROWS * spec.DIM * spec.inp_bits),
+        # head_address=tvm.runtime.const(spec.INP_SCR_BASE_ADDRESS, "uint32"),
+        head_address=None,
+    )
+
+
+# The memory information for the compiler
+@tvm.register_func("tvm.info.mem.%s" % Environment.instance().scr_wgt_scope)
+def mem_info_wgt_buffer():
+    """Creates the information about the local.scratchpad_weight memory node
+
+    Returns:
+        node: The corresponding MemoryInfo node
+    """
+    spec = Environment.instance()
+    return tvm.ir.make_node(
+        "MemoryInfo",
+        unit_bits=spec.wgt_bits,
+        max_simd_bits=spec.DIM,
+        max_num_bits=int(spec.WGT_SCR_ROWS * spec.DIM * spec.wgt_bits),
+        # head_address=tvm.runtime.const(spec.WGT_SCR_BASE_ADDRESS, "uint32"),
+        head_address=None,
+    )
+
+
+# The memory information for the compiler
+@tvm.register_func("tvm.info.mem.%s" % Environment.instance().acc_scope)
+def mem_info_acc_buffer():
+    """Creates the information about the local.accumulator memory node
+
+    Returns:
+        node: The corresponding MemoryInfo node
+    """
+    spec = Environment.instance()
+    return tvm.ir.make_node(
+        "MemoryInfo",
+        unit_bits=env.inp_bits,
+        max_simd_bits=env.DIM,
+        max_num_bits=int(env.ACC_ROWS * env.DIM * env.inp_bits),
+        # head_address=tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32"),
+        head_address=None,
+    )
diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
new file mode 100644
index 000000000000..7d6350d1ebb9
--- /dev/null
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -0,0 +1,386 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, exec-used
+"""
+Environment declaration. Contains Gemminis hardware parameters.
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from __future__ import absolute_import as _abs
+from .intrin import (
+    gemm,
+    gemm_cisc,
+    conv2d_cisc,
+    dw_conv2d_cisc,
+    add_tensorize,
+    add_mvout_tensorize,
+)
+import re
+from pydevicetree import Devicetree
+import os
+import tvm
+import sys
+from typing import List, Tuple, Dict, Callable
+from .utils import counters
+
+
+class Environment(object):
+    """Hardware configuration object.
+
+    This object contains all the information
+    needed for compiling to a specific Gemmini backend.
+
+    """
+
+    _instance = None
+
+    @classmethod
+    def init_overwrite(
+        cls,
+        batch=1,
+        dim=32,
+        max_bytes=64,
+        inp_dtype="int8",
+        wgt_dtype="int8",
+        acc_dtype="int32",
+        acc_rows=4096,
+        bank_rows=8192,
+        bank_num=4,
+        debug=False,
+        enabled_counters: Dict = {},
+        supports_non_zero_padding: bool = False,
+        use_experimental_qnn_add: bool = False,
+    ):
+        """Overwrites the init function
+
+        Args:
+            batch (int, optional): Batch size. Defaults to 1.
+            dim (int, optional): Gemminis systolic array dimensions (DIM). Defaults to 32.
+            max_bytes (int, optional): Used to calculate the maximum amount of columns one mvin instruction can generate. Defaults to 64.
+            inp_dtype (str, optional): Type supported by the Gemmini scratchpad. Defaults to "int8".
+            wgt_dtype (str, optional): Type supported by the Gemmini "logical" weight scratchpad. Defaults to "int8".
+            acc_dtype (str, optional): Type supported by the Gemmini accumulator. Defaults to "int32".
+            acc_rows (int, optional): Amount of rows of the accumulator. Defaults to 4096.
+            bank_rows (int, optional): Amount of rows of each bank in the scratchpad. Defaults to 8192.
+            bank_num (int, optional): Amount of banks for the scratchpad. Defaults to 4.
+            debug (bool, optional): Adds debug of Gemmini counters to generated code. Defaults to False.
+            enabled_counters (dict, optional): Dictionary of enabled Gemmini counters for debug purposes. Defaults to empty.
+            supports_non_zero_padding (bool, optional): If Gemmini supports instructions with non-zero padding. Defaults to False.
+            use_experimental_qnn_add (bool, optional): Activate pattern matching for qnn.add. Defaults to False.
+        """
+        inst = Environment.instance()
+        inst.init(
+            batch=batch,
+            dim=dim,
+            max_bytes=max_bytes,
+            inp_dtype=inp_dtype,
+            wgt_dtype=wgt_dtype,
+            acc_dtype=acc_dtype,
+            acc_rows=acc_rows,
+            bank_rows=bank_rows,
+            bank_num=bank_num,
+            debug=debug,
+            enabled_counters=enabled_counters,
+            supports_non_zero_padding=supports_non_zero_padding,
+            use_experimental_qnn_add=use_experimental_qnn_add,
+        )
+
+    @classmethod
+    def instance(cls):
+        """Returns the current instance
+
+        Returns:
+            _type_: _description_
+        """
+        if cls._instance is None:
+            cls._instance = cls.__new__(cls)
+            cls._instance.init()
+        return cls._instance
+
+    def init(
+        self,
+        batch=1,
+        dim=16,
+        max_bytes=64,
+        inp_dtype="int8",
+        wgt_dtype="int8",
+        acc_dtype="int32",
+        acc_rows=1024,
+        bank_rows=4096,
+        bank_num=4,
+        debug=False,
+        enabled_counters: Dict = {},
+        supports_non_zero_padding: bool = False,
+        use_experimental_qnn_add: bool = False,
+    ):
+        """_summary_
+
+        Args:
+            batch (int, optional): Batch size. Defaults to 1.
+            dim (int, optional): Gemminis systolic array dimensions (DIM). Defaults to 32.
+            max_bytes (int, optional): Used to calculate the maximum amount of columns one mvin instruction can generate. Defaults to 64.
+            inp_dtype (str, optional): Type supported by the Gemmini scratchpad. Defaults to "int8".
+            wgt_dtype (str, optional): Type supported by the Gemmini "logical" weight scratchpad. Defaults to "int8".
+            acc_dtype (str, optional): Type supported by the Gemmini accumulator. Defaults to "int32".
+            acc_rows (int, optional): Amount of rows of the accumulator. Defaults to 4096.
+            bank_rows (int, optional): Amount of rows of each bank in the scratchpad. Defaults to 8192.
+            bank_num (int, optional): Amount of banks for the scratchpad. Defaults to 4.
+            debug (bool, optional): Adds debug of Gemmini counters to generated code. Defaults to False.
+            enabled_counters (dict, optional): Dictionary of enabled Gemmini counters for debug purposes. Defaults to empty.
+            supports_non_zero_padding (bool, optional): If Gemmini supports instructions with non-zero padding. Defaults to False.
+            use_experimental_qnn_add (bool, optional): Activate pattern matching for qnn.add. Defaults to False.
+        """
+
+        assert batch == 1, "Only batch size of 1 is currently supported"
+        self.debug = debug
+
+        self.BATCH = batch
+        self.DIM = dim
+        self.MAX_BYTES = max_bytes
+
+        self.inp_dtype = inp_dtype
+        self.wgt_dtype = wgt_dtype
+        self.acc_dtype = acc_dtype
+
+        self.inp_bits = int(
+            re.match(r"((float)|(int)|(uint))(?P<width_bits>[0-9]+)", self.inp_dtype).group(
+                "width_bits"
+            )
+        )
+        self.wgt_bits = int(
+            re.match(r"((float)|(int)|(uint))(?P<width_bits>[0-9]+)", self.wgt_dtype).group(
+                "width_bits"
+            )
+        )
+        self.acc_bits = int(
+            re.match(r"((float)|(int)|(uint))(?P<width_bits>[0-9]+)", self.acc_dtype).group(
+                "width_bits"
+            )
+        )
+
+        self.size_elem = int(self.inp_bits / 8)
+        self.size_acc = int(self.acc_bits / 8)
+
+        self.ACC_ROWS = acc_rows
+        self.BANK_ROWS = bank_rows
+        self.BANK_NUM = bank_num
+
+        self.WGT_SCR_BASE_ADDRESS = int(self.BANK_ROWS * self.BANK_NUM * 2 / 4)
+        self.WGT_SCR_ROWS = self.BANK_ROWS * self.BANK_NUM - self.WGT_SCR_BASE_ADDRESS
+        self.INP_SCR_BASE_ADDRESS = 0
+        self.INP_SCR_ROWS = self.WGT_SCR_BASE_ADDRESS
+        self.OUT_ACC_BASE_ADDRESS = 0xC0000000
+
+        self.MAX_BLOCK_LEN = int(self.MAX_BYTES / self.DIM)
+        if self.DIM * self.size_acc <= self.MAX_BYTES:
+            self.MAX_BLOCK_LEN_ACC = int(self.MAX_BYTES / (self.DIM * self.size_acc))
+        else:
+            self.MAX_BLOCK_LEN_ACC = 1
+
+        self.scr_scope = "local.scratchpad"
+        self.acc_scope = "local.accumulator"
+        # TODO (FP): check this scratchpad_weight. Actually, only one scratchpad should exist, but we do this logical partition to correctly manage the pointers to the buffers stored in this memories. Should see how we can fix this in the future.
+        self.scr_wgt_scope = "local.scratchpad_weight"
+
+        self.A_mvin = "A_mvin"
+        self.B_mvin = "B_mvin"
+        self.D_mvin = "D_mvin"
+        self.C_mvin = "C_mvin"
+        self.C_mvin_accum = "C_mvin_accum"
+        self.C_mvout = "C_mvout"
+        self.C_mvout_acc_dtype = "C_mvout_acc_dtype"
+
+        self.WEIGHT_STATIONARY = 1
+        self.OUTPUT_STATIONARY = 0
+
+        self.mvin_scale_identity = 1.0
+        self.max_matrix = 64
+
+        self.supports_non_zero_padding = supports_non_zero_padding
+        self.use_experimental_qnn_add = use_experimental_qnn_add
+
+        self.enabled_counters = enabled_counters if bool(enabled_counters) else counters
+        # Check that all enabled counters exist in the actual counters from Gemmini
+        for key, value in self.enabled_counters.items():
+            assert (
+                self.enabled_counters[key] == counters[key]
+            ), f"Enabled counter with key {key} does not exist or has a different name in the actual counters dict!"
+
+    def gemm(
+        self,
+        I: int,
+        K: int,
+        J: int,
+        stride: int = 1,
+        is_depthwise_conv2d: bool = False,
+        mode: int = 1,
+        accum_patch=None,
+    ) -> Callable:
+        """Wrapper to expose the gemm intrinsic
+
+        Args:
+            I (int): output first axis dimension
+            K (int): reduction axis dimension
+            J (int): output second axis dimension
+            stride (int, optional): Stride, useful for convolutions. Defaults to 1.
+            is_depthwise_conv2d (bool, optional): Flag to explain if this is a GEMM for a depthwise convolution. Defaults to False.
+            mode (int, optional): Systolic array mode (WS=1,OS=0). Defaults to 1.
+            accum_patch (_type_, optional): Var of the reduction axis loop. Defaults to None.
+
+        Returns:
+            Callable: gemm instrinsic
+        """
+        return gemm(self, I, K, J, stride, is_depthwise_conv2d, mode, accum_patch)
+
+    def gemm_cisc(
+        self,
+        inp_shape: Tuple[int, ...],
+        wgt_shape: Tuple[int, ...],
+        bias_shape: Tuple[int, ...],
+        scale: float,
+        matmul_type: int,
+    ) -> Callable:
+        """Wrapper to expose the gemm_cisc intrinsic
+
+        Args:
+            inp_shape (Tuple[int,...]): Input feature map shape
+            wgt_shape (Tuple[int,...]): Weights shape
+            bias_shape (Tuple[int,...]): Bias shape
+            scale (float): Output scaling factor
+            matmul_type (int): Systolic array mode (WS=1,OS=0)
+
+        Returns:
+            Callable: gemm cisc intrinsic
+        """
+        return gemm_cisc(self, inp_shape, wgt_shape, bias_shape, scale, matmul_type)
+
+    def conv2d_cisc(
+        self,
+        inp_shape: Tuple[int, ...],
+        wgt_shape: Tuple[int, ...],
+        bias_shape: Tuple[int, ...],
+        out_shape: Tuple[int, ...],
+        strides: int,
+        padding: List[int],
+        padding_value: int,
+        activation: int,
+        scale: float,
+        pool_size: List[int],
+        pool_strides: List[int],
+        pool_dilation: List[int],
+        pool_padding: List[int],
+    ) -> Callable:
+        """Wrapper to expose the conv2d_cisc intrinsic
+
+        Args:
+            inp_shape (Tuple[int,...]): Input feature map shape
+            wgt_shape (Tuple[int,...]): Weights shape
+            bias_shape (Tuple[int,...]): Bias shape
+            out_shape (Tuple[int,...]): Output feature map shape
+            strides (int): Convolution stride
+            padding (List[int]): Pixels to pad in each direction
+            padding_value (int): Value to use for padding
+            activation (int): Has activation?
+            scale (float): Output scaling factor
+            pool_size (List[int]): Size of the output pooling window
+            pool_strides (List[int]): Strides for the output pooling window
+            pool_dilation (List[int]): Dilation for the output pooling window
+            pool_padding (List[int]): Padding for the output pooling
+
+        Returns:
+            Callable: conv2d cisc intrinsic
+        """
+        return conv2d_cisc(
+            self,
+            inp_shape,
+            wgt_shape,
+            bias_shape,
+            out_shape,
+            strides,
+            padding,
+            padding_value,
+            activation,
+            scale,
+            pool_size,
+            pool_strides,
+            pool_dilation,
+            pool_padding,
+        )
+
+    def dw_conv2d_cisc(
+        self,
+        inp_shape: Tuple[int, ...],
+        wgt_shape: Tuple[int, ...],
+        bias_shape: Tuple[int, ...],
+        out_shape: Tuple[int, ...],
+        strides: int,
+        padding: List[int],
+        padding_value: int,
+        activation: int,
+        scale: float,
+    ) -> Callable:
+        """Wrapper to expose the dw_conv2d_cisc intrinsic
+
+        Args:
+            inp_shape (Tuple[int,...]): Input feature map shape
+            wgt_shape (Tuple[int,...]): Weights shape
+            bias_shape (Tuple[int,...]): Bias shape
+            out_shape (Tuple[int,...]): Output feature map shape
+            strides (int): Convolution stride
+            padding (List[int]): Pixels to pad in each direction
+            padding_value (int): Value to use for padding
+            activation (int): Has activation?
+            scale (float): Output scaling factor
+
+        Returns:
+            Callable: dw conv2d cisc intrinsic
+        """
+        return dw_conv2d_cisc(
+            self,
+            inp_shape,
+            wgt_shape,
+            bias_shape,
+            out_shape,
+            strides,
+            padding,
+            padding_value,
+            activation,
+            scale,
+        )
+
+    def add_tensorize(self, oshape: Tuple[int, ...]) -> Callable:
+        """Wrapper to expose the add_tensorize intrinsic
+
+        Args:
+            oshape (Tuple[int,...]): Output feature map shape
+
+        Returns:
+            Callable: add intrinsic
+        """
+        return add_tensorize(self, oshape)
+
+    def add_mvout_tensorize(self, oshape: Tuple[int, ...]) -> Callable:
+        """Wrapper to expose the add_mvout_tensorize intrinsic
+
+        Args:
+            oshape (Tuple[int,...]): Output feature map shape
+
+        Returns:
+            Callable: add mvout intrinsic
+        """
+        return add_mvout_tensorize(self, oshape)
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
new file mode 100644
index 000000000000..84c028b3d33c
--- /dev/null
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -0,0 +1,188 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Miscellaneous helpers
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import numpy as np
+import pathlib
+from .environment import Environment
+
+import abc
+import collections
+import matplotlib
+import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
+import PIL.Image as Image
+import PIL.ImageColor as ImageColor
+import PIL.ImageDraw as ImageDraw
+import PIL.ImageFont as ImageFont
+import six
+from six.moves import range
+from six.moves import zip
+import tensorflow.compat.v1 as tf
+from typing import List, Tuple
+
+
+env = Environment.instance()
+
+
+def create_header_file(
+    name: str,
+    section: str,
+    tensor_name: str,
+    tensor_data: np.ndarray,
+    output_path: str,
+    debug: bool = False,
+    weights: bool = None,
+):
+    """This function generates a header file containing the data from the numpy array provided.
+
+    Args:
+        name (str): Header file name
+        section (str): section to assign the generated variable
+        tensor_name (str): name for the generated variable
+        tensor_data (np.ndarray): data to fill the variable with
+        output_path (str): output path where the header file will be generated
+        debug (bool, optional): enable debug. Defaults to False.
+        weights (bool, optional): For debug purposes. Defaults to None.
+    """
+    file_path = pathlib.Path(f"{output_path}/" + name).resolve()
+    # Create header file with npy_data as a C array
+    raw_header_path = file_path.with_suffix(".h").resolve()
+    raw_source_path = file_path.with_suffix(".c").resolve()
+
+    if tensor_data.dtype == np.float32:
+        type = "float"
+        align = 32
+    elif tensor_data.dtype == np.int8:
+        type = "int8_t"
+        align = 16
+    elif tensor_data.dtype == np.uint8:
+        type = "uint8_t"
+        align = 16
+    elif tensor_data.dtype == np.uint32:
+        type = "uint32_t"
+        align = 16
+    else:
+        assert False, "Type %s is not supported!" % tensor_data.dtype
+
+    with open(raw_header_path, "a+") as header_file:
+        header_file.write(
+            f"#define {tensor_name}_len {tensor_data.size}\n"
+            + f"extern {type} {tensor_name}[{tensor_name}_len];\n"
+        )
+
+    if not raw_source_path.is_file():
+        with open(raw_source_path, "a+") as source_file:
+            source_file.write(f"#include <stdint.h>\n")
+    with open(raw_source_path, "a+") as source_file:
+
+        source_file.write(
+            f'{type} {tensor_name}[] __attribute__((section("{section}"), aligned({align}))) = {{'
+            if section
+            else f"{type} {tensor_name}[] __attribute__((aligned({align}))) = {{"
+        )
+        data_hexstr = tensor_data.tobytes().hex()
+        flatten = tensor_data.flatten()
+
+        if tensor_data.dtype == np.float32 or tensor_data.dtype == np.uint32:
+            for i in range(0, len(flatten)):
+                source_file.write(f"{flatten[i]},")
+            source_file.write("};\n\n")
+        else:
+            for i in range(0, len(data_hexstr), 2):
+                if flatten[int(i / 2)] < 0:
+                    # Special treatment to generate negative numbers correctly!
+                    data_hexstr_2comp = (
+                        (~int(flatten[int(i / 2)]) + 1).to_bytes(length=1, byteorder="big").hex()
+                    )
+                    source_file.write(f"-0x{data_hexstr_2comp}")
+                else:
+                    source_file.write(f"+0x{data_hexstr[i:i+2]}")
+                if i != (len(flatten) - 1) * 2:
+                    source_file.write(",")
+            source_file.write("};\n\n")
+
+        if debug:
+            source_file.write("/*\n")
+            for n in range(tensor_data.shape[0]):
+                for ch in range(tensor_data.shape[3]):
+                    source_file.write("Channel %i:\n" % ch)
+                    for row in range(tensor_data.shape[1]):
+                        for col in range(tensor_data.shape[2]):
+                            source_file.write(f"{tensor_data[n][row][col][ch]}\t")
+                        source_file.write("\n")
+            source_file.write("*/\n")
+
+            if weights is not None:
+                source_file.write("/*\n")
+                for o_ch in range(weights.shape[3]):
+                    source_file.write("Output channel %i:\n" % o_ch)
+                    for i_ch in range(weights.shape[2]):
+                        source_file.write("Input channel %i:\n" % i_ch)
+                        for row in range(weights.shape[0]):
+                            for col in range(weights.shape[1]):
+                                source_file.write(f"{weights[row][col][i_ch][o_ch]}\t")
+                            source_file.write("\n")
+                source_file.write("*/\n")
+
+
+def get_divisors(x: int) -> List[int]:
+    """Gets all the numbers that perfectly divide x
+
+    Args:
+        x (int): Number to divide
+
+    Returns:
+        List[int]: list of divisors
+    """
+    divs = []
+    for i in range(1, x + 1):
+        if x % i == 0:
+            divs.append(i)
+    return divs
+
+
+def get_greater_div(x, limit: int = None):
+    """Gets the greater divisor for all x
+
+    Args:
+        x: _description_
+        limit (int, optional): Max greater divisor to return. Defaults to None.
+
+    Returns:
+        int: Greater divisor
+    """
+
+    limit = env.DIM if limit == None else limit
+
+    if isinstance(x, int):
+        elements = [x]
+    elif isinstance(x, list):
+        elements = x
+    else:
+        assert False, "type of x not supported!"
+
+    divisors = []
+    for element in elements:
+        divs = get_divisors(element)
+        filtered = filter(lambda d: d <= limit, divs)
+        divisors.append(filtered)
+
+    return max(set.intersection(*map(set, divisors)))
diff --git a/python/tvm/contrib/gemmini/intrin.py b/python/tvm/contrib/gemmini/intrin.py
new file mode 100644
index 000000000000..0909e58a890d
--- /dev/null
+++ b/python/tvm/contrib/gemmini/intrin.py
@@ -0,0 +1,873 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Gemmini related intrinsics
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from __future__ import absolute_import as _abs
+
+import tvm
+from tvm import te
+from typing import List, Tuple
+
+
+def gemm(
+    env,
+    I: int,
+    K: int,
+    J: int,
+    stride: int = 1,
+    is_depthwise_conv2d: bool = True,
+    mode: int = 1,
+    accum_patch: tvm.tir.Var = None,
+):
+    """Matrix-matrix multiply intrinsic, inserts the most basic Gemmini instructions
+
+    Args:
+        env (Environment): Environment with configurations
+        I (int): output first axis dimension
+        K (int): reduction axis dimension
+        J (int): output second axis dimension
+        stride (int, optional): Stride, useful for convolutions. Defaults to 1.
+        is_depthwise_conv2d (bool, optional): Flag to explain if this is a GEMM for a depthwise convolution. Defaults to False.
+        mode (int, optional): Systolic array mode (WS=1,OS=0). Defaults to 1.
+        accum_patch (tvm.tir.Var, optional): Var of the reduction axis loop. Defaults to None.
+
+    Returns:
+        TensorIntrin: gemm tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for I, K and J?
+
+    wgt_shape = (K, J)
+
+    inp_shape = (I, K)
+
+    out_shape = (I, J)
+
+    wgt = te.placeholder(wgt_shape, dtype=env.wgt_dtype, name=env.scr_wgt_scope)
+    inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
+
+    bias = te.placeholder(out_shape, dtype=env.inp_dtype, name=env.scr_scope)
+
+    k = te.reduce_axis((0, wgt_shape[0]), name="k")
+
+    out_dtype = env.inp_dtype
+
+    if is_depthwise_conv2d:
+        out = te.compute(
+            out_shape,
+            lambda i, j: te.sum(
+                inp[i * stride + k, j].astype(env.inp_dtype) * wgt[0, k].astype(env.inp_dtype)
+                + bias[i, j].astype(env.inp_dtype),
+                axis=[k],
+            ),
+            name="out",
+        )
+    else:
+        out = te.compute(
+            out_shape,
+            lambda i, j: te.sum(
+                inp[i * stride, k].astype(env.inp_dtype) * wgt[k, j].astype(env.inp_dtype)
+                + bias[i, j].astype(env.inp_dtype),
+                axis=[k],
+            ),
+            name="out",
+        )
+    wgt_layout = tvm.tir.decl_buffer(
+        wgt.shape,
+        wgt.dtype,
+        "wgt_buff",
+        scope=env.scr_wgt_scope,
+        strides=[te.var("wgt_k"), te.var("wgt_y")],
+        offset_factor=env.DIM,
+    )
+    inp_layout = tvm.tir.decl_buffer(
+        inp.shape,
+        inp.dtype,
+        "inp_buff",
+        scope=env.scr_scope,
+        strides=[te.var("inp_x"), te.var("inp_k")],
+        offset_factor=env.DIM,
+    )
+    bias_layout = tvm.tir.decl_buffer(
+        bias.shape,
+        bias.dtype,
+        "bias_buff",
+        scope=env.acc_scope,
+        strides=[te.var("inp_x"), te.var("inp_k")],
+        offset_factor=env.DIM,
+    )
+    out_layout = tvm.tir.decl_buffer(
+        out.shape,
+        out_dtype,
+        "out_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_x"), te.var("out_y")],
+        offset_factor=env.DIM,
+    )
+
+    def intrin_func(ins, outs):
+        """Matrix-matrix multiply intrinsic function"""
+        dinp, dwgt, dbias = ins
+        dout = outs[0]
+
+        inp_base_address = tvm.runtime.const(env.INP_SCR_BASE_ADDRESS, "uint32")
+        wgt_base_address = tvm.runtime.const(env.WGT_SCR_BASE_ADDRESS, "uint32")
+        wgt_access_ptr = dwgt.access_ptr("r", "uint32")
+        out_base_address = tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+        out_access_ptr = dout.access_ptr("w", "uint32")
+
+        garbage = tvm.runtime.const(0xFFFFFFFF, "uint32")
+
+        def _body():
+            """Generate matrix-matrix multiply Gemmini instruction, without accumulate (garbage address in compute_preloaded)"""
+            irb = tvm.tir.ir_builder.create()
+
+            inp_access_ptr = dinp.access_ptr("r", "uint32")
+
+            A_access_ptr = inp_base_address + inp_access_ptr
+            BD_access_ptr = (
+                wgt_base_address + wgt_access_ptr if mode == env.WEIGHT_STATIONARY else garbage
+            )
+            C_access_ptr = out_base_address + out_access_ptr
+            DB_access_ptr = (
+                garbage if mode == env.WEIGHT_STATIONARY else wgt_base_address + wgt_access_ptr
+            )
+
+            A_cols = dinp.shape[1]
+            A_rows = dinp.shape[0]
+            BD_cols = dwgt.shape[1] if mode == env.WEIGHT_STATIONARY else dout.shape[1]
+            BD_rows = dwgt.shape[0] if mode == env.WEIGHT_STATIONARY else dout.shape[0]
+            C_cols = dout.shape[1]
+            C_rows = dout.shape[0]
+            DB_cols = C_cols if mode == env.WEIGHT_STATIONARY else dwgt.shape[1]
+            DB_rows = C_rows if mode == env.WEIGHT_STATIONARY else dwgt.shape[0]
+
+            with irb.if_scope(accum_patch == 0):
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended_preload",
+                        BD_access_ptr,
+                        C_access_ptr,
+                        BD_cols,
+                        BD_rows,
+                        C_cols,
+                        C_rows,
+                    )
+                )
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended_compute_preloaded",
+                        A_access_ptr,
+                        DB_access_ptr,
+                        A_cols,
+                        A_rows,
+                        DB_cols,
+                        DB_rows,
+                    )
+                )
+            with irb.else_scope():
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended_preload",
+                        garbage,
+                        C_access_ptr,
+                        BD_cols,
+                        BD_rows,
+                        C_cols,
+                        C_rows,
+                    )
+                )
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended_compute_accumulated",
+                        A_access_ptr,
+                        DB_access_ptr,
+                        A_cols,
+                        A_rows,
+                        DB_cols,
+                        DB_rows,
+                    )
+                )
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="GEMM",
+        binds={inp: inp_layout, wgt: wgt_layout, bias: bias_layout, out: out_layout},
+    )
+
+
+def gemm_cisc(
+    env,
+    inp_shape: Tuple[int, ...],
+    wgt_shape: Tuple[int, ...],
+    bias_shape: Tuple[int, ...],
+    scale: float,
+    matmul_type: int,
+):
+    """Matrix-matrix multiply intrinsic, inserts the calls to the function provided by the Gemmini developers to run matrix multiplication using the loop instructions
+
+    Args:
+        env (Environment): Environment with configurations
+        inp_shape (Tuple[int,...]): Input feature map shape
+        wgt_shape (Tuple[int,...]): Weights shape
+        bias_shape (Tuple[int,...]): Bias shape
+        scale (float): Output scaling factor
+        matmul_type (int): Systolic array mode (WS=1,OS=0)
+
+    Returns:
+        TensorIntrin: GEMM CISC tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for inp_shape, wgt_shape and bias_shape?
+
+    wgt = te.placeholder(wgt_shape, dtype=env.inp_dtype, name=env.scr_wgt_scope)
+    inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
+    bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
+
+    K = wgt.shape[0]
+    J = wgt.shape[1]
+    I = inp.shape[0]
+
+    k_ = te.reduce_axis((0, K), name="K")
+
+    output_shape = (I, J)
+
+    out = te.compute(
+        output_shape,
+        lambda x_, y_: te.sum(
+            inp[x_, k_].astype(env.inp_dtype) * wgt[k_, y_].astype(env.inp_dtype)
+            + bias[y_].astype(env.inp_dtype),
+            axis=[k_],
+        ),
+    )
+
+    wgt_layout = tvm.tir.decl_buffer(
+        wgt_shape,
+        env.inp_dtype,
+        "wgt_buff",
+    )
+    inp_layout = tvm.tir.decl_buffer(
+        inp_shape,
+        env.inp_dtype,
+        "inp_buff",
+        strides=[te.var("inp_x"), te.var("inp_y")],
+    )
+    bias_layout = tvm.tir.decl_buffer(
+        bias_shape,
+        env.acc_dtype,
+        "bias_buff",
+    )
+    out_layout = tvm.tir.decl_buffer(
+        output_shape,
+        env.inp_dtype,
+        "out_buff",
+    )
+
+    def intrin_func(ins, outs):
+        """Matrix-matrix multiply intrinsic function"""
+        dinp, dwgt, dbias = ins
+        dout = outs[0]
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "tiled_matmul_auto",
+                    dinp.shape[0],  # dim_I,
+                    dwgt.shape[1],  # dim_J,
+                    dinp.shape[1],  # dim_K,
+                    dinp.access_ptr("r"),
+                    dwgt.access_ptr("r"),
+                    dbias.access_ptr("r"),
+                    dout.access_ptr("w"),
+                    dinp.shape[0],  # stride_A
+                    dwgt.shape[1],  # stride_B
+                    dwgt.shape[1],  # stride_C
+                    dwgt.shape[1],  # stride_D
+                    1.0,  # A_scale_factor
+                    1.0,  # B_scale_factor
+                    1.0,  # D_scale_factor
+                    0,  # act
+                    scale,
+                    0,  # relu6_shift
+                    1,  # repeating_bias
+                    0,  # transpose_A
+                    0,  # transpose_B
+                    0,  # full_C
+                    0,  # low_D
+                    # 0,
+                    0,  # weightA
+                    matmul_type,
+                )
+            )
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="CONV2D_CISC",
+        binds={inp: inp_layout, wgt: wgt_layout, bias: bias_layout, out: out_layout},
+    )
+
+
+def conv2d_cisc(
+    env,
+    inp_shape: Tuple[int, ...],
+    wgt_shape: Tuple[int, ...],
+    bias_shape: Tuple[int, ...],
+    out_shape: Tuple[int, ...],
+    strides: int,
+    padding: List[int],
+    padding_value: int,
+    activation: int,
+    scale: float,
+    pool_size: List[int],
+    pool_strides: List[int],
+    pool_dilation: List[int],
+    pool_padding: List[int],
+):
+    """2D convolution intrinsic, inserts the calls to the function provided by the Gemmini developers to run a 2D convolution using the loop instructions
+
+    Args:
+        env (Environment): Environment with configurations
+        inp_shape (Tuple[int,...]): Input feature map shape
+        wgt_shape (Tuple[int,...]): Weights shape
+        bias_shape (Tuple[int,...]): Bias shape
+        out_shape (Tuple[int,...]): Output feature map shape
+        strides (int): Convolution stride
+        padding (List[int]): Pixels to pad in each direction
+        padding_value (int): Value to use for padding
+        activation (int): Has activation?
+        scale (float): Output scaling factor
+        pool_size (List[int]): Size of the output pooling window
+        pool_strides (List[int]): Strides for the output pooling window
+        pool_dilation (List[int]): Dilation for the output pooling window. Not used for now.
+        pool_padding (List[int]): Padding for the output pooling
+
+    Returns:
+        TensorIntrin: CONV2D CISC tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for the supported parameters?
+
+    wgt = te.placeholder(wgt_shape, dtype=env.inp_dtype, name=env.scr_wgt_scope)
+    inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
+    bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
+
+    OC = wgt.shape[3]
+    KH = wgt.shape[0]
+    KW = wgt.shape[1]
+
+    N = inp.shape[0]
+    IH = inp.shape[1]
+    IW = inp.shape[2]
+    IC = inp.shape[3]
+
+    ric = te.reduce_axis((0, IC), name="ric")
+    rkh = te.reduce_axis((0, KH), name="rkh")
+    rkw = te.reduce_axis((0, KW), name="rkw")
+
+    HSTR = strides[0]
+    WSTR = strides[1]
+
+    out = te.compute(
+        out_shape,
+        lambda b_o, i, j, c_o: te.sum(
+            inp[b_o, i * HSTR + rkh, j * WSTR + rkw, ric].astype(env.inp_dtype)
+            * wgt[rkh, rkw, ric, c_o].astype(env.inp_dtype)
+            + bias[c_o].astype(env.inp_dtype),
+            axis=[rkh, rkw, ric],
+        ),
+    )
+
+    wgt_layout = tvm.tir.decl_buffer(wgt_shape, env.inp_dtype, "wgt_buff")
+    inp_layout = tvm.tir.decl_buffer(
+        inp_shape,
+        env.inp_dtype,
+        "inp_buff",
+        strides=[te.var("inp_x"), te.var("inp_y"), te.var("inp_b"), te.var("inp_k")],
+    )
+    bias_layout = tvm.tir.decl_buffer(
+        bias_shape,
+        env.acc_dtype,
+        "bias_buff",
+    )
+    out_layout = tvm.tir.decl_buffer(
+        out_shape,
+        env.inp_dtype,
+        "out_buff",
+    )
+
+    def intrin_func(ins, outs):
+        """2D convolution intrinsic function"""
+        dinp, dwgt, dbias = ins
+        dout = outs[0]
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            if env.supports_non_zero_padding:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "tiled_conv_auto",
+                        dinp.shape[0],  # BATCH_SIZE,
+                        dinp.shape[1],  # IN_DIM,
+                        dinp.shape[3],  # IN_CHANNELS,
+                        dout.shape[3],  # OUT_CHANNELS,
+                        dout.shape[1],  # OUT_DIM,
+                        strides[0],
+                        1,
+                        1,
+                        padding[2],
+                        padding_value,
+                        dwgt.shape[0],
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        dinp.access_ptr("r"),
+                        dwgt.access_ptr("r"),
+                        dbias.access_ptr("r"),
+                        dout.access_ptr("w"),
+                        activation,
+                        scale,
+                        pool_size[0],
+                        pool_strides[0],
+                        pool_padding[0],
+                        1,
+                    )
+                )
+            else:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "tiled_conv_auto",
+                        dinp.shape[0],  # BATCH_SIZE,
+                        dinp.shape[1],  # IN_DIM,
+                        dinp.shape[3],  # IN_CHANNELS,
+                        dout.shape[3],  # OUT_CHANNELS,
+                        dout.shape[1],  # OUT_DIM,
+                        strides[0],
+                        1,
+                        1,
+                        padding[2],
+                        dwgt.shape[0],
+                        0,
+                        0,
+                        0,
+                        0,
+                        0,
+                        dinp.access_ptr("r"),
+                        dwgt.access_ptr("r"),
+                        dbias.access_ptr("r"),
+                        dout.access_ptr("w"),
+                        activation,
+                        scale,
+                        pool_size[0],
+                        pool_strides[0],
+                        pool_padding[0],
+                        1,
+                    )
+                )
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="CONV2D_CISC",
+        binds={inp: inp_layout, wgt: wgt_layout, bias: bias_layout, out: out_layout},
+    )
+
+
+def dw_conv2d_cisc(
+    env,
+    inp_shape: Tuple[int, ...],
+    wgt_shape: Tuple[int, ...],
+    bias_shape: Tuple[int, ...],
+    out_shape: Tuple[int, ...],
+    strides: int,
+    padding: List[int],
+    padding_value: int,
+    activation: int,
+    scale: float,
+):
+    """2D depthwise convolution intrinsic, inserts the calls to the function provided by the Gemmini developers to run a 2D depthwise convolution using the loop instructions
+
+    Args:
+        env (Environment): Environment with configurations
+        inp_shape (Tuple[int,...]): Input feature map shape
+        wgt_shape (Tuple[int,...]): Weights shape
+        bias_shape (Tuple[int,...]): Bias shape
+        out_shape (Tuple[int,...]): Output feature map shape
+        strides (int): Convolution stride
+        padding (List[int]): Pixels to pad in each direction
+        padding_value (int): Value to use for padding
+        activation (int): Has activation?
+        scale (float): Output scaling factor
+
+    Returns:
+        TensorIntrin: depthwise convolution 2d tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for the supported parameters?
+
+    wgt = te.placeholder(wgt_shape, dtype=env.inp_dtype, name=env.scr_wgt_scope)
+    inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
+    bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
+
+    OC = wgt.shape[0]
+    KH = wgt.shape[1]
+    KW = wgt.shape[2]
+
+    N = inp.shape[0]
+    IH = inp.shape[1]
+    IW = inp.shape[2]
+    IC = inp.shape[3]
+
+    rkh = te.reduce_axis((0, KH), name="rkh")
+    rkw = te.reduce_axis((0, KW), name="rkw")
+
+    HSTR = strides[0]
+    WSTR = strides[1]
+
+    out = te.compute(
+        out_shape,
+        lambda b_o, i, j, c_o: te.sum(
+            inp[b_o, i * HSTR + rkh, j * WSTR + rkw, c_o].astype(env.inp_dtype)
+            * wgt[c_o, rkh, rkw].astype(env.inp_dtype)
+            + bias[c_o].astype(env.inp_dtype),
+            axis=[rkh, rkw],
+        ),
+    )
+
+    wgt_layout = tvm.tir.decl_buffer(
+        wgt_shape,
+        env.inp_dtype,
+        "wgt_buff",
+        # strides=[te.var("wgt_i"),te.var("wgt_j")]
+    )
+    inp_layout = tvm.tir.decl_buffer(
+        inp_shape,
+        env.inp_dtype,
+        "inp_buff",
+        strides=[te.var("inp_x"), te.var("inp_y"), te.var("inp_b"), te.var("inp_k")],
+    )
+    bias_layout = tvm.tir.decl_buffer(
+        bias_shape,
+        env.acc_dtype,
+        "bias_buff",
+    )
+    out_layout = tvm.tir.decl_buffer(
+        out_shape,
+        env.inp_dtype,
+        "out_buff",
+    )
+
+    def intrin_func(ins, outs):
+        """2D depthwise convolution intrinsic function"""
+        dinp, dwgt, dbias = ins
+        dout = outs[0]
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            if env.supports_non_zero_padding:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "tiled_conv_dw_auto",
+                        dinp.shape[0],  # BATCH_SIZE,
+                        dinp.shape[1],  # IN_DIM,
+                        dinp.shape[3],  # IN_CHANNELS,
+                        # dout.shape[3],#OUT_CHANNELS,
+                        dout.shape[1],  # OUT_DIM,
+                        strides[0],
+                        # 1, 1,
+                        padding[2],
+                        padding_value,
+                        dwgt.shape[1],
+                        # 0, 0, 0, 0, 0,
+                        dinp.access_ptr("r"),
+                        dwgt.access_ptr("r"),
+                        dbias.access_ptr("r"),
+                        dout.access_ptr("w"),
+                        activation,
+                        scale,
+                        1,
+                        0,
+                        0,
+                        1,
+                    )
+                )
+            else:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "tiled_conv_dw_auto",
+                        dinp.shape[0],  # BATCH_SIZE,
+                        dinp.shape[1],  # IN_DIM,
+                        dinp.shape[3],  # IN_CHANNELS,
+                        # dout.shape[3],#OUT_CHANNELS,
+                        dout.shape[1],  # OUT_DIM,
+                        strides[0],
+                        # 1, 1,
+                        padding[2],
+                        dwgt.shape[1],
+                        # 0, 0, 0, 0, 0,
+                        dinp.access_ptr("r"),
+                        dwgt.access_ptr("r"),
+                        dbias.access_ptr("r"),
+                        dout.access_ptr("w"),
+                        activation,
+                        scale,
+                        1,
+                        0,
+                        0,
+                        1,
+                    )
+                )
+
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="DWCONV2D_CISC",
+        binds={inp: inp_layout, wgt: wgt_layout, bias: bias_layout, out: out_layout},
+    )
+
+
+def add_tensorize(env, oshape: Tuple[int, ...]):
+    """Add intrinsic, inserts the most basic Gemmini instructions to support the qnn.add operator
+
+    Args:
+        env (Environment): Environment with configurations
+        oshape (Tuple[int,...]): Output feature map shape
+
+    Returns:
+        TensorIntrin: add tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for the supported parameters?
+
+    ifm1 = te.placeholder(oshape, dtype=env.inp_dtype, name=env.acc_scope)
+    ifm2 = te.placeholder(oshape, dtype=env.inp_dtype, name=env.acc_scope)
+
+    out = te.compute(
+        oshape, lambda i, j: ifm1[i, j].astype(env.inp_dtype) + ifm2[i, j].astype(env.inp_dtype)
+    )
+
+    ifm1_dtype = env.inp_dtype
+
+    ifm1_layout = tvm.tir.decl_buffer(
+        oshape,
+        ifm1_dtype,
+        "ifm1_buff",
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+    ifm2_layout = tvm.tir.decl_buffer(
+        oshape,
+        env.inp_dtype,
+        "ifm2_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+    out_layout = tvm.tir.decl_buffer(
+        oshape,
+        env.inp_dtype,
+        "out_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+
+    def intrin_func(ins, outs):
+        """Add intrinsic function"""
+        difm1, difm2 = ins
+        dout = outs[0]
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin2",
+                    difm1.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + difm2.access_ptr("w", "uint32"),
+                    difm1.shape[1],
+                    difm1.shape[0],
+                )
+            )
+
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="ADD",
+        binds={ifm1: ifm1_layout, ifm2: ifm2_layout, out: out_layout},
+    )
+
+
+def add_mvout_tensorize(env, oshape: Tuple[int, ...]):
+    """Helper for the add intrinsic
+
+    Args:
+        env (Environment): Environment with configurations
+        oshape (Tuple[int,...]): Output feature map shape
+
+    Returns:
+        TensorIntrin: add mvout tensor intrinsic
+    """
+
+    # TODO (FP): add assertions here for the supported parameters?
+
+    ifm1 = te.placeholder(oshape, dtype=env.inp_dtype, name=env.acc_scope)
+    ifm2 = te.placeholder(oshape, dtype=env.inp_dtype, name=env.acc_scope)
+
+    out = te.compute(
+        oshape, lambda i, j: ifm1[i, j].astype(env.inp_dtype) + ifm2[i, j].astype(env.inp_dtype)
+    )
+
+    ifm1_dtype = env.inp_dtype
+
+    ifm1_layout = tvm.tir.decl_buffer(
+        oshape,
+        ifm1_dtype,
+        "ifm1_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+    ifm2_layout = tvm.tir.decl_buffer(
+        oshape,
+        env.inp_dtype,
+        "ifm2_buff",
+        scope=env.acc_scope,
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+    out_layout = tvm.tir.decl_buffer(
+        oshape,
+        env.inp_dtype,
+        "out_buff",
+        strides=[te.var("out_b"), te.var("out_x")],
+        offset_factor=env.DIM,
+    )
+
+    def intrin_func(ins, outs):
+        """Add mvout intrinsic function"""
+        difm1, difm2 = ins
+        dout = outs[0]
+
+        def _body():
+            irb = tvm.tir.ir_builder.create()
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvout",
+                    dout.access_ptr("w"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + difm2.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    difm1.shape[1],
+                    difm1.shape[0],
+                )
+            )
+
+            return irb.get()
+
+        def _reduce_reset():
+            irb = tvm.tir.ir_builder.create()
+            return irb.get()
+
+        def _reduce_update():
+            return _body()
+
+        # return a triple of normal-set, reset, update
+        return (_body(), _reduce_reset(), _reduce_update())
+
+    return te.decl_tensor_intrin(
+        out.op,
+        intrin_func,
+        name="ADD_MVOUT",
+        binds={ifm1: ifm1_layout, ifm2: ifm2_layout, out: out_layout},
+    )
diff --git a/python/tvm/contrib/gemmini/legalize.py b/python/tvm/contrib/gemmini/legalize.py
new file mode 100644
index 000000000000..6f279bb512b3
--- /dev/null
+++ b/python/tvm/contrib/gemmini/legalize.py
@@ -0,0 +1,595 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+A set of passes to legalize the Gemmini operators
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from typing import List, Type, Callable
+import math
+
+import numpy as np  # type: ignore
+
+import tvm  # type: ignore
+from tvm import te
+from tvm import relay
+from tvm import ir
+from tvm.relay.dataflow_pattern import DFPatternCallback  # type: ignore
+from tvm.relay.dataflow_pattern import wildcard
+from tvm.relay.dataflow_pattern import is_op
+from tvm.relay.dataflow_pattern import rewrite
+from tvm.relay.dataflow_pattern import CallPattern
+from tvm.relay.frontend.common import infer_shape as _infer_shape
+from tvm.relay.frontend.common import infer_type as _infer_type
+from tvm.relay.expr_functor import ExprMutator, ExprVisitor
+
+from tvm.relay.op import _make  # type: ignore
+
+from .pattern_table import *  # type: ignore
+
+from .environment import Environment
+
+env = Environment.instance()
+
+
+def gemmini_gemm(
+    ifm1: tvm.relay.Expr,
+    ifm2: tvm.relay.Expr,
+    bias: tvm.relay.Expr,
+    ifm_scale: float,
+    ifm_offset: float,
+    bias_scale: float,
+    bias_offset: float,
+    ofm_scale: float,
+    ofm_offset: float,
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.gemm operator
+
+    Args:
+        ifm1 (tvm.relay.Expr): Input feature map 1
+        ifm2 (tvm.relay.Expr): Input feature map 2 (weights)
+        bias (tvm.relay.Expr): Biases
+        ifm_scale (float): Input feature map scaling factor
+        ifm_offset (float): Input feature map offset
+        bias_scale (float): Biases scaling factor
+        bias_offset (float): Biases offset
+        ofm_scale (float): Output feature map scaling factor
+        ofm_offset (float): Output feature map offset
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.gemm operator
+    """
+    return _make.gemmini_gemm(
+        ifm1, ifm2, bias, ifm_scale, ifm_offset, bias_scale, bias_offset, ofm_scale, ofm_offset
+    )
+
+
+def gemmini_add(
+    ifm1: tvm.relay.Expr,
+    ifm2: tvm.relay.Expr,
+    ifm1_scale: float,
+    ifm1_offset: float,
+    ifm2_scale: float,
+    ifm2_offset: float,
+    ofm_scale: float,
+    ofm_offset: float,
+    shape: Tuple[int, ...],
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.add operator
+
+    Args:
+        ifm1 (tvm.relay.Expr): Input feature map 1
+        ifm2 (tvm.relay.Expr): Input feature map 2
+        ifm1_scale (float): Input feature map 1 scaling factor
+        ifm1_offset (float): Input feature map 1 offset
+        ifm2_scale (float): Input feature map 2 scaling factor
+        ifm2_offset (float): Input feature map 2 offset
+        ofm_scale (float): Output feature map scaling factor
+        ofm_offset (float): Output feature map offset
+        shape (Tuple[int,...]): Shape of the input feature maps and the output feature map
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.add operator
+    """
+    return _make.gemmini_add(
+        ifm1,
+        ifm2,
+        ifm1_scale,
+        ifm1_offset,
+        ifm2_scale,
+        ifm2_offset,
+        ofm_scale,
+        ofm_offset,
+        shape,
+    )
+
+
+def gemmini_conv2d(
+    data: tvm.relay.Expr,
+    weights: tvm.relay.Expr,
+    bias: tvm.relay.Expr,
+    strides: tuple,
+    padding: tuple,
+    ifm_scale: float,
+    ifm_offset: float,
+    weights_scale: float,
+    weights_offset: float,
+    bias_scale: float,
+    bias_offset: float,
+    ofm_scale: float,
+    ofm_offset: float,
+    activation: bool,
+    has_pool: bool,
+    pool_size: tvm.relay.Expr,
+    pool_strides: tvm.relay.Expr,
+    pool_dilation: tvm.relay.Expr,
+    pool_padding: tvm.relay.Expr,
+    input_req_offset_out: tvm.relay.Expr,
+    has_activation: bool,
+    activation_scale_in: tvm.relay.Expr,
+    activation_offset_in: tvm.relay.Expr,
+    activation_scale_out: tvm.relay.Expr,
+    activation_offset_out: tvm.relay.Expr,
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.conv2d operator
+
+    Args:
+        data (tvm.relay.Expr): Input feature map
+        weights (tvm.relay.Expr): Convolution weights matrix
+        bias (tvm.relay.Expr): Convolution biases matrix
+        strides (tuple): Convolution strides
+        padding (tuple): Convolution paddings in each direction
+        ifm_scale (float): Input feature map scaling factor
+        ifm_offset (float): Input feature map offset
+        weights_scale (float): Weights scaling factor
+        weights_offset (float): Convolution weights offset
+        bias_scale (float): Biases scaling factor
+        bias_offset (float): Biases weights offset
+        ofm_scale (float): Output feature map scaling factor
+        ofm_offset (float): Output feature map offset
+        activation (bool): TODO (FP): see if this can be deleted! Has activation?
+        has_pool (bool): Has pooling layer after the output of the convolution?
+        pool_size (tvm.relay.Expr): Pooling window size
+        pool_strides (tvm.relay.Expr): Pooling window strides
+        pool_dilation (tvm.relay.Expr): Pooling window dilation
+        pool_padding (tvm.relay.Expr): Pooling padding in each direction
+        input_req_offset_out (tvm.relay.Expr): Requantize layer output offset
+        has_activation (bool): Has activation?
+        activation_scale_in (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer input scaling factor
+        activation_offset_in (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer input offset
+        activation_scale_out (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer output scaling factor
+        activation_offset_out (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer output offset
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.conv2d operator
+    """
+    return _make.gemmini_conv2d(
+        data,
+        weights,
+        bias,
+        strides,
+        padding,
+        ifm_scale,
+        ifm_offset,
+        weights_scale,
+        weights_offset,
+        bias_scale,
+        bias_offset,
+        ofm_scale,
+        ofm_offset,
+        activation,
+        has_pool,
+        pool_size,
+        pool_strides,
+        pool_dilation,
+        pool_padding,
+        input_req_offset_out,
+        has_activation,
+        activation_scale_in,
+        activation_offset_in,
+        activation_scale_out,
+        activation_offset_out,
+    )
+
+
+def gemmini_depthwise_conv2d(
+    data: tvm.relay.Expr,
+    weights: tvm.relay.Expr,
+    bias: tvm.relay.Expr,
+    strides: tuple,
+    padding: tuple,
+    ifm_scale: float,
+    ifm_offset: float,
+    weights_scale: float,
+    weights_offset: float,
+    bias_scale: float,
+    bias_offset: float,
+    ofm_scale: float,
+    ofm_offset: float,
+    activation: bool,
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.depthwiseconv2d operator
+
+    Args:
+        data (tvm.relay.Expr): Input feature map
+        weights (tvm.relay.Expr): Convolution weights matrix
+        bias (tvm.relay.Expr): Convolution biases matrix
+        strides (tuple): Convolution strides
+        padding (tuple): Convolution paddings in each direction
+        ifm_scale (float): Input feature map scaling
+        ifm_offset (float): Input feature map offset
+        weights_scale (float): Convolution weights scaling factor
+        weights_offset (float): Convolution weights offset
+        bias_scale (float): Convolution biases scaling factor
+        bias_offset (float): Convolution biases offset
+        ofm_scale (float): Output feature map scaling
+        ofm_offset (float): Output feature map offset
+        activation (bool): Has activation?
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.depthwiseconv2d operator
+    """
+    return _make.gemmini_depthwise_conv2d(
+        data,
+        weights,
+        bias,
+        strides,
+        padding,
+        ifm_scale,
+        ifm_offset,
+        weights_scale,
+        weights_offset,
+        bias_scale,
+        bias_offset,
+        ofm_scale,
+        ofm_offset,
+        activation,
+    )
+
+
+def gemmini_max_pool2d(
+    ifm: tvm.relay.Expr,
+    pool_size: tvm.relay.Expr,
+    pool_strides: tvm.relay.Expr,
+    pool_dilation: tvm.relay.Expr,
+    pool_padding: tvm.relay.Expr,
+    shape: tuple,
+) -> tvm.relay.Call:
+    """Generates the call to the contrib.gemmini.max_pool2d operator
+
+    Args:
+        ifm (tvm.relay.Expr): Input feature map
+        pool_size (tvm.relay.Expr): Pooling window size
+        pool_strides (tvm.relay.Expr): Pooling window strides
+        pool_dilation (tvm.relay.Expr): Pooling window dilation
+        pool_padding (tvm.relay.Expr): Pooling padding in each direction
+        shape (tuple): Input shape
+
+    Returns:
+        tvm.relay.Call: Call to the contrib.gemmini.max_pool2d operator
+    """
+    return _make.gemmini_max_pool2d(
+        ifm, pool_size, pool_strides, pool_dilation, pool_padding, shape
+    )
+
+
+class AddRewriter(DFPatternCallback):
+    """Convert add related composite functions into contrib.gemmini.add operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": AddParams.composite_name}))(
+            wildcard(), wildcard()
+        )
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = AddParams(post.op.body)
+        gemmini_add_op = gemmini_add(
+            post.args[0],
+            post.args[1],
+            params.ifm1_scale,
+            params.ifm1_offset,
+            params.ifm2_scale,
+            params.ifm2_offset,
+            params.ofm_scale,
+            params.ofm_offset,
+            params.output_shape,
+        )
+        return gemmini_add_op
+
+
+class GEMMRewriter(DFPatternCallback):
+    """Convert gemm related composite functions into contrib.gemmini.gemm operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": GEMMParams.composite_name}))(
+            wildcard(), wildcard(), wildcard()
+        )
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = GEMMParams(post.op.body)
+        gemmini_gemm_op = gemmini_gemm(
+            post.args[0],
+            post.args[1],
+            post.args[2],
+            params.ifm_scale,
+            params.ifm_offset,
+            params.bias_scale,
+            params.bias_offset,
+            params.ofm_scale,
+            params.ofm_offset,
+        )
+        return gemmini_gemm_op
+
+
+class CONV2DRewriter(DFPatternCallback):
+    """Convert conv2d related composite functions into contrib.gemmini.conv2d operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": CONV2DParams.composite_name}))(
+            wildcard(), wildcard(), wildcard()
+        )
+        self.data_index = 0
+        self.weights_index = 1
+        self.bias_index = 2
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = CONV2DParams(post.op.body)
+        if params.has_external_pad:
+            self.weights_index = 2
+            self.bias_index = 3
+        else:
+            self.weights_index = 1
+            self.bias_index = 2
+
+        bias = post.args[self.bias_index]
+
+        if params.has_input_requantize:
+            data = relay.cast(post.args[self.data_index], "int8")
+        else:
+            data = post.args[self.data_index]
+
+        if params.is_depthwise:
+            reshaped_weights = relay.squeeze(
+                relay.transpose(post.args[self.weights_index], [3, 0, 1, 2]), axis=[3]
+            )
+            gemmini_depthwise_conv2d_op = gemmini_depthwise_conv2d(
+                data=data,
+                weights=reshaped_weights,
+                bias=bias,
+                strides=params.strides,
+                padding=params.padding,
+                ifm_scale=params.ifm_scale,
+                ifm_offset=params.ifm_offset,
+                weights_scale=params.weights_scale,
+                weights_offset=params.weights_offset,
+                bias_scale=params.bias_scale,
+                bias_offset=params.bias_offset,
+                ofm_scale=params.ofm_scale,
+                ofm_offset=params.ofm_offset,
+                activation=params.activation,
+            )
+            return gemmini_depthwise_conv2d_op
+        else:
+            gemmini_conv2d_op = gemmini_conv2d(
+                data=data,
+                weights=post.args[self.weights_index],
+                bias=bias,
+                strides=params.strides,
+                padding=params.padding,
+                ifm_scale=params.ifm_scale,
+                ifm_offset=params.ifm_offset,
+                weights_scale=params.weights_scale,
+                weights_offset=params.weights_offset,
+                bias_scale=params.bias_scale,
+                bias_offset=params.bias_offset,
+                ofm_scale=params.ofm_scale,
+                ofm_offset=params.ofm_offset,
+                activation=params.activation,
+                has_pool=params.has_pool,
+                pool_size=params.pool_size,
+                pool_strides=params.pool_strides,
+                pool_dilation=params.pool_dilation,
+                pool_padding=params.pool_padding,
+                input_req_offset_out=params.input_offset_out,
+                has_activation=params.has_activation,
+                activation_scale_in=params.activation_scale_in,
+                activation_offset_in=params.activation_offset_in,
+                activation_scale_out=params.activation_scale_out,
+                activation_offset_out=params.activation_offset_out,
+            )
+        return gemmini_conv2d_op
+
+
+class CONV2DExternalPadRewriter(CONV2DRewriter):
+    def __init__(self):
+        super().__init__()
+        self.pattern = (wildcard().has_attr({"Composite": CONV2DParams.composite_name}))(
+            wildcard(), wildcard(), wildcard(), wildcard()
+        )
+        self.data_index = 0
+
+
+class CONV2DExternalPadAndRelu6Rewriter(CONV2DRewriter):
+    def __init__(self):
+        super().__init__()
+        self.pattern = (wildcard().has_attr({"Composite": CONV2DParams.composite_name}))(
+            wildcard(), wildcard(), wildcard(), wildcard(), wildcard()
+        )
+        self.data_index = 0
+        self.min_index = 4
+
+
+class MAXPOOL2DRewriter(DFPatternCallback):
+    """Convert conv2d related composite functions into gemmini_max_pool2d operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": MaxPoolParams.composite_name}))(
+            wildcard()
+        )
+        self.data_index = 0
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = MaxPoolParams(post.op.body)
+
+        data = post.args[self.data_index]
+
+        gemmini_max_pool2d_op = gemmini_max_pool2d(
+            ifm=data,
+            pool_size=params.pool_size,
+            pool_strides=params.pool_strides,
+            pool_dilation=params.pool_dilation,
+            pool_padding=params.pool_padding,
+            shape=params.shape,
+        )
+        return gemmini_max_pool2d_op
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeAdd:
+    """This is the pass that wraps the AddRewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(AddRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeMaxPool2D:
+    """This is the pass that wraps the MAXPOOL2DRewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(MAXPOOL2DRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeGEMM:
+    """This is the pass that wraps the GEMMRewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(GEMMRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeCONV2D:
+    """This is the pass that wraps the CONV2DRewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(CONV2DRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeCONV2DExternalPad:
+    """This is the pass that wraps the CONV2DExternalPadRewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(CONV2DExternalPadRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeCONV2DExternalPadAndRelu6:
+    """This is the pass that wraps the CONV2DExternalPadAndRelu6Rewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(CONV2DExternalPadAndRelu6Rewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeGemmini:
+    """This is the pass to call graph-rewrites to perform graph transformation
+    in a way such that the operations are replaced with hardware/codegen supported
+    operations.
+    """
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        """This is the method that replaces the operations with hardware/codegen supported
+        operations.
+        """
+        mod = LegalizeCONV2DExternalPadAndRelu6()(mod)
+        mod = LegalizeCONV2DExternalPad()(mod)
+        mod = LegalizeAdd()(mod)
+        mod = LegalizeCONV2D()(mod)
+        mod = LegalizeGEMM()(mod)
+        mod = LegalizeMaxPool2D()(mod)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        # pylint is unable figure out the decorated
+        # class is callable, thus adding this to
+        # suppress the warning.
+        pass
diff --git a/python/tvm/contrib/gemmini/pattern_table.py b/python/tvm/contrib/gemmini/pattern_table.py
new file mode 100644
index 000000000000..a43f10699c75
--- /dev/null
+++ b/python/tvm/contrib/gemmini/pattern_table.py
@@ -0,0 +1,469 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Pattern table declaring the supported Gemmini operators
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from typing import Dict, List, Tuple, Callable, Optional
+
+import tvm  # type: ignore
+from tvm import relay
+from tvm.target import Target
+from tvm.relay.build_module import bind_params_by_name  # type: ignore
+from tvm.relay.op.contrib.register import register_pattern_table  # type: ignore
+from tvm.relay.dataflow_pattern import is_constant, wildcard, is_op
+from .utils import *
+
+from tvm.topi.utils import const_vector, get_const_int, get_const_float
+from tvm.relay.frontend.common import infer_shape as _infer_shape
+from tvm.relay.frontend.common import infer_type as _infer_type
+
+from .environment import Environment
+
+env = Environment.instance()
+
+
+class GEMMParams:
+    """
+    This class will parse a Call to a gemmini.gemm composite function
+    """
+
+    composite_name = "gemmini.gemm"
+
+    def __init__(self, func_body: tvm.relay.Function):
+
+        dense_op = func_body.args[0]
+        self.weights = func_body.args[1]
+        requantize_op = func_body
+
+        bias_add = requantize_op.args[0]
+        self.bias = bias_add.args[1]
+        dense_op = bias_add.args[0]
+        self.ifm_scale = dense_op.args[QDenseArgs.IFM_SCALE.value]
+        self.ifm_offset = dense_op.args[QDenseArgs.IFM_ZERO_POINT.value]
+
+        if requantize_op.op.name == "qnn.requantize":
+            self.merge_requantize = True
+            self.bias_scale = requantize_op.args[RequantArgs.IFM_SCALE.value]
+            self.bias_offset = requantize_op.args[RequantArgs.IFM_ZERO_POINT.value]
+            self.ofm_scale = requantize_op.args[RequantArgs.OFM_SCALE.value]
+            self.ofm_offset = requantize_op.args[RequantArgs.OFM_ZERO_POINT.value]
+        else:
+            self.merge_requantize = False
+            self.bias_scale = tvm.relay.const([1.0], "float")
+            self.bias_offset = tvm.relay.const(0, "int32")
+            self.ofm_scale = tvm.relay.const(1.0, "float")
+            self.ofm_offset = tvm.relay.const(0, "int32")
+
+    def is_valid(self) -> bool:
+        """
+        This function checks whether gemmini.gemm has compatible attributes with the Gemmini
+        """
+        # TODO (FP): complete this validation
+        return True
+
+
+class AddParams:
+    """
+    This class will parse a Call to a gemmini.add composite function
+    """
+
+    composite_name = "gemmini.add"
+    activation_map = {"clip": "CLIP"}
+
+    def __init__(self, func_body: tvm.relay.Function):
+        if str(func_body.op) in self.activation_map.keys():
+            activation = func_body
+            add_op = func_body.args[0]
+        else:
+            add_op = func_body
+
+        self.ifm1_scale = add_op.args[BinaryElementwiseArgs.IFM1_SCALE.value]
+        self.ifm1_offset = add_op.args[BinaryElementwiseArgs.IFM1_ZERO_POINT.value]
+        self.ifm2_scale = add_op.args[BinaryElementwiseArgs.IFM2_SCALE.value]
+        self.ifm2_offset = add_op.args[BinaryElementwiseArgs.IFM2_ZERO_POINT.value]
+        self.ofm_scale = add_op.args[BinaryElementwiseArgs.OFM_SCALE.value]
+        self.ofm_offset = add_op.args[BinaryElementwiseArgs.OFM_ZERO_POINT.value]
+        self.output_shape = _infer_shape(add_op)
+        self.ifm1_shape = _infer_shape(add_op.args[0])
+        self.ifm2_shape = _infer_shape(add_op.args[1])
+
+    def is_valid(self) -> bool:
+        """
+        This function checks whether gemmini.add has compatible attributes with the Gemmini
+        """
+        # TODO (FP): complete this validation
+        # We only support 4 dimensions add operators... for now
+        if len(self.output_shape) != 4:
+            return False
+        if self.ifm1_shape != self.ifm2_shape:
+            return False
+        return True
+
+
+class CONV2DParams:
+    """
+    This class will parse a Call to a gemmini.conv2d composite function
+    """
+
+    composite_name = "gemmini.conv2d"
+    activation_map = {"clip": "CLIP"}
+
+    def __init__(self, func_body: tvm.relay.Function):
+        activation = None
+        self.pool_size = [0, 0]
+        self.pool_strides = [0, 0]
+        self.pool_padding = [0, 0, 0, 0]
+        self.pool_dilation = [0, 0]
+        self.has_pool = False
+        self.has_activation = False
+        self.a_min = None
+        self.a_max = None
+        self.has_external_pad = False
+        self.activation_scale_in = tvm.relay.const(1.0, "float")
+        self.activation_offset_in = tvm.relay.const(0, "int32")
+        self.activation_scale_out = tvm.relay.const(1.0, "float")
+        self.activation_offset_out = tvm.relay.const(0, "int32")
+
+        _op = func_body
+
+        if _op.args[0].op.name != "nn.bias_add":
+
+            if _op.op.name == "clip":
+                _op = _op.args[0]
+            else:
+
+                if _op.op.name == "nn.max_pool2d":
+                    max_pool = _op
+                    self.pool_size = max_pool.attrs.pool_size
+                    self.pool_strides = max_pool.attrs.strides
+                    self.pool_padding = max_pool.attrs.padding
+                    self.pool_dilation = max_pool.attrs.dilation
+                    self.has_pool = True
+                    _op = max_pool.args[0]
+
+                if _op.op.name == "clip":
+                    _op = _op.args[0]
+                elif _op.args[0].op.name == "clip":
+                    self.activation_scale_in = _op.args[RequantArgs.IFM_SCALE.value]
+                    self.activation_offset_in = _op.args[RequantArgs.IFM_ZERO_POINT.value]
+                    self.activation_scale_out = _op.args[RequantArgs.OFM_SCALE.value]
+                    self.activation_offset_out = _op.args[RequantArgs.OFM_ZERO_POINT.value]
+                    clip = _op.args[0]
+                    self.has_activation = True
+                    _min = clip.args[0]
+                    self.a_min = clip.attrs.a_min
+                    self.a_max = clip.attrs.a_max
+                    _op = _min.args[0]
+
+        requantize_op = _op
+
+        bias_add = requantize_op.args[0]
+
+        conv2d_op = bias_add.args[0]
+
+        self.has_input_requantize = False
+        self.input_scale_in = tvm.relay.const(1.0, "float")
+        self.input_offset_in = tvm.relay.const(0, "int32")
+        self.input_scale_out = tvm.relay.const(1.0, "float")
+        self.input_offset_out = tvm.relay.const(0, "int32")
+
+        self.output_shape = _infer_shape(conv2d_op)
+        self.strides = conv2d_op.attrs.strides
+        self.padding = conv2d_op.attrs.padding
+        self.groups = conv2d_op.attrs.groups
+        self.is_depthwise = self.groups == conv2d_op.attrs.channels and self.groups != 1
+        self.data = conv2d_op.args[0]
+        self.input_shape = _infer_shape(self.data)
+        if (
+            not isinstance(self.data, relay.expr.Var)
+            and not isinstance(self.data.op, relay.function.Function)
+            and self.data.op.name == "nn.pad"
+        ):
+            padding = self.data.attrs.pad_width
+            self.padding = [padding[1][0], padding[1][1], padding[2][0], padding[2][1]]
+            self.has_external_pad = True
+        self.weights = conv2d_op.args[1]
+        self.weights_shape = _infer_shape(self.weights)
+        self.bias = bias_add.args[1]
+        self.ifm_scale = float(conv2d_op.args[QConv2DArgs.IFM_SCALE.value].data.numpy())
+        self.ifm_offset = conv2d_op.args[QConv2DArgs.IFM_ZERO_POINT.value]
+        self.ifm_offset_const = conv2d_op.args[QConv2DArgs.IFM_ZERO_POINT.value]
+        self.weights_scale = 1.0
+        self.weights_offset = 0.0
+
+        if requantize_op.op.name == "qnn.requantize":
+            self.bias_scale = requantize_op.args[RequantArgs.IFM_SCALE.value]
+            self.bias_offset = requantize_op.args[RequantArgs.IFM_ZERO_POINT.value]
+            self.ofm_scale = requantize_op.args[RequantArgs.OFM_SCALE.value]
+            self.ofm_offset = requantize_op.args[RequantArgs.OFM_ZERO_POINT.value]
+        else:
+            self.bias_scale = tvm.relay.const([1.0], "float")
+            self.bias_offset = tvm.relay.const(0, "int32")
+            self.ofm_scale = tvm.relay.const(1.0, "float")
+            self.ofm_offset = tvm.relay.const(0, "int32")
+
+        if activation is not None:
+            self.activation = False
+        else:
+            self.activation = False
+
+    def is_valid(self) -> bool:
+        """
+        This function checks whether gemmini.conv2d has compatible attributes with the Gemmini
+        """
+        # TODO (FP): complete this validation
+        if len(set(self.pool_padding)) != 1 or len(set(self.pool_strides)) != 1:
+            return False
+
+        if self.has_input_requantize:
+            if (
+                self.input_scale_in.data.numpy() != self.input_scale_out.data.numpy()
+                or self.input_offset_in.data.numpy() != 0
+            ):
+                # Only this specific cases are supported, for now...
+                return False
+
+        if self.a_max is not None and self.a_max != 127:
+            return False
+
+        return True
+
+
+class DepthwiseCONV2DParams(CONV2DParams):
+    """
+    This class will parse a Call to a gemmini.depthwiseconv2d composite function
+    """
+
+    composite_name = "gemmini.depthwiseconv2d"
+    activation_map = {"clip": "CLIP"}
+
+    def __init__(self, func_body: tvm.relay.Function):
+        super().__init__(func_body)
+
+
+class MaxPoolParams:
+    """
+    This class will parse a Call to a gemmini.max_pool2d composite function
+    """
+
+    composite_name = "gemmini.max_pool2d"
+
+    def __init__(self, func_body: tvm.relay.Function):
+        self.pool_size = func_body.attrs.pool_size
+        self.pool_strides = func_body.attrs.strides
+        self.pool_padding = func_body.attrs.padding
+        self.pool_dilation = func_body.attrs.dilation
+        self.shape = _infer_shape(func_body)
+
+    def is_valid(self) -> bool:
+        """
+        This function checks whether max_pool2d has compatible attributes with the Gemmini
+        """
+        # TODO (FP): complete this validation?
+        if len(set(self.pool_padding)) != 1:
+            return False
+        if (self.shape[1] != self.shape[2]) or self.shape[1] == 1:
+            return False
+        return True
+
+
+def make_dense_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to qnn.dense.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    dense_out : CallPattern
+        Call node sequence.
+    """
+    data = wildcard()
+    weight = wildcard()
+    bias = wildcard()
+    dense = is_op("qnn.dense")(
+        data, weight, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    bias_add = is_op("nn.bias_add")(
+        dense,
+        bias,
+    )
+    req = is_op("qnn.requantize")(
+        bias_add, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    return req
+
+
+def make_add_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to qnn.add.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    add_out : CallPattern
+        Call node sequence.
+    """
+    ifm1 = wildcard()
+    ifm2 = wildcard()
+    add_out = is_op("qnn.add")(
+        ifm1,
+        ifm2,
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+    )
+    clip_or_req = add_out.optional(is_op("clip"))
+    return clip_or_req
+
+
+def make_conv2d_pattern(
+    with_padded_input: bool = False, with_maxpool: bool = False, with_relu_6: bool = False
+) -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to qnn.conv2d.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    conv2d_out : CallPattern
+        Call node sequence.
+    """
+    data = wildcard()
+    if with_padded_input:
+        data = is_op("nn.pad")(data, wildcard())
+    weight = wildcard()
+    bias = wildcard()
+    conv2d_out = is_op("qnn.conv2d")(
+        data, weight, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    bias_add = is_op("nn.bias_add")(
+        conv2d_out,
+        bias,
+    )
+    output = is_op("qnn.requantize")(
+        bias_add, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    if with_relu_6:
+        output = is_op("minimum")(output, wildcard())
+        output = is_op("clip")(output)
+        output = is_op("qnn.requantize")(
+            output, is_constant(), is_constant(), is_constant(), is_constant()
+        )
+    else:
+        output = output.optional(is_op("clip"))
+    if with_maxpool:
+        output = output.optional(is_op("nn.max_pool2d"))
+        return output
+    else:
+        return output
+
+
+def make_depthwiseconv2d_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to qnn.conv2d, but only if it is a depthwise convolution.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    conv2d_out : CallPattern
+        Call node sequence.
+    """
+    data = wildcard()
+    weight = wildcard()
+    bias = wildcard()
+    conv2d_out = is_op("qnn.conv2d")(
+        data, weight, is_constant(), is_constant(), is_constant(), is_constant()
+    ).has_attr({"kernel_layout": "HWOI"})
+    bias_add = is_op("nn.bias_add")(
+        conv2d_out,
+        bias,
+    )
+    output = is_op("qnn.requantize")(
+        bias_add, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    clip_or_req = output.optional(is_op("clip"))
+    return clip_or_req
+
+
+def make_maxpool_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """Create patterns related to nn.max_pool2d.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    max_pool2d : CallPattern
+        Call node sequence.
+    """
+    max_pool2d = is_op("nn.max_pool2d")(wildcard())
+    return max_pool2d
+
+
+@register_pattern_table("gemmini")
+def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]:
+
+    pattern_table_filters = []
+    pattern_table_filters.append(
+        (
+            GEMMParams.composite_name,
+            make_dense_pattern(),
+            lambda pat: GEMMParams(pat).is_valid(),
+        )
+    )
+
+    for pad in [True, False]:
+        for max_pool in [True, False]:
+            for relu6 in [True, False]:
+                pattern_table_filters.append(
+                    (
+                        CONV2DParams.composite_name,
+                        make_conv2d_pattern(
+                            with_padded_input=pad, with_maxpool=max_pool, with_relu_6=relu6
+                        ),
+                        lambda pat: CONV2DParams(pat).is_valid(),
+                    )
+                )
+
+    pattern_table_filters.append(
+        (
+            MaxPoolParams.composite_name,
+            make_maxpool_pattern(),
+            lambda pat: MaxPoolParams(pat).is_valid(),
+        )
+    )
+
+    if env.use_experimental_qnn_add:
+        pattern_table_filters.append(
+            (
+                AddParams.composite_name,
+                make_add_pattern(),
+                lambda pat: AddParams(pat).is_valid(),
+            )
+        )
+
+    return pattern_table_filters
diff --git a/python/tvm/contrib/gemmini/transform.py b/python/tvm/contrib/gemmini/transform.py
new file mode 100644
index 000000000000..312217cc8210
--- /dev/null
+++ b/python/tvm/contrib/gemmini/transform.py
@@ -0,0 +1,816 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=len-as-condition, no-else-return, unused-argument, invalid-name
+"""
+Transformation passes for Gemmini
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from numpy import isin
+import tvm
+from tvm import te
+from tvm.topi import utils
+import numpy as np
+from copy import deepcopy
+import itertools
+import ast
+from tvm.tir.ir_builder import IRBuilder
+from typing import Dict
+
+from .environment import Environment
+
+env = Environment.instance()
+
+
+def _get_counters(irb: IRBuilder):
+    """Generates calls to print the values of the configured timers
+
+    Args:
+        irb (IRBuilder): IRBuilder
+    """
+    irb.emit(tvm.tir.call_extern("", "counter_snapshot_take"))
+    irb.emit(tvm.tir.call_extern("", "printf", "Counter values:\\r\\n"))
+    counter_vars = []
+    for i, (key, value) in enumerate(env.enabled_counters.items()):
+        counter_var = irb.let(
+            value.lower() + "_var", tvm.tir.call_extern("uint32", "counter_read", i)
+        )
+        counter_vars.append(counter_var)
+        irb.emit(tvm.tir.call_extern("", "printf", tvm.tir.StringImm("%s," % value)))
+    irb.emit(tvm.tir.call_extern("", "printf", "\\r\\n"))
+    for c in counter_vars:
+        irb.emit(tvm.tir.call_extern("", "printf", tvm.tir.StringImm("%lu,"), c))
+    irb.emit(tvm.tir.call_extern("", "printf", "\\r\\n"))
+
+
+def _configure_timers(irb: IRBuilder):
+    """Generates calls to configure the enabled counters
+
+    Args:
+        irb (IRBuilder): IRBuilder
+    """
+    for i, (key, value) in enumerate(env.enabled_counters.items()):
+        irb.emit(tvm.tir.call_extern("", "counter_configure", i, key))
+
+
+def _reset_counters(irb: IRBuilder):
+    """Generates calls to reset all Gemmini counters
+
+    Args:
+        irb (IRBuilder): IRBuilder
+    """
+    irb.emit(tvm.tir.call_extern("", "counter_reset"))
+    irb.emit(tvm.tir.call_extern("", "counter_snapshot_reset"))
+
+
+def _match_pragma(stmt, key):
+    """Internal helper to match stmt to pragma stmt.
+
+    Parameters
+    ----------
+    stmt : Stmt
+        The AttrStmt
+
+    key : str
+        The pragma key
+    """
+    return (stmt.attr_key == "pragma_" + key) or (
+        stmt.attr_key == "pragma_scope" and stmt.value.value == key
+    )
+
+
+def _get_config_dict_from_str(str_value: str) -> Dict:
+    """Returns a configuration dictionary from its string representation
+
+    Args:
+        str_value (str): Dictionary encoded in a string
+
+    Returns:
+        Dict: Configuration dictionary
+    """
+    return ast.literal_eval(str(str_value).replace("'", '"').replace('"{', "{").replace('}"', "}"))
+
+
+def _gen_debug_header(irb: IRBuilder):
+    """If the debug flag is activated in the environment, generate the debug headers for the code
+
+    Args:
+        irb (IRBuilder): _description_
+    """
+    if env.debug:
+        _configure_timers(irb)
+        _reset_counters(irb)
+
+
+def _gen_debug_tail(irb: IRBuilder):
+    """If the debug flag is activated in the environment, generate the debug tails for the code
+
+    Args:
+        irb (IRBuilder): _description_
+    """
+    if env.debug:
+        _get_counters(irb)
+
+
+def InsertGemminiHeaderOperators():
+    """Pass to generate the calls to the Gemmini configuration instructions"""
+
+    def _do_fold(stmt):
+        if _match_pragma(stmt, "add_start"):
+            irb = tvm.tir.ir_builder.create()
+            _gen_debug_header(irb)
+
+            irb.emit(tvm.tir.call_extern("", "gemmini_flush", 0))
+
+            config_dict = _get_config_dict_from_str(stmt.body.value)
+            A_size = config_dict["A_size"]
+            B_size = config_dict["B_size"]
+            C_size = config_dict["C_size"]
+            A_private_stride = config_dict["A_private_stride"]
+            B_private_stride = config_dict["B_private_stride"]
+            execution_stride = config_dict["execution_stride"]
+            activation = config_dict["activation"]
+            mode = config_dict["mode"]
+            max_pixels_per_row = config_dict["max_pixels_per_row"]
+            ifm1_scale = config_dict["ifm1_scale"]
+            ifm2_scale = config_dict["ifm2_scale"]
+            scale = config_dict["scale"]
+            act = 1 if activation else 0
+
+            shrunk = 1
+            irb.emit(tvm.tir.call_extern("", "gemmini_config_ex", mode, act, 0))
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended4_config_ld",
+                    A_size,
+                    ifm1_scale,
+                    shrunk,
+                    A_private_stride,
+                    0,
+                )
+            )
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended4_config_ld",
+                    B_size,
+                    ifm2_scale,
+                    shrunk,
+                    B_private_stride,
+                    1,
+                )
+            )
+            irb.emit(
+                tvm.tir.call_extern(
+                    "", "gemmini_extended4_config_ld", C_size * 4, scale, 0, env.DIM, 2
+                )
+            )
+            irb.emit(tvm.tir.call_extern("", "gemmini_extended_config_st", C_size, act, scale))
+
+            return tvm.tir.SeqStmt([irb.get(), stmt])
+        elif _match_pragma(stmt, "gemm_start"):
+            irb = tvm.tir.ir_builder.create()
+            _gen_debug_header(irb)
+
+            irb.emit(tvm.tir.call_extern("", "gemmini_flush", 0))
+
+            config_dict = _get_config_dict_from_str(stmt.body.value)
+            A_size = config_dict["A_size"]
+            B_size = config_dict["B_size"]
+            C_size = config_dict["C_size"]
+            A_private_stride = config_dict["A_private_stride"]
+            B_private_stride = config_dict["B_private_stride"]
+            execution_stride = config_dict["execution_stride"]
+            activation = config_dict["activation"]
+            mode = config_dict["mode"]
+            max_pixels_per_row = config_dict["max_pixels_per_row"]
+            scale = config_dict["scale"]
+            padding_value = config_dict["padding_value"]
+            act = 1 if activation else 0
+
+            irb.emit(
+                tvm.tir.call_extern(
+                    "", "gemmini_extended_config_ex", mode, act, 0, execution_stride, 0, 0
+                )
+            )
+            if padding_value == 0:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended5_config_ld",
+                        A_size,
+                        1.0,
+                        0,
+                        A_private_stride,
+                        max_pixels_per_row,
+                        0,
+                    )
+                )
+            else:
+                irb.emit(
+                    tvm.tir.call_extern(
+                        "",
+                        "gemmini_extended6_config_ld",
+                        A_size,
+                        1.0,
+                        0,
+                        A_private_stride,
+                        max_pixels_per_row,
+                        padding_value,
+                        0,
+                    )
+                )
+            irb.emit(
+                tvm.tir.call_extern(
+                    "", "gemmini_extended5_config_ld", B_size, 1.0, 0, B_private_stride, 1, 1
+                )
+            )
+            irb.emit(tvm.tir.call_extern("", "gemmini_extended4_config_ld", 0, 1.0, 0, env.DIM, 2))
+            irb.emit(tvm.tir.call_extern("", "gemmini_extended_config_st", C_size, act, scale))
+
+            return tvm.tir.SeqStmt([irb.get(), stmt])
+        elif _match_pragma(stmt, "gemm_cisc_start"):
+            irb = tvm.tir.ir_builder.create()
+            _gen_debug_header(irb)
+
+            irb.emit(tvm.tir.call_extern("", "gemmini_flush", 0))
+            return tvm.tir.SeqStmt([irb.get(), stmt])
+        elif _match_pragma(stmt, "conv2d_cisc_start") or _match_pragma(
+            stmt, "dw_conv2d_cisc_start"
+        ):
+            irb = tvm.tir.ir_builder.create()
+            _gen_debug_header(irb)
+
+            return tvm.tir.SeqStmt([irb.get(), stmt])
+        return None
+
+    def _ftransform(f, mod, ctx):
+        return f.with_body(
+            tvm.tir.stmt_functor.ir_transform(f.body, _do_fold, None, ["tir.AttrStmt"])
+        )
+
+    return tvm.tir.transform.prim_func_pass(
+        _ftransform, opt_level=0, name="tir.gemmini.insert_header_operators"
+    )
+
+
+def InsertGemminiFenceOperator():
+    """Pass to generate the call to the fence instruction at the end of the operator"""
+
+    func_name = ""
+
+    def _do_fold(stmt):
+        if _match_pragma(stmt, "gemm_end"):
+            irb = tvm.tir.ir_builder.create()
+            irb.emit(tvm.tir.call_extern("", "gemmini_fence"))
+            _gen_debug_tail(irb)
+
+            return tvm.tir.SeqStmt([stmt, irb.get()])
+        return None
+
+    def _ftransform(f, mod, ctx):
+        func_name = f.attrs["global_symbol"]
+        return f.with_body(
+            tvm.tir.stmt_functor.ir_transform(f.body, _do_fold, None, ["tir.AttrStmt"])
+        )
+
+    return tvm.tir.transform.prim_func_pass(
+        _ftransform, opt_level=0, name="tir.gemmini.insert_fence_operators"
+    )
+
+
+def InjectAMVINIntrin():
+    """Pass to inject A mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("A mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            dst_access_ptr = dst.access_ptr("w", "uint32")
+
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.INP_SCR_BASE_ADDRESS, "uint8") + dst_access_ptr,
+                    cols,
+                    rows,
+                )
+            )
+
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.A_mvin, _inject_copy)
+
+
+def InjectAMVINIntrinTransposed():
+    """Pass to inject A mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("A mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            # TODO (FP): check this pointers types again!
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            dst_access_ptr = dst.access_ptr("w", "uint32")
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.INP_SCR_BASE_ADDRESS, "uint8") + dst_access_ptr,
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.A_mvin + "_t", _inject_copy)
+
+
+def InjectBMVINIntrin():
+    """Pass to inject B mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        wgt_base_address = tvm.runtime.const(env.WGT_SCR_BASE_ADDRESS, "int32")
+        if dst.scope() == "global":
+            raise RuntimeError("B mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            dst_access_ptr = dst.access_ptr("r", "int32")
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin2",
+                    src.access_ptr("r"),
+                    wgt_base_address + dst_access_ptr,
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.B_mvin, _inject_copy)
+
+
+def InjectBMVINIntrinTransposed():
+    """Pass to inject B mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("B mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin2",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.WGT_SCR_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.B_mvin + "_t", _inject_copy)
+
+
+def InjectDMVINIntrin():
+    """Pass to inject D mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("D mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin3",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.D_mvin, _inject_copy)
+
+
+def InjectDMVINIntrinTransposed():
+    """Pass to inject D mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("D mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin3",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.D_mvin + "_t", _inject_copy)
+
+
+def InjectCMVOUTIntrin():
+    """Pass to inject C mvout intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if src.scope() == "global":
+            raise RuntimeError("C mvout should have a local source")
+        elif dst.scope() == "global":
+            # Store
+            irb = tvm.tir.ir_builder.create()
+            if len(dst.shape) == 1:
+                cols = 1
+            else:
+                cols = dst.shape[1]
+            rows = dst.shape[0]
+            out_access_ptr = src.access_ptr("w", "uint32")
+            get_full_width = tvm.runtime.const(0x00000000, "uint32")
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvout",
+                    dst.access_ptr("w"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + out_access_ptr
+                    - tvm.runtime.const(0x40000000, "uint32")
+                    + get_full_width,
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvout, _inject_copy)
+
+
+def InjectCMVOUTIntrinTransposed():
+    """Pass to inject C mvout intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if src.scope() == "global":
+            raise RuntimeError("C mvout should have a local source")
+        elif dst.scope() == "global":
+            # Store
+            irb = tvm.tir.ir_builder.create()
+            # TODO (FP): check this pointers types again!
+            if len(dst.shape) == 1:
+                rows = 1
+            else:
+                rows = dst.shape[1]
+            cols = dst.shape[0]
+            out_access_ptr = src.access_ptr("w", "uint32")
+            get_full_width = tvm.runtime.const(0x00000000, "uint32")
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvout",
+                    dst.access_ptr("w"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + out_access_ptr
+                    - tvm.runtime.const(0x40000000, "uint32")
+                    + get_full_width,
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvout + "_t", _inject_copy)
+
+
+def InjectCMVINIntrin():
+    """Pass to inject C mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("C mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvin, _inject_copy)
+
+
+def InjectCMVINIntrinTransposed():
+    """Pass to inject C mvin intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("C mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32")
+                    - tvm.runtime.const(0x40000000, "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvin + "_t", _inject_copy)
+
+
+def InjectCMVINAccumIntrin():
+    """Pass to inject C mvin accum intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("C mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                cols = 1
+            else:
+                cols = src.shape[1]
+            rows = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin3",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvin_accum, _inject_copy)
+
+
+def InjectCMVINAccumIntrinTransposed():
+    """Pass to inject C mvin accum intrinsics.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+
+    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
+        # TODO (FP): add padding support...
+        _ = pad_value
+        if dst.scope() == "global":
+            raise RuntimeError("C mvin should have a local destination")
+        elif src.scope() == "global":
+            # Load
+            irb = tvm.tir.ir_builder.create()
+            if len(src.shape) == 1:
+                rows = 1
+            else:
+                rows = src.shape[1]
+            cols = src.shape[0]
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "gemmini_extended_mvin3",
+                    src.access_ptr("r"),
+                    tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32")
+                    + dst.access_ptr("w", "uint32"),
+                    cols,
+                    rows,
+                )
+            )
+            return irb.get()
+        else:
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+
+    return tvm.tir.transform.InjectCopyIntrin(env.C_mvin_accum + "_t", _inject_copy)
diff --git a/python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb
new file mode 100644
index 000000000000..2c2527830858
--- /dev/null
+++ b/python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb
@@ -0,0 +1,311 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MobileNet tutorial\n",
+    "\n",
+    "This tutorials shows how a quantized MobileNet network can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os\n",
+    "import tvm.contrib.gemmini as gemmini\n",
+    "from tvm import relay\n",
+    "import tvm\n",
+    "from mobilenet_utils import generate_mobilenet_tflite_model, get_real_image, run_tflite_model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"CHIPYARD_HOME\"] = \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We clean and prepare the workspace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
+    "os.system(\"mkdir -p include\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tflite_model_dir = generate_mobilenet_tflite_model()\n",
+    "\n",
+    "input_image = get_real_image(224, 224)\n",
+    "\n",
+    "tflite_model_file = os.path.join(tflite_model_dir)\n",
+    "tflite_model_buf = open(tflite_model_file, \"rb\").read()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    import tflite\n",
+    "\n",
+    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "except AttributeError:\n",
+    "    import tflite.Model\n",
+    "\n",
+    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "\n",
+    "tflite_res = run_tflite_model(tflite_model_buf, input_image)\n",
+    "tflite_pred = np.squeeze(tflite_res).argsort()[-5:][::-1]\n",
+    "print(\"Expected argmax = %i\" % (tflite_pred[0],))\n",
+    "print(\"Expected max labels = %s\" % (tflite_pred,))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input_image, \"./include\")\n",
+    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", tflite_pred.astype(np.uint32), \"./include\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The TFLite model generated in the previous steps is now imported into TVM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dtype_dict = {\"input\": input_image.dtype.name}\n",
+    "shape_dict = {\"input\": input_image.shape}\n",
+    "\n",
+    "mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict)\n",
+    "mod = relay.transform.InferType()(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod = gemmini.preprocess_pass(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
+    "\n",
+    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
+    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
+    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
+    "\n",
+    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
+    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "\n",
+    "os.system(\"mkdir dev\")\n",
+    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
+    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
+    "\n",
+    "import tarfile\n",
+    "\n",
+    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
+    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
+    "project_options = {\n",
+    "    \"project_type\": \"mobilenet_example\"\n",
+    "}  \n",
+    "\n",
+    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
+    "generated_project = tvm.micro.generate_project(\n",
+    "    template_project_path, module, generated_project_dir, project_options\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We build the project. This will generate an executable we can run on the Spike simulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.build()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we execute the compiled baremetal project on the Spike simulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.flash()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('tvm': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py b/python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py
new file mode 100644
index 000000000000..51e75fdd7022
--- /dev/null
+++ b/python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py
@@ -0,0 +1,138 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Utils to help generate the MobileNet TFLite model
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import os
+from tvm.contrib.download import download_testdata
+import numpy as np
+import tensorflow as tf
+
+
+def get_real_image(im_height, im_width):
+    from PIL import Image
+
+    repo_base = "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/"
+    img_name = "elephant-299.jpg"
+    image_url = os.path.join(repo_base, img_name)
+    img_path = download_testdata(image_url, img_name, module="data")
+    image = Image.open(img_path).resize((im_height, im_width))
+    x = np.array(image).astype("uint8")
+    data = np.reshape(x, (1, im_height, im_width, 3))
+    return data
+
+
+def run_tflite_model(tflite_model_buf, input_data):
+    """Generic function to execute TFLite"""
+    try:
+        from tensorflow import lite as interpreter_wrapper
+    except ImportError:
+        from tensorflow.contrib import lite as interpreter_wrapper
+
+    input_data = input_data if isinstance(input_data, list) else [input_data]
+
+    interpreter = interpreter_wrapper.Interpreter(model_content=tflite_model_buf)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    # set input
+    assert len(input_data) == len(input_details)
+    for i in range(len(input_details)):
+        interpreter.set_tensor(input_details[i]["index"], input_data[i])
+
+    # Run
+    interpreter.invoke()
+
+    # get output
+    tflite_output = list()
+    for i in range(len(output_details)):
+        tflite_output.append(interpreter.get_tensor(output_details[i]["index"]))
+
+    return tflite_output
+
+
+def download_model():
+    model_url = (
+        "https://storage.googleapis.com/download.tensorflow.org/models/"
+        "tflite_11_05_08/mobilenet_v2_1.0_224.tgz"
+    )
+
+    # Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite
+    model_path = download_testdata(
+        model_url, "mobilenet_v2_1.0_224.tgz", module=["tf", "official", "mobilenet_v2"]
+    )
+    model_dir = os.path.dirname(model_path)
+
+    return model_dir, model_path
+
+
+def extract(path):
+    import tarfile
+
+    if path.endswith("tgz") or path.endswith("gz"):
+        dir_path = os.path.dirname(path)
+        tar = tarfile.open(path)
+        tar.extractall(path=dir_path)
+        tar.close()
+    else:
+        raise RuntimeError("Could not decompress the file: " + path)
+
+
+def create_tflite_model(model_dir: str):
+    # tflite_model_name = [f for f in os.listdir(model_dir) if f.endswith(".tflite")][0]
+    # return f"{model_dir}/{tflite_model_name}"
+    def representative_data_gen():
+        dataset = [
+            np.array(np.random.randint(0, 255, size=(1, 224, 224, 3)), dtype=np.float32)
+            for s in range(100)
+        ]
+        for input_value in dataset:
+            # Model has only one input so each data point has one element.s
+            yield [input_value]
+
+    pb_file = [f for f in os.listdir(model_dir) if f.endswith(".pb")][0]
+    converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
+        f"{model_dir}/{pb_file}",
+        input_arrays=["input"],
+        input_shapes={"input": [1, 224, 224, 3]},
+        output_arrays=["MobilenetV2/Predictions/Reshape"],
+    )
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    # converter.target_spec.supported_ops = [tf.lite.OpsSet.SELECT_TF_OPS]
+    converter.inference_input_type = tf.uint8
+    converter.inference_output_type = tf.uint8
+    converter.representative_dataset = representative_data_gen
+    converter._experimental_disable_per_channel = True
+
+    tflite_model = converter.convert()
+    tflite_model_name = pb_file.replace(".pb", ".tflite")
+    with open(f"{model_dir}/{tflite_model_name}", "wb") as f:
+        f.write(tflite_model)
+
+    return f"{model_dir}/{tflite_model_name}"
+
+
+def generate_mobilenet_tflite_model():
+    model_dir, model_path = download_model()
+    extract(model_path)
+    return create_tflite_model(model_dir)
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb
new file mode 100644
index 000000000000..3bb2fa5788e9
--- /dev/null
+++ b/python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb
@@ -0,0 +1,395 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Add layer tutorial\n",
+    "\n",
+    "This tutorials shows how a quantized add layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.\n",
+    "\n",
+    "Note: This is an **experimental** layer!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.keras import layers\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import tvm.contrib.gemmini as gemmini\n",
+    "from tvm import relay\n",
+    "import tvm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"CHIPYARD_HOME\"] = \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we define the parameters of the layer we want to test. In this case:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_height = 16\n",
+    "input_width = 16\n",
+    "input_channels = 16\n",
+    "activation = 0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Model(tf.Module):\n",
+    "    def __init__(self, name=None):\n",
+    "        super().__init__(name)\n",
+    "\n",
+    "    @tf.function(\n",
+    "        input_signature=[\n",
+    "            tf.TensorSpec(\n",
+    "                shape=[1, input_height, input_width, input_channels],\n",
+    "                dtype=tf.float32,\n",
+    "            ),\n",
+    "            tf.TensorSpec(\n",
+    "                shape=[1, input_height, input_width, input_channels],\n",
+    "                dtype=tf.float32,\n",
+    "            ),\n",
+    "        ]\n",
+    "    )\n",
+    "    def add(self, x, y):\n",
+    "        if activation == 0:\n",
+    "            return x + y\n",
+    "        else:\n",
+    "            return layers.Activation(\"relu\")(x + y)\n",
+    "\n",
+    "model = Model()\n",
+    "\n",
+    "# Convert the concrete functions using TFLiteConverter\n",
+    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+    "\n",
+    "def representative_data_gen():\n",
+    "    dataset = [\n",
+    "        (\n",
+    "            np.array(\n",
+    "                np.random.randint(\n",
+    "                    -127, 128, size=(1, input_height, input_width, input_channels)\n",
+    "                ),\n",
+    "                dtype=np.float32,\n",
+    "            ),\n",
+    "            np.array(\n",
+    "                np.random.randint(\n",
+    "                    0, 128, size=(1, input_height, input_width, input_channels)\n",
+    "                ),\n",
+    "                dtype=np.float32,\n",
+    "            ),\n",
+    "        )\n",
+    "        for s in range(100)\n",
+    "    ]\n",
+    "    for input_value in dataset:\n",
+    "        yield [input_value[0], input_value[1]]\n",
+    "\n",
+    "\n",
+    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
+    "converter.inference_input_type = tf.uint8\n",
+    "converter.inference_output_type = tf.int8\n",
+    "converter.representative_dataset = representative_data_gen\n",
+    "converter._experimental_disable_per_channel = True\n",
+    "\n",
+    "tflite_model = converter.convert()\n",
+    "\n",
+    "# Save the model.\n",
+    "with open(\"add.tflite\", \"wb\") as f:\n",
+    "    f.write(tflite_model)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
+    "\n",
+    "tflite_file = \"./add.tflite\"\n",
+    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
+    "input_tensor = \"layer1_input\"\n",
+    "input_dtype = \"uint8\"\n",
+    "\n",
+    "os.system(\"mkdir -p include\")\n",
+    "\n",
+    "try:\n",
+    "    import tflite\n",
+    "\n",
+    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "except AttributeError:\n",
+    "    import tflite.Model\n",
+    "\n",
+    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "\n",
+    "# Load the TFLite model and allocate tensors.\n",
+    "interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)\n",
+    "interpreter.allocate_tensors()\n",
+    "input_details = interpreter.get_input_details()\n",
+    "output_details = interpreter.get_output_details()\n",
+    "tensor_details = interpreter.get_tensor_details()\n",
+    "\n",
+    "input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
+    "input_matrix_2 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
+    "\n",
+    "interpreter.set_tensor(input_details[0][\"index\"], input_matrix_1)\n",
+    "interpreter.set_tensor(input_details[1][\"index\"], input_matrix_2)\n",
+    "\n",
+    "interpreter.invoke()\n",
+    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.create_header_file(\"inputs\", \"data\", \"input_1\", input_matrix_2, \"./include\")\n",
+    "gemmini.create_header_file(\"inputs\", \"data\", \"input_2\", input_matrix_1, \"./include\")\n",
+    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096, use_experimental_qnn_add=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The TFLite model generated in the previous steps is now imported into TVM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod, params = relay.frontend.from_tflite(\n",
+    "    tflite_model,\n",
+    "    shape_dict={\"serving_default_x\": (1, input_height, input_width, input_channels), \"serving_default_y\": (1, input_height, input_width, input_channels)},\n",
+    "    dtype_dict={\"serving_default_x\": input_dtype, \"serving_default_y\": input_dtype},\n",
+    ")\n",
+    "mod = relay.transform.InferType()(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod = gemmini.preprocess_pass(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
+    "\n",
+    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
+    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
+    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
+    "\n",
+    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
+    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "\n",
+    "os.system(\"mkdir dev\")\n",
+    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
+    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
+    "\n",
+    "import tarfile\n",
+    "\n",
+    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
+    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
+    "project_options = {\n",
+    "    \"project_type\": \"add_example\"\n",
+    "}  \n",
+    "\n",
+    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
+    "generated_project = tvm.micro.generate_project(\n",
+    "    template_project_path, module, generated_project_dir, project_options\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We build the project. This will generate an executable we can run on the Spike simulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.build()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
+    "\n",
+    "Note: if there are errors, these can be related to rounding errors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.flash()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('tvm': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb
new file mode 100644
index 000000000000..c7512586b809
--- /dev/null
+++ b/python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb
@@ -0,0 +1,378 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2D convolution layer tutorial\n",
+    "\n",
+    "This tutorials shows how a quantized 2d convolution layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow import keras\n",
+    "from tensorflow.keras import layers\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import tvm.contrib.gemmini as gemmini\n",
+    "from tvm import relay\n",
+    "import tvm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"CHIPYARD_HOME\"] = \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we define the parameters of the layer we want to test. In this case:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_height = 16\n",
+    "input_width = 16\n",
+    "input_channels = 16\n",
+    "output_channels = 16\n",
+    "kernel_size = 3\n",
+    "stride = 1\n",
+    "padding = 'valid'\n",
+    "activation = None\n",
+    "bias = True\n",
+    "\n",
+    "# We can add a max pooling layer after the convolution. This can be merged by the integration and can be executed together with the convolution on the Gemmini accelerator.\n",
+    "pool_size = 1\n",
+    "pool_stride = 1\n",
+    "pool_padding = 'valid'\n",
+    "use_pool = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "layer_sequence = [\n",
+    "    layers.Conv2D(\n",
+    "        output_channels,\n",
+    "        kernel_size=kernel_size,\n",
+    "        padding=padding,\n",
+    "        activation=activation,\n",
+    "        use_bias=True,\n",
+    "        bias_initializer=\"ones\",\n",
+    "        input_shape=(input_height, input_width, input_channels),\n",
+    "        strides=stride,\n",
+    "    )\n",
+    "]\n",
+    "if use_pool:\n",
+    "    layer_sequence.append(\n",
+    "        layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)\n",
+    "    )\n",
+    "\n",
+    "model = keras.Sequential(layer_sequence)\n",
+    "\n",
+    "# Convert the concrete functions using TFLiteConverter\n",
+    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+    "\n",
+    "def representative_data_gen():\n",
+    "    dataset = [\n",
+    "        np.array(np.random.randint(0, 10, size=(100, input_height, input_width, input_channels)), dtype=np.float32)\n",
+    "        for s in range(10)\n",
+    "    ]\n",
+    "    for input_value in dataset:\n",
+    "        # Model has only one input so each data point has one element.s\n",
+    "        yield [input_value]\n",
+    "\n",
+    "\n",
+    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
+    "converter.inference_input_type = tf.uint8\n",
+    "converter.inference_output_type = tf.int8\n",
+    "converter.representative_dataset = representative_data_gen\n",
+    "converter._experimental_disable_per_channel = True\n",
+    "\n",
+    "tflite_model = converter.convert()\n",
+    "\n",
+    "# Save the model.\n",
+    "with open(\"conv.tflite\", \"wb\") as f:\n",
+    "    f.write(tflite_model)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
+    "\n",
+    "tflite_file = \"./conv.tflite\"\n",
+    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
+    "input_tensor = \"layer1_input\"\n",
+    "input_dtype = \"uint8\"\n",
+    "\n",
+    "os.system(\"mkdir -p include\")\n",
+    "\n",
+    "try:\n",
+    "    import tflite\n",
+    "\n",
+    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "except AttributeError:\n",
+    "    import tflite.Model\n",
+    "\n",
+    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "\n",
+    "# Load the TFLite model and allocate tensors.\n",
+    "interpreter = tf.lite.Interpreter(model_path=\"./conv.tflite\")\n",
+    "interpreter.allocate_tensors()\n",
+    "input_details = interpreter.get_input_details()\n",
+    "output_details = interpreter.get_output_details()\n",
+    "input_matrix = np.random.randint(0, 127, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
+    "interpreter.set_tensor(input_details[0][\"index\"], input_matrix)\n",
+    "interpreter.invoke()\n",
+    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input_matrix, \"./include\")\n",
+    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The TFLite model generated in the previous steps is now imported into TVM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod, params = relay.frontend.from_tflite(\n",
+    "    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}\n",
+    ")\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod = gemmini.preprocess_pass(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
+    "\n",
+    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
+    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
+    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
+    "\n",
+    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
+    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "\n",
+    "os.system(\"mkdir dev\")\n",
+    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
+    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
+    "\n",
+    "import tarfile\n",
+    "\n",
+    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
+    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
+    "project_options = {\n",
+    "    \"project_type\": \"conv2d_example\"\n",
+    "}  \n",
+    "\n",
+    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
+    "generated_project = tvm.micro.generate_project(\n",
+    "    template_project_path, module, generated_project_dir, project_options\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We build the project. This will generate an executable we can run on the Spike simulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.build()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
+    "\n",
+    "Note: if there are errors, these can be related to rounding errors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.flash()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('tvm': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb
new file mode 100644
index 000000000000..d1959f66b72a
--- /dev/null
+++ b/python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb
@@ -0,0 +1,378 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dense layer tutorial\n",
+    "\n",
+    "This tutorials shows how a quantized dense (fully connected) layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import tvm.contrib.gemmini as gemmini\n",
+    "from tvm import relay\n",
+    "import tvm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"CHIPYARD_HOME\"] = \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we define the parameters of the layer we want to test. In this case:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_height = 32\n",
+    "input_width = 32\n",
+    "output_width = 32"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Model(tf.Module):\n",
+    "    def __init__(self, name=None):\n",
+    "        super().__init__(name)\n",
+    "        self.w = tf.Variable(tf.random.normal([input_width, output_width]), name=\"w\")\n",
+    "        self.b = tf.Variable(tf.random.normal([output_width]), name=\"b\")\n",
+    "\n",
+    "    @tf.function(\n",
+    "        input_signature=[\n",
+    "            tf.TensorSpec(shape=[input_height, input_width], dtype=tf.float32),\n",
+    "        ]\n",
+    "    )\n",
+    "    def matmul(self, x):\n",
+    "        return tf.linalg.matmul(x, self.w, transpose_b=False) + self.b\n",
+    "\n",
+    "model = Model()\n",
+    "\n",
+    "# Convert the concrete functions using TFLiteConverter\n",
+    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+    "\n",
+    "\n",
+    "def representative_data_gen():\n",
+    "    dataset = [\n",
+    "        (\n",
+    "            np.array(\n",
+    "                np.random.randint(-127, 128, size=(input_height, input_width)), dtype=np.float32\n",
+    "            ),\n",
+    "            np.array(\n",
+    "                np.random.randint(-127, 128, size=(input_width, output_width)), dtype=np.float32\n",
+    "            ),\n",
+    "        )\n",
+    "        for s in range(100)\n",
+    "    ]\n",
+    "    for input_value in dataset:\n",
+    "        # Model has only one input so each data point has one element.\n",
+    "        yield [input_value[0]]\n",
+    "\n",
+    "\n",
+    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
+    "converter.inference_input_type = tf.uint8\n",
+    "converter.inference_output_type = tf.int8\n",
+    "converter.representative_dataset = representative_data_gen\n",
+    "converter._experimental_disable_per_channel = True\n",
+    "\n",
+    "tflite_model = converter.convert()\n",
+    "\n",
+    "# Save the model.\n",
+    "with open(\"matmul.tflite\", \"wb\") as f:\n",
+    "    f.write(tflite_model)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
+    "\n",
+    "tflite_file = \"./matmul.tflite\"\n",
+    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
+    "input_tensor = \"layer1_input\"\n",
+    "input_dtype = \"uint8\"\n",
+    "\n",
+    "os.system(\"mkdir -p include\")\n",
+    "\n",
+    "try:\n",
+    "    import tflite\n",
+    "\n",
+    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "except AttributeError:\n",
+    "    import tflite.Model\n",
+    "\n",
+    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "\n",
+    "# Load the TFLite model and allocate tensors.\n",
+    "interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)\n",
+    "interpreter.allocate_tensors()\n",
+    "input_details = interpreter.get_input_details()\n",
+    "output_details = interpreter.get_output_details()\n",
+    "tensor_details = interpreter.get_tensor_details()\n",
+    "\n",
+    "input1 = np.random.randint(0, 255, (input_height, input_width), dtype=np.uint8)\n",
+    "interpreter.set_tensor(input_details[0][\"index\"], input1)\n",
+    "\n",
+    "interpreter.invoke()\n",
+    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input1, \"./include\")\n",
+    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The TFLite model generated in the previous steps is now imported into TVM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod, params = relay.frontend.from_tflite(\n",
+    "    tflite_model,\n",
+    "    shape_dict={\n",
+    "        \"serving_default_x:0\": (input_height, input_width),\n",
+    "    },\n",
+    "    dtype_dict={\n",
+    "        \"serving_default_x:0\": input_dtype,\n",
+    "    },\n",
+    ")\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod = gemmini.preprocess_pass(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
+    "\n",
+    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
+    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
+    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
+    "\n",
+    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
+    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "\n",
+    "os.system(\"mkdir dev\")\n",
+    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
+    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
+    "\n",
+    "import tarfile\n",
+    "\n",
+    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
+    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
+    "project_options = {\n",
+    "    \"project_type\": \"dense_example\"\n",
+    "}  \n",
+    "\n",
+    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
+    "generated_project = tvm.micro.generate_project(\n",
+    "    template_project_path, module, generated_project_dir, project_options\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We build the project. This will generate an executable we can run on the Spike simulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.build()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
+    "\n",
+    "Note: if there are errors, these can be related to rounding errors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.flash()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('tvm': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb
new file mode 100644
index 000000000000..b5753a300401
--- /dev/null
+++ b/python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb
@@ -0,0 +1,373 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2D depthwise convolution layer tutorial\n",
+    "\n",
+    "This tutorials shows how a quantized 2D depthwise convolution layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import itertools\n",
+    "from pyrsistent import v\n",
+    "import tensorflow as tf\n",
+    "from tensorflow import keras\n",
+    "from tensorflow.keras import layers\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import argparse\n",
+    "import random\n",
+    "import tvm.contrib.gemmini as gemmini\n",
+    "from tvm import relay\n",
+    "import tvm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"CHIPYARD_HOME\"] = \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we define the parameters of the layer we want to test. In this case:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_height = 112\n",
+    "input_width = 112\n",
+    "input_channels = 32\n",
+    "kernel_size = 3\n",
+    "stride = 1\n",
+    "padding = 'same'\n",
+    "activation = None\n",
+    "bias = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = keras.Sequential(\n",
+    "    [\n",
+    "        layers.DepthwiseConv2D(\n",
+    "            kernel_size=kernel_size,\n",
+    "            padding=padding,\n",
+    "            activation=activation,\n",
+    "            use_bias=True,\n",
+    "            bias_initializer=\"ones\",\n",
+    "            input_shape=(input_height, input_width, input_channels),\n",
+    "            strides=stride,\n",
+    "        )\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "# Convert the concrete functions using TFLiteConverter\n",
+    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+    "\n",
+    "def representative_data_gen():\n",
+    "    dataset = [\n",
+    "        np.array(np.random.randint(0, 127, size=(10, input_height, input_width, input_channels)), dtype=np.float32)\n",
+    "        for s in range(10)\n",
+    "    ]\n",
+    "    for input_value in dataset:\n",
+    "        # Model has only one input so each data point has one element.s\n",
+    "        yield [input_value]\n",
+    "\n",
+    "\n",
+    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
+    "converter.inference_input_type = tf.uint8\n",
+    "converter.inference_output_type = tf.int8\n",
+    "converter.representative_dataset = representative_data_gen\n",
+    "converter._experimental_disable_per_channel = True\n",
+    "\n",
+    "tflite_model = converter.convert()\n",
+    "\n",
+    "# Save the model.\n",
+    "with open(\"dwconv.tflite\", \"wb\") as f:\n",
+    "    f.write(tflite_model)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
+    "\n",
+    "tflite_file = \"./dwconv.tflite\"\n",
+    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
+    "input_tensor = \"layer1_input\"\n",
+    "input_dtype = \"uint8\"\n",
+    "\n",
+    "os.system(\"mkdir -p include\")\n",
+    "\n",
+    "try:\n",
+    "    import tflite\n",
+    "\n",
+    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "except AttributeError:\n",
+    "    import tflite.Model\n",
+    "\n",
+    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "\n",
+    "# Load the TFLite model and allocate tensors.\n",
+    "interpreter = tf.lite.Interpreter(model_path=\"./dwconv.tflite\")\n",
+    "interpreter.allocate_tensors()\n",
+    "input_details = interpreter.get_input_details()\n",
+    "output_details = interpreter.get_output_details()\n",
+    "tensor_details = interpreter.get_tensor_details()\n",
+    "\n",
+    "input = np.random.randint(0, 2, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
+    "interpreter.set_tensor(input_details[0][\"index\"], input)\n",
+    "\n",
+    "interpreter.invoke()\n",
+    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input, \"./include\")\n",
+    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The TFLite model generated in the previous steps is now imported into TVM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod, params = relay.frontend.from_tflite(\n",
+    "    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}\n",
+    ")\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod = gemmini.preprocess_pass(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
+    "\n",
+    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
+    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
+    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
+    "\n",
+    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
+    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "\n",
+    "os.system(\"mkdir dev\")\n",
+    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
+    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
+    "\n",
+    "import tarfile\n",
+    "\n",
+    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
+    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
+    "project_options = {\n",
+    "    \"project_type\": \"dwconv2d_example\"\n",
+    "}  \n",
+    "\n",
+    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
+    "generated_project = tvm.micro.generate_project(\n",
+    "    template_project_path, module, generated_project_dir, project_options\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We build the project. This will generate an executable we can run on the Spike simulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.build()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
+    "\n",
+    "Note: if there are errors, these can be related to rounding errors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.flash()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 ('tvm': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb
new file mode 100644
index 000000000000..bdee93760f96
--- /dev/null
+++ b/python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb
@@ -0,0 +1,378 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2D max pooling layer tutorial\n",
+    "\n",
+    "This tutorials shows how a quantized 2D max pooling layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.keras import layers\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import tvm.contrib.gemmini as gemmini\n",
+    "from tvm import relay\n",
+    "import tvm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"CHIPYARD_HOME\"] = \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we define the parameters of the layer we want to test. In this case:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_height = 16\n",
+    "input_width = 16\n",
+    "input_channels = 16\n",
+    "pool_size = 2\n",
+    "pool_stride = 1\n",
+    "pool_padding = 'valid'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Model(tf.Module):\n",
+    "    def __init__(self, name=None):\n",
+    "        super().__init__(name)\n",
+    "\n",
+    "    @tf.function(\n",
+    "        input_signature=[\n",
+    "            tf.TensorSpec(\n",
+    "                shape=[1, input_height, input_width, input_channels],\n",
+    "                dtype=tf.float32,\n",
+    "            )\n",
+    "        ]\n",
+    "    )\n",
+    "    def maxpool(self, x):\n",
+    "        return layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)(x)\n",
+    "\n",
+    "model = Model()\n",
+    "\n",
+    "# Convert the concrete functions using TFLiteConverter\n",
+    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+    "\n",
+    "\n",
+    "def representative_data_gen():\n",
+    "    dataset = [\n",
+    "        np.array(\n",
+    "            np.random.randint(\n",
+    "                -127, 128, size=(1, input_height, input_width, input_channels)\n",
+    "            ),\n",
+    "            dtype=np.float32,\n",
+    "        )\n",
+    "        for s in range(100)\n",
+    "    ]\n",
+    "    for input_value in dataset:\n",
+    "        # Model has only one input so each data point has one element.\n",
+    "        yield [input_value]\n",
+    "\n",
+    "\n",
+    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
+    "converter.inference_input_type = tf.uint8\n",
+    "converter.inference_output_type = tf.int8\n",
+    "converter.representative_dataset = representative_data_gen\n",
+    "converter._experimental_disable_per_channel = True\n",
+    "\n",
+    "tflite_model = converter.convert()\n",
+    "\n",
+    "# Save the model.\n",
+    "with open(\"maxpool.tflite\", \"wb\") as f:\n",
+    "    f.write(tflite_model)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
+    "\n",
+    "tflite_file = \"./maxpool.tflite\"\n",
+    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
+    "input_tensor = \"layer1_input\"\n",
+    "input_dtype = \"uint8\"\n",
+    "\n",
+    "os.system(\"mkdir -p include\")\n",
+    "\n",
+    "try:\n",
+    "    import tflite\n",
+    "\n",
+    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "except AttributeError:\n",
+    "    import tflite.Model\n",
+    "\n",
+    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
+    "\n",
+    "# Load the TFLite model and allocate tensors.\n",
+    "interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)\n",
+    "interpreter.allocate_tensors()\n",
+    "input_details = interpreter.get_input_details()\n",
+    "output_details = interpreter.get_output_details()\n",
+    "tensor_details = interpreter.get_tensor_details()\n",
+    "\n",
+    "input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
+    "\n",
+    "interpreter.set_tensor(input_details[0][\"index\"], input_matrix_1)\n",
+    "\n",
+    "interpreter.invoke()\n",
+    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input_matrix_1, \"./include\")\n",
+    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The TFLite model generated in the previous steps is now imported into TVM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod, params = relay.frontend.from_tflite(\n",
+    "    tflite_model,\n",
+    "    shape_dict={\"serving_default_x\": (1, input_height, input_width, input_channels)},\n",
+    "    dtype_dict={\"serving_default_x\": input_dtype},\n",
+    ")\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mod = gemmini.preprocess_pass(mod)\n",
+    "mod[\"main\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
+    "\n",
+    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
+    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
+    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
+    "\n",
+    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
+    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "\n",
+    "os.system(\"mkdir dev\")\n",
+    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
+    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
+    "\n",
+    "import tarfile\n",
+    "\n",
+    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
+    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
+    "project_options = {\n",
+    "    \"project_type\": \"maxpool2d_example\"\n",
+    "}  \n",
+    "\n",
+    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
+    "generated_project = tvm.micro.generate_project(\n",
+    "    template_project_path, module, generated_project_dir, project_options\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We build the project. This will generate an executable we can run on the Spike simulator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.build()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
+    "\n",
+    "Note: if there are errors, these can be related to rounding errors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generated_project.flash()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.10 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/tvm/contrib/gemmini/utils.py b/python/tvm/contrib/gemmini/utils.py
new file mode 100644
index 000000000000..1f9d6b26134f
--- /dev/null
+++ b/python/tvm/contrib/gemmini/utils.py
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Useful enumerations and others
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from enum import Enum
+
+counters = {
+    1: "MAIN_LD_CYCLES",
+    2: "MAIN_ST_CYCLES",
+    3: "MAIN_EX_CYCLES",
+    4: "MAIN_LD_ST_CYCLES",
+    5: "MAIN_LD_EX_CYCLES",
+    6: "MAIN_ST_EX_CYCLES",
+    7: "MAIN_LD_ST_EX_CYCLES",
+    8: "LOAD_DMA_WAIT_CYCLE",
+    9: "LOAD_ACTIVE_CYCLE",
+    10: "LOAD_SCRATCHPAD_WAIT_CYCLE",
+    11: "STORE_DMA_WAIT_CYCLE",
+    12: "STORE_ACTIVE_CYCLE",
+    13: "STORE_POOLING_CYCLE",
+    14: "STORE_SCRATCHPAD_WAIT_CYCLE",
+    15: "DMA_TLB_MISS_CYCLE",
+    16: "DMA_TLB_HIT_REQ",
+    17: "DMA_TLB_TOTAL_REQ",
+    18: "RDMA_ACTIVE_CYCLE",
+    19: "RDMA_TLB_WAIT_CYCLES",
+    20: "RDMA_TL_WAIT_CYCLES",
+    21: "WDMA_ACTIVE_CYCLE",
+    22: "WDMA_TLB_WAIT_CYCLES",
+    23: "WDMA_TL_WAIT_CYCLES",
+    24: "EXE_ACTIVE_CYCLE",
+    25: "EXE_FLUSH_CYCLE",
+    26: "EXE_CONTROL_Q_BLOCK_CYCLE",
+    27: "EXE_PRELOAD_HAZ_CYCLE",
+    28: "EXE_OVERLAP_HAZ_CYCLE",
+    29: "SCRATCHPAD_A_WAIT_CYCLE",
+    30: "SCRATCHPAD_B_WAIT_CYCLE",
+    31: "SCRATCHPAD_D_WAIT_CYCLE",
+    32: "ACC_A_WAIT_CYCLE",
+    33: "ACC_B_WAIT_CYCLE",
+    34: "ACC_D_WAIT_CYCLE",
+    35: "A_GARBAGE_CYCLES",
+    36: "B_GARBAGE_CYCLES",
+    37: "D_GARBAGE_CYCLES",
+    38: "IM2COL_MEM_CYCLES",
+    39: "IM2COL_ACTIVE_CYCLES",
+    40: "IM2COL_TRANSPOSER_WAIT_CYCLE",
+    41: "RESERVATION_STATION_FULL_CYCLES",
+    42: "RESERVATION_STATION_ACTIVE_CYCLES",
+    43: "LOOP_MATMUL_ACTIVE_CYCLES",
+    44: "TRANSPOSE_PRELOAD_UNROLLER_ACTIVE_CYCLES",
+    45: "RESERVATION_STATION_LD_COUNT",
+    46: "RESERVATION_STATION_ST_COUNT",
+    47: "RESERVATION_STATION_EX_COUNT",
+    48: "RDMA_BYTES_REC",
+    49: "WDMA_BYTES_SENT",
+    50: "RDMA_TOTAL_LATENCY",
+    51: "WDMA_TOTAL_LATENCY",
+}
+
+
+class ClipArgs(Enum):
+    """
+    This is a helper enums to obtain the correct index
+    of clip arguments.
+    """
+
+    A_MIN = 1
+    A_MAX = 2
+
+
+class BinaryElementwiseArgs(Enum):
+    """This is a helper enums to access the correct index
+    of binary elementwise arguments
+    """
+
+    IFM1 = 0
+    IFM2 = 1
+    IFM1_SCALE = 2
+    IFM1_ZERO_POINT = 3
+    IFM2_SCALE = 4
+    IFM2_ZERO_POINT = 5
+    OFM_SCALE = 6
+    OFM_ZERO_POINT = 7
+
+
+class QDenseArgs(Enum):
+    """
+    This is a helper enum to access the correct index of
+    qnn.dense arguments
+    """
+
+    IFM = 0
+    WEIGHTS = 1
+    IFM_ZERO_POINT = 2
+    WEIGHTS_ZERO_POINT = 3
+    IFM_SCALE = 4
+    WEIGHTS_SCALE = 5
+
+
+class QConv2DArgs(Enum):
+    """
+    This is a helper enum to obtain the correct index
+    of qnn.conv2d arguments.
+    """
+
+    IFM = 0
+    WEIGHTS = 1
+    IFM_ZERO_POINT = 2
+    WEIGHTS_ZERO_POINT = 3
+    IFM_SCALE = 4
+    WEIGHTS_SCALE = 5
+
+
+class RequantArgs(Enum):
+    """
+    This is a helper enum to obtain the correct index
+    of qnn.requantize arguments.
+    """
+
+    IFM_SCALE = 1
+    IFM_ZERO_POINT = 2
+    OFM_SCALE = 3
+    OFM_ZERO_POINT = 4
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index 92574ce2f8c2..b05d0d60d47a 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -38,6 +38,7 @@ class MicroTVMTemplateProject(enum.Enum):
     ZEPHYR = "zephyr"
     ARDUINO = "arduino"
     CRT = "crt"
+    GEMMINI = "gemmini"
 
     @classmethod
     def list(cls):
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 1ba9f5e73395..5f8469463997 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -519,7 +519,7 @@ def _export_operator_model_library_format(mod: build_module.OperatorModule, temp
     """
     targets = []
     for target in mod.ir_module_by_target.keys():
-        if str(target.kind) not in ("llvm", "c"):
+        if str(target.kind) not in ("llvm", "c", "gemmini"):
             raise UnsupportedInModelLibraryFormatError(
                 f"Operator has non-DSO-exportable target {target!s}, which is not yet supported in "
                 "Model Library Format"
diff --git a/python/tvm/relay/backend/contrib/gemmini/__init__.py b/python/tvm/relay/backend/contrib/gemmini/__init__.py
new file mode 100644
index 000000000000..b68c070cbed9
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/__init__.py
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Gemmini operators compute and schedule declarations
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from . import op
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
new file mode 100644
index 000000000000..9f7837c076e5
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
@@ -0,0 +1,214 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Add operator declaration and schedule registration for Gemmini
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
+
+from tvm.contrib.gemmini.environment import Environment
+from tvm.contrib.gemmini.build_module import lower
+from tvm.contrib.gemmini.helpers import get_greater_div
+
+import json
+
+env = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.add")
+def add(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    ifm1: tvm.te.tensor.Tensor,
+    ifm2: tvm.te.tensor.Tensor,
+    ofm_offset: tvm.te.tensor.Tensor,
+    ifm1_scale: float,
+    ifm2_scale: float,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's add operator
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        ifm1 (tvm.te.tensor.Tensor): input tensor 1
+        ifm2 (tvm.te.tensor.Tensor): input tensor 2
+        ofm_offset (tvm.te.tensor.Tensor): offset tensor
+        ifm1_scale (float): scaling factor for input tensor 1
+        ifm2_scale (float): scaling factor for input tensor 2
+
+    Raises:
+        topi.InvalidShapeError: if input shapes are not supported
+
+    Returns:
+        tvm.te.tensor.Tensor: add operator result
+    """
+
+    # Make sure that the input shapes make sense
+    if len(ifm1.shape) != 4 or len(ifm2.shape) != 4 or len(ofm_offset.shape) != 4:
+        raise topi.InvalidShapeError()
+
+    # Derive shapes
+    oshape = topi.utils.get_const_tuple(ifm1.shape)
+
+    tensor_type = env.inp_dtype
+
+    ofm_offset_stage = te.compute(
+        oshape,
+        lambda b, x, y, c: ofm_offset[b, x, y, c].astype(tensor_type),
+        name="ofm_offset.local",
+        tag="ofm_offset",
+    )
+    ifm2_stage = te.compute(
+        oshape,
+        lambda b, x, y, c: ifm2[b, x, y, c].astype(tensor_type)
+        + ofm_offset_stage[b, x, y, c].astype(tensor_type),
+        name="ifm2.local",
+        tag="ifm2",
+    )
+    res = te.compute(
+        oshape,
+        lambda b, x, y, c: ifm1[b, x, y, c].astype(tensor_type)
+        + ifm2_stage[b, x, y, c].astype(tensor_type),
+        name="res",
+        tag="add",
+        attrs={
+            "ifm1_scale": ifm1_scale,
+            "ifm2_scale": ifm2_scale,
+        },
+    )
+
+    cfg.add_flop(
+        3 * np.prod(topi.utils.get_const_tuple(oshape))
+        + 2  # element additions needed
+        * np.prod(
+            topi.utils.get_const_tuple(oshape)
+        )  # element multiplications needed (input scaling)
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.add")
+def schedule_add(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's add operator
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+
+    assert len(outs) == 1
+    output = outs[0]
+
+    add_stage = output.op.output(0)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    sch = te.create_schedule([x.op for x in outs])
+
+    ifm1, ifm2_op = add_stage.op.input_tensors
+    ifm2, ofm_offset_op = ifm2_op.op.input_tensors
+    ofm_offset = ofm_offset_op.op.input_tensors[0]
+
+    b, x, y, c = sch[add_stage].op.axis
+
+    # Prepare the scope of each buffer
+    cifm1 = sch.cache_read(ifm1, env.acc_scope, [add_stage])
+    sch[ifm2_op].set_scope(env.acc_scope)
+    sch[ofm_offset_op].set_scope(env.acc_scope)
+
+    # Split axis, taking into account the maximum value of rows and columns that can be moved into Gemminis accumulator (DIM)
+    y_factor = get_greater_div(int(sch[add_stage].op.axis[3].dom.extent))
+    x_factor = get_greater_div(int(sch[add_stage].op.axis[2].dom.extent))
+    y_o, y_i = sch[add_stage].split(sch[add_stage].op.axis[3], factor=y_factor)
+    x_o, x_i = sch[add_stage].split(sch[add_stage].op.axis[2], factor=x_factor)
+    sch[add_stage].reorder(x_o, y_o, x_i, y_i)
+
+    # Compute the stages in the correct position
+    sch[cifm1].compute_at(sch[add_stage], y_o)
+    sch[ifm2_op].compute_at(sch[add_stage], y_o)
+    sch[ofm_offset_op].compute_at(sch[add_stage], y_o)
+
+    # Split axis, taking into account the maximum value of rows and columns that can be moved into Gemminis accumulator (DIM)
+    cifm1_ax_0_1, cifm1_ax_0_2 = sch[cifm1].split(sch[cifm1].op.axis[2], factor=env.DIM)
+    cifm1_ax_1_1, cifm1_ax_1_2 = sch[cifm1].split(
+        sch[cifm1].op.axis[3], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+    )
+    sch[cifm1].reorder(cifm1_ax_0_1, cifm1_ax_1_1, cifm1_ax_0_2, cifm1_ax_1_2)
+
+    cifm2_ax_0_1, cifm2_ax_0_2 = sch[ifm2_op].split(sch[ifm2_op].op.axis[2], factor=env.DIM)
+    cifm2_ax_1_1, cifm2_ax_1_2 = sch[ifm2_op].split(
+        sch[ifm2_op].op.axis[3], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+    )
+    sch[ifm2_op].reorder(cifm2_ax_0_1, cifm2_ax_1_1, cifm2_ax_0_2, cifm2_ax_1_2)
+
+    cofm_offset_ax_0_1, cofm_offset_ax_0_2 = sch[ofm_offset_op].split(
+        sch[ofm_offset_op].op.axis[2], factor=env.DIM
+    )
+    cofm_offset_ax_1_1, cofm_offset_ax_1_2 = sch[ofm_offset_op].split(
+        sch[ofm_offset_op].op.axis[3], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+    )
+    sch[ofm_offset_op].reorder(
+        cofm_offset_ax_0_1, cofm_offset_ax_1_1, cofm_offset_ax_0_2, cofm_offset_ax_1_2
+    )
+
+    # Set pragmas to insert mvin instructions
+    oshape = (x_factor, y_factor)
+    if x_factor == 1:
+        sch[cifm1].pragma(cifm1_ax_0_2, env.C_mvin + "_t")
+        sch[ofm_offset_op].pragma(cofm_offset_ax_0_2, env.C_mvin_accum + "_t")
+    else:
+        sch[cifm1].pragma(cifm1_ax_0_2, env.C_mvin)
+        sch[ofm_offset_op].pragma(cofm_offset_ax_0_2, env.C_mvin_accum)
+
+    # Tensorize
+    sch[ifm2_op].tensorize(cifm2_ax_0_2, env.add_tensorize(oshape))
+    sch[add_stage].tensorize(x_i, env.add_mvout_tensorize(oshape))
+
+    # Create configuration dictionary
+    config_dict = {}
+    config_dict["A_size"] = int(ifm1.shape[3])
+    config_dict["B_size"] = int(ifm2.shape[3])
+    config_dict["C_size"] = int(output.shape[3])
+    config_dict["A_private_stride"] = env.DIM
+    config_dict["B_private_stride"] = env.DIM
+    config_dict["execution_stride"] = 1
+    config_dict["activation"] = 0
+    config_dict["mode"] = env.WEIGHT_STATIONARY
+    config_dict["max_pixels_per_row"] = 1
+    config_dict["ifm1_scale"] = float(add_stage.op.attrs["ifm1_scale"])
+    config_dict["ifm2_scale"] = float(add_stage.op.attrs["ifm2_scale"])
+    config_dict["scale"] = 1.0
+
+    # Set pragmas to configure the start and end of the Gemmini code
+    sch[output].pragma(sch[output].op.axis[0], "add_start")
+    sch[output].pragma(sch[output].op.axis[0], "configs", str(config_dict))
+    sch[output].pragma(sch[output].op.axis[0], "gemm_end")
+
+    # print(lower(sch,[ifm1,ifm2,ofm_offset,output]))
+    # breakpoint()
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
new file mode 100644
index 000000000000..6d129a0e8b0f
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
@@ -0,0 +1,244 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Conv2d operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+
+from tvm.contrib.gemmini.environment import Environment
+from tvm.contrib.gemmini.build_module import lower
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
+from tvm.contrib.gemmini.helpers import get_greater_div
+
+env = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.conv2d_cisc")
+def conv2d_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    orig_data: tvm.te.tensor.Tensor,
+    kernel: tvm.te.tensor.Tensor,
+    bias: tvm.te.tensor.Tensor,
+    strides: tvm.ir.container.Array,
+    padding: tvm.ir.container.Array,
+    ifm_offset: int,
+    activation: int,
+    gemmini_scale: float,
+    pool_size: tvm.ir.container.Array,
+    pool_strides: tvm.ir.container.Array,
+    pool_dilation: tvm.ir.container.Array,
+    pool_padding: tvm.ir.container.Array,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's conv2d operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        orig_data (tvm.te.tensor.Tensor): Input feature map
+        kernel (tvm.te.tensor.Tensor): Layer weights
+        bias (tvm.te.tensor.Tensor): Layer biases
+        strides (tvm.ir.container.Array): convolution strides
+        padding (tvm.ir.container.Array): input feature map padding
+        ifm_offset (int): input feature map offset (used for the padding of the input feature map)
+        activation (int): has activation?
+        gemmini_scale (float): output scaling factor
+        pool_size (tvm.ir.container.Array): size of the output pooling window
+        pool_strides (tvm.ir.container.Array): strides for the output pooling window
+        pool_dilation (tvm.ir.container.Array): dilation for the output pooling window (not used!)
+        pool_padding (tvm.ir.container.Array): padding for the output pooling window
+
+    Returns:
+        tvm.te.tensor.Tensor: conv2d operator result
+    """
+    assert len(orig_data.shape) == 4
+    assert len(kernel.shape) == 4
+    assert len(bias.shape) == 1
+    assert (
+        orig_data.shape[1] == orig_data.shape[2]
+    ), "GEMMINIs Conv2d CISC schedule only supports square inputs!"
+
+    OC = kernel.shape[3]
+    KH = kernel.shape[0]
+    KW = kernel.shape[1]
+
+    N = orig_data.shape[0]
+    IH = orig_data.shape[1]
+    IW = orig_data.shape[2]
+    IC = orig_data.shape[3]
+
+    HSTR = strides[0]
+    WSTR = strides[1]
+    TOP_PAD = padding[0]
+    LEFT_PAD = padding[1]
+    BOTTOM_PAD = padding[2]
+    RIGHT_PAD = padding[3]
+
+    OH = topi.utils.get_const_int(tvm.tir.div((IH + (TOP_PAD + BOTTOM_PAD) - KH), HSTR) + 1)
+    OW = topi.utils.get_const_int(tvm.tir.div((IW + (LEFT_PAD + RIGHT_PAD) - KW), WSTR) + 1)
+
+    ric = te.reduce_axis((0, IC), name="ric")
+    rkh = te.reduce_axis((0, KH), name="rkh")
+    rkw = te.reduce_axis((0, KW), name="rkw")
+
+    oshape = (N, OH, OW, OC)
+
+    if len(set(padding)) == 1 and (env.supports_non_zero_padding or ifm_offset == 0):
+        # If the padding is the same for all borders, there is no need to use topi.nn.pad, because Gemminis CISC instructions support equal padding
+        data = orig_data
+    else:
+        # If not, then pad before calling Gemminis functions
+        data = topi.nn.pad(
+            orig_data,
+            [0, TOP_PAD, LEFT_PAD, 0],
+            [0, BOTTOM_PAD, RIGHT_PAD, 0],
+            pad_value=ifm_offset,
+            name="pad_data",
+        )
+
+    res = te.compute(
+        oshape,
+        lambda b_o, i, j, c_o: te.sum(
+            data[b_o, i * HSTR + rkh, j * WSTR + rkw, ric].astype(env.inp_dtype)
+            * kernel[rkh, rkw, ric, c_o].astype(env.inp_dtype)
+            + bias[c_o].astype(env.inp_dtype),
+            axis=[rkh, rkw, ric],
+        ),
+        name="res",
+        tag="conv2d",
+        attrs={
+            "activation": activation,
+            "strides": [HSTR, WSTR],
+            "padding": padding,
+            "padding_value": ifm_offset,
+            "scale": gemmini_scale,
+            "pool_size": pool_size,
+            "pool_strides": pool_strides,
+            "pool_dilation": pool_dilation,
+            "pool_padding": pool_padding,
+        },
+    )
+
+    cfg.add_flop(
+        np.prod(topi.utils.get_const_tuple(oshape)) * KH * KW * IC
+        + np.prod(topi.utils.get_const_tuple(oshape))
+        * (KH * KW * IC - 1)  # Multiplications and additions needed
+        + np.prod(  # Additions needed
+            topi.utils.get_const_tuple(oshape)
+        )  # Output scaling multiplications
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.conv2d_cisc")
+def schedule_conv2d_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's conv2d operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+    assert len(outs) == 1
+    output = outs[0]
+    const_ops = []
+    ewise_inputs = []
+    ewise_ops = []
+    conv2d_res = []
+
+    def _traverse(op):
+        if topi.tag.is_broadcast(op.tag):
+            if not op.same_as(output.op):
+                if not op.axis:
+                    const_ops.append(op)
+                else:
+                    ewise_ops.append(op)
+            for tensor in op.input_tensors:
+                if isinstance(tensor.op, tvm.te.PlaceholderOp):
+                    ewise_inputs.append((op, tensor))
+                else:
+                    _traverse(tensor.op)
+        else:
+            if op.tag == "conv2d":
+                conv2d_res.append(op)
+            else:
+                for tensor in op.input_tensors:
+                    _traverse(tensor.op)
+
+    _traverse(output.op)
+    assert len(conv2d_res) == 1
+    conv2d_stage = conv2d_res[0].output(0)
+    sch = te.create_schedule(output.op)
+
+    data, kernel, bias = conv2d_stage.op.input_tensors
+
+    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
+        temp = data.op.input_tensors[0]
+        pad_data = data
+        data = temp
+    else:
+        pad_data = data
+
+    orig_kernel = kernel
+
+    x_bo, x_i, x_j, x_co = sch[conv2d_stage].op.axis
+    rkh, rkw, ric = sch[conv2d_stage].op.reduce_axis
+
+    x_bo_o, x_bo_i = sch[conv2d_stage].split(x_bo, factor=pad_data.shape[0])
+
+    axis_for_start = x_bo_o
+
+    # If topi.nn.pad was added, its because the padding was not equal in all dimensions.
+    padding_for_C_code = conv2d_stage.op.attrs["padding"] if pad_data == data else [0, 0, 0, 0]
+    padding_value_for_C_code = conv2d_stage.op.attrs["padding_value"] if pad_data == data else 0
+
+    # Apply tensorization
+    sch[conv2d_stage].tensorize(
+        x_bo_i,
+        env.conv2d_cisc(
+            pad_data.shape,
+            kernel.shape,
+            bias.shape,
+            conv2d_stage.shape,
+            conv2d_stage.op.attrs["strides"],
+            padding_for_C_code,
+            padding_value_for_C_code,
+            conv2d_stage.op.attrs["activation"],
+            conv2d_stage.op.attrs["scale"],
+            conv2d_stage.op.attrs["pool_size"],
+            conv2d_stage.op.attrs["pool_strides"],
+            conv2d_stage.op.attrs["pool_dilation"],
+            conv2d_stage.op.attrs["pool_padding"],
+        ),
+    )
+
+    # Tag loops with pragmas to delimit the start and end of the Gemmini related code
+    sch[conv2d_stage].pragma(axis_for_start, "conv2d_cisc_start")
+    sch[conv2d_stage].pragma(axis_for_start, "gemm_end")
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
new file mode 100644
index 000000000000..03051f193638
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
@@ -0,0 +1,377 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Dense (GEMM) operator declaration and schedule registration for Gemmini's intrinsic instructions
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
+
+from tvm.contrib.gemmini.environment import Environment
+from tvm.contrib.gemmini.build_module import lower
+from tvm.contrib.gemmini.helpers import get_greater_div
+
+env = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.gemm")
+def gemm(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    data: tvm.te.tensor.Tensor,
+    weight: tvm.te.tensor.Tensor,
+    bias: tvm.te.tensor.Tensor,
+    scale: float,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's dense operator using intrinsic instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        data (tvm.te.tensor.Tensor): Input feature map
+        weight (tvm.te.tensor.Tensor): Layer weights
+        bias (tvm.te.tensor.Tensor): Layer biases
+        scale (float): output scaling factor
+
+    Returns:
+        tvm.te.tensor.Tensor: dense operator result
+    """
+
+    # Derive shapes
+    ishape = topi.utils.get_const_tuple(data.shape)
+    wshape = topi.utils.get_const_tuple(weight.shape)
+    oshape = (data.shape[0], weight.shape[1])
+
+    # Reduction axes (input channel)
+    assert ishape[1] == wshape[0]
+    k_o = te.reduce_axis((0, wshape[0]), name="k_o")
+
+    bias_stage = te.compute(
+        oshape,
+        lambda x_o, y_o: bias[y_o].astype(env.inp_dtype),
+        name="bias.local.accumulator",
+        tag="bias_add",
+    )
+
+    res = te.compute(
+        oshape,
+        lambda x_o, y_o: te.sum(
+            data[x_o, k_o].astype(env.inp_dtype) * weight[k_o, y_o].astype(env.inp_dtype)
+            + bias_stage[x_o, y_o].astype(env.inp_dtype),
+            axis=[k_o],
+        ),
+        name="res",
+        tag="dense",
+        attrs={"scale": scale},
+    )
+
+    cfg.add_flop(
+        (2 * np.prod(topi.utils.get_const_tuple(oshape)) * ishape[1])  # element multiplications
+        + np.prod(topi.utils.get_const_tuple(oshape))  # bias additions
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.gemm")
+def schedule_gemm(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's dense operator using intrinsic instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+
+    assert len(outs) == 1
+    output = outs[0]
+
+    dense_stage = output.op.output(0)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    sch = te.create_schedule([x.op for x in outs])
+
+    data, weight, bias_op = dense_stage.op.input_tensors
+    bias = bias_op.op.input_tensors[0]
+
+    ##### space definition begin #####
+    x, y = sch[dense_stage].op.axis
+    (z,) = sch[dense_stage].op.reduce_axis
+
+    # TODO (FP): add limits for scratchpad and accumulator sizes perhaps?
+    cfg.define_split(
+        "tile_xo",
+        x,
+        num_outputs=3,
+        policy="power2",
+        filter=lambda ax: (
+            ax.size[-1] == get_greater_div(int(data.shape[0]))
+            if (data.shape[0] >= env.DIM)
+            else ax.size[-1] <= env.DIM
+        ),
+    )
+
+    cfg.define_split(
+        "tile_yo",
+        y,
+        num_outputs=3,
+        policy="power2",
+        filter=lambda ax: (
+            ax.size[-1] == get_greater_div(int(weight.shape[1]))
+            if (weight.shape[1] >= env.DIM)
+            else ax.size[-1] <= env.DIM
+        ),
+    )
+
+    cfg.define_split(
+        "tile_zo",
+        z,
+        num_outputs=3,
+        policy="power2",
+        filter=lambda ax: (
+            ax.size[-1] == get_greater_div(int(weight.shape[0]))
+            if (weight.shape[0] >= env.DIM)
+            else ax.size[-1] <= env.DIM
+        ),
+    )
+
+    # accumulate_multiple_patches knob
+    #   2: only one patch is computed in the accumulator
+    #   1: More than one patch is computed in the accumulator, depends on tile_yo
+    #   0: More than one patch is computed in the accumulator, depends on tile_yo AND tile_xo
+    cfg.define_knob("accumulate_multiple_patches", [0, 1, 2])
+    # exchange axis
+    #   exchange the order of axis x and y
+    cfg.define_knob("exchange_axis", [False, True])
+    # WS/OS
+    #   0: Gemmini will be configured as output stationary
+    #   1: Gemmini will be configured as weight stationary
+    cfg.define_knob("WS/OS", [env.WEIGHT_STATIONARY, env.OUTPUT_STATIONARY])
+    # mvout_big_block
+    #   False: generate mvout instructions moving as maximum DIM columns
+    #   True: generate mvout instructions moving more than DIM columns
+    cfg.define_knob("mvout_big_block", [True, False])
+    if cfg.is_fallback:
+        # Load default split values
+        cfg["tile_xo"] = SplitEntity([-1, 8, get_greater_div(int(data.shape[0]))])
+        cfg["tile_yo"] = SplitEntity([-1, 8, get_greater_div(int(weight.shape[1]))])
+        cfg["tile_zo"] = SplitEntity([-1, 8, get_greater_div(int(weight.shape[0]))])
+        cfg["accumulate_multiple_patches"] = OtherOptionEntity(0)
+        cfg["exchange_axis"] = OtherOptionEntity(False)
+        cfg["mvout_big_block"] = OtherOptionEntity(True)
+        cfg["WS/OS"] = OtherOptionEntity(env.WEIGHT_STATIONARY)
+
+    ###### space definition end ######
+
+    cdata = sch.cache_read(data, env.scr_scope, [dense_stage])
+    cweight = sch.cache_read(weight, env.scr_wgt_scope, [dense_stage])
+    dense_stage_acc = sch.cache_write(output, env.acc_scope)
+    sch[bias_op].set_scope(env.acc_scope)
+    (x_, y_) = sch[dense_stage_acc].op.axis
+    (z_,) = sch[dense_stage_acc].op.reduce_axis
+
+    # Split loops to generate the inner dimensions specified by knobs tile_xo and tile_yo
+    b_y, yo, yi = cfg["tile_yo"].apply(sch, output, sch[output].op.axis[1])
+    b_x, xo, xi = cfg["tile_xo"].apply(sch, output, sch[output].op.axis[0])
+
+    # Apply the exchange_axis knob
+    if cfg["exchange_axis"].val:
+        sch[output].reorder(b_y, b_x, yo, xo, yi, xi)
+    else:
+        sch[output].reorder(b_x, b_y, xo, yo, xi, yi)
+
+    # Apply the accumulate_multiple_patches knob
+    if cfg["accumulate_multiple_patches"].val == 0:
+        axis_for_output = b_x if cfg["exchange_axis"].val else b_y
+    elif cfg["accumulate_multiple_patches"].val == 1:
+        axis_for_output = yo if cfg["exchange_axis"].val else xo
+    else:
+        axis_for_output = xo if cfg["exchange_axis"].val else yo
+
+    axis_gemm_start = b_y if cfg["exchange_axis"].val else b_x
+
+    # Move the dense_stage_acc stage to the correct axis of the output stage
+    sch[dense_stage_acc].compute_at(sch[output], axis_for_output)
+
+    # # Split loops to generate the inner dimensions specified by knob tile_zo
+    xo_o, xi_o = sch[dense_stage_acc].split(x_, factor=env.DIM)
+    yo_o, yi_o = sch[dense_stage_acc].split(y_, factor=env.DIM)
+    b_z, zo_o, zi_o = cfg["tile_zo"].apply(sch, dense_stage_acc, z_)
+
+    # Apply the exchange_axis knob
+    if cfg["exchange_axis"].val:
+        sch[dense_stage_acc].reorder(b_z, xo_o, yo_o, zo_o, xi_o, yi_o, zi_o)
+    else:
+        sch[dense_stage_acc].reorder(b_z, yo_o, xo_o, zo_o, yi_o, xi_o, zi_o)
+
+    # Generate knobs to move the copy of data across different loops
+    axis_to_input_data = [b_x, b_z, xo_o, zo_o]
+    axis_to_input_weights = [b_y, b_z, yo_o, zo_o]
+    stages_to_input_data = [output, dense_stage_acc, dense_stage_acc, dense_stage_acc]
+    cfg.define_knob("axis_for_cdata", [0, 1, 2, 3])
+    cfg.define_knob("axis_for_cweight", [0, 1, 2, 3])
+    if cfg.is_fallback:
+        cfg["axis_for_cdata"] = OtherOptionEntity(0)
+        cfg["axis_for_cweight"] = OtherOptionEntity(0)
+
+    # Compute the move of the bias in the correct loop
+    sch[bias_op].compute_at(sch[output], axis_for_output)
+
+    # We assert here that the mvin of data does not use more space than the available one in the scratchpad
+    if cfg["axis_for_cdata"].val == 0:
+        assert (
+            cfg["tile_xo"].size[1] * cfg["tile_xo"].size[2] * data.shape[1]
+            <= env.INP_SCR_ROWS * env.DIM
+        ), "Data matrix will not fit in scratchpad!"
+    elif cfg["axis_for_cdata"].val == 1:
+        assert (
+            cfg["tile_xo"].size[2] * data.shape[1] <= env.INP_SCR_ROWS * env.DIM
+        ), "Data matrix will not fit in scratchpad!"
+    if cfg["axis_for_cweight"].val == 0:
+        assert (
+            cfg["tile_yo"].size[1] * cfg["tile_yo"].size[2] * weight.shape[0]
+            <= env.WGT_SCR_ROWS * env.DIM
+        ), "Weight matrix will not fit in scratchpad!"
+    elif cfg["axis_for_cweight"].val == 1:
+        assert (
+            cfg["tile_yo"].size[2] * weight.shape[0] <= env.WGT_SCR_ROWS * env.DIM
+        ), "Weight matrix will not fit in scratchpad!"
+
+    # And here we assert that there is enough place available in the accumulator
+    if cfg["accumulate_multiple_patches"].val == 0:
+        assert (
+            cfg["tile_xo"].size[1]
+            * cfg["tile_xo"].size[2]
+            * cfg["tile_yo"].size[1]
+            * cfg["tile_yo"].size[2]
+            <= env.ACC_ROWS * env.DIM
+        ), "Result matrix will not fit in accumulator!"
+    elif cfg["accumulate_multiple_patches"].val == 1:
+        assert (
+            cfg["tile_xo"].size[2] * cfg["tile_yo"].size[1] * cfg["tile_yo"].size[2]
+            <= env.ACC_ROWS * env.DIM
+        ), "Result matrix will not fit in accumulator!"
+
+    # Move the data and weight move instructions into the correct loops selected by the axis_for_cdata and axis_for_cweight knobs
+    axis_for_cdata = axis_to_input_data[cfg["axis_for_cdata"].val]
+    axis_for_cweight = axis_to_input_weights[cfg["axis_for_cweight"].val]
+    sch[cdata].compute_at(sch[stages_to_input_data[cfg["axis_for_cdata"].val]], axis_for_cdata)
+    sch[cweight].compute_at(
+        sch[stages_to_input_data[cfg["axis_for_cweight"].val]], axis_for_cweight
+    )
+
+    # Split input moves because Gemmini's mvin only supports mvins with rows <= DIM and cols <= MAX_BLOCK_LEN
+    cdata_ax_0_1, cdata_ax_0_2 = sch[cdata].split(sch[cdata].op.axis[0], factor=env.DIM)
+    cdata_ax_1_1, cdata_ax_1_2 = sch[cdata].split(
+        sch[cdata].op.axis[1], factor=env.MAX_BLOCK_LEN * env.DIM
+    )
+    sch[cdata].reorder(cdata_ax_0_1, cdata_ax_1_1, cdata_ax_0_2, cdata_ax_1_2)
+
+    cweight_ax_0_1, cweight_ax_0_2 = sch[cweight].split(sch[cweight].op.axis[0], factor=env.DIM)
+    cweight_ax_1_1, cweight_ax_1_2 = sch[cweight].split(
+        sch[cweight].op.axis[1], factor=env.MAX_BLOCK_LEN * env.DIM
+    )
+    sch[cweight].reorder(cweight_ax_0_1, cweight_ax_1_1, cweight_ax_0_2, cweight_ax_1_2)
+
+    cbias_ax_0_1, cbias_ax_0_2 = sch[bias_op].split(sch[bias_op].op.axis[0], factor=env.DIM)
+    cbias_ax_1_1, cbias_ax_1_2 = sch[bias_op].split(
+        sch[bias_op].op.axis[1], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+    )
+    sch[bias_op].reorder(cbias_ax_0_1, cbias_ax_1_1, cbias_ax_0_2, cbias_ax_1_2)
+
+    # Mvout preparation
+    if cfg["exchange_axis"].val:
+        sch[output].reorder(yo, yi, xo, xi)
+    else:
+        sch[output].reorder(xo, xi, yo, yi)
+    if cfg["accumulate_multiple_patches"].val == 0:
+        fused_x = sch[output].fuse(xo, xi)
+        fused_y = sch[output].fuse(yo, yi)
+    elif cfg["accumulate_multiple_patches"].val == 1:
+        if cfg["exchange_axis"].val:
+            fused_x = sch[output].fuse(xo, xi)
+            fused_y = yi
+        else:
+            fused_x = xi
+            fused_y = sch[output].fuse(yo, yi)
+    else:
+        fused_x = xi
+        fused_y = yi
+
+    fused_x_1, fused_x_2 = sch[output].split(fused_x, factor=env.DIM)
+    fused_y_1, fused_y_2 = sch[output].split(
+        fused_y, factor=env.MAX_BLOCK_LEN * env.DIM if cfg["mvout_big_block"].val else env.DIM
+    )
+    sch[output].reorder(fused_x_1, fused_y_1, fused_x_2, fused_y_2)
+
+    # Tag loops with pragmas, in order to insert the move in and move out instructions
+    sch[cweight].pragma(cweight_ax_0_2, env.B_mvin)
+    if data.shape[0] == 1 and weight.shape[1] > 1:
+        sch[cdata].pragma(cdata_ax_0_2, env.A_mvin + "_t")
+        sch[bias_op].pragma(cbias_ax_0_2, env.D_mvin + "_t")
+        sch[output].pragma(fused_x_2, env.C_mvout + "_t")
+    else:
+        sch[cdata].pragma(cdata_ax_0_2, env.A_mvin)
+        sch[bias_op].pragma(cbias_ax_0_2, env.D_mvin)
+        sch[output].pragma(fused_x_2, env.C_mvout)
+
+    # Apply tensorize
+    I = data.shape[0] if data.shape[0] < env.DIM else cfg["tile_xo"].size[-1]
+    K = weight.shape[0] if weight.shape[0] < env.DIM else cfg["tile_zo"].size[-1]
+    J = weight.shape[1] if weight.shape[1] < env.DIM else cfg["tile_yo"].size[-1]
+
+    sch[dense_stage_acc].tensorize(
+        xi_o if cfg["exchange_axis"].val else yi_o,
+        env.gemm(
+            I,
+            K,
+            J,
+            mode=cfg["WS/OS"].val,
+            accum_patch=tvm.tir.IntImm("uint8", 0)
+            if cfg["exchange_axis"].val or cfg["tile_zo"].size[1] != 1
+            else xo_o.var,
+        ),
+    )
+
+    # Generate configuration dictionary, in order to correctly generate the calls to the configuration instructions
+    config_dict = {}
+    config_dict["A_size"] = int(data.shape[1])
+    config_dict["B_size"] = int(weight.shape[1])
+    config_dict["C_size"] = int(output.shape[1])
+    config_dict["A_private_stride"] = env.DIM
+    config_dict["B_private_stride"] = env.DIM
+    config_dict["execution_stride"] = 1
+    config_dict["activation"] = 0
+    config_dict["mode"] = cfg["WS/OS"].val
+    config_dict["max_pixels_per_row"] = 1
+    config_dict["scale"] = float(dense_stage.op.attrs["scale"])
+    config_dict["padding_value"] = 0
+
+    # Tag loops with pragmas to delimit the start and end of the Gemmini related code
+    sch[output].pragma(axis_gemm_start, "gemm_start")
+    sch[output].pragma(axis_gemm_start, "configs", str(config_dict))
+    sch[output].pragma(axis_gemm_start, "gemm_end")
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
new file mode 100644
index 000000000000..0144563940f9
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Dense (GEMM) operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import math
+import sys
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+
+from tvm.contrib.gemmini.environment import Environment
+from tvm.contrib.gemmini.build_module import lower
+from tvm.contrib.gemmini.intrin import gemm_cisc
+
+env = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.gemm_cisc")
+def gemm_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    data: tvm.te.tensor.Tensor,
+    weight: tvm.te.tensor.Tensor,
+    bias: tvm.te.tensor.Tensor,
+    scale: float,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's dense operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        data (tvm.te.tensor.Tensor): Input feature map
+        weight (tvm.te.tensor.Tensor): Layer weights
+        bias (tvm.te.tensor.Tensor): Layer biases
+        scale (float): output scaling factor
+
+    Returns:
+        tvm.te.tensor.Tensor: dense operator result
+    """
+
+    # Derive shapes
+    ishape = topi.utils.get_const_tuple(data.shape)
+    wshape = topi.utils.get_const_tuple(weight.shape)
+    oshape = (data.shape[0], weight.shape[1])
+
+    # Reduction axes (input channel)
+    assert ishape[1] == wshape[0]
+    k_o = te.reduce_axis((0, wshape[0]), name="k_o")
+
+    res = te.compute(
+        oshape,
+        lambda x_o, y_o: te.sum(
+            data[x_o, k_o].astype(env.inp_dtype) * weight[k_o, y_o].astype(env.inp_dtype)
+            + bias[y_o].astype(env.inp_dtype),
+            axis=[k_o],
+        ),
+        name="res",
+        tag="dense",
+        attrs={"scale": scale},
+    )
+
+    cfg.add_flop(
+        (2 * np.prod(topi.utils.get_const_tuple(oshape)) * ishape[1])  # element multiplications
+        + np.prod(topi.utils.get_const_tuple(oshape))  # bias additions
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.gemm_cisc")
+def schedule_gemm_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's dense operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+    assert len(outs) == 1
+    output = outs[0]
+
+    dense_stage = output.op.output(0)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    sch = te.create_schedule([x.op for x in outs])
+
+    data, weight, bias = dense_stage.op.input_tensors
+
+    # WS/OS
+    #   0: Gemmini will be configured as output stationary
+    #   1: Gemmini will be configured as weight stationary
+    cfg.define_knob("WS/OS", [env.WEIGHT_STATIONARY, env.OUTPUT_STATIONARY])
+    if cfg.is_fallback:
+        cfg["WS/OS"] = OtherOptionEntity(env.WEIGHT_STATIONARY)
+
+    x_, y_ = sch[dense_stage].op.axis
+
+    x_o, x_i = sch[dense_stage].split(x_, factor=data.shape[0])
+
+    axis_for_start = x_o
+
+    # Apply tensorization
+    sch[dense_stage].tensorize(
+        x_i,
+        env.gemm_cisc(
+            data.shape, weight.shape, bias.shape, dense_stage.op.attrs["scale"], cfg["WS/OS"].val
+        ),
+    )
+
+    # Tag loops with pragmas to delimit the start and end of the Gemmini related code
+    sch[dense_stage].pragma(axis_for_start, "gemm_cisc_start")
+    sch[dense_stage].pragma(axis_for_start, "gemm_end")
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
new file mode 100644
index 000000000000..c67767f783c2
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
@@ -0,0 +1,227 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+Depthwise conv2d operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+
+from tvm.contrib.gemmini.environment import Environment
+from tvm.contrib.gemmini.build_module import lower
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+from tvm.contrib.gemmini.helpers import get_greater_div
+
+env = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.depthwiseconv2d_cisc")
+def depthwise_conv2d_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    orig_data: tvm.te.tensor.Tensor,
+    orig_kernel: tvm.te.tensor.Tensor,
+    bias: tvm.te.tensor.Tensor,
+    strides: tvm.ir.container.Array,
+    padding: tvm.ir.container.Array,
+    ifm_offset: int,
+    activation: int,
+    gemmini_scale: float,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition for Gemmini's depthwise conv2d operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        orig_data (tvm.te.tensor.Tensor): Input feature map
+        orig_kernel (tvm.te.tensor.Tensor): Layer weights
+        bias (tvm.te.tensor.Tensor): Layer biases
+        strides (tvm.ir.container.Array): convolution strides
+        padding (tvm.ir.container.Array): input feature map padding
+        ifm_offset (int): input feature map offset (used for the padding of the input feature map)
+        activation (int): has activation?
+        gemmini_scale (float): output scaling factor
+
+    Returns:
+        tvm.te.tensor.Tensor: depthwise conv2d operator result
+    """
+
+    assert len(orig_data.shape) == 4
+    assert len(orig_kernel.shape) == 3
+    assert len(bias.shape) == 1
+    assert (
+        orig_data.shape[1] == orig_data.shape[2]
+    ), "GEMMINIs depthwise conv2d CISC schedule only supports square inputs!"
+
+    OC = orig_kernel.shape[0]
+    KH = orig_kernel.shape[1]
+    KW = orig_kernel.shape[2]
+
+    kernel = orig_kernel
+
+    N = orig_data.shape[0]
+    IH = orig_data.shape[1]
+    IW = orig_data.shape[2]
+    IC = orig_data.shape[3]
+
+    HSTR = strides[0]
+    WSTR = strides[1]
+    TOP_PAD = padding[0]
+    LEFT_PAD = padding[1]
+    BOTTOM_PAD = padding[2]
+    RIGHT_PAD = padding[3]
+
+    OH = topi.utils.get_const_int(tvm.tir.div((IH + (TOP_PAD + BOTTOM_PAD) - KH), HSTR) + 1)
+    OW = topi.utils.get_const_int(tvm.tir.div((IW + (LEFT_PAD + RIGHT_PAD) - KW), WSTR) + 1)
+
+    if len(set(padding)) == 1 and env.supports_non_zero_padding:
+        # If the padding is the same for all borders, there is no need to use topi.nn.pad, because Gemminis CISC instructions support equal padding
+        data = orig_data
+    else:
+        # If not, then pad before calling Gemminis functions
+        data = topi.nn.pad(
+            orig_data,
+            [0, TOP_PAD, LEFT_PAD, 0],
+            [0, BOTTOM_PAD, RIGHT_PAD, 0],
+            pad_value=ifm_offset,
+            name="pad_data",
+        )
+
+    rkh = te.reduce_axis((0, KH), name="rkh")
+    rkw = te.reduce_axis((0, KW), name="rkw")
+
+    oshape = (N, OH, OW, OC)
+
+    res = te.compute(
+        oshape,
+        lambda b_o, i, j, c_o: te.sum(
+            data[b_o, i * HSTR + rkh, j * WSTR + rkw, c_o].astype(env.inp_dtype)
+            * kernel[c_o, rkh, rkw].astype(env.inp_dtype)
+            + bias[c_o].astype(env.inp_dtype),
+            axis=[rkh, rkw],
+        ),
+        name="res",
+        tag="conv2d",
+        attrs={
+            "activation": activation,
+            "strides": [HSTR, WSTR],
+            "padding": padding,
+            "padding_value": ifm_offset,
+            "scale": gemmini_scale,
+        },
+    )
+
+    cfg.add_flop(
+        np.prod(topi.utils.get_const_tuple(oshape)) * KH * KW
+        + np.prod(topi.utils.get_const_tuple(oshape))
+        * (KH * KW - 1)  # Multiplications and additions needed
+        + np.prod(topi.utils.get_const_tuple(oshape))  # Output scaling factor multiplications
+    )
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.depthwiseconv2d_cisc")
+def schedule_depthwise_conv2d_cisc(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's depthwise conv2d operator using CISC instructions
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+    assert len(outs) == 1
+    output = outs[0]
+    const_ops = []
+    ewise_inputs = []
+    ewise_ops = []
+    conv2d_res = []
+
+    def _traverse(op):
+        if topi.tag.is_broadcast(op.tag):
+            if not op.same_as(output.op):
+                if not op.axis:
+                    const_ops.append(op)
+                else:
+                    ewise_ops.append(op)
+            for tensor in op.input_tensors:
+                if isinstance(tensor.op, tvm.te.PlaceholderOp):
+                    ewise_inputs.append((op, tensor))
+                else:
+                    _traverse(tensor.op)
+        else:
+            if op.tag == "conv2d":
+                conv2d_res.append(op)
+            else:
+                for tensor in op.input_tensors:
+                    _traverse(tensor.op)
+
+    _traverse(output.op)
+    assert len(conv2d_res) == 1
+    conv2d_stage = conv2d_res[0].output(0)
+    sch = te.create_schedule(output.op)
+
+    data, kernel, bias = conv2d_stage.op.input_tensors
+    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
+        temp = data.op.input_tensors[0]
+        pad_data = data
+        data = temp
+    else:
+        pad_data = data
+
+    orig_kernel = kernel
+
+    x_bo, x_i, x_j, x_co = sch[conv2d_stage].op.axis
+    rkh, rkw = sch[conv2d_stage].op.reduce_axis
+
+    x_bo_o, x_bo_i = sch[conv2d_stage].split(x_bo, factor=pad_data.shape[0])
+
+    axis_for_start = x_bo_o
+
+    # If topi.nn.pad was added, its because the padding was not equal in all dimensions.
+    padding_for_C_code = conv2d_stage.op.attrs["padding"] if pad_data == data else [0, 0, 0, 0]
+    padding_value_for_C_code = conv2d_stage.op.attrs["padding_value"] if pad_data == data else 0
+
+    # Apply tensorization
+    sch[conv2d_stage].tensorize(
+        x_bo_i,
+        env.dw_conv2d_cisc(
+            pad_data.shape,
+            kernel.shape,
+            bias.shape,
+            conv2d_stage.shape,
+            conv2d_stage.op.attrs["strides"],
+            padding_for_C_code,
+            padding_value_for_C_code,
+            conv2d_stage.op.attrs["activation"],
+            conv2d_stage.op.attrs["scale"],
+        ),
+    )
+
+    # Tag loops with pragmas to delimit the start and end of the Gemmini related code
+    sch[conv2d_stage].pragma(axis_for_start, "dw_conv2d_cisc_start")
+    sch[conv2d_stage].pragma(axis_for_start, "gemm_end")
+
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
new file mode 100644
index 000000000000..7d922ddd2db4
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
@@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIsch, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""
+MaxPool2D operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+import numpy as np
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+
+from tvm.contrib.gemmini.environment import Environment
+from tvm.contrib.gemmini.build_module import lower
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
+from tvm.contrib.gemmini.helpers import get_greater_div
+
+env = Environment.instance()
+
+
+@autotvm.register_topi_compute("contrib.gemmini.max_pool2d")
+# def conv2d(args,attrs):
+def max_pool2d(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity,
+    data: tvm.te.tensor.Tensor,
+    weights: tvm.te.tensor.Tensor,
+    pool_size: tvm.ir.container.Array,
+    pool_strides: tvm.ir.container.Array,
+    pool_dilation: tvm.ir.container.Array,
+    pool_padding: tvm.ir.container.Array,
+) -> tvm.te.tensor.Tensor:
+    """Computation definition to run a max pooling layer on Gemmini. Uses a trick: we call a dw convolution + max pooling, but all weights are 1. So the depthwise convolution does nothing, and the Gemmini accelerator takes care internally of applying the max pooling.
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        data (tvm.te.tensor.Tensor): Input feature map
+        weights (tvm.te.tensor.Tensor): Weights... just all ones, needed by the called function
+        pool_size (tvm.ir.container.Array): Pooling window size
+        pool_strides (tvm.ir.container.Array): Pooling window strides
+        pool_dilation (tvm.ir.container.Array): Pooling window dilation (not used for now)
+        pool_padding (tvm.ir.container.Array): Pooling window padding
+
+    Returns:
+        tvm.te.tensor.Tensor: max pool2d operator result
+    """
+
+    assert len(data.shape) == 4
+
+    def irb_builder_func(ins, outs):
+        irb = tvm.tir.ir_builder.create()
+
+        if env.supports_non_zero_padding:
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "tiled_conv_dw_auto",
+                    ins[0].shape[0],  # BATCH_SIZE,
+                    ins[0].shape[1],  # IN_DIM,
+                    ins[0].shape[3],  # IN_CHANNELS,
+                    ins[0].shape[1],  # OUT_DIM,
+                    1,
+                    0,
+                    0,
+                    1,
+                    ins[0].access_ptr("r"),
+                    ins[1].access_ptr("r"),
+                    0,
+                    outs[0].access_ptr("w"),
+                    0,
+                    1.0,
+                    pool_size[0],
+                    pool_strides[0],
+                    pool_padding[0],
+                    1,
+                )
+            )
+        else:
+            irb.emit(
+                tvm.tir.call_extern(
+                    "",
+                    "tiled_conv_dw_auto",
+                    ins[0].shape[0],  # BATCH_SIZE,
+                    ins[0].shape[1],  # IN_DIM,
+                    ins[0].shape[3],  # IN_CHANNELS,
+                    ins[0].shape[1],  # OUT_DIM,
+                    1,
+                    0,
+                    1,
+                    ins[0].access_ptr("r"),
+                    ins[1].access_ptr("r"),
+                    0,
+                    outs[0].access_ptr("w"),
+                    0,
+                    1.0,
+                    pool_size[0],
+                    pool_strides[0],
+                    pool_padding[0],
+                    1,
+                )
+            )
+        irb.emit(tvm.tir.call_extern("", "gemmini_fence"))
+
+        return irb.get()
+
+    res = te.extern(
+        (1,), [data, weights], lambda ins, outs: irb_builder_func(ins, outs), dtype="int8"
+    )
+
+    # TODO (FP): add correct FLOPS
+    # cfg.add_flop(2 * np.prod(topi.utils.get_const_tuple(oshape)) * KH * KW * IC)
+
+    return res
+
+
+@autotvm.register_topi_schedule("contrib.gemmini.max_pool2d")
+def schedule_max_pool2d(
+    cfg: tvm.autotvm.task.space.FallbackConfigEntity, outs: tvm.ir.container.Array
+) -> tvm.te.schedule.Schedule:
+    """Schedule definition for Gemmini's max pool2d operator
+
+    Args:
+        cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
+        outs (tvm.ir.container.Array): Output tensors
+
+    Returns:
+        tvm.te.schedule.Schedule: transformed schedule
+    """
+    assert len(outs) == 1
+    output = outs[0]
+    sch = te.create_schedule(output.op)
+    return sch
diff --git a/python/tvm/relay/backend/contrib/gemmini/op.py b/python/tvm/relay/backend/contrib/gemmini/op.py
new file mode 100644
index 000000000000..6ca41c66d139
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/gemmini/op.py
@@ -0,0 +1,286 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument, ungrouped-imports
+"""
+Namespace for the supported Relay operators on Gemmini
+=====================
+**Author**: `Federico Peccia <https://fPecc.github.io/>`_
+"""
+
+from __future__ import absolute_import as _abs
+
+import tvm
+from tvm import te
+from tvm import autotvm
+from tvm import topi
+
+from tvm.relay.op import op as reg
+from tvm.relay.op import strategy as _strategy
+from tvm.relay.op.op import OpPattern, OpStrategy
+
+from .gemmini_dense import gemm, schedule_gemm
+from .gemmini_dense_cisc import gemm_cisc, schedule_gemm_cisc
+from .gemmini_conv2d_cisc import conv2d_cisc, schedule_conv2d_cisc
+from .gemmini_depthwise_conv2d_cisc import depthwise_conv2d_cisc, schedule_depthwise_conv2d_cisc
+from .gemmini_add import add, schedule_add
+from .gemmini_max_pool2d import max_pool2d, schedule_max_pool2d
+from tvm.contrib.gemmini.environment import Environment
+
+from tvm.topi.utils import const_vector, get_const_int, get_const_float
+import numpy as np
+
+ENV = Environment.instance()
+
+
+def wrap_max_pool2d_topi_compute(topi_compute):
+    """Wrapper for the max pool2d compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        return [
+            topi_compute(
+                *inputs,
+                attrs.pool_size,
+                attrs.pool_strides,
+                attrs.pool_dilation,
+                attrs.pool_padding,
+            )
+        ]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.max_pool2d", "FTVMStrategy")
+def max_pool2d_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's max_pool2d operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs) == 2:
+        strategy = OpStrategy()
+        strategy.add_implementation(
+            wrap_max_pool2d_topi_compute(max_pool2d),
+            _strategy.wrap_topi_schedule(schedule_max_pool2d),
+            name="contrib.gemmini.max_pool2d",
+            plevel=10,
+        )
+        return strategy
+    return None
+
+
+def wrap_add_topi_compute(topi_compute):
+    """Wrapper for the add compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        ifm1_scale = float(attrs.ifm1_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+        ifm2_scale = float(attrs.ifm2_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+        return [topi_compute(*inputs, ifm1_scale, ifm2_scale)]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.add", "FTVMStrategy")
+def add_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's add operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs) == 3:
+        strategy = OpStrategy()
+        strategy.add_implementation(
+            wrap_add_topi_compute(add),
+            _strategy.wrap_topi_schedule(schedule_add),
+            name="contrib.gemmini.add",
+            plevel=10,
+        )
+        return strategy
+    return None
+
+
+def wrap_gemm_topi_compute(topi_compute):
+    """Wrapper for the GEMM compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        return [
+            topi_compute(
+                *inputs, float(attrs.bias_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+            )
+        ]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.gemm", "FTVMStrategy")
+def gemm_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's GEMM operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs) == 3:
+        strategy = OpStrategy()
+        strategy.add_implementation(
+            wrap_gemm_topi_compute(gemm),
+            _strategy.wrap_topi_schedule(schedule_gemm),
+            name="contrib.gemmini.gemm",
+            plevel=9,
+        )
+        strategy.add_implementation(
+            wrap_gemm_topi_compute(gemm_cisc),
+            _strategy.wrap_topi_schedule(schedule_gemm_cisc),
+            name="contrib.gemmini.gemm_cisc",
+            plevel=10,  # Higher -> used over the other one, unless AutoTVM says the other is better
+        )
+        return strategy
+    return None
+
+
+def wrap_conv2d_topi_compute(topi_compute):
+    """Wrapper for the conv2d compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        if attrs.has_activation:
+            gemmini_scale = float(
+                attrs.activation_scale_in.data.numpy() / attrs.activation_scale_out.data.numpy()
+            ) * float(attrs.bias_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+        else:
+            gemmini_scale = float(attrs.bias_scale.data.numpy() / attrs.ofm_scale.data.numpy())
+        return [
+            topi_compute(
+                *inputs,
+                attrs.strides,
+                attrs.padding,
+                int(attrs.ifm_offset.data.numpy()),
+                attrs.activation,
+                gemmini_scale,
+                attrs.pool_size,
+                attrs.pool_strides,
+                attrs.pool_dilation,
+                attrs.pool_padding,
+            )
+        ]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.conv2d", "FTVMStrategy")
+def conv2d_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's conv2d operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs[0].shape) == 4:
+        strategy = OpStrategy()
+        if inputs[0].shape[1] == inputs[0].shape[2]:
+            strategy.add_implementation(
+                wrap_conv2d_topi_compute(conv2d_cisc),
+                _strategy.wrap_topi_schedule(schedule_conv2d_cisc),
+                name="contrib.gemmini.conv2d_cisc",
+                plevel=10,
+            )
+        return strategy
+    return None
+
+
+def wrap_depthwise_conv2d_topi_compute(topi_compute):
+    """Wrapper for the depthwise conv2d compute
+
+    Args:
+        topi_compute (function): function to wrap
+    """
+
+    def wrapper(attrs, inputs, out_type):
+        return [
+            topi_compute(
+                *inputs,
+                attrs.strides,
+                attrs.padding,
+                int(attrs.ifm_offset.data.numpy()),
+                attrs.activation,
+                float(attrs.bias_scale.data.numpy() / attrs.ofm_scale.data.numpy()),
+            )
+        ]
+
+    return wrapper
+
+
+@tvm.ir.register_op_attr("contrib.gemmini.depthwiseconv2d", "FTVMStrategy")
+def depthwise_conv2d_strategy_gemmini(attrs, inputs, out_type, target):
+    """Strategy implementations for Gemmini's depthwiseconv2d operator
+
+    Args:
+        attrs (tvm.runtime.object.Object): attributes for the strategy
+        inputs (tvm.ir.container.Array): inputs
+        out_type (tvm.ir.tensor_type.TensorType): output type
+        target (tvm.target.target.Target): target for the strategy
+
+    Returns:
+        OpStrategy: strategies implementation
+    """
+    if len(inputs[0].shape) == 4:
+        strategy = OpStrategy()
+        if inputs[0].shape[1] == inputs[0].shape[2]:
+            strategy.add_implementation(
+                wrap_depthwise_conv2d_topi_compute(depthwise_conv2d_cisc),
+                _strategy.wrap_topi_schedule(schedule_depthwise_conv2d_cisc),
+                name="contrib.gemmini.depthwiseconv2d_cisc",
+                plevel=10,
+            )
+        return strategy
+    return None
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 9b0e5748bcc0..c0aa371b4d3d 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -214,6 +214,17 @@ def InjectRollingBuffer():
     return _ffi_api.InjectRollingBuffer()  # type: ignore
 
 
+def CorrectGemminisScratchpadAndAccumulatorPointers():
+    """Corrects the pointer addresses of buffers inside Gemmini's scratchpad and accumulator
+
+    Returns:
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.CorrectGemminisScratchpadAndAccumulatorPointers()
+
+
 def StorageRewrite():
     """Rewrite storage allocation pattern.
 
diff --git a/src/relay/op/contrib/gemmini/add.cc b/src/relay/op/contrib/gemmini/add.cc
new file mode 100644
index 000000000000..b27ad4717d14
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/add.cc
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/add.cc
+ * \brief Add operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/op/op_common.h"
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini Add operators */
+struct GemminiAddAttrs : public tvm::AttrsNode<GemminiAddAttrs> {
+  Expr ifm1_scale;
+  Expr ifm1_offset;
+  Expr ifm2_scale;
+  Expr ifm2_offset;
+  Expr ofm_scale;
+  Expr ofm_offset;
+  Array<PrimExpr> shape;
+
+  TVM_DECLARE_ATTRS(GemminiAddAttrs, "relay.attrs.GemminiAddAttrs") {
+    TVM_ATTR_FIELD(ifm1_scale).describe("Input feature map 1 quantization scale");
+    TVM_ATTR_FIELD(ifm1_offset).describe("Input feature map 1 quantization offset");
+    TVM_ATTR_FIELD(ifm2_scale).describe("Input feature map 2 quantization scale");
+    TVM_ATTR_FIELD(ifm2_offset).describe("Input feature map 2 quantization offset");
+    TVM_ATTR_FIELD(ofm_scale).describe("Output feature map quantization scale");
+    TVM_ATTR_FIELD(ofm_offset).describe("Output feature map quantization offset");
+    TVM_ATTR_FIELD(shape).describe("Output shape");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiAddAttrs);
+
+bool GemminiAddRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  const int ifm1_index = 0;
+  const int ifm2_index = 1;
+  const int result_index = 3;
+  ICHECK_EQ(types.size(), result_index + 1);
+
+  const auto* ifm1 = types[ifm1_index].as<TensorTypeNode>();
+  const auto* ifm2 = types[ifm2_index].as<TensorTypeNode>();
+  ICHECK(ifm1 != nullptr) << "ifm1 cannot be nullptr.";
+  ICHECK(ifm2 != nullptr) << "ifm2 cannot be nullptr.";
+
+  const auto* param = attrs.as<GemminiAddAttrs>();
+  ICHECK(param != nullptr) << "GemminiAddAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  Array<IndexExpr> ofm_shape({ifm1->shape[0], ifm2->shape[1], ifm2->shape[2], ifm2->shape[3]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiAdd(Expr ifm1, Expr ifm2, Expr ifm1_scale, Expr ifm1_offset, Expr ifm2_scale,
+                    Expr ifm2_offset, Expr ofm_scale, Expr ofm_offset, Array<PrimExpr> shape) {
+  auto attrs = make_object<GemminiAddAttrs>();
+  attrs->ifm1_scale = std::move(ifm1_scale);
+  attrs->ifm1_offset = std::move(ifm1_offset);
+  attrs->ifm2_scale = std::move(ifm2_scale);
+  attrs->ifm2_offset = std::move(ifm2_offset);
+  attrs->ofm_scale = std::move(ofm_scale);
+  attrs->ofm_offset = std::move(ofm_offset);
+  attrs->shape = std::move(shape);
+
+  static const Op& op = Op::Get("contrib.gemmini.add");
+
+  auto requantized_ifm1 = ifm1;
+
+  auto requantized_ifm2 = ifm2;
+
+  auto ofm_offset_tensor = Full(attrs->ofm_offset, attrs->shape, DataType::Float(32));
+  auto ifm1_offset_tensor = Multiply(Divide(attrs->ifm1_scale, attrs->ofm_scale),
+                                     Cast(attrs->ifm1_offset, DataType::Float(32)));
+  auto ifm2_offset_tensor = Multiply(Divide(attrs->ifm2_scale, attrs->ofm_scale),
+                                     Cast(attrs->ifm2_offset, DataType::Float(32)));
+  ofm_offset_tensor = Subtract(Subtract(ofm_offset_tensor, ifm1_offset_tensor), ifm2_offset_tensor);
+
+  auto final_offset_tensor = tvm::relay::qnn::RequantizeOrUpcast(
+      ofm_offset_tensor, MakeConstantScalar(DataType::Float(32), 1),
+      MakeConstantScalar(DataType::Float(32), 0), MakeConstantScalar(DataType::Float(32), 1),
+      MakeConstantScalar(DataType::Float(32), 0), attrs->shape, -1);
+
+  auto add_output =
+      Call(op, {requantized_ifm1, requantized_ifm2, final_offset_tensor}, Attrs(attrs), {});
+  return add_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_add").set_body_typed(MakeGemminiAdd);
+
+RELAY_REGISTER_OP("contrib.gemmini.add")
+    .describe("Gemmini Add operator.")
+    .set_attrs_type<GemminiAddAttrs>()
+    .set_num_inputs(3)
+    .add_argument("ifm1", "Tensor", "The Input 1 Feature Map tensor.")
+    .add_argument("ifm2", "Tensor", "The Input 2 Feature Map tensor.")
+    .add_argument("ofm_offset_tensor", "Tensor", "The output offset tensor.")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiAdd", GemminiAddRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/contrib/gemmini/convolution.cc b/src/relay/op/contrib/gemmini/convolution.cc
new file mode 100644
index 000000000000..1ac0a3ad0df5
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/convolution.cc
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/convolution.cc
+ * \brief 2D convolution operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+//#include "common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini 2D convolution operator */
+struct GemminiConv2dAttrs : public tvm::AttrsNode<GemminiConv2dAttrs> {
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  double ifm_scale;
+  Expr ifm_offset;
+  double weights_scale;
+  double weights_offset;
+  Expr bias_scale;
+  Expr bias_offset;
+  Expr ofm_scale;
+  Expr ofm_offset;
+  bool activation;
+  bool has_pool;
+  Array<IndexExpr> pool_size;
+  Array<IndexExpr> pool_strides;
+  Array<IndexExpr> pool_dilation;
+  Array<IndexExpr> pool_padding;
+  Expr input_req_offset_out;
+  Expr activation_scale_in;
+  Expr activation_offset_in;
+  Expr activation_scale_out;
+  Expr activation_offset_out;
+  bool has_activation;
+
+  TVM_DECLARE_ATTRS(GemminiConv2dAttrs, "relay.attrs.GemminiConv2dAttrs") {
+    TVM_ATTR_FIELD(strides)
+        .set_default(Array<IndexExpr>({1, 1}))
+        .describe("The 2 dimensional strides as (stride_height, stride_width).");
+    TVM_ATTR_FIELD(padding)
+        .set_default(Array<IndexExpr>({0, 0, 0, 0}))
+        .describe("The 4 dimensional padding.");
+    TVM_ATTR_FIELD(ifm_scale).set_default(1.0).describe("Input quantization scale");
+    TVM_ATTR_FIELD(ifm_offset).describe("Input quantization offset");
+    TVM_ATTR_FIELD(weights_scale).set_default(1.0).describe("Weights quantization scale");
+    TVM_ATTR_FIELD(weights_offset).set_default(0.0).describe("Weights quantization offset");
+    TVM_ATTR_FIELD(bias_scale).describe("Bias quantization scale");
+    TVM_ATTR_FIELD(bias_offset).describe("Bias quantization offset");
+    TVM_ATTR_FIELD(ofm_scale).describe("Output quantization scale");
+    TVM_ATTR_FIELD(ofm_offset).describe("Output quantization offset");
+    TVM_ATTR_FIELD(activation)
+        .set_default(false)
+        .describe("If it has a ReLu activation (True) or not (False)");
+    TVM_ATTR_FIELD(has_pool).set_default(false).describe(
+        "If it has a pool layer (True) or not (False)");
+    TVM_ATTR_FIELD(pool_size).describe("Pooling window size");
+    TVM_ATTR_FIELD(pool_strides).describe("Pooling window strides");
+    TVM_ATTR_FIELD(pool_dilation).describe("Pooling window dilation");
+    TVM_ATTR_FIELD(pool_padding).describe("Pooling padding");
+    TVM_ATTR_FIELD(input_req_offset_out).describe("Requantization output offset");
+    TVM_ATTR_FIELD(activation_scale_in).describe("Activation input scaling factor");
+    TVM_ATTR_FIELD(activation_offset_in).describe("Activation input offset");
+    TVM_ATTR_FIELD(activation_scale_out).describe("Activation output scaling factor");
+    TVM_ATTR_FIELD(activation_offset_out).describe("Activation output offset");
+    TVM_ATTR_FIELD(has_activation).describe("Has activation?");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiConv2dAttrs);
+
+bool GemminiConv2dRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                      const TypeReporter& reporter) {
+  const int data_index = 0;
+  const int weights_index = 1;
+  const int bias_index = 2;
+  const int result_index = 3;
+
+  const auto* data = types[data_index].as<TensorTypeNode>();
+  const auto* weights = types[weights_index].as<TensorTypeNode>();
+  const auto* bias = types[bias_index].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  if (weights == nullptr) return false;
+  if (bias == nullptr) return false;
+
+  const auto* params = attrs.as<GemminiConv2dAttrs>();
+  ICHECK(params != nullptr) << "GemminiConv2dAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  PrimExpr conv2d_output_h =
+      ((data->shape[1] + (params->padding[0] + params->padding[2]) - weights->shape[0]) /
+       params->strides[0]) +
+      1;
+  PrimExpr conv2d_output_w =
+      ((data->shape[2] + (params->padding[1] + params->padding[3]) - weights->shape[1]) /
+       params->strides[1]) +
+      1;
+  PrimExpr max_pool2d_h = conv2d_output_h;
+  PrimExpr max_pool2d_w = conv2d_output_w;
+  if (params->has_pool) {
+    max_pool2d_h = ((conv2d_output_h + (params->pool_padding[0] + params->pool_padding[2]) -
+                     params->pool_size[0]) /
+                    params->pool_strides[0]) +
+                   1;
+    max_pool2d_w = ((conv2d_output_w + (params->pool_padding[1] + params->pool_padding[3]) -
+                     params->pool_size[1]) /
+                    params->pool_strides[1]) +
+                   1;
+  }
+  Array<IndexExpr> ofm_shape({data->shape[0], max_pool2d_h, max_pool2d_w, weights->shape[3]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiConv2d(Expr data, Expr weights, Expr bias, Array<IndexExpr> strides,
+                       Array<IndexExpr> padding, double ifm_scale, Expr ifm_offset,
+                       double weights_scale, double weights_offset, Expr bias_scale,
+                       Expr bias_offset, Expr ofm_scale, Expr ofm_offset, bool activation,
+                       bool has_pool, Array<IndexExpr> pool_size, Array<IndexExpr> pool_strides,
+                       Array<IndexExpr> pool_dilation, Array<IndexExpr> pool_padding,
+                       Expr input_req_offset_out, bool has_activation, Expr activation_scale_in,
+                       Expr activation_offset_in, Expr activation_scale_out,
+                       Expr activation_offset_out) {
+  auto attrs = make_object<GemminiConv2dAttrs>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->activation = std::move(activation);
+  attrs->ifm_scale = std::move(ifm_scale);
+  attrs->ifm_offset = std::move(ifm_offset);
+  attrs->weights_scale = std::move(weights_scale);
+  attrs->weights_offset = std::move(weights_offset);
+  attrs->bias_scale = std::move(bias_scale);
+  attrs->bias_offset = std::move(bias_offset);
+  attrs->ofm_scale = std::move(ofm_scale);
+  attrs->ofm_offset = std::move(ofm_offset);
+  attrs->has_pool = std::move(has_pool);
+  attrs->pool_size = std::move(pool_size);
+  attrs->pool_strides = std::move(pool_strides);
+  attrs->pool_dilation = std::move(pool_dilation);
+  attrs->pool_padding = std::move(pool_padding);
+  attrs->input_req_offset_out = std::move(input_req_offset_out);
+  attrs->activation_scale_in = std::move(activation_scale_in);
+  attrs->activation_offset_in = std::move(activation_offset_in);
+  attrs->activation_scale_out = std::move(activation_scale_out);
+  attrs->activation_offset_out = std::move(activation_offset_out);
+  attrs->has_activation = std::move(has_activation);
+
+  static const Op& op = Op::Get("contrib.gemmini.conv2d");
+
+  auto zero_const = MakeConstantScalar(DataType::Int(32), 0);
+  auto one_const = MakeConstantScalar(DataType::Int(32), 0);
+
+  auto new_bias = bias;
+  // Bias change
+  // Term 3
+  auto reduced_t3 = Sum(Cast(weights, DataType::Int(32)), {0, 1, 2}, false, false);
+  auto term3 = Multiply(attrs->ifm_offset, reduced_t3);
+  auto input_req_bias_term = Multiply(attrs->input_req_offset_out, reduced_t3);
+
+  new_bias = Add(Subtract(bias, term3), input_req_bias_term);
+  auto scale_1 = Divide(attrs->bias_scale, attrs->ofm_scale);
+  auto bias_fix = Divide(Cast(attrs->ofm_offset, DataType::Float(32)), scale_1);
+  new_bias = Add(new_bias, Cast(bias_fix, DataType::Int(32)));
+
+  if (attrs->has_activation) {
+    auto scale_2 = Divide(attrs->activation_scale_in, attrs->activation_scale_out);
+    auto term_1 = Cast(attrs->activation_offset_in, DataType::Float(32));
+    auto term_2 = Divide(Cast(attrs->activation_offset_out, DataType::Float(32)), scale_2);
+    auto bias_fix = Divide(Subtract(term_2, term_1), scale_1);
+    new_bias = Add(new_bias, Cast(bias_fix, DataType::Int(32)));
+  }
+
+  auto conv2d_output = Call(op, {data, weights, new_bias}, Attrs(attrs), {});
+  return conv2d_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_conv2d").set_body_typed(MakeGemminiConv2d);
+
+RELAY_REGISTER_OP("contrib.gemmini.conv2d")
+    .describe("Gemmini 2D convolution operator")
+    .set_attrs_type<GemminiConv2dAttrs>()
+    .set_num_inputs(3)
+    .add_argument("data", "Tensor", "The Input Feature Map tensor.")
+    .add_argument("weights", "Tensor", "The Weights tensor.")
+    .add_argument("bias", "Tensor", "The bias tensor.")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiConv2d", GemminiConv2dRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/contrib/gemmini/depthwise_convolution.cc b/src/relay/op/contrib/gemmini/depthwise_convolution.cc
new file mode 100644
index 000000000000..d9cb264fb514
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/depthwise_convolution.cc
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/depthwise_convolution.cc
+ * \brief 2D depthwise convolution operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+//#include "common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini 2D depthwise convolution operator */
+struct GemminiDepthwiseConv2dAttrs : public tvm::AttrsNode<GemminiDepthwiseConv2dAttrs> {
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  double ifm_scale;
+  Expr ifm_offset;
+  double weights_scale;
+  double weights_offset;
+  Expr bias_scale;
+  Expr bias_offset;
+  Expr ofm_scale;
+  Expr ofm_offset;
+  bool activation;
+
+  TVM_DECLARE_ATTRS(GemminiDepthwiseConv2dAttrs, "relay.attrs.GemminiDepthwiseConv2dAttrs") {
+    TVM_ATTR_FIELD(strides)
+        .set_default(Array<IndexExpr>({1, 1}))
+        .describe("The 2 dimensional strides as (stride_height, stride_width).");
+    TVM_ATTR_FIELD(padding)
+        .set_default(Array<IndexExpr>({0, 0, 0, 0}))
+        .describe("The 4 dimensional padding.");
+    TVM_ATTR_FIELD(ifm_scale).set_default(1.0).describe("Input quantization scale");
+    TVM_ATTR_FIELD(ifm_offset).describe("Input quantization offset");
+    TVM_ATTR_FIELD(weights_scale).set_default(1.0).describe("Weights quantization scale");
+    TVM_ATTR_FIELD(weights_offset).set_default(0.0).describe("Weights quantization offset");
+    TVM_ATTR_FIELD(bias_scale).describe("Bias quantization scale");
+    TVM_ATTR_FIELD(bias_offset).describe("Bias quantization offset");
+    TVM_ATTR_FIELD(ofm_scale).describe("Output quantization scale");
+    TVM_ATTR_FIELD(ofm_offset).describe("Output quantization offset");
+    TVM_ATTR_FIELD(activation)
+        .set_default(false)
+        .describe("If it has a ReLu activation (True) or not (False)");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiDepthwiseConv2dAttrs);
+
+bool GemminiDepthwiseConv2dRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                               const TypeReporter& reporter) {
+  const int data_index = 0;
+  const int weights_index = 1;
+  const int bias_index = 2;
+  const int result_index = 3;
+
+  const auto* data = types[data_index].as<TensorTypeNode>();
+  const auto* weights = types[weights_index].as<TensorTypeNode>();
+  const auto* bias = types[bias_index].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  if (weights == nullptr) return false;
+  if (bias == nullptr) return false;
+
+  const auto* params = attrs.as<GemminiDepthwiseConv2dAttrs>();
+  ICHECK(params != nullptr) << "GemminiDepthwiseConv2dAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  Array<IndexExpr> ofm_shape(
+      {data->shape[0],
+       ((data->shape[1] + (params->padding[0] + params->padding[2]) - weights->shape[1]) /
+        params->strides[0]) +
+           1,
+       ((data->shape[2] + (params->padding[1] + params->padding[3]) - weights->shape[2]) /
+        params->strides[1]) +
+           1,
+       weights->shape[0]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiDepthwiseConv2d(Expr data, Expr weights, Expr bias, Array<IndexExpr> strides,
+                                Array<IndexExpr> padding, double ifm_scale, Expr ifm_offset,
+                                double weights_scale, double weights_offset, Expr bias_scale,
+                                Expr bias_offset, Expr ofm_scale, Expr ofm_offset,
+                                bool activation) {
+  auto attrs = make_object<GemminiDepthwiseConv2dAttrs>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->activation = std::move(activation);
+  attrs->ifm_scale = std::move(ifm_scale);
+  attrs->ifm_offset = std::move(ifm_offset);
+  attrs->weights_scale = std::move(weights_scale);
+  attrs->weights_offset = std::move(weights_offset);
+  attrs->bias_scale = std::move(bias_scale);
+  attrs->bias_offset = std::move(bias_offset);
+  attrs->ofm_scale = std::move(ofm_scale);
+  attrs->ofm_offset = std::move(ofm_offset);
+
+  static const Op& op = Op::Get("contrib.gemmini.depthwiseconv2d");
+
+  // Bias change
+  // Term 3
+  auto reduced_t3 = Sum(Cast(weights, DataType::Int(32)), {1, 2}, false, false);
+  auto term3 = Multiply(attrs->ifm_offset, reduced_t3);
+
+  auto new_bias = Subtract(bias, term3);
+  auto scale = Divide(attrs->bias_scale, attrs->ofm_scale);
+  auto bias_fix = Divide(Cast(attrs->ofm_offset, DataType::Float(32)), scale);
+  new_bias = Add(new_bias, Cast(bias_fix, DataType::Int(32)));
+
+  auto conv2d_output = Call(op, {data, weights, new_bias}, Attrs(attrs), {});
+  return conv2d_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_depthwise_conv2d")
+    .set_body_typed(MakeGemminiDepthwiseConv2d);
+
+RELAY_REGISTER_OP("contrib.gemmini.depthwiseconv2d")
+    .describe("Gemmini 2D depthwise convolution operator.")
+    .set_attrs_type<GemminiDepthwiseConv2dAttrs>()
+    .set_num_inputs(3)
+    .add_argument("data", "Tensor", "The Input Feature Map tensor.")
+    .add_argument("weights", "Tensor", "The Weights tensor.")
+    .add_argument("bias", "Tensor", "The bias tensor.")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiDepthwiseConv2d", GemminiDepthwiseConv2dRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/contrib/gemmini/gemm.cc b/src/relay/op/contrib/gemmini/gemm.cc
new file mode 100644
index 000000000000..6002e72aaa41
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/gemm.cc
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/gemm.cc
+ * \brief GEMM operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+//#include "common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini GEMM operator */
+struct GemminiGEMMAttrs : public tvm::AttrsNode<GemminiGEMMAttrs> {
+  Expr ifm_scale;
+  Expr ifm_offset;
+  Expr bias_scale;
+  Expr bias_offset;
+  Expr ofm_scale;
+  Expr ofm_offset;
+
+  TVM_DECLARE_ATTRS(GemminiGEMMAttrs, "relay.attrs.GemminiGEMMAttrs") {
+    TVM_ATTR_FIELD(ifm_scale).describe("Data quantization scale");
+    TVM_ATTR_FIELD(ifm_offset).describe("Data quantization offset");
+    TVM_ATTR_FIELD(bias_scale).describe("Bias quantization scale");
+    TVM_ATTR_FIELD(bias_offset).describe("Bias quantization offset");
+    TVM_ATTR_FIELD(ofm_scale).describe("Output quantization scale");
+    TVM_ATTR_FIELD(ofm_offset).describe("Output quantization offset");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiGEMMAttrs);
+
+bool GemminiGEMMRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                    const TypeReporter& reporter) {
+  const int ifm1_index = 0;
+  const int ifm2_index = 1;
+  const int bias_index = 2;
+  const int result_index = 3;
+
+  const auto* ifm1 = types[ifm1_index].as<TensorTypeNode>();
+  const auto* ifm2 = types[ifm2_index].as<TensorTypeNode>();
+  const auto* bias = types[bias_index].as<TensorTypeNode>();
+  if (ifm1 == nullptr) return false;
+  if (ifm2 == nullptr) return false;
+  if (bias == nullptr) return false;
+
+  const auto* param = attrs.as<GemminiGEMMAttrs>();
+  ICHECK(param != nullptr) << "GemminiGEMMAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  Array<IndexExpr> ofm_shape({ifm1->shape[0], ifm2->shape[1]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiGEMM(Expr data, Expr weights, Expr bias, Expr ifm_scale, Expr ifm_offset,
+                     Expr bias_scale, Expr bias_offset, Expr ofm_scale, Expr ofm_offset) {
+  auto attrs = make_object<GemminiGEMMAttrs>();
+  attrs->ifm_scale = std::move(ifm_scale);
+  attrs->ifm_offset = std::move(ifm_offset);
+  attrs->bias_scale = std::move(bias_scale);
+  attrs->bias_offset = std::move(bias_offset);
+  attrs->ofm_scale = std::move(ofm_scale);
+  attrs->ofm_offset = std::move(ofm_offset);
+
+  static const Op& op = Op::Get("contrib.gemmini.gemm");
+
+  auto weights_transposed = MakeTranspose(weights, {1, 0});
+  auto reduced_t3 = Sum(Cast(weights_transposed, DataType::Int(32)), {0}, false, false);
+  auto term3 = Multiply(attrs->ifm_offset, reduced_t3);
+
+  auto scale = Divide(attrs->bias_scale, attrs->ofm_scale);
+  auto bias_fix = Divide(Cast(attrs->ofm_offset, DataType::Float(32)), scale);
+
+  auto new_bias = Add(Subtract(bias, term3), Cast(bias_fix, DataType::Int(32)));
+
+  auto gemm_output = Call(op, {data, weights_transposed, new_bias}, Attrs(attrs), {});
+  return gemm_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_gemm").set_body_typed(MakeGemminiGEMM);
+
+RELAY_REGISTER_OP("contrib.gemmini.gemm")
+    .describe("Gemmini GEMM operator")
+    .set_attrs_type<GemminiGEMMAttrs>()
+    .set_num_inputs(3)
+    .add_argument("ifm1", "Tensor", "The Input Feature Map tensor.")
+    .add_argument("ifm2", "Tensor", "The Weights tensor.")
+    .add_argument("bias", "Tensor", "The bias tensor")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiGEMM", GemminiGEMMRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/contrib/gemmini/max_pool2d.cc b/src/relay/op/contrib/gemmini/max_pool2d.cc
new file mode 100644
index 000000000000..2e435ceea875
--- /dev/null
+++ b/src/relay/op/contrib/gemmini/max_pool2d.cc
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/gemmini/max_pool2d.cc
+ * \brief 2D max pool operator definition for Gemmini.
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/relay/op.h>
+
+#include "../../../qnn/utils.h"
+#include "../../op_common.h"
+//#include "common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace gemmini {
+
+/*! \brief Attributes used by the Gemmini GEMM operators */
+struct GemminiMaxPool2DAttrs : public tvm::AttrsNode<GemminiMaxPool2DAttrs> {
+  Array<IndexExpr> pool_size;
+  Array<IndexExpr> pool_strides;
+  Array<IndexExpr> pool_dilation;
+  Array<IndexExpr> pool_padding;
+  Array<PrimExpr> shape;
+
+  TVM_DECLARE_ATTRS(GemminiMaxPool2DAttrs, "relay.attrs.GemminiMaxPool2DAttrs") {
+    TVM_ATTR_FIELD(pool_size).describe("Pooling window size");
+    TVM_ATTR_FIELD(pool_strides).describe("Pooling window strides");
+    TVM_ATTR_FIELD(pool_dilation).describe("Pooling window dilation");
+    TVM_ATTR_FIELD(pool_padding).describe("Pooling padding");
+    TVM_ATTR_FIELD(shape).describe("Input shape");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(GemminiMaxPool2DAttrs);
+
+bool GemminiMaxPool2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                         const TypeReporter& reporter) {
+  const int data_index = 0;
+  const int result_index = 2;
+
+  const auto* data = types[data_index].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const auto* params = attrs.as<GemminiMaxPool2DAttrs>();
+  ICHECK(params != nullptr) << "GemminiMaxPool2DAttrs cannot be nullptr.";
+
+  DataType ofm_dtype = DataType::Int(8);
+
+  // Assign ofm type
+  PrimExpr max_pool2d_h = ((data->shape[1] + (params->pool_padding[0] + params->pool_padding[2]) -
+                            params->pool_size[0]) /
+                           params->pool_strides[0]) +
+                          1;
+  PrimExpr max_pool2d_w = ((data->shape[2] + (params->pool_padding[1] + params->pool_padding[3]) -
+                            params->pool_size[1]) /
+                           params->pool_strides[1]) +
+                          1;
+  Array<IndexExpr> ofm_shape({data->shape[0], max_pool2d_h, max_pool2d_w, data->shape[3]});
+  reporter->Assign(types[result_index], TensorType(ofm_shape, ofm_dtype));
+  return true;
+}
+
+Expr MakeGemminiMaxPool2D(Expr data, Array<IndexExpr> pool_size, Array<IndexExpr> pool_strides,
+                          Array<IndexExpr> pool_dilation, Array<IndexExpr> pool_padding,
+                          Array<PrimExpr> shape) {
+  auto attrs = make_object<GemminiMaxPool2DAttrs>();
+  attrs->pool_size = std::move(pool_size);
+  attrs->pool_strides = std::move(pool_strides);
+  attrs->pool_dilation = std::move(pool_dilation);
+  attrs->pool_padding = std::move(pool_padding);
+  attrs->shape = std::move(shape);
+
+  static const Op& op = Op::Get("contrib.gemmini.max_pool2d");
+
+  // Trick to be able to accelerate the max pooling operation using the dw convolution function of
+  // Gemmini ;)
+  auto weights =
+      Full(MakeConstantScalar(DataType::Int(8), 1), {attrs->shape[3], 1, 1}, DataType::Int(8));
+
+  auto max_pool2d_output = Call(op, {data, weights}, Attrs(attrs), {});
+
+  return max_pool2d_output;
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.gemmini_max_pool2d").set_body_typed(MakeGemminiMaxPool2D);
+
+RELAY_REGISTER_OP("contrib.gemmini.max_pool2d")
+    .describe("Gemmini 2D max pooling operator")
+    .set_attrs_type<GemminiMaxPool2DAttrs>()
+    .set_num_inputs(2)
+    .add_argument("data", "Tensor", "The Input Feature Map tensor.")
+    .add_argument("weights", "Tensor", "The Weights dummy tensor.")
+    .set_support_level(11)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .add_type_rel("GemminiMaxPool2D", GemminiMaxPool2DRel);
+
+}  // namespace gemmini
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index c8c099171c96..d827aac35647 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -229,7 +229,8 @@ runtime::Module CreateMetadataModule(
     // TODO(@manupa-arm) : we should be able to use csource_metadata
     // if the variables are empty when all the runtime modules implement get_func_names
     if (symbol_const_vars.empty() && is_targeting_crt && mod->IsDSOExportable() &&
-        (target->kind->name == "c" || target->kind->name == "llvm")) {
+        (target->kind->name == "c" || target->kind->name == "llvm" ||
+         target->kind->name == "gemmini")) {
       crt_exportable_modules.push_back(mod);
     } else {
       non_crt_exportable_modules.push_back(mod);
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index a47158d37883..84fc9bb9dac9 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -64,6 +64,9 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_s
     decl_stream << "#include <arm_nn_types.h>\n";
     decl_stream << "#include <arm_nn_math_types.h>\n";
   }
+  if (target_str.find("gemmini") != std::string::npos) {
+    decl_stream << "#include \"gemmini_testutils.h\"\n";
+  }
   CodeGenC::Init(output_ssa);
 }
 
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index a8d8936c905a..43d50306be45 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -39,7 +39,8 @@ LetStmt::LetStmt(Var var, PrimExpr value, Stmt body, Span span) {
   // It is still valid to bind a pointer type
   // var to a value that is of type handle.
   if (var->type_annotation.as<PointerTypeNode>()) {
-    ICHECK(vdtype.is_handle());
+    // TODO (FP): Is this check really necessary?
+    // ICHECK(vdtype.is_handle());
   } else {
     ICHECK_EQ(value.dtype(), var.dtype());
   }
diff --git a/src/tir/transforms/inject_gemmini_pointer_correction.cc b/src/tir/transforms/inject_gemmini_pointer_correction.cc
new file mode 100644
index 000000000000..d73f6b9b63ca
--- /dev/null
+++ b/src/tir/transforms/inject_gemmini_pointer_correction.cc
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Correct pointer addresses in scratchpad and accumulator of Gemmini
+ * \file inject_gemmini_pointer_correction.cc
+ * \author Federico Peccia <https://fPecc.github.io/>
+ */
+#include <tvm/arith/analyzer.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/target/target_info.h>
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../../runtime/thread_storage_scope.h"
+#include "ir_utils.h"
+
+namespace tvm {
+namespace tir {
+
+struct CorrectGemminisScratchpadAndAccumulatorPointersConfigNode
+    : public tvm::AttrsNode<CorrectGemminisScratchpadAndAccumulatorPointersConfigNode> {
+  int dim;
+
+  TVM_DECLARE_ATTRS(CorrectGemminisScratchpadAndAccumulatorPointersConfigNode,
+                    "tir.transform.CorrectGemminisScratchpadAndAccumulatorPointersConfig") {
+    TVM_ATTR_FIELD(dim).describe("Systolic array DIM").set_default(16);
+  }
+};
+
+class CorrectGemminisScratchpadAndAccumulatorPointersConfig : public Attrs {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(
+      CorrectGemminisScratchpadAndAccumulatorPointersConfig, Attrs,
+      CorrectGemminisScratchpadAndAccumulatorPointersConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(CorrectGemminisScratchpadAndAccumulatorPointersConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.CorrectGemminisScratchpadAndAccumulatorPointers",
+                                CorrectGemminisScratchpadAndAccumulatorPointersConfig);
+
+class CorrectGemminisScratchpadAndAccumulatorPointersInjector : public StmtExprMutator {
+ public:
+  explicit CorrectGemminisScratchpadAndAccumulatorPointersInjector(int dim) : dim_(dim) {}
+
+  Stmt Inject(Stmt stmt) { return this->VisitStmt(stmt); }
+
+  PrimExpr VisitExpr_(const CallNode* op) final {
+    /*
+    This pass is used to modify the access ptr
+    */
+    auto node = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
+    if (node->op.same_as(builtin::tvm_access_ptr())) {
+      const VarNode* buffer = node->args[1].as<VarNode>();
+
+      if (std::string(buffer->name_hint).find("local") != std::string::npos) {
+        PrimExpr offset = this->VisitExpr(node->args[2]);
+        PrimExpr extent = this->VisitExpr(node->args[3]);
+
+        const auto* ptr_type = buffer->type_annotation.as<PointerTypeNode>();
+        ICHECK(ptr_type) << "The provided variable is not of pointer type";
+        auto scope = ptr_type->storage_scope;
+        auto info = GetMemoryInfo(scope);
+        ICHECK(info.defined()) << "Cannot find memory info of " << scope;
+        DataType dtype = Downcast<PrimType>(ptr_type->element_type)->dtype;
+        int dtype_bits = dtype.bits() * dtype.lanes();
+
+        int div = dim_;
+        const IntImmNode* extent_int = extent.as<IntImmNode>();
+
+        PrimExpr inner_offset = indexmod(offset, extent);
+        PrimExpr outer_offset = offset - inner_offset;
+        PrimExpr outer_offset_corrected = indexdiv(outer_offset, div);
+        PrimExpr offset_corrected = outer_offset_corrected + inner_offset;
+
+        return Call(node->dtype, node->op,
+                    {node->args[0], node->args[1], offset_corrected, extent, node->args[4]});
+      }
+    }
+    return StmtExprMutator::VisitExpr_(op);
+  }
+
+ private:
+  int dim_;
+};
+
+namespace transform {
+
+Pass CorrectGemminisScratchpadAndAccumulatorPointers() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    auto* n = f.CopyOnWrite();
+    auto cfg = ctx->GetConfig<CorrectGemminisScratchpadAndAccumulatorPointersConfig>(
+        "tir.CorrectGemminisScratchpadAndAccumulatorPointers");
+    if (!cfg.defined()) {
+      cfg = AttrsWithDefaultValues<CorrectGemminisScratchpadAndAccumulatorPointersConfig>();
+    }
+    n->body = CorrectGemminisScratchpadAndAccumulatorPointersInjector(cfg.value()->dim)
+                  .Inject(std::move(n->body));
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.CorrectGemminisScratchpadAndAccumulatorPointers",
+                            {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.CorrectGemminisScratchpadAndAccumulatorPointers")
+    .set_body_typed(CorrectGemminisScratchpadAndAccumulatorPointers);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm

From 6108374975ad86ad086c42017004c298c1999614 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 11:09:03 +0100
Subject: [PATCH 023/286] Merged Makefiles into template, added cmake Gemmini
 config, moved tutorials to gallery

---
 apps/microtvm/gemmini/README.md               |   2 +-
 .../template_project/microtvm_api_server.py   | 122 +-----
 .../add/Makefile => Makefile.template}        |   8 -
 .../src/makefiles/conv2d/Makefile             |  68 ---
 .../src/makefiles/dense/Makefile              |  68 ---
 .../src/makefiles/dwconv2d/Makefile           |  68 ---
 .../src/makefiles/maxpool2d/Makefile          |  68 ---
 .../src/makefiles/mobilenet/Makefile          |  68 ---
 cmake/config.cmake                            |   3 +
 cmake/modules/contrib/Gemmini.cmake           |  14 +-
 .../micro_gemmini/README.txt                  |   5 +
 .../micro_gemmini/micro_gemmini_add.py        | 234 +++++++++++
 .../micro_gemmini/micro_gemmini_conv2d.py     | 215 ++++++++++
 .../micro_gemmini/micro_gemmini_dense.py      | 214 ++++++++++
 .../micro_gemmini/micro_gemmini_dwconv2d.py   | 207 +++++++++
 .../micro_gemmini/micro_gemmini_maxpool2d.py  | 211 ++++++++++
 .../micro_gemmini/micro_gemmini_mobilenet.py  | 262 ++++++++++++
 .../networks/mobilenet-tutorial.ipynb         | 311 --------------
 .../tutorials/networks/mobilenet_utils.py     | 138 ------
 .../single_operators/add-tutorial.ipynb       | 395 ------------------
 .../single_operators/conv2d-tutorial.ipynb    | 378 -----------------
 .../single_operators/dense-tutorial.ipynb     | 378 -----------------
 .../single_operators/dwconv2d-tutorial.ipynb  | 373 -----------------
 .../single_operators/maxpool2d-tutorial.ipynb | 378 -----------------
 24 files changed, 1370 insertions(+), 2818 deletions(-)
 rename apps/microtvm/gemmini/template_project/src/{makefiles/add/Makefile => Makefile.template} (82%)
 delete mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile
 delete mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile
 delete mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile
 delete mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile
 delete mode 100644 apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile
 create mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/README.txt
 create mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py
 create mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py
 create mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py
 create mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py
 create mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py
 create mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py
 delete mode 100644 python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb
 delete mode 100644 python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py
 delete mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb
 delete mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb
 delete mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb
 delete mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb
 delete mode 100644 python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb

diff --git a/apps/microtvm/gemmini/README.md b/apps/microtvm/gemmini/README.md
index 11fea3415b70..9b4c45716062 100644
--- a/apps/microtvm/gemmini/README.md
+++ b/apps/microtvm/gemmini/README.md
@@ -1,3 +1,3 @@
 This directory contains code to create code for the Gemmini accelerator using microTVM. These tests are then executed on the Spike RISC-V ISA simulator.
 
-In order to use this correctly, the Spike simulator has to be installed. This can be done by following the steps found on the Chipyard repository.
+In order to use this correctly, the Spike simulator has to be installed. This can be done by following the steps found on the [Chipyard](https://chipyard.readthedocs.io/en/stable/) repository. The instructions to also install the patch of the Spike simulator that adds the Gemmini functional simulator can be found in the [Gemmini](https://github.com/ucb-bar/gemmini) repository.
diff --git a/apps/microtvm/gemmini/template_project/microtvm_api_server.py b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
index f4d4f7eb5e89..85971316ec4e 100644
--- a/apps/microtvm/gemmini/template_project/microtvm_api_server.py
+++ b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
@@ -109,6 +109,14 @@ def _copy_project_files(self, api_server_dir, project_dir, project_type):
                 shutil.copytree(item, dest)
             else:
                 shutil.copy2(item, dest)
+        
+        shutil.copy2(project_dir / "src" / "Makefile.template", project_dir / "src" / "Makefile")
+
+        test_name = project_type.replace("_example","")
+        new_line = f"tests = {test_name}\n"
+        with open(project_dir / "src" / "Makefile", 'r') as original: data = original.read()
+        with open(project_dir / "src" / "Makefile", 'w') as modified: modified.write(new_line + data)
+
 
     CRT_COPY_ITEMS = ("include", "src")
 
@@ -122,18 +130,6 @@ def _copy_standalone_crt(self, source_dir, standalone_crt_dir):
             else:
                 shutil.copy2(src_path, dst_path)
 
-    # Example project is the "minimum viable project",
-    # and doesn't need a fancy RPC server
-    EXAMPLE_PROJECT_UNUSED_COMPONENTS = []
-
-    def _remove_unused_components(self, source_dir, project_type):
-        unused_components = []
-        if project_type == "example_project":
-            unused_components = self.EXAMPLE_PROJECT_UNUSED_COMPONENTS
-
-        for component in unused_components:
-            shutil.rmtree(source_dir / "standalone_crt" / component)
-
     def _disassemble_mlf(self, mlf_tar_path, source_dir):
         with tempfile.TemporaryDirectory() as mlf_unpacking_dir_str:
             mlf_unpacking_dir = pathlib.Path(mlf_unpacking_dir_str)
@@ -158,48 +154,12 @@ def _disassemble_mlf(self, mlf_tar_path, source_dir):
                 metadata = json.load(f)
         return metadata
 
-    def _template_model_header(self, source_dir, metadata):
-        with open(source_dir / "model.h", "r") as f:
-            model_h_template = Template(f.read())
-
-        assert (
-            metadata["style"] == "full-model"
-        ), "when generating AOT, expect only full-model Model Library Format"
-
-        template_values = {
-            "workspace_size_bytes": metadata["memory"]["functions"]["main"][0][
-                "workspace_size_bytes"
-            ],
-        }
-
-        with open(source_dir / "model.h", "w") as f:
-            f.write(model_h_template.substitute(template_values))
-
-    # Arduino ONLY recognizes .ino, .ccp, .c, .h
-
     CPP_FILE_EXTENSION_SYNONYMS = ("cc", "cxx")
 
-    def _change_cpp_file_extensions(self, source_dir):
-        for ext in self.CPP_FILE_EXTENSION_SYNONYMS:
-            for filename in source_dir.rglob(f"*.{ext}"):
-                filename.rename(filename.with_suffix(".cpp"))
-
-        for filename in source_dir.rglob(f"*.inc"):
-            filename.rename(filename.with_suffix(".h"))
-
     def _convert_includes(self, project_dir, source_dir):
         """Changes all #include statements in project_dir to be relevant to their
         containing file's location.
 
-        Arduino only supports includes relative to a file's location, so this
-        function finds each time we #include a file and changes the path to
-        be relative to the file location. Does not do this for standard C
-        libraries. Also changes angle brackets syntax to double quotes syntax.
-
-        See Also
-        -----
-        https://www.arduino.cc/reference/en/language/structure/further-syntax/include/
-
         """
         for ext in ("c", "h", "cpp"):
             for filename in source_dir.rglob(f"*.{ext}"):
@@ -260,45 +220,6 @@ def _find_modified_include_path(self, project_dir, file_path, include_path):
         # It's probably a standard C/C++ header
         return include_path
 
-    def _copy_standalone_crt_makefiles(self, api_server_dir, source_dir):
-        print(source_dir)
-        shutil.copy2(
-            api_server_dir / "src/example_project/Makefile",
-            source_dir,
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/Makefile.in",
-            source_dir,
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/Makefrag",
-            source_dir,
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/build.sh",
-            source_dir,
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/configure.ac",
-            source_dir,
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/include/gemmini_nn.h",
-            source_dir / "include/gemmini_nn.h",
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/include/gemmini_testutils.h",
-            source_dir / "include/gemmini_testutils.h",
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/include/gemmini.h",
-            source_dir / "include/gemmini.h",
-        )
-        shutil.copy2(
-            api_server_dir / "src/example_project/rocc-software/src/xcustom.h",
-            source_dir / "rocc-software/src/xcustom.h",
-        )
-
     def _copy_debug_data_files(self, project_dir):
         if os.path.isdir(str(project_dir / ".." / "include")):
             copy_tree(str(project_dir / ".." / "include"), str(project_dir / "src" / "model"))
@@ -317,7 +238,6 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
 
         # Copy standalone_crt into src folder
         self._copy_standalone_crt(source_dir, standalone_crt_dir)
-        self._remove_unused_components(source_dir, options["project_type"])
 
         # Populate crt-config.h
         crt_config_dir = project_dir / "src" / "standalone_crt" / "crt_config"
@@ -327,47 +247,27 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         )
 
         # Unpack the MLF and copy the relevant files
-        # extract_path = os.path.splitext(model_library_format_path)[0]
-        # with tarfile.TarFile(model_library_format_path) as tf:
-        #    os.makedirs(project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
-        #    tf.extractall(path=project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
         metadata = self._disassemble_mlf(model_library_format_path, source_dir)
         shutil.copy2(model_library_format_path, project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
 
         self._copy_debug_data_files(project_dir)
-        # For AOT, template model.h with metadata to minimize space usage
-        # if options["project_type"] == "example_project":
-        #    self._template_model_header(source_dir, metadata)
-
-        # Copy makefiles to treat standalone crt code as RIOT modules
-        # self._copy_standalone_crt_makefiles(API_SERVER_DIR, source_dir)
-
-        self._change_cpp_file_extensions(source_dir)
 
         # Recursively change includes
         self._convert_includes(project_dir, source_dir)
 
     def build(self, options):
         subprocess.call(
-            "source %s && cd src && ./build.sh" % (os.environ["CHIPYARD_HOME"] + "/env.sh",),
+            "cd src && ./build.sh",
             shell=True,
-            executable="/bin/bash",
         )
-        # os.system("source %s && cd src && ./build.sh" % (os.environ["CHIPYARD_HOME"] + "/env.sh",))
 
     def flash(self, options):
         test_name = options["project_type"].split("_")[0]
         subprocess.call(
-            "source %s && cd src/build && spike --extension=gemmini %s"
-            % (os.environ["CHIPYARD_HOME"] + "/env.sh", test_name + "-baremetal"),
+            "cd src/build && spike --extension=gemmini %s"
+            % (test_name + "-baremetal",),
             shell=True,
-            executable="/bin/bash",
         )
-        # os.system("source %s && cd src/build && spike --extension=gemmini %s" % (os.environ["CHIPYARD_HOME"] + "/env.sh",test_name + "-baremetal",))
-        # if logging.root.level == logging.DEBUG:
-        #    os.system("cd src/build && spike --extension=gemmini ")
-        # else:
-        #    os.system("cd src && make flash -s > /dev/null")
 
     def open_transport(self, options):
         pass
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/add/Makefile b/apps/microtvm/gemmini/template_project/src/Makefile.template
similarity index 82%
rename from apps/microtvm/gemmini/template_project/src/makefiles/add/Makefile
rename to apps/microtvm/gemmini/template_project/src/Makefile.template
index 2c997cea1a80..9368836a8802 100644
--- a/apps/microtvm/gemmini/template_project/src/makefiles/add/Makefile
+++ b/apps/microtvm/gemmini/template_project/src/Makefile.template
@@ -1,8 +1,5 @@
 include $(abs_top_srcdir)/Makefrag
 
-tests = \
-	add \
-
 tests_baremetal = $(tests:=-baremetal)
 
 ifeq ($(findstring spike,$(RUNNER)),spike)
@@ -53,11 +50,6 @@ vpath %.c $(src_dir)
 %-baremetal: %.c $(GEMMINI_HEADERS)
 	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
 		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
-#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
-		$(LIBS)
 
 run-baremetal: $(runs_baremetal)
 
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile
deleted file mode 100644
index f80da67c3f98..000000000000
--- a/apps/microtvm/gemmini/template_project/src/makefiles/conv2d/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-include $(abs_top_srcdir)/Makefrag
-
-tests = \
-	conv2d \
-
-tests_baremetal = $(tests:=-baremetal)
-
-ifeq ($(findstring spike,$(RUNNER)),spike)
-# Currently don't support conv or conv-with-pool on spike
-runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
-else
-# Don't run very long benchmarks for RTL sim
-runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
-endif
-
-RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
-BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
-GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
-STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
-
-CFLAGS := $(CFLAGS) \
-	-DPREALLOCATE=1 \
-	-DMULTITHREAD=1 \
-	-mcmodel=medany \
-	-std=gnu99 \
-	-O2 \
-	-ffast-math \
-	-fno-common \
-	-fno-builtin-printf \
-	-march=rv64gc -Wa,-march=rv64gcxhwacha \
-	-lm \
-	-lgcc \
-	-I${RISCV_TESTS} \
-	-I${RISCV_TESTS}/env \
-	-I$(abs_top_srcdir) \
-	-I$(abs_top_srcdir)/include \
-	-I$(BENCH_COMMON) \
-	-DID_STRING=$(ID_STRING) \
-	-DPRINT_TILE=0 \
-
-CFLAGS_BAREMETAL := \
-	$(CFLAGS) \
-	-nostdlib \
-	-nostartfiles \
-	-static \
-	-T $(BENCH_COMMON)/test.ld \
-	-DBAREMETAL=1 \
-
-all: $(tests_baremetal)
-
-vpath %.c $(src_dir)
-
-%-baremetal: %.c $(GEMMINI_HEADERS)
-	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
-		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
-#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
-		$(LIBS)
-
-run-baremetal: $(runs_baremetal)
-
-%-baremetal.run: %-baremetal
-	$(RUNNER)$(abs_top_srcdir)/build/$^
-
-junk += $(tests_baremetal)
-
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile
deleted file mode 100644
index 0b1932ceef91..000000000000
--- a/apps/microtvm/gemmini/template_project/src/makefiles/dense/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-include $(abs_top_srcdir)/Makefrag
-
-tests = \
-	dense \
-
-tests_baremetal = $(tests:=-baremetal)
-
-ifeq ($(findstring spike,$(RUNNER)),spike)
-# Currently don't support conv or conv-with-pool on spike
-runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
-else
-# Don't run very long benchmarks for RTL sim
-runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
-endif
-
-RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
-BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
-GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
-STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
-
-CFLAGS := $(CFLAGS) \
-	-DPREALLOCATE=1 \
-	-DMULTITHREAD=1 \
-	-mcmodel=medany \
-	-std=gnu99 \
-	-O2 \
-	-ffast-math \
-	-fno-common \
-	-fno-builtin-printf \
-	-march=rv64gc -Wa,-march=rv64gcxhwacha \
-	-lm \
-	-lgcc \
-	-I${RISCV_TESTS} \
-	-I${RISCV_TESTS}/env \
-	-I$(abs_top_srcdir) \
-	-I$(abs_top_srcdir)/include \
-	-I$(BENCH_COMMON) \
-	-DID_STRING=$(ID_STRING) \
-	-DPRINT_TILE=0 \
-
-CFLAGS_BAREMETAL := \
-	$(CFLAGS) \
-	-nostdlib \
-	-nostartfiles \
-	-static \
-	-T $(BENCH_COMMON)/test.ld \
-	-DBAREMETAL=1 \
-
-all: $(tests_baremetal)
-
-vpath %.c $(src_dir)
-
-%-baremetal: %.c $(GEMMINI_HEADERS)
-	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
-		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
-#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
-		$(LIBS)
-
-run-baremetal: $(runs_baremetal)
-
-%-baremetal.run: %-baremetal
-	$(RUNNER)$(abs_top_srcdir)/build/$^
-
-junk += $(tests_baremetal)
-
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile
deleted file mode 100644
index fa89e5be162d..000000000000
--- a/apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-include $(abs_top_srcdir)/Makefrag
-
-tests = \
-	dwconv2d \
-
-tests_baremetal = $(tests:=-baremetal)
-
-ifeq ($(findstring spike,$(RUNNER)),spike)
-# Currently don't support conv or conv-with-pool on spike
-runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
-else
-# Don't run very long benchmarks for RTL sim
-runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
-endif
-
-RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
-BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
-GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
-STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
-
-CFLAGS := $(CFLAGS) \
-	-DPREALLOCATE=1 \
-	-DMULTITHREAD=1 \
-	-mcmodel=medany \
-	-std=gnu99 \
-	-O2 \
-	-ffast-math \
-	-fno-common \
-	-fno-builtin-printf \
-	-march=rv64gc -Wa,-march=rv64gcxhwacha \
-	-lm \
-	-lgcc \
-	-I${RISCV_TESTS} \
-	-I${RISCV_TESTS}/env \
-	-I$(abs_top_srcdir) \
-	-I$(abs_top_srcdir)/include \
-	-I$(BENCH_COMMON) \
-	-DID_STRING=$(ID_STRING) \
-	-DPRINT_TILE=0 \
-
-CFLAGS_BAREMETAL := \
-	$(CFLAGS) \
-	-nostdlib \
-	-nostartfiles \
-	-static \
-	-T $(BENCH_COMMON)/test.ld \
-	-DBAREMETAL=1 \
-
-all: $(tests_baremetal)
-
-vpath %.c $(src_dir)
-
-%-baremetal: %.c $(GEMMINI_HEADERS)
-	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
-		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
-#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
-		$(LIBS)
-
-run-baremetal: $(runs_baremetal)
-
-%-baremetal.run: %-baremetal
-	$(RUNNER)$(abs_top_srcdir)/build/$^
-
-junk += $(tests_baremetal)
-
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile
deleted file mode 100644
index 1218e9e67a96..000000000000
--- a/apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-include $(abs_top_srcdir)/Makefrag
-
-tests = \
-	maxpool2d \
-
-tests_baremetal = $(tests:=-baremetal)
-
-ifeq ($(findstring spike,$(RUNNER)),spike)
-# Currently don't support conv or conv-with-pool on spike
-runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
-else
-# Don't run very long benchmarks for RTL sim
-runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
-endif
-
-RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
-BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
-GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
-STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
-
-CFLAGS := $(CFLAGS) \
-	-DPREALLOCATE=1 \
-	-DMULTITHREAD=1 \
-	-mcmodel=medany \
-	-std=gnu99 \
-	-O2 \
-	-ffast-math \
-	-fno-common \
-	-fno-builtin-printf \
-	-march=rv64gc -Wa,-march=rv64gcxhwacha \
-	-lm \
-	-lgcc \
-	-I${RISCV_TESTS} \
-	-I${RISCV_TESTS}/env \
-	-I$(abs_top_srcdir) \
-	-I$(abs_top_srcdir)/include \
-	-I$(BENCH_COMMON) \
-	-DID_STRING=$(ID_STRING) \
-	-DPRINT_TILE=0 \
-
-CFLAGS_BAREMETAL := \
-	$(CFLAGS) \
-	-nostdlib \
-	-nostartfiles \
-	-static \
-	-T $(BENCH_COMMON)/test.ld \
-	-DBAREMETAL=1 \
-
-all: $(tests_baremetal)
-
-vpath %.c $(src_dir)
-
-%-baremetal: %.c $(GEMMINI_HEADERS)
-	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
-		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
-#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
-		$(LIBS)
-
-run-baremetal: $(runs_baremetal)
-
-%-baremetal.run: %-baremetal
-	$(RUNNER)$(abs_top_srcdir)/build/$^
-
-junk += $(tests_baremetal)
-
diff --git a/apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile b/apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile
deleted file mode 100644
index b6d977550097..000000000000
--- a/apps/microtvm/gemmini/template_project/src/makefiles/mobilenet/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-include $(abs_top_srcdir)/Makefrag
-
-tests = \
-	mobilenet \
-
-tests_baremetal = $(tests:=-baremetal)
-
-ifeq ($(findstring spike,$(RUNNER)),spike)
-# Currently don't support conv or conv-with-pool on spike
-runs_baremetal = $(addsuffix .run,$(filter-out conv-baremetal conv_with_pool-baremetal,$(tests_baremetal)))
-else
-# Don't run very long benchmarks for RTL sim
-runs_baremetal = $(addsuffix .run,$(filter-out tiled_matmul_cpu-baremetal tiled_matmul_option-baremetal,$(tests_baremetal)))
-endif
-
-RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tests
-BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
-GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
-STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
-
-CFLAGS := $(CFLAGS) \
-	-DPREALLOCATE=1 \
-	-DMULTITHREAD=1 \
-	-mcmodel=medany \
-	-std=gnu99 \
-	-O2 \
-	-ffast-math \
-	-fno-common \
-	-fno-builtin-printf \
-	-march=rv64gc -Wa,-march=rv64gcxhwacha \
-	-lm \
-	-lgcc \
-	-I${RISCV_TESTS} \
-	-I${RISCV_TESTS}/env \
-	-I$(abs_top_srcdir) \
-	-I$(abs_top_srcdir)/include \
-	-I$(BENCH_COMMON) \
-	-DID_STRING=$(ID_STRING) \
-	-DPRINT_TILE=0 \
-
-CFLAGS_BAREMETAL := \
-	$(CFLAGS) \
-	-nostdlib \
-	-nostartfiles \
-	-static \
-	-T $(BENCH_COMMON)/test.ld \
-	-DBAREMETAL=1 \
-
-all: $(tests_baremetal)
-
-vpath %.c $(src_dir)
-
-%-baremetal: %.c $(GEMMINI_HEADERS)
-	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
-		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
-#		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/aot_executor_module/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/common/*.c) \
-		$(wildcard $(abs_top_srcdir)/standalone_crt/src/runtime/crt/memory/*.c) \
-		$(LIBS)
-
-run-baremetal: $(runs_baremetal)
-
-%-baremetal.run: %-baremetal
-	$(RUNNER)$(abs_top_srcdir)/build/$^
-
-junk += $(tests_baremetal)
-
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 679f5c459e87..5a93f9db652b 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -285,6 +285,9 @@ set(USE_ANTLR OFF)
 # Whether use Relay debug mode
 set(USE_RELAY_DEBUG OFF)
 
+# Wheter to build the microTVM Gemmini integration
+set(USE_GEMMINI OFF)
+
 # Whether to build fast VTA simulator driver
 set(USE_VTA_FSIM OFF)
 
diff --git a/cmake/modules/contrib/Gemmini.cmake b/cmake/modules/contrib/Gemmini.cmake
index 4b73d183ddc1..0d224c74ea75 100644
--- a/cmake/modules/contrib/Gemmini.cmake
+++ b/cmake/modules/contrib/Gemmini.cmake
@@ -1,4 +1,4 @@
-if(USE_MICRO)
+if(USE_GEMMINI)
   message(STATUS "Add Gemmini for microTVM")
 
   function(microtvm_add_gemmini)
@@ -10,7 +10,7 @@ if(USE_MICRO)
 
       # Dense example project generation
       "apps/microtvm/gemmini/template_project/src dense.c -> gemmini/src/dense_example"
-      "apps/microtvm/gemmini/template_project/src/makefiles/dense Makefile -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/dense_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dense_example"
       "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/dense_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/dense_example"
@@ -20,7 +20,7 @@ if(USE_MICRO)
 
       # CONV2D example project generation
       "apps/microtvm/gemmini/template_project/src conv2d.c -> gemmini/src/conv2d_example"
-      "apps/microtvm/gemmini/template_project/src/makefiles/conv2d Makefile -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/conv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/conv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/conv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/conv2d_example"
@@ -30,7 +30,7 @@ if(USE_MICRO)
 
       # DW CONV2D example project generation
       "apps/microtvm/gemmini/template_project/src dwconv2d.c -> gemmini/src/dwconv2d_example"
-      "apps/microtvm/gemmini/template_project/src/makefiles/dwconv2d Makefile -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/dwconv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dwconv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/dwconv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/dwconv2d_example"
@@ -40,7 +40,7 @@ if(USE_MICRO)
 
       # ADD example project generation
       "apps/microtvm/gemmini/template_project/src add.c -> gemmini/src/add_example"
-      "apps/microtvm/gemmini/template_project/src/makefiles/add Makefile -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/add_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/add_example"
       "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/add_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/add_example"
@@ -50,7 +50,7 @@ if(USE_MICRO)
 
       # Max pooling 2d example project generation
       "apps/microtvm/gemmini/template_project/src maxpool2d.c -> gemmini/src/maxpool2d_example"
-      "apps/microtvm/gemmini/template_project/src/makefiles/maxpool2d Makefile -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/maxpool2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/maxpool2d_example"
       "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/maxpool2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/maxpool2d_example"
@@ -60,7 +60,7 @@ if(USE_MICRO)
 
       # Mobilenet example project generation
       "apps/microtvm/gemmini/template_project/src mobilenet.c -> gemmini/src/mobilenet_example"
-      "apps/microtvm/gemmini/template_project/src/makefiles/mobilenet Makefile -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/mobilenet_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/mobilenet_example"
       "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/mobilenet_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/mobilenet_example"
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/README.txt b/gallery/how_to/work_with_microtvm/micro_gemmini/README.txt
new file mode 100644
index 000000000000..6826cc7ab810
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/README.txt
@@ -0,0 +1,5 @@
+.. _tutorial-micro-gemmini:
+
+Generate code for the Gemmini accelerator using microTVM
+------------------
+These how-tos demonstrate how to deploy models for the Gemmini accelerator using microTVM.
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py
new file mode 100644
index 000000000000..b3fe3c5bb3a0
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py
@@ -0,0 +1,234 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single add layer example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized add layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+
+Note: This is an **experimental** layer!
+"""
+
+import tensorflow as tf
+from tensorflow.keras import layers
+import numpy as np
+import os
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+# 
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 16
+input_width = 16
+input_channels = 16
+activation = 0
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+class Model(tf.Module):
+    def __init__(self, name=None):
+        super().__init__(name)
+
+    @tf.function(
+        input_signature=[
+            tf.TensorSpec(
+                shape=[1, input_height, input_width, input_channels],
+                dtype=tf.float32,
+            ),
+            tf.TensorSpec(
+                shape=[1, input_height, input_width, input_channels],
+                dtype=tf.float32,
+            ),
+        ]
+    )
+    def add(self, x, y):
+        if activation == 0:
+            return x + y
+        else:
+            return layers.Activation("relu")(x + y)
+
+model = Model()
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+def representative_data_gen():
+    dataset = [
+        (
+            np.array(
+                np.random.randint(
+                    -127, 128, size=(1, input_height, input_width, input_channels)
+                ),
+                dtype=np.float32,
+            ),
+            np.array(
+                np.random.randint(
+                    0, 128, size=(1, input_height, input_width, input_channels)
+                ),
+                dtype=np.float32,
+            ),
+        )
+        for s in range(100)
+    ]
+    for input_value in dataset:
+        yield [input_value[0], input_value[1]]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+with open("add.tflite", "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+
+os.system("rm -rf model.tar dev/ include/ generated-project/")
+
+tflite_file = "./add.tflite"
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+os.system("mkdir -p include")
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+tensor_details = interpreter.get_tensor_details()
+
+input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)
+input_matrix_2 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)
+
+interpreter.set_tensor(input_details[0]["index"], input_matrix_1)
+interpreter.set_tensor(input_details[1]["index"], input_matrix_2)
+
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+gemmini.create_header_file("inputs", "data", "input_1", input_matrix_2, "./include")
+gemmini.create_header_file("inputs", "data", "input_2", input_matrix_1, "./include")
+gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096, use_experimental_qnn_add=True)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+
+mod, params = relay.frontend.from_tflite(
+    tflite_model,
+    shape_dict={"serving_default_x": (1, input_height, input_width, input_channels), "serving_default_y": (1, input_height, input_width, input_channels)},
+    dtype_dict={"serving_default_x": input_dtype, "serving_default_y": input_dtype},
+)
+mod = relay.transform.InferType()(mod)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+##################################
+# Exporting and testing the model using microTVM
+# --------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
+
+import pathlib
+
+os.system("mkdir dev")
+model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
+tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+
+import tarfile
+
+with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
+    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
+
+# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+project_options = {
+    "project_type": "add_example"
+}  
+
+generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+generated_project = tvm.micro.generate_project(
+    template_project_path, module, generated_project_dir, project_options
+)
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+#generated_project.flash()
\ No newline at end of file
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py
new file mode 100644
index 000000000000..18bca38eafa0
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py
@@ -0,0 +1,215 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single 2d convolutional layer example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized 2d convolution layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+
+"""
+
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import numpy as np
+import os
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+# 
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 16
+input_width = 16
+input_channels = 16
+output_channels = 16
+kernel_size = 3
+stride = 1
+padding = 'valid'
+activation = None
+bias = True
+
+# We can add a max pooling layer after the convolution. This can be merged by the integration and can be executed together with the convolution on the Gemmini accelerator.
+pool_size = 1
+pool_stride = 1
+pool_padding = 'valid'
+use_pool = False
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+
+layer_sequence = [
+    layers.Conv2D(
+        output_channels,
+        kernel_size=kernel_size,
+        padding=padding,
+        activation=activation,
+        use_bias=True,
+        bias_initializer="ones",
+        input_shape=(input_height, input_width, input_channels),
+        strides=stride,
+    )
+]
+if use_pool:
+    layer_sequence.append(
+        layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)
+    )
+
+model = keras.Sequential(layer_sequence)
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+def representative_data_gen():
+    dataset = [
+        np.array(np.random.randint(0, 10, size=(100, input_height, input_width, input_channels)), dtype=np.float32)
+        for s in range(10)
+    ]
+    for input_value in dataset:
+        # Model has only one input so each data point has one element.s
+        yield [input_value]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+with open("conv.tflite", "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+
+os.system("rm -rf model.tar dev/ include/ generated-project/")
+
+tflite_file = "./conv.tflite"
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+os.system("mkdir -p include")
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_path="./conv.tflite")
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+input_matrix = np.random.randint(0, 127, (1, input_height, input_width, input_channels), dtype=np.uint8)
+interpreter.set_tensor(input_details[0]["index"], input_matrix)
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+gemmini.create_header_file("inputs", "data", "input", input_matrix, "./include")
+gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+mod, params = relay.frontend.from_tflite(
+    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}
+)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+##################################
+# Exporting and testing the model using microTVM
+# --------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
+import pathlib
+
+os.system("mkdir dev")
+model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
+tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+
+import tarfile
+
+with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
+    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
+
+# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+project_options = {
+    "project_type": "conv2d_example"
+}  
+
+generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+generated_project = tvm.micro.generate_project(
+    template_project_path, module, generated_project_dir, project_options
+)
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+generated_project.flash()
\ No newline at end of file
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py
new file mode 100644
index 000000000000..35349a5c157f
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py
@@ -0,0 +1,214 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single dense layer example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized dense (fully connected) layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+
+"""
+
+import tensorflow as tf
+import numpy as np
+import os
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+# 
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 32
+input_width = 32
+output_width = 32
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+class Model(tf.Module):
+    def __init__(self, name=None):
+        super().__init__(name)
+        self.w = tf.Variable(tf.random.normal([input_width, output_width]), name="w")
+        self.b = tf.Variable(tf.random.normal([output_width]), name="b")
+
+    @tf.function(
+        input_signature=[
+            tf.TensorSpec(shape=[input_height, input_width], dtype=tf.float32),
+        ]
+    )
+    def matmul(self, x):
+        return tf.linalg.matmul(x, self.w, transpose_b=False) + self.b
+
+model = Model()
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+
+def representative_data_gen():
+    dataset = [
+        (
+            np.array(
+                np.random.randint(-127, 128, size=(input_height, input_width)), dtype=np.float32
+            ),
+            np.array(
+                np.random.randint(-127, 128, size=(input_width, output_width)), dtype=np.float32
+            ),
+        )
+        for s in range(100)
+    ]
+    for input_value in dataset:
+        # Model has only one input so each data point has one element.
+        yield [input_value[0]]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+with open("matmul.tflite", "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+os.system("rm -rf model.tar dev/ include/ generated-project/")
+
+tflite_file = "./matmul.tflite"
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+os.system("mkdir -p include")
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+tensor_details = interpreter.get_tensor_details()
+
+input1 = np.random.randint(0, 255, (input_height, input_width), dtype=np.uint8)
+interpreter.set_tensor(input_details[0]["index"], input1)
+
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+gemmini.create_header_file("inputs", "data", "input", input1, "./include")
+gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+mod, params = relay.frontend.from_tflite(
+    tflite_model,
+    shape_dict={
+        "serving_default_x:0": (input_height, input_width),
+    },
+    dtype_dict={
+        "serving_default_x:0": input_dtype,
+    },
+)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+##################################
+# Exporting and testing the model using microTVM
+# --------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
+import pathlib
+
+os.system("mkdir dev")
+model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
+tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+
+import tarfile
+
+with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
+    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
+
+# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+project_options = {
+    "project_type": "dense_example"
+}  
+
+generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+generated_project = tvm.micro.generate_project(
+    template_project_path, module, generated_project_dir, project_options
+)
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+generated_project.flash()
+
+
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py
new file mode 100644
index 000000000000..44d3e57ea2d9
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py
@@ -0,0 +1,207 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single 2d depthwise convolutional layer example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized 2D depthwise convolution layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+"""
+
+import itertools
+from pyrsistent import v
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import numpy as np
+import os
+import argparse
+import random
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+# 
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 112
+input_width = 112
+input_channels = 32
+kernel_size = 3
+stride = 1
+padding = 'same'
+activation = None
+bias = True
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+model = keras.Sequential(
+    [
+        layers.DepthwiseConv2D(
+            kernel_size=kernel_size,
+            padding=padding,
+            activation=activation,
+            use_bias=True,
+            bias_initializer="ones",
+            input_shape=(input_height, input_width, input_channels),
+            strides=stride,
+        )
+    ]
+)
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+def representative_data_gen():
+    dataset = [
+        np.array(np.random.randint(0, 127, size=(10, input_height, input_width, input_channels)), dtype=np.float32)
+        for s in range(10)
+    ]
+    for input_value in dataset:
+        # Model has only one input so each data point has one element.s
+        yield [input_value]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+with open("dwconv.tflite", "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+os.system("rm -rf model.tar dev/ include/ generated-project/")
+
+tflite_file = "./dwconv.tflite"
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+os.system("mkdir -p include")
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_path="./dwconv.tflite")
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+tensor_details = interpreter.get_tensor_details()
+
+input = np.random.randint(0, 2, (1, input_height, input_width, input_channels), dtype=np.uint8)
+interpreter.set_tensor(input_details[0]["index"], input)
+
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+gemmini.create_header_file("inputs", "data", "input", input, "./include")
+gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+mod, params = relay.frontend.from_tflite(
+    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}
+)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+##################################
+# Exporting and testing the model using microTVM
+# --------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
+import pathlib
+
+os.system("mkdir dev")
+model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
+tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+
+import tarfile
+
+with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
+    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
+
+# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+project_options = {
+    "project_type": "dwconv2d_example"
+}  
+
+generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+generated_project = tvm.micro.generate_project(
+    template_project_path, module, generated_project_dir, project_options
+)
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+
+generated_project.flash()
\ No newline at end of file
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py
new file mode 100644
index 000000000000..03798ae62851
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A single 2d max pooling layer example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized 2D max pooling layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+"""
+
+import tensorflow as tf
+from tensorflow.keras import layers
+import numpy as np
+import os
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+# 
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# Then we define the parameters of the layer we want to test. In this case:
+input_height = 16
+input_width = 16
+input_channels = 16
+pool_size = 2
+pool_stride = 1
+pool_padding = 'valid'
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+class Model(tf.Module):
+    def __init__(self, name=None):
+        super().__init__(name)
+
+    @tf.function(
+        input_signature=[
+            tf.TensorSpec(
+                shape=[1, input_height, input_width, input_channels],
+                dtype=tf.float32,
+            )
+        ]
+    )
+    def maxpool(self, x):
+        return layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)(x)
+
+model = Model()
+
+# Convert the concrete functions using TFLiteConverter
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+
+def representative_data_gen():
+    dataset = [
+        np.array(
+            np.random.randint(
+                -127, 128, size=(1, input_height, input_width, input_channels)
+            ),
+            dtype=np.float32,
+        )
+        for s in range(100)
+    ]
+    for input_value in dataset:
+        # Model has only one input so each data point has one element.
+        yield [input_value]
+
+
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.int8
+converter.representative_dataset = representative_data_gen
+converter._experimental_disable_per_channel = True
+
+tflite_model = converter.convert()
+
+# Save the model.
+with open("maxpool.tflite", "wb") as f:
+    f.write(tflite_model)
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+os.system("rm -rf model.tar dev/ include/ generated-project/")
+
+tflite_file = "./maxpool.tflite"
+tflite_model_buf = open(tflite_file, "rb").read()
+input_tensor = "layer1_input"
+input_dtype = "uint8"
+
+os.system("mkdir -p include")
+
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+# Load the TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+tensor_details = interpreter.get_tensor_details()
+
+input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)
+
+interpreter.set_tensor(input_details[0]["index"], input_matrix_1)
+
+interpreter.invoke()
+expected_output = interpreter.get_tensor(output_details[0]["index"])
+
+# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+gemmini.create_header_file("inputs", "data", "input", input_matrix_1, "./include")
+gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+mod, params = relay.frontend.from_tflite(
+    tflite_model,
+    shape_dict={"serving_default_x": (1, input_height, input_width, input_channels)},
+    dtype_dict={"serving_default_x": input_dtype},
+)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+##################################
+# Exporting and testing the model using microTVM
+# --------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
+import pathlib
+
+os.system("mkdir dev")
+model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
+tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+
+import tarfile
+
+with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
+    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
+
+# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+project_options = {
+    "project_type": "maxpool2d_example"
+}  
+
+generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+generated_project = tvm.micro.generate_project(
+    template_project_path, module, generated_project_dir, project_options
+)
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+# Note: if there are errors, these can be related to rounding errors.
+generated_project.flash()
\ No newline at end of file
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py
new file mode 100644
index 000000000000..5d3a5009b67e
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py
@@ -0,0 +1,262 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Running TVM on the Gemmini accelerator - A complete MobileNet example
+======================================================================================
+**Author**:
+`Federico Peccia <https://fPecc.github.io/>`_
+
+This tutorials shows how a quantized MobileNet network can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.
+"""
+
+import numpy as np
+import tensorflow as tf
+import os
+import tvm.contrib.gemmini as gemmini
+from tvm import relay
+import tvm
+from mobilenet_utils import generate_mobilenet_tflite_model, get_real_image, run_tflite_model
+from tvm.contrib.download import download_testdata
+
+##################################
+# Pre-requisites
+# --------------------------------
+#
+# After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
+# 
+# .. code-block:: bash
+#
+#   source <your chipyard home path>/env.sh
+#
+# WARNING: if you have installed TVM in a virtual environment, FIRST activate the Chipyard environment, and THEN activate the tvm entironment.
+
+##################################
+# Helper functions
+# --------------------------------
+#
+# This functions will help us generate the MobileNet model
+
+def get_real_image(im_height, im_width):
+    from PIL import Image
+
+    repo_base = "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/"
+    img_name = "elephant-299.jpg"
+    image_url = os.path.join(repo_base, img_name)
+    img_path = download_testdata(image_url, img_name, module="data")
+    image = Image.open(img_path).resize((im_height, im_width))
+    x = np.array(image).astype("uint8")
+    data = np.reshape(x, (1, im_height, im_width, 3))
+    return data
+
+def run_tflite_model(tflite_model_buf, input_data):
+    """Generic function to execute TFLite"""
+    try:
+        from tensorflow import lite as interpreter_wrapper
+    except ImportError:
+        from tensorflow.contrib import lite as interpreter_wrapper
+
+    input_data = input_data if isinstance(input_data, list) else [input_data]
+
+    interpreter = interpreter_wrapper.Interpreter(model_content=tflite_model_buf)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    # set input
+    assert len(input_data) == len(input_details)
+    for i in range(len(input_details)):
+        interpreter.set_tensor(input_details[i]["index"], input_data[i])
+
+    # Run
+    interpreter.invoke()
+
+    # get output
+    tflite_output = list()
+    for i in range(len(output_details)):
+        tflite_output.append(interpreter.get_tensor(output_details[i]["index"]))
+
+    return tflite_output
+
+def download_model():
+    model_url = (
+        "https://storage.googleapis.com/download.tensorflow.org/models/"
+        "tflite_11_05_08/mobilenet_v2_1.0_224.tgz"
+    )
+
+    # Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite
+    model_path = download_testdata(
+        model_url, "mobilenet_v2_1.0_224.tgz", module=["tf", "official", "mobilenet_v2"]
+    )
+    model_dir = os.path.dirname(model_path)
+
+    return model_dir, model_path
+
+
+def extract(path):
+    import tarfile
+
+    if path.endswith("tgz") or path.endswith("gz"):
+        dir_path = os.path.dirname(path)
+        tar = tarfile.open(path)
+        tar.extractall(path=dir_path)
+        tar.close()
+    else:
+        raise RuntimeError("Could not decompress the file: " + path)
+
+
+def create_tflite_model(model_dir: str):
+    # tflite_model_name = [f for f in os.listdir(model_dir) if f.endswith(".tflite")][0]
+    # return f"{model_dir}/{tflite_model_name}"
+    def representative_data_gen():
+        dataset = [
+            np.array(np.random.randint(0, 255, size=(1, 224, 224, 3)), dtype=np.float32)
+            for s in range(100)
+        ]
+        for input_value in dataset:
+            # Model has only one input so each data point has one element.s
+            yield [input_value]
+
+    pb_file = [f for f in os.listdir(model_dir) if f.endswith(".pb")][0]
+    converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
+        f"{model_dir}/{pb_file}",
+        input_arrays=["input"],
+        input_shapes={"input": [1, 224, 224, 3]},
+        output_arrays=["MobilenetV2/Predictions/Reshape"],
+    )
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    # converter.target_spec.supported_ops = [tf.lite.OpsSet.SELECT_TF_OPS]
+    converter.inference_input_type = tf.uint8
+    converter.inference_output_type = tf.uint8
+    converter.representative_dataset = representative_data_gen
+    converter._experimental_disable_per_channel = True
+
+    tflite_model = converter.convert()
+    tflite_model_name = pb_file.replace(".pb", ".tflite")
+    with open(f"{model_dir}/{tflite_model_name}", "wb") as f:
+        f.write(tflite_model)
+
+    return f"{model_dir}/{tflite_model_name}"
+
+
+def generate_mobilenet_tflite_model():
+    model_dir, model_path = download_model()
+    extract(model_path)
+    return create_tflite_model(model_dir)
+
+##################################
+# Baseline generation
+# --------------------------------
+#
+# In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
+
+# We clean and prepare the workspace
+os.system("rm -rf model.tar dev/ include/ generated-project/")
+os.system("mkdir -p include")
+
+# We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
+tflite_model_dir = generate_mobilenet_tflite_model()
+
+input_image = get_real_image(224, 224)
+
+tflite_model_file = os.path.join(tflite_model_dir)
+tflite_model_buf = open(tflite_model_file, "rb").read()
+
+# Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+tflite_res = run_tflite_model(tflite_model_buf, input_image)
+tflite_pred = np.squeeze(tflite_res).argsort()[-5:][::-1]
+print("Expected argmax = %i" % (tflite_pred[0],))
+print("Expected max labels = %s" % (tflite_pred,))
+
+# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+gemmini.create_header_file("inputs", "data", "input", input_image, "./include")
+gemmini.create_header_file("outputs", "data", "output", tflite_pred.astype(np.uint32), "./include")
+
+##################################
+# Compiling the model with TVM
+# --------------------------------
+#
+# In this section, we will compile the model using TVM and the Gemmini integration.
+
+# The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
+gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)
+
+# The TFLite model generated in the previous steps is now imported into TVM.
+dtype_dict = {"input": input_image.dtype.name}
+shape_dict = {"input": input_image.shape}
+
+mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict)
+mod = relay.transform.InferType()(mod)
+mod["main"]
+
+# In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
+mod = gemmini.preprocess_pass(mod)
+mod["main"]
+
+# Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
+# The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
+RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": False})
+TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
+EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
+
+with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
+
+##################################
+# Exporting and testing the model using microTVM
+# --------------------------------
+#
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
+
+# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
+import pathlib
+
+os.system("mkdir dev")
+model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
+tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+
+import tarfile
+
+with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
+    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
+
+# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+project_options = {
+    "project_type": "mobilenet_example"
+}  
+
+generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+generated_project = tvm.micro.generate_project(
+    template_project_path, module, generated_project_dir, project_options
+)
+
+# We build the project. This will generate an executable we can run on the Spike simulator.
+generated_project.build()
+
+# Finally, we execute the compiled baremetal project on the Spike simulator.
+generated_project.flash()
\ No newline at end of file
diff --git a/python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb
deleted file mode 100644
index 2c2527830858..000000000000
--- a/python/tvm/contrib/gemmini/tutorials/networks/mobilenet-tutorial.ipynb
+++ /dev/null
@@ -1,311 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# MobileNet tutorial\n",
-    "\n",
-    "This tutorials shows how a quantized MobileNet network can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import os\n",
-    "import tvm.contrib.gemmini as gemmini\n",
-    "from tvm import relay\n",
-    "import tvm\n",
-    "from mobilenet_utils import generate_mobilenet_tflite_model, get_real_image, run_tflite_model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"CHIPYARD_HOME\"] = \"\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We clean and prepare the workspace"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
-    "os.system(\"mkdir -p include\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tflite_model_dir = generate_mobilenet_tflite_model()\n",
-    "\n",
-    "input_image = get_real_image(224, 224)\n",
-    "\n",
-    "tflite_model_file = os.path.join(tflite_model_dir)\n",
-    "tflite_model_buf = open(tflite_model_file, \"rb\").read()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "try:\n",
-    "    import tflite\n",
-    "\n",
-    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "except AttributeError:\n",
-    "    import tflite.Model\n",
-    "\n",
-    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "\n",
-    "tflite_res = run_tflite_model(tflite_model_buf, input_image)\n",
-    "tflite_pred = np.squeeze(tflite_res).argsort()[-5:][::-1]\n",
-    "print(\"Expected argmax = %i\" % (tflite_pred[0],))\n",
-    "print(\"Expected max labels = %s\" % (tflite_pred,))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input_image, \"./include\")\n",
-    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", tflite_pred.astype(np.uint32), \"./include\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The TFLite model generated in the previous steps is now imported into TVM."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dtype_dict = {\"input\": input_image.dtype.name}\n",
-    "shape_dict = {\"input\": input_image.shape}\n",
-    "\n",
-    "mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict)\n",
-    "mod = relay.transform.InferType()(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod = gemmini.preprocess_pass(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
-    "\n",
-    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
-    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
-    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
-    "\n",
-    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
-    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pathlib\n",
-    "\n",
-    "os.system(\"mkdir dev\")\n",
-    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
-    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
-    "\n",
-    "import tarfile\n",
-    "\n",
-    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
-    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
-    "project_options = {\n",
-    "    \"project_type\": \"mobilenet_example\"\n",
-    "}  \n",
-    "\n",
-    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
-    "generated_project = tvm.micro.generate_project(\n",
-    "    template_project_path, module, generated_project_dir, project_options\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We build the project. This will generate an executable we can run on the Spike simulator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.build()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finally, we execute the compiled baremetal project on the Spike simulator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.flash()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.8.10 ('tvm': venv)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py b/python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py
deleted file mode 100644
index 51e75fdd7022..000000000000
--- a/python/tvm/contrib/gemmini/tutorials/networks/mobilenet_utils.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Utils to help generate the MobileNet TFLite model
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
-
-import os
-from tvm.contrib.download import download_testdata
-import numpy as np
-import tensorflow as tf
-
-
-def get_real_image(im_height, im_width):
-    from PIL import Image
-
-    repo_base = "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/"
-    img_name = "elephant-299.jpg"
-    image_url = os.path.join(repo_base, img_name)
-    img_path = download_testdata(image_url, img_name, module="data")
-    image = Image.open(img_path).resize((im_height, im_width))
-    x = np.array(image).astype("uint8")
-    data = np.reshape(x, (1, im_height, im_width, 3))
-    return data
-
-
-def run_tflite_model(tflite_model_buf, input_data):
-    """Generic function to execute TFLite"""
-    try:
-        from tensorflow import lite as interpreter_wrapper
-    except ImportError:
-        from tensorflow.contrib import lite as interpreter_wrapper
-
-    input_data = input_data if isinstance(input_data, list) else [input_data]
-
-    interpreter = interpreter_wrapper.Interpreter(model_content=tflite_model_buf)
-    interpreter.allocate_tensors()
-
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
-
-    # set input
-    assert len(input_data) == len(input_details)
-    for i in range(len(input_details)):
-        interpreter.set_tensor(input_details[i]["index"], input_data[i])
-
-    # Run
-    interpreter.invoke()
-
-    # get output
-    tflite_output = list()
-    for i in range(len(output_details)):
-        tflite_output.append(interpreter.get_tensor(output_details[i]["index"]))
-
-    return tflite_output
-
-
-def download_model():
-    model_url = (
-        "https://storage.googleapis.com/download.tensorflow.org/models/"
-        "tflite_11_05_08/mobilenet_v2_1.0_224.tgz"
-    )
-
-    # Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite
-    model_path = download_testdata(
-        model_url, "mobilenet_v2_1.0_224.tgz", module=["tf", "official", "mobilenet_v2"]
-    )
-    model_dir = os.path.dirname(model_path)
-
-    return model_dir, model_path
-
-
-def extract(path):
-    import tarfile
-
-    if path.endswith("tgz") or path.endswith("gz"):
-        dir_path = os.path.dirname(path)
-        tar = tarfile.open(path)
-        tar.extractall(path=dir_path)
-        tar.close()
-    else:
-        raise RuntimeError("Could not decompress the file: " + path)
-
-
-def create_tflite_model(model_dir: str):
-    # tflite_model_name = [f for f in os.listdir(model_dir) if f.endswith(".tflite")][0]
-    # return f"{model_dir}/{tflite_model_name}"
-    def representative_data_gen():
-        dataset = [
-            np.array(np.random.randint(0, 255, size=(1, 224, 224, 3)), dtype=np.float32)
-            for s in range(100)
-        ]
-        for input_value in dataset:
-            # Model has only one input so each data point has one element.s
-            yield [input_value]
-
-    pb_file = [f for f in os.listdir(model_dir) if f.endswith(".pb")][0]
-    converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
-        f"{model_dir}/{pb_file}",
-        input_arrays=["input"],
-        input_shapes={"input": [1, 224, 224, 3]},
-        output_arrays=["MobilenetV2/Predictions/Reshape"],
-    )
-    converter.optimizations = [tf.lite.Optimize.DEFAULT]
-    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-    # converter.target_spec.supported_ops = [tf.lite.OpsSet.SELECT_TF_OPS]
-    converter.inference_input_type = tf.uint8
-    converter.inference_output_type = tf.uint8
-    converter.representative_dataset = representative_data_gen
-    converter._experimental_disable_per_channel = True
-
-    tflite_model = converter.convert()
-    tflite_model_name = pb_file.replace(".pb", ".tflite")
-    with open(f"{model_dir}/{tflite_model_name}", "wb") as f:
-        f.write(tflite_model)
-
-    return f"{model_dir}/{tflite_model_name}"
-
-
-def generate_mobilenet_tflite_model():
-    model_dir, model_path = download_model()
-    extract(model_path)
-    return create_tflite_model(model_dir)
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb
deleted file mode 100644
index 3bb2fa5788e9..000000000000
--- a/python/tvm/contrib/gemmini/tutorials/single_operators/add-tutorial.ipynb
+++ /dev/null
@@ -1,395 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Add layer tutorial\n",
-    "\n",
-    "This tutorials shows how a quantized add layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension.\n",
-    "\n",
-    "Note: This is an **experimental** layer!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorflow as tf\n",
-    "from tensorflow.keras import layers\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import tvm.contrib.gemmini as gemmini\n",
-    "from tvm import relay\n",
-    "import tvm"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"CHIPYARD_HOME\"] = \"\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Then we define the parameters of the layer we want to test. In this case:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_height = 16\n",
-    "input_width = 16\n",
-    "input_channels = 16\n",
-    "activation = 0"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class Model(tf.Module):\n",
-    "    def __init__(self, name=None):\n",
-    "        super().__init__(name)\n",
-    "\n",
-    "    @tf.function(\n",
-    "        input_signature=[\n",
-    "            tf.TensorSpec(\n",
-    "                shape=[1, input_height, input_width, input_channels],\n",
-    "                dtype=tf.float32,\n",
-    "            ),\n",
-    "            tf.TensorSpec(\n",
-    "                shape=[1, input_height, input_width, input_channels],\n",
-    "                dtype=tf.float32,\n",
-    "            ),\n",
-    "        ]\n",
-    "    )\n",
-    "    def add(self, x, y):\n",
-    "        if activation == 0:\n",
-    "            return x + y\n",
-    "        else:\n",
-    "            return layers.Activation(\"relu\")(x + y)\n",
-    "\n",
-    "model = Model()\n",
-    "\n",
-    "# Convert the concrete functions using TFLiteConverter\n",
-    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
-    "\n",
-    "def representative_data_gen():\n",
-    "    dataset = [\n",
-    "        (\n",
-    "            np.array(\n",
-    "                np.random.randint(\n",
-    "                    -127, 128, size=(1, input_height, input_width, input_channels)\n",
-    "                ),\n",
-    "                dtype=np.float32,\n",
-    "            ),\n",
-    "            np.array(\n",
-    "                np.random.randint(\n",
-    "                    0, 128, size=(1, input_height, input_width, input_channels)\n",
-    "                ),\n",
-    "                dtype=np.float32,\n",
-    "            ),\n",
-    "        )\n",
-    "        for s in range(100)\n",
-    "    ]\n",
-    "    for input_value in dataset:\n",
-    "        yield [input_value[0], input_value[1]]\n",
-    "\n",
-    "\n",
-    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
-    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
-    "converter.inference_input_type = tf.uint8\n",
-    "converter.inference_output_type = tf.int8\n",
-    "converter.representative_dataset = representative_data_gen\n",
-    "converter._experimental_disable_per_channel = True\n",
-    "\n",
-    "tflite_model = converter.convert()\n",
-    "\n",
-    "# Save the model.\n",
-    "with open(\"add.tflite\", \"wb\") as f:\n",
-    "    f.write(tflite_model)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
-    "\n",
-    "tflite_file = \"./add.tflite\"\n",
-    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
-    "input_tensor = \"layer1_input\"\n",
-    "input_dtype = \"uint8\"\n",
-    "\n",
-    "os.system(\"mkdir -p include\")\n",
-    "\n",
-    "try:\n",
-    "    import tflite\n",
-    "\n",
-    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "except AttributeError:\n",
-    "    import tflite.Model\n",
-    "\n",
-    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "\n",
-    "# Load the TFLite model and allocate tensors.\n",
-    "interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)\n",
-    "interpreter.allocate_tensors()\n",
-    "input_details = interpreter.get_input_details()\n",
-    "output_details = interpreter.get_output_details()\n",
-    "tensor_details = interpreter.get_tensor_details()\n",
-    "\n",
-    "input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
-    "input_matrix_2 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
-    "\n",
-    "interpreter.set_tensor(input_details[0][\"index\"], input_matrix_1)\n",
-    "interpreter.set_tensor(input_details[1][\"index\"], input_matrix_2)\n",
-    "\n",
-    "interpreter.invoke()\n",
-    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.create_header_file(\"inputs\", \"data\", \"input_1\", input_matrix_2, \"./include\")\n",
-    "gemmini.create_header_file(\"inputs\", \"data\", \"input_2\", input_matrix_1, \"./include\")\n",
-    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096, use_experimental_qnn_add=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The TFLite model generated in the previous steps is now imported into TVM."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod, params = relay.frontend.from_tflite(\n",
-    "    tflite_model,\n",
-    "    shape_dict={\"serving_default_x\": (1, input_height, input_width, input_channels), \"serving_default_y\": (1, input_height, input_width, input_channels)},\n",
-    "    dtype_dict={\"serving_default_x\": input_dtype, \"serving_default_y\": input_dtype},\n",
-    ")\n",
-    "mod = relay.transform.InferType()(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod = gemmini.preprocess_pass(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
-    "\n",
-    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
-    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
-    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
-    "\n",
-    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
-    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pathlib\n",
-    "\n",
-    "os.system(\"mkdir dev\")\n",
-    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
-    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
-    "\n",
-    "import tarfile\n",
-    "\n",
-    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
-    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
-    "project_options = {\n",
-    "    \"project_type\": \"add_example\"\n",
-    "}  \n",
-    "\n",
-    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
-    "generated_project = tvm.micro.generate_project(\n",
-    "    template_project_path, module, generated_project_dir, project_options\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We build the project. This will generate an executable we can run on the Spike simulator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.build()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
-    "\n",
-    "Note: if there are errors, these can be related to rounding errors."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.flash()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.8.10 ('tvm': venv)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb
deleted file mode 100644
index c7512586b809..000000000000
--- a/python/tvm/contrib/gemmini/tutorials/single_operators/conv2d-tutorial.ipynb
+++ /dev/null
@@ -1,378 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 2D convolution layer tutorial\n",
-    "\n",
-    "This tutorials shows how a quantized 2d convolution layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorflow as tf\n",
-    "from tensorflow import keras\n",
-    "from tensorflow.keras import layers\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import tvm.contrib.gemmini as gemmini\n",
-    "from tvm import relay\n",
-    "import tvm"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"CHIPYARD_HOME\"] = \"\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Then we define the parameters of the layer we want to test. In this case:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_height = 16\n",
-    "input_width = 16\n",
-    "input_channels = 16\n",
-    "output_channels = 16\n",
-    "kernel_size = 3\n",
-    "stride = 1\n",
-    "padding = 'valid'\n",
-    "activation = None\n",
-    "bias = True\n",
-    "\n",
-    "# We can add a max pooling layer after the convolution. This can be merged by the integration and can be executed together with the convolution on the Gemmini accelerator.\n",
-    "pool_size = 1\n",
-    "pool_stride = 1\n",
-    "pool_padding = 'valid'\n",
-    "use_pool = False"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "layer_sequence = [\n",
-    "    layers.Conv2D(\n",
-    "        output_channels,\n",
-    "        kernel_size=kernel_size,\n",
-    "        padding=padding,\n",
-    "        activation=activation,\n",
-    "        use_bias=True,\n",
-    "        bias_initializer=\"ones\",\n",
-    "        input_shape=(input_height, input_width, input_channels),\n",
-    "        strides=stride,\n",
-    "    )\n",
-    "]\n",
-    "if use_pool:\n",
-    "    layer_sequence.append(\n",
-    "        layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)\n",
-    "    )\n",
-    "\n",
-    "model = keras.Sequential(layer_sequence)\n",
-    "\n",
-    "# Convert the concrete functions using TFLiteConverter\n",
-    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
-    "\n",
-    "def representative_data_gen():\n",
-    "    dataset = [\n",
-    "        np.array(np.random.randint(0, 10, size=(100, input_height, input_width, input_channels)), dtype=np.float32)\n",
-    "        for s in range(10)\n",
-    "    ]\n",
-    "    for input_value in dataset:\n",
-    "        # Model has only one input so each data point has one element.s\n",
-    "        yield [input_value]\n",
-    "\n",
-    "\n",
-    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
-    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
-    "converter.inference_input_type = tf.uint8\n",
-    "converter.inference_output_type = tf.int8\n",
-    "converter.representative_dataset = representative_data_gen\n",
-    "converter._experimental_disable_per_channel = True\n",
-    "\n",
-    "tflite_model = converter.convert()\n",
-    "\n",
-    "# Save the model.\n",
-    "with open(\"conv.tflite\", \"wb\") as f:\n",
-    "    f.write(tflite_model)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
-    "\n",
-    "tflite_file = \"./conv.tflite\"\n",
-    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
-    "input_tensor = \"layer1_input\"\n",
-    "input_dtype = \"uint8\"\n",
-    "\n",
-    "os.system(\"mkdir -p include\")\n",
-    "\n",
-    "try:\n",
-    "    import tflite\n",
-    "\n",
-    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "except AttributeError:\n",
-    "    import tflite.Model\n",
-    "\n",
-    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "\n",
-    "# Load the TFLite model and allocate tensors.\n",
-    "interpreter = tf.lite.Interpreter(model_path=\"./conv.tflite\")\n",
-    "interpreter.allocate_tensors()\n",
-    "input_details = interpreter.get_input_details()\n",
-    "output_details = interpreter.get_output_details()\n",
-    "input_matrix = np.random.randint(0, 127, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
-    "interpreter.set_tensor(input_details[0][\"index\"], input_matrix)\n",
-    "interpreter.invoke()\n",
-    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input_matrix, \"./include\")\n",
-    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The TFLite model generated in the previous steps is now imported into TVM."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod, params = relay.frontend.from_tflite(\n",
-    "    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}\n",
-    ")\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod = gemmini.preprocess_pass(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
-    "\n",
-    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
-    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
-    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
-    "\n",
-    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
-    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pathlib\n",
-    "\n",
-    "os.system(\"mkdir dev\")\n",
-    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
-    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
-    "\n",
-    "import tarfile\n",
-    "\n",
-    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
-    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
-    "project_options = {\n",
-    "    \"project_type\": \"conv2d_example\"\n",
-    "}  \n",
-    "\n",
-    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
-    "generated_project = tvm.micro.generate_project(\n",
-    "    template_project_path, module, generated_project_dir, project_options\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We build the project. This will generate an executable we can run on the Spike simulator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.build()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
-    "\n",
-    "Note: if there are errors, these can be related to rounding errors."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.flash()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.8.10 ('tvm': venv)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb
deleted file mode 100644
index d1959f66b72a..000000000000
--- a/python/tvm/contrib/gemmini/tutorials/single_operators/dense-tutorial.ipynb
+++ /dev/null
@@ -1,378 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Dense layer tutorial\n",
-    "\n",
-    "This tutorials shows how a quantized dense (fully connected) layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorflow as tf\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import tvm.contrib.gemmini as gemmini\n",
-    "from tvm import relay\n",
-    "import tvm"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"CHIPYARD_HOME\"] = \"\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Then we define the parameters of the layer we want to test. In this case:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_height = 32\n",
-    "input_width = 32\n",
-    "output_width = 32"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class Model(tf.Module):\n",
-    "    def __init__(self, name=None):\n",
-    "        super().__init__(name)\n",
-    "        self.w = tf.Variable(tf.random.normal([input_width, output_width]), name=\"w\")\n",
-    "        self.b = tf.Variable(tf.random.normal([output_width]), name=\"b\")\n",
-    "\n",
-    "    @tf.function(\n",
-    "        input_signature=[\n",
-    "            tf.TensorSpec(shape=[input_height, input_width], dtype=tf.float32),\n",
-    "        ]\n",
-    "    )\n",
-    "    def matmul(self, x):\n",
-    "        return tf.linalg.matmul(x, self.w, transpose_b=False) + self.b\n",
-    "\n",
-    "model = Model()\n",
-    "\n",
-    "# Convert the concrete functions using TFLiteConverter\n",
-    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
-    "\n",
-    "\n",
-    "def representative_data_gen():\n",
-    "    dataset = [\n",
-    "        (\n",
-    "            np.array(\n",
-    "                np.random.randint(-127, 128, size=(input_height, input_width)), dtype=np.float32\n",
-    "            ),\n",
-    "            np.array(\n",
-    "                np.random.randint(-127, 128, size=(input_width, output_width)), dtype=np.float32\n",
-    "            ),\n",
-    "        )\n",
-    "        for s in range(100)\n",
-    "    ]\n",
-    "    for input_value in dataset:\n",
-    "        # Model has only one input so each data point has one element.\n",
-    "        yield [input_value[0]]\n",
-    "\n",
-    "\n",
-    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
-    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
-    "converter.inference_input_type = tf.uint8\n",
-    "converter.inference_output_type = tf.int8\n",
-    "converter.representative_dataset = representative_data_gen\n",
-    "converter._experimental_disable_per_channel = True\n",
-    "\n",
-    "tflite_model = converter.convert()\n",
-    "\n",
-    "# Save the model.\n",
-    "with open(\"matmul.tflite\", \"wb\") as f:\n",
-    "    f.write(tflite_model)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
-    "\n",
-    "tflite_file = \"./matmul.tflite\"\n",
-    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
-    "input_tensor = \"layer1_input\"\n",
-    "input_dtype = \"uint8\"\n",
-    "\n",
-    "os.system(\"mkdir -p include\")\n",
-    "\n",
-    "try:\n",
-    "    import tflite\n",
-    "\n",
-    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "except AttributeError:\n",
-    "    import tflite.Model\n",
-    "\n",
-    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "\n",
-    "# Load the TFLite model and allocate tensors.\n",
-    "interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)\n",
-    "interpreter.allocate_tensors()\n",
-    "input_details = interpreter.get_input_details()\n",
-    "output_details = interpreter.get_output_details()\n",
-    "tensor_details = interpreter.get_tensor_details()\n",
-    "\n",
-    "input1 = np.random.randint(0, 255, (input_height, input_width), dtype=np.uint8)\n",
-    "interpreter.set_tensor(input_details[0][\"index\"], input1)\n",
-    "\n",
-    "interpreter.invoke()\n",
-    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input1, \"./include\")\n",
-    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The TFLite model generated in the previous steps is now imported into TVM."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod, params = relay.frontend.from_tflite(\n",
-    "    tflite_model,\n",
-    "    shape_dict={\n",
-    "        \"serving_default_x:0\": (input_height, input_width),\n",
-    "    },\n",
-    "    dtype_dict={\n",
-    "        \"serving_default_x:0\": input_dtype,\n",
-    "    },\n",
-    ")\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod = gemmini.preprocess_pass(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
-    "\n",
-    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
-    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
-    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
-    "\n",
-    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
-    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pathlib\n",
-    "\n",
-    "os.system(\"mkdir dev\")\n",
-    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
-    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
-    "\n",
-    "import tarfile\n",
-    "\n",
-    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
-    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
-    "project_options = {\n",
-    "    \"project_type\": \"dense_example\"\n",
-    "}  \n",
-    "\n",
-    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
-    "generated_project = tvm.micro.generate_project(\n",
-    "    template_project_path, module, generated_project_dir, project_options\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We build the project. This will generate an executable we can run on the Spike simulator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.build()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
-    "\n",
-    "Note: if there are errors, these can be related to rounding errors."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.flash()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.8.10 ('tvm': venv)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb
deleted file mode 100644
index b5753a300401..000000000000
--- a/python/tvm/contrib/gemmini/tutorials/single_operators/dwconv2d-tutorial.ipynb
+++ /dev/null
@@ -1,373 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 2D depthwise convolution layer tutorial\n",
-    "\n",
-    "This tutorials shows how a quantized 2D depthwise convolution layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import itertools\n",
-    "from pyrsistent import v\n",
-    "import tensorflow as tf\n",
-    "from tensorflow import keras\n",
-    "from tensorflow.keras import layers\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import argparse\n",
-    "import random\n",
-    "import tvm.contrib.gemmini as gemmini\n",
-    "from tvm import relay\n",
-    "import tvm"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"CHIPYARD_HOME\"] = \"\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Then we define the parameters of the layer we want to test. In this case:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_height = 112\n",
-    "input_width = 112\n",
-    "input_channels = 32\n",
-    "kernel_size = 3\n",
-    "stride = 1\n",
-    "padding = 'same'\n",
-    "activation = None\n",
-    "bias = True"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = keras.Sequential(\n",
-    "    [\n",
-    "        layers.DepthwiseConv2D(\n",
-    "            kernel_size=kernel_size,\n",
-    "            padding=padding,\n",
-    "            activation=activation,\n",
-    "            use_bias=True,\n",
-    "            bias_initializer=\"ones\",\n",
-    "            input_shape=(input_height, input_width, input_channels),\n",
-    "            strides=stride,\n",
-    "        )\n",
-    "    ]\n",
-    ")\n",
-    "\n",
-    "# Convert the concrete functions using TFLiteConverter\n",
-    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
-    "\n",
-    "def representative_data_gen():\n",
-    "    dataset = [\n",
-    "        np.array(np.random.randint(0, 127, size=(10, input_height, input_width, input_channels)), dtype=np.float32)\n",
-    "        for s in range(10)\n",
-    "    ]\n",
-    "    for input_value in dataset:\n",
-    "        # Model has only one input so each data point has one element.s\n",
-    "        yield [input_value]\n",
-    "\n",
-    "\n",
-    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
-    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
-    "converter.inference_input_type = tf.uint8\n",
-    "converter.inference_output_type = tf.int8\n",
-    "converter.representative_dataset = representative_data_gen\n",
-    "converter._experimental_disable_per_channel = True\n",
-    "\n",
-    "tflite_model = converter.convert()\n",
-    "\n",
-    "# Save the model.\n",
-    "with open(\"dwconv.tflite\", \"wb\") as f:\n",
-    "    f.write(tflite_model)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
-    "\n",
-    "tflite_file = \"./dwconv.tflite\"\n",
-    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
-    "input_tensor = \"layer1_input\"\n",
-    "input_dtype = \"uint8\"\n",
-    "\n",
-    "os.system(\"mkdir -p include\")\n",
-    "\n",
-    "try:\n",
-    "    import tflite\n",
-    "\n",
-    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "except AttributeError:\n",
-    "    import tflite.Model\n",
-    "\n",
-    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "\n",
-    "# Load the TFLite model and allocate tensors.\n",
-    "interpreter = tf.lite.Interpreter(model_path=\"./dwconv.tflite\")\n",
-    "interpreter.allocate_tensors()\n",
-    "input_details = interpreter.get_input_details()\n",
-    "output_details = interpreter.get_output_details()\n",
-    "tensor_details = interpreter.get_tensor_details()\n",
-    "\n",
-    "input = np.random.randint(0, 2, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
-    "interpreter.set_tensor(input_details[0][\"index\"], input)\n",
-    "\n",
-    "interpreter.invoke()\n",
-    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input, \"./include\")\n",
-    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The TFLite model generated in the previous steps is now imported into TVM."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod, params = relay.frontend.from_tflite(\n",
-    "    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}\n",
-    ")\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod = gemmini.preprocess_pass(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
-    "\n",
-    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
-    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
-    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
-    "\n",
-    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
-    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pathlib\n",
-    "\n",
-    "os.system(\"mkdir dev\")\n",
-    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
-    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
-    "\n",
-    "import tarfile\n",
-    "\n",
-    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
-    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
-    "project_options = {\n",
-    "    \"project_type\": \"dwconv2d_example\"\n",
-    "}  \n",
-    "\n",
-    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
-    "generated_project = tvm.micro.generate_project(\n",
-    "    template_project_path, module, generated_project_dir, project_options\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We build the project. This will generate an executable we can run on the Spike simulator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.build()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
-    "\n",
-    "Note: if there are errors, these can be related to rounding errors."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.flash()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.8.10 ('tvm': venv)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "5d7de8d84d32cbbe537c50b34cb949251a03cf44fca18853707459ebd33e07d4"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb b/python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb
deleted file mode 100644
index bdee93760f96..000000000000
--- a/python/tvm/contrib/gemmini/tutorials/single_operators/maxpool2d-tutorial.ipynb
+++ /dev/null
@@ -1,378 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 2D max pooling layer tutorial\n",
-    "\n",
-    "This tutorials shows how a quantized 2D max pooling layer can be compiled to be executed on the Gemmini accelerator. The generated baremetal C code is then tested on the Spike RISC-V ISA simulator. Before starting this tutorial, you should have downloaded the Chipyard repository and installed the Spike simulator with the Gemmini extension."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorflow as tf\n",
-    "from tensorflow.keras import layers\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import tvm.contrib.gemmini as gemmini\n",
-    "from tvm import relay\n",
-    "import tvm"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We need to export the environment variable CHIPYARD_HOME, in order to be able to run the Spike simulator correctly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"CHIPYARD_HOME\"] = \"\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Then we define the parameters of the layer we want to test. In this case:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_height = 16\n",
-    "input_width = 16\n",
-    "input_channels = 16\n",
-    "pool_size = 2\n",
-    "pool_stride = 1\n",
-    "pool_padding = 'valid'"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class Model(tf.Module):\n",
-    "    def __init__(self, name=None):\n",
-    "        super().__init__(name)\n",
-    "\n",
-    "    @tf.function(\n",
-    "        input_signature=[\n",
-    "            tf.TensorSpec(\n",
-    "                shape=[1, input_height, input_width, input_channels],\n",
-    "                dtype=tf.float32,\n",
-    "            )\n",
-    "        ]\n",
-    "    )\n",
-    "    def maxpool(self, x):\n",
-    "        return layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)(x)\n",
-    "\n",
-    "model = Model()\n",
-    "\n",
-    "# Convert the concrete functions using TFLiteConverter\n",
-    "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
-    "\n",
-    "\n",
-    "def representative_data_gen():\n",
-    "    dataset = [\n",
-    "        np.array(\n",
-    "            np.random.randint(\n",
-    "                -127, 128, size=(1, input_height, input_width, input_channels)\n",
-    "            ),\n",
-    "            dtype=np.float32,\n",
-    "        )\n",
-    "        for s in range(100)\n",
-    "    ]\n",
-    "    for input_value in dataset:\n",
-    "        # Model has only one input so each data point has one element.\n",
-    "        yield [input_value]\n",
-    "\n",
-    "\n",
-    "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
-    "converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]\n",
-    "converter.inference_input_type = tf.uint8\n",
-    "converter.inference_output_type = tf.int8\n",
-    "converter.representative_dataset = representative_data_gen\n",
-    "converter._experimental_disable_per_channel = True\n",
-    "\n",
-    "tflite_model = converter.convert()\n",
-    "\n",
-    "# Save the model.\n",
-    "with open(\"maxpool.tflite\", \"wb\") as f:\n",
-    "    f.write(tflite_model)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.system(\"rm -rf model.tar dev/ include/ generated-project/\")\n",
-    "\n",
-    "tflite_file = \"./maxpool.tflite\"\n",
-    "tflite_model_buf = open(tflite_file, \"rb\").read()\n",
-    "input_tensor = \"layer1_input\"\n",
-    "input_dtype = \"uint8\"\n",
-    "\n",
-    "os.system(\"mkdir -p include\")\n",
-    "\n",
-    "try:\n",
-    "    import tflite\n",
-    "\n",
-    "    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "except AttributeError:\n",
-    "    import tflite.Model\n",
-    "\n",
-    "    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)\n",
-    "\n",
-    "# Load the TFLite model and allocate tensors.\n",
-    "interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)\n",
-    "interpreter.allocate_tensors()\n",
-    "input_details = interpreter.get_input_details()\n",
-    "output_details = interpreter.get_output_details()\n",
-    "tensor_details = interpreter.get_tensor_details()\n",
-    "\n",
-    "input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)\n",
-    "\n",
-    "interpreter.set_tensor(input_details[0][\"index\"], input_matrix_1)\n",
-    "\n",
-    "interpreter.invoke()\n",
-    "expected_output = interpreter.get_tensor(output_details[0][\"index\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.create_header_file(\"inputs\", \"data\", \"input\", input_matrix_1, \"./include\")\n",
-    "gemmini.create_header_file(\"outputs\", \"data\", \"output\", expected_output, \"./include\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The TFLite model generated in the previous steps is now imported into TVM."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod, params = relay.frontend.from_tflite(\n",
-    "    tflite_model,\n",
-    "    shape_dict={\"serving_default_x\": (1, input_height, input_width, input_channels)},\n",
-    "    dtype_dict={\"serving_default_x\": input_dtype},\n",
-    ")\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the __gemmini.preprocess__ pass. Notice the changes in the \"main\" function after running the preprocess pass."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mod = gemmini.preprocess_pass(mod)\n",
-    "mod[\"main\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.\n",
-    "\n",
-    "The __gemmini.build_config__ function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RUNTIME = tvm.relay.backend.Runtime(\"crt\", {\"system-lib\": False})\n",
-    "TARGET = tvm.target.target.Target({\"kind\": \"c\", \"device\": \"gemmini\"})\n",
-    "EXECUTOR = tvm.relay.backend.Executor(\"aot\", options={\"interface-api\": \"c\", \"unpacked-api\": 1})\n",
-    "\n",
-    "with gemmini.build_config(usmp_alg=\"hill_climb\",opt_level=3, disabled_pass=[\"AlterOpLayout\"]):\n",
-    "    module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pathlib\n",
-    "\n",
-    "os.system(\"mkdir dev\")\n",
-    "model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), \"dev/model.tar\")\n",
-    "tvm.micro.export_model_library_format(module, model_library_format_tar_path)\n",
-    "\n",
-    "import tarfile\n",
-    "\n",
-    "with tarfile.open(model_library_format_tar_path, \"r:*\") as tar_f:\n",
-    "    print(\"\\n\".join(f\" - {m.name}\" for m in tar_f.getmembers()))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects(\"gemmini\"))\n",
-    "project_options = {\n",
-    "    \"project_type\": \"maxpool2d_example\"\n",
-    "}  \n",
-    "\n",
-    "generated_project_dir = pathlib.Path(pathlib.Path.cwd(), \"generated-project\")\n",
-    "generated_project = tvm.micro.generate_project(\n",
-    "    template_project_path, module, generated_project_dir, project_options\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We build the project. This will generate an executable we can run on the Spike simulator."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.build()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finally, we execute the compiled baremetal project on the Spike simulator.\n",
-    "\n",
-    "Note: if there are errors, these can be related to rounding errors."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "generated_project.flash()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.8.10 64-bit",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From c92ac821eed0c10b6f9f6f5af6b7c724408deb17 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 11:22:36 +0100
Subject: [PATCH 024/286] Small CMAKE fix and lint fixes

---
 CMakeLists.txt                                |  4 +-
 .../template_project/microtvm_api_server.py   | 14 +++----
 .../micro_gemmini/micro_gemmini_add.py        | 37 +++++++++++--------
 .../micro_gemmini/micro_gemmini_conv2d.py     | 28 ++++++++------
 .../micro_gemmini/micro_gemmini_dense.py      | 11 ++----
 .../micro_gemmini/micro_gemmini_dwconv2d.py   | 22 ++++++-----
 .../micro_gemmini/micro_gemmini_maxpool2d.py  | 21 +++++------
 .../micro_gemmini/micro_gemmini_mobilenet.py  | 14 ++++---
 8 files changed, 83 insertions(+), 68 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 47499ff90356..9cfa48fc045d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -575,7 +575,9 @@ if(USE_MICRO)
   # Unix Makefiles generator, need to add these explicit target-level dependency)
   add_dependencies(tvm_runtime zephyr)
   add_dependencies(tvm_runtime arduino)
-  add_dependencies(tvm_runtime gemmini)
+  if(USE_GEMMINI)
+    add_dependencies(tvm_runtime gemmini)
+  endif()
   if(MSVC)
     target_link_libraries(tvm PRIVATE host_standalone_crt )
     target_link_libraries(tvm_runtime PRIVATE host_standalone_crt)
diff --git a/apps/microtvm/gemmini/template_project/microtvm_api_server.py b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
index 85971316ec4e..df2f27d315ea 100644
--- a/apps/microtvm/gemmini/template_project/microtvm_api_server.py
+++ b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
@@ -109,14 +109,15 @@ def _copy_project_files(self, api_server_dir, project_dir, project_type):
                 shutil.copytree(item, dest)
             else:
                 shutil.copy2(item, dest)
-        
+
         shutil.copy2(project_dir / "src" / "Makefile.template", project_dir / "src" / "Makefile")
 
-        test_name = project_type.replace("_example","")
+        test_name = project_type.replace("_example", "")
         new_line = f"tests = {test_name}\n"
-        with open(project_dir / "src" / "Makefile", 'r') as original: data = original.read()
-        with open(project_dir / "src" / "Makefile", 'w') as modified: modified.write(new_line + data)
-
+        with open(project_dir / "src" / "Makefile", "r") as original:
+            data = original.read()
+        with open(project_dir / "src" / "Makefile", "w") as modified:
+            modified.write(new_line + data)
 
     CRT_COPY_ITEMS = ("include", "src")
 
@@ -264,8 +265,7 @@ def build(self, options):
     def flash(self, options):
         test_name = options["project_type"].split("_")[0]
         subprocess.call(
-            "cd src/build && spike --extension=gemmini %s"
-            % (test_name + "-baremetal",),
+            "cd src/build && spike --extension=gemmini %s" % (test_name + "-baremetal",),
             shell=True,
         )
 
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py
index b3fe3c5bb3a0..b8521c4b6ae2 100644
--- a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py
@@ -38,7 +38,7 @@
 # --------------------------------
 #
 # After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
-# 
+#
 # .. code-block:: bash
 #
 #   source <your chipyard home path>/env.sh
@@ -80,24 +80,22 @@ def add(self, x, y):
         else:
             return layers.Activation("relu")(x + y)
 
+
 model = Model()
 
 # Convert the concrete functions using TFLiteConverter
 converter = tf.lite.TFLiteConverter.from_keras_model(model)
 
+
 def representative_data_gen():
     dataset = [
         (
             np.array(
-                np.random.randint(
-                    -127, 128, size=(1, input_height, input_width, input_channels)
-                ),
+                np.random.randint(-127, 128, size=(1, input_height, input_width, input_channels)),
                 dtype=np.float32,
             ),
             np.array(
-                np.random.randint(
-                    0, 128, size=(1, input_height, input_width, input_channels)
-                ),
+                np.random.randint(0, 128, size=(1, input_height, input_width, input_channels)),
                 dtype=np.float32,
             ),
         )
@@ -147,8 +145,12 @@ def representative_data_gen():
 output_details = interpreter.get_output_details()
 tensor_details = interpreter.get_tensor_details()
 
-input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)
-input_matrix_2 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)
+input_matrix_1 = np.random.randint(
+    0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8
+)
+input_matrix_2 = np.random.randint(
+    0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8
+)
 
 interpreter.set_tensor(input_details[0]["index"], input_matrix_1)
 interpreter.set_tensor(input_details[1]["index"], input_matrix_2)
@@ -168,13 +170,18 @@ def representative_data_gen():
 # In this section, we will compile the model using TVM and the Gemmini integration.
 
 # The Gemmini environment class needs to be initialized with the parameters of the Gemmini accelerator where we want to execute our operation. We use here the default parameters.
-gemmini.Environment.init_overwrite(dim=16, acc_rows=1024, bank_rows=4096, use_experimental_qnn_add=True)
+gemmini.Environment.init_overwrite(
+    dim=16, acc_rows=1024, bank_rows=4096, use_experimental_qnn_add=True
+)
 
 # The TFLite model generated in the previous steps is now imported into TVM.
 
 mod, params = relay.frontend.from_tflite(
     tflite_model,
-    shape_dict={"serving_default_x": (1, input_height, input_width, input_channels), "serving_default_y": (1, input_height, input_width, input_channels)},
+    shape_dict={
+        "serving_default_x": (1, input_height, input_width, input_channels),
+        "serving_default_y": (1, input_height, input_width, input_channels),
+    },
     dtype_dict={"serving_default_x": input_dtype, "serving_default_y": input_dtype},
 )
 mod = relay.transform.InferType()(mod)
@@ -192,7 +199,7 @@ def representative_data_gen():
 TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
 EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
 
-with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
 ##################################
@@ -217,9 +224,7 @@ def representative_data_gen():
 # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
 
 template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {
-    "project_type": "add_example"
-}  
+project_options = {"project_type": "add_example"}
 
 generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
 generated_project = tvm.micro.generate_project(
@@ -231,4 +236,4 @@ def representative_data_gen():
 
 # Finally, we execute the compiled baremetal project on the Spike simulator.
 # Note: if there are errors, these can be related to rounding errors.
-#generated_project.flash()
\ No newline at end of file
+# generated_project.flash()
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py
index 18bca38eafa0..b58881162dcc 100644
--- a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py
@@ -38,7 +38,7 @@
 # --------------------------------
 #
 # After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
-# 
+#
 # .. code-block:: bash
 #
 #   source <your chipyard home path>/env.sh
@@ -58,14 +58,14 @@
 output_channels = 16
 kernel_size = 3
 stride = 1
-padding = 'valid'
+padding = "valid"
 activation = None
 bias = True
 
 # We can add a max pooling layer after the convolution. This can be merged by the integration and can be executed together with the convolution on the Gemmini accelerator.
 pool_size = 1
 pool_stride = 1
-pool_padding = 'valid'
+pool_padding = "valid"
 use_pool = False
 
 # We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
@@ -92,9 +92,13 @@
 # Convert the concrete functions using TFLiteConverter
 converter = tf.lite.TFLiteConverter.from_keras_model(model)
 
+
 def representative_data_gen():
     dataset = [
-        np.array(np.random.randint(0, 10, size=(100, input_height, input_width, input_channels)), dtype=np.float32)
+        np.array(
+            np.random.randint(0, 10, size=(100, input_height, input_width, input_channels)),
+            dtype=np.float32,
+        )
         for s in range(10)
     ]
     for input_value in dataset:
@@ -140,7 +144,9 @@ def representative_data_gen():
 interpreter.allocate_tensors()
 input_details = interpreter.get_input_details()
 output_details = interpreter.get_output_details()
-input_matrix = np.random.randint(0, 127, (1, input_height, input_width, input_channels), dtype=np.uint8)
+input_matrix = np.random.randint(
+    0, 127, (1, input_height, input_width, input_channels), dtype=np.uint8
+)
 interpreter.set_tensor(input_details[0]["index"], input_matrix)
 interpreter.invoke()
 expected_output = interpreter.get_tensor(output_details[0]["index"])
@@ -160,7 +166,9 @@ def representative_data_gen():
 
 # The TFLite model generated in the previous steps is now imported into TVM.
 mod, params = relay.frontend.from_tflite(
-    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}
+    tflite_model,
+    shape_dict={input_tensor: (input_height, input_width, input_channels)},
+    dtype_dict={input_tensor: input_dtype},
 )
 mod["main"]
 
@@ -174,7 +182,7 @@ def representative_data_gen():
 TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
 EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
 
-with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
 ##################################
@@ -198,9 +206,7 @@ def representative_data_gen():
 # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
 
 template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {
-    "project_type": "conv2d_example"
-}  
+project_options = {"project_type": "conv2d_example"}
 
 generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
 generated_project = tvm.micro.generate_project(
@@ -212,4 +218,4 @@ def representative_data_gen():
 
 # Finally, we execute the compiled baremetal project on the Spike simulator.
 # Note: if there are errors, these can be related to rounding errors.
-generated_project.flash()
\ No newline at end of file
+generated_project.flash()
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py
index 35349a5c157f..c9a7caffc71b 100644
--- a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py
@@ -36,7 +36,7 @@
 # --------------------------------
 #
 # After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
-# 
+#
 # .. code-block:: bash
 #
 #   source <your chipyard home path>/env.sh
@@ -69,6 +69,7 @@ def __init__(self, name=None):
     def matmul(self, x):
         return tf.linalg.matmul(x, self.w, transpose_b=False) + self.b
 
+
 model = Model()
 
 # Convert the concrete functions using TFLiteConverter
@@ -172,7 +173,7 @@ def representative_data_gen():
 TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
 EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
 
-with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
 ##################################
@@ -195,9 +196,7 @@ def representative_data_gen():
 
 # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
 template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {
-    "project_type": "dense_example"
-}  
+project_options = {"project_type": "dense_example"}
 
 generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
 generated_project = tvm.micro.generate_project(
@@ -210,5 +209,3 @@ def representative_data_gen():
 # Finally, we execute the compiled baremetal project on the Spike simulator.
 # Note: if there are errors, these can be related to rounding errors.
 generated_project.flash()
-
-
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py
index 44d3e57ea2d9..14c39898278e 100644
--- a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py
@@ -41,7 +41,7 @@
 # --------------------------------
 #
 # After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
-# 
+#
 # .. code-block:: bash
 #
 #   source <your chipyard home path>/env.sh
@@ -60,7 +60,7 @@
 input_channels = 32
 kernel_size = 3
 stride = 1
-padding = 'same'
+padding = "same"
 activation = None
 bias = True
 
@@ -82,9 +82,13 @@
 # Convert the concrete functions using TFLiteConverter
 converter = tf.lite.TFLiteConverter.from_keras_model(model)
 
+
 def representative_data_gen():
     dataset = [
-        np.array(np.random.randint(0, 127, size=(10, input_height, input_width, input_channels)), dtype=np.float32)
+        np.array(
+            np.random.randint(0, 127, size=(10, input_height, input_width, input_channels)),
+            dtype=np.float32,
+        )
         for s in range(10)
     ]
     for input_value in dataset:
@@ -152,7 +156,9 @@ def representative_data_gen():
 
 # The TFLite model generated in the previous steps is now imported into TVM.
 mod, params = relay.frontend.from_tflite(
-    tflite_model, shape_dict={input_tensor: (input_height, input_width, input_channels)}, dtype_dict={input_tensor: input_dtype}
+    tflite_model,
+    shape_dict={input_tensor: (input_height, input_width, input_channels)},
+    dtype_dict={input_tensor: input_dtype},
 )
 mod["main"]
 
@@ -166,7 +172,7 @@ def representative_data_gen():
 TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
 EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
 
-with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
 ##################################
@@ -189,9 +195,7 @@ def representative_data_gen():
 
 # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
 template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {
-    "project_type": "dwconv2d_example"
-}  
+project_options = {"project_type": "dwconv2d_example"}
 
 generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
 generated_project = tvm.micro.generate_project(
@@ -204,4 +208,4 @@ def representative_data_gen():
 # Finally, we execute the compiled baremetal project on the Spike simulator.
 # Note: if there are errors, these can be related to rounding errors.
 
-generated_project.flash()
\ No newline at end of file
+generated_project.flash()
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py
index 03798ae62851..6dbb11695ac2 100644
--- a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py
@@ -36,7 +36,7 @@
 # --------------------------------
 #
 # After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
-# 
+#
 # .. code-block:: bash
 #
 #   source <your chipyard home path>/env.sh
@@ -55,7 +55,7 @@
 input_channels = 16
 pool_size = 2
 pool_stride = 1
-pool_padding = 'valid'
+pool_padding = "valid"
 
 # We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
 class Model(tf.Module):
@@ -73,6 +73,7 @@ def __init__(self, name=None):
     def maxpool(self, x):
         return layers.MaxPool2D(pool_size=pool_size, strides=pool_stride, padding=pool_padding)(x)
 
+
 model = Model()
 
 # Convert the concrete functions using TFLiteConverter
@@ -82,9 +83,7 @@ def maxpool(self, x):
 def representative_data_gen():
     dataset = [
         np.array(
-            np.random.randint(
-                -127, 128, size=(1, input_height, input_width, input_channels)
-            ),
+            np.random.randint(-127, 128, size=(1, input_height, input_width, input_channels)),
             dtype=np.float32,
         )
         for s in range(100)
@@ -133,7 +132,9 @@ def representative_data_gen():
 output_details = interpreter.get_output_details()
 tensor_details = interpreter.get_tensor_details()
 
-input_matrix_1 = np.random.randint(0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8)
+input_matrix_1 = np.random.randint(
+    0, 255, (1, input_height, input_width, input_channels), dtype=np.uint8
+)
 
 interpreter.set_tensor(input_details[0]["index"], input_matrix_1)
 
@@ -171,7 +172,7 @@ def representative_data_gen():
 TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
 EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
 
-with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
 ##################################
@@ -194,9 +195,7 @@ def representative_data_gen():
 
 # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
 template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {
-    "project_type": "maxpool2d_example"
-}  
+project_options = {"project_type": "maxpool2d_example"}
 
 generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
 generated_project = tvm.micro.generate_project(
@@ -208,4 +207,4 @@ def representative_data_gen():
 
 # Finally, we execute the compiled baremetal project on the Spike simulator.
 # Note: if there are errors, these can be related to rounding errors.
-generated_project.flash()
\ No newline at end of file
+generated_project.flash()
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py
index 5d3a5009b67e..fdb43096c87d 100644
--- a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py
+++ b/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py
@@ -37,7 +37,7 @@
 # --------------------------------
 #
 # After the installation of the Chipyard development tools, you should have an env.sh file in your Chipyard home directory. This file needs to be sourced before running this tutorial:
-# 
+#
 # .. code-block:: bash
 #
 #   source <your chipyard home path>/env.sh
@@ -50,6 +50,7 @@
 #
 # This functions will help us generate the MobileNet model
 
+
 def get_real_image(im_height, im_width):
     from PIL import Image
 
@@ -62,6 +63,7 @@ def get_real_image(im_height, im_width):
     data = np.reshape(x, (1, im_height, im_width, 3))
     return data
 
+
 def run_tflite_model(tflite_model_buf, input_data):
     """Generic function to execute TFLite"""
     try:
@@ -92,6 +94,7 @@ def run_tflite_model(tflite_model_buf, input_data):
 
     return tflite_output
 
+
 def download_model():
     model_url = (
         "https://storage.googleapis.com/download.tensorflow.org/models/"
@@ -159,6 +162,7 @@ def generate_mobilenet_tflite_model():
     extract(model_path)
     return create_tflite_model(model_dir)
 
+
 ##################################
 # Baseline generation
 # --------------------------------
@@ -223,7 +227,7 @@ def generate_mobilenet_tflite_model():
 TARGET = tvm.target.target.Target({"kind": "c", "device": "gemmini"})
 EXECUTOR = tvm.relay.backend.Executor("aot", options={"interface-api": "c", "unpacked-api": 1})
 
-with gemmini.build_config(usmp_alg="hill_climb",opt_level=3, disabled_pass=["AlterOpLayout"]):
+with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
 ##################################
@@ -246,9 +250,7 @@ def generate_mobilenet_tflite_model():
 
 # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
 template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {
-    "project_type": "mobilenet_example"
-}  
+project_options = {"project_type": "mobilenet_example"}
 
 generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
 generated_project = tvm.micro.generate_project(
@@ -259,4 +261,4 @@ def generate_mobilenet_tflite_model():
 generated_project.build()
 
 # Finally, we execute the compiled baremetal project on the Spike simulator.
-generated_project.flash()
\ No newline at end of file
+generated_project.flash()

From ec4f5856cd93761da4a4917e926dbc182d02bf38 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 7 Dec 2022 02:36:16 -0800
Subject: [PATCH 025/286] [ci] Fix docs deploy (#13570)

The PR #13300 had a bad merge with main an undid a fix from #13442, this
adds it back in.
---
 .asf.yaml                                     |   1 +
 .../generated/docker_jenkinsfile.groovy       | 238 +++++++++---------
 ci/jenkins/generated/gpu_jenkinsfile.groovy   |   9 +-
 .../templates/docker_jenkinsfile.groovy.j2    |  42 ++--
 .../templates/gpu_jenkinsfile.groovy.j2       |   5 +-
 5 files changed, 153 insertions(+), 142 deletions(-)

diff --git a/.asf.yaml b/.asf.yaml
index 34d191e376bb..047a573e05c3 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -60,6 +60,7 @@ github:
           - arm/pr-head
           - cortexm/pr-head
           - cpu/pr-head
+          - docker/pr-head
           - gpu/pr-head
           - hexagon/pr-head
           - i386/pr-head
diff --git a/ci/jenkins/generated/docker_jenkinsfile.groovy b/ci/jenkins/generated/docker_jenkinsfile.groovy
index 28e81efb7bf0..246b40c6e9c0 100644
--- a/ci/jenkins/generated/docker_jenkinsfile.groovy
+++ b/ci/jenkins/generated/docker_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-05T14:48:41.987490
+// Generated at 2022-12-06T21:25:49.429894
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
@@ -619,121 +619,6 @@ def update_docker(ecr_image, hub_image) {
   )
 }
 
-stage('Docker Image Build') {
-  parallel(
-    'ci_arm': {
-      node('ARM') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          // We're purposefully not setting the built image here since they
-          // are not yet being uploaded to tlcpack
-          // ci_arm = build_image('ci_arm')
-          built_ci_arm = build_image('ci_arm');
-        }
-      }
-    },
-    'ci_cortexm': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          // We're purposefully not setting the built image here since they
-          // are not yet being uploaded to tlcpack
-          // ci_cortexm = build_image('ci_cortexm')
-          built_ci_cortexm = build_image('ci_cortexm');
-        }
-      }
-    },
-    'ci_cpu': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          // We're purposefully not setting the built image here since they
-          // are not yet being uploaded to tlcpack
-          // ci_cpu = build_image('ci_cpu')
-          built_ci_cpu = build_image('ci_cpu');
-        }
-      }
-    },
-    'ci_gpu': {
-      node('GPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          // We're purposefully not setting the built image here since they
-          // are not yet being uploaded to tlcpack
-          // ci_gpu = build_image('ci_gpu')
-          built_ci_gpu = build_image('ci_gpu');
-        }
-      }
-    },
-    'ci_hexagon': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          // We're purposefully not setting the built image here since they
-          // are not yet being uploaded to tlcpack
-          // ci_hexagon = build_image('ci_hexagon')
-          built_ci_hexagon = build_image('ci_hexagon');
-        }
-      }
-    },
-    'ci_i386': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          // We're purposefully not setting the built image here since they
-          // are not yet being uploaded to tlcpack
-          // ci_i386 = build_image('ci_i386')
-          built_ci_i386 = build_image('ci_i386');
-        }
-      }
-    },
-    'ci_lint': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          // We're purposefully not setting the built image here since they
-          // are not yet being uploaded to tlcpack
-          // ci_lint = build_image('ci_lint')
-          built_ci_lint = build_image('ci_lint');
-        }
-      }
-    },
-    'ci_minimal': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          // We're purposefully not setting the built image here since they
-          // are not yet being uploaded to tlcpack
-          // ci_minimal = build_image('ci_minimal')
-          built_ci_minimal = build_image('ci_minimal');
-        }
-      }
-    },
-    'ci_riscv': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          // We're purposefully not setting the built image here since they
-          // are not yet being uploaded to tlcpack
-          // ci_riscv = build_image('ci_riscv')
-          built_ci_riscv = build_image('ci_riscv');
-        }
-      }
-    },
-    'ci_wasm': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          // We're purposefully not setting the built image here since they
-          // are not yet being uploaded to tlcpack
-          // ci_wasm = build_image('ci_wasm')
-          built_ci_wasm = build_image('ci_wasm');
-        }
-      }
-    },
-  )
-}
-
 def deploy() {
   stage('Deploy') {
     if (env.BRANCH_NAME == 'main') {
@@ -957,4 +842,123 @@ def deploy() {
   }
 }
 
-deploy()
+
+
+if (rebuild_docker_images) {
+  stage('Docker Image Build') {
+    parallel(
+      'ci_arm': {
+        node('ARM') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_arm = build_image('ci_arm')
+            built_ci_arm = build_image('ci_arm');
+          }
+        }
+      },
+      'ci_cortexm': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_cortexm = build_image('ci_cortexm')
+            built_ci_cortexm = build_image('ci_cortexm');
+          }
+        }
+      },
+      'ci_cpu': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_cpu = build_image('ci_cpu')
+            built_ci_cpu = build_image('ci_cpu');
+          }
+        }
+      },
+      'ci_gpu': {
+        node('GPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_gpu = build_image('ci_gpu')
+            built_ci_gpu = build_image('ci_gpu');
+          }
+        }
+      },
+      'ci_hexagon': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_hexagon = build_image('ci_hexagon')
+            built_ci_hexagon = build_image('ci_hexagon');
+          }
+        }
+      },
+      'ci_i386': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_i386 = build_image('ci_i386')
+            built_ci_i386 = build_image('ci_i386');
+          }
+        }
+      },
+      'ci_lint': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_lint = build_image('ci_lint')
+            built_ci_lint = build_image('ci_lint');
+          }
+        }
+      },
+      'ci_minimal': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_minimal = build_image('ci_minimal')
+            built_ci_minimal = build_image('ci_minimal');
+          }
+        }
+      },
+      'ci_riscv': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_riscv = build_image('ci_riscv')
+            built_ci_riscv = build_image('ci_riscv');
+          }
+        }
+      },
+      'ci_wasm': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_wasm = build_image('ci_wasm')
+            built_ci_wasm = build_image('ci_wasm');
+          }
+        }
+      },
+    )
+  }
+
+  deploy()
+}
diff --git a/ci/jenkins/generated/gpu_jenkinsfile.groovy b/ci/jenkins/generated/gpu_jenkinsfile.groovy
index c226255e0e6e..ef357f0d7c0b 100644
--- a/ci/jenkins/generated/gpu_jenkinsfile.groovy
+++ b/ci/jenkins/generated/gpu_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-05T14:48:42.195581
+// Generated at 2022-12-06T20:30:23.035868
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
@@ -1274,10 +1274,9 @@ def deploy() {
           timeout(time: max_time, unit: 'MINUTES') {
             init_git()
                     sh(
-                script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/docs",
-                label: 'Download artifacts from S3',
-              )
-
+                      script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/docs --items docs.tgz",
+                      label: 'Download docs folder from S3',
+                    )
                     deploy_docs()
           }
         }
diff --git a/ci/jenkins/templates/docker_jenkinsfile.groovy.j2 b/ci/jenkins/templates/docker_jenkinsfile.groovy.j2
index 016a1c7bc8e9..db3e6159b82a 100644
--- a/ci/jenkins/templates/docker_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/docker_jenkinsfile.groovy.j2
@@ -138,24 +138,6 @@ def update_docker(ecr_image, hub_image) {
   )
 }
 
-stage('Docker Image Build') {
-  parallel(
-  {% for image in images %}
-    '{{ image.name }}': {
-      node('{{ image.platform }}') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          // We're purposefully not setting the built image here since they
-          // are not yet being uploaded to tlcpack
-          // {{ image.name }} = build_image('{{ image.name }}')
-          built_{{ image.name }} = build_image('{{ image.name }}');
-        }
-      }
-    },
-  {% endfor %}
-  )
-}
-
 def deploy() {
   stage('Deploy') {
     if (env.BRANCH_NAME == 'main') {
@@ -236,4 +218,26 @@ def deploy() {
   }
 }
 
-deploy()
+
+
+if (rebuild_docker_images) {
+  stage('Docker Image Build') {
+    parallel(
+    {% for image in images %}
+      '{{ image.name }}': {
+        node('{{ image.platform }}') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // {{ image.name }} = build_image('{{ image.name }}')
+            built_{{ image.name }} = build_image('{{ image.name }}');
+          }
+        }
+      },
+    {% endfor %}
+    )
+  }
+
+  deploy()
+}
diff --git a/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2 b/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2
index 4a11a1bc427a..2a9e7236d26d 100644
--- a/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2
@@ -195,7 +195,10 @@ def deploy() {
           ws="tvm/deploy-docs",
         ) %}
           init_git()
-          {{ m.download_artifacts(tag='docs') }}
+          sh(
+            script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/docs --items docs.tgz",
+            label: 'Download docs folder from S3',
+          )
           deploy_docs()
         {% endcall %}
       )

From aaa80e133dc125b612e597b3762d9019e8f014b0 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 7 Dec 2022 02:38:00 -0800
Subject: [PATCH 026/286] [ci] Make tvm-bot aware of platform specific jobs
 (#13571)

See #13337 for more context, this fixes `@tvm-bot rerun` to work with
the new jobs
---
 ci/scripts/github/github_tvmbot.py | 28 ++++++++++++++++++++++------
 ci/scripts/jenkins/git_utils.py    |  2 +-
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/ci/scripts/github/github_tvmbot.py b/ci/scripts/github/github_tvmbot.py
index 8dc897367e57..d8dfcdb5b312 100755
--- a/ci/scripts/github/github_tvmbot.py
+++ b/ci/scripts/github/github_tvmbot.py
@@ -530,12 +530,26 @@ def merge_if_passed_checks(self) -> Optional[Dict[str, Any]]:
             return None
 
     def rerun_jenkins_ci(self) -> None:
-        url = JENKINS_URL + f"job/tvm/job/PR-{self.number}/buildWithParameters"
-        logging.info(f"Rerunning ci with URL={url}")
-        if self.dry_run:
-            logging.info("Dry run, not sending POST")
-        else:
-            post(url, auth=("tvm-bot", TVM_BOT_JENKINS_TOKEN))
+        job_names = [
+            "tvm-arm",
+            "tvm-cortexm",
+            "tvm-cpu",
+            "tvm-docker",
+            "tvm-gpu",
+            "tvm-hexagon",
+            "tvm-i386",
+            "tvm-lint",
+            "tvm-minimal",
+            "tvm-riscv",
+            "tvm-wasm",
+        ]
+        for name in job_names:
+            url = JENKINS_URL + f"job/{name}/job/PR-{self.number}/buildWithParameters"
+            logging.info(f"Rerunning ci with URL={url}")
+            if self.dry_run:
+                logging.info("Dry run, not sending POST")
+            else:
+                post(url, auth=("tvm-bot", TVM_BOT_JENKINS_TOKEN))
 
     def rerun_github_actions(self) -> None:
         workflow_ids = []
@@ -684,11 +698,13 @@ def run(pr: PR):
         try:
             pr.rerun_jenkins_ci()
         except Exception as e:
+            logging.exception(e)
             errors.append(e)
 
         try:
             pr.rerun_github_actions()
         except Exception as e:
+            logging.exception(e)
             errors.append(e)
 
         if len(errors) > 0:
diff --git a/ci/scripts/jenkins/git_utils.py b/ci/scripts/jenkins/git_utils.py
index 1295ff8e3c2c..cf6660051424 100644
--- a/ci/scripts/jenkins/git_utils.py
+++ b/ci/scripts/jenkins/git_utils.py
@@ -35,7 +35,7 @@ def compress_query(query: str) -> str:
 
 
 def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] = None):
-    logging.info(f"Requesting POST to", url, "with", body)
+    logging.info(f"Requesting POST to {url} with {body}")
     headers = {}
     req = request.Request(url, headers=headers, method="POST")
     if auth is not None:

From 96fbbcaa037030c5fa4b148f373df0b0bec4d596 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 7 Dec 2022 03:04:13 -0800
Subject: [PATCH 027/286] [ci] Add a workflow to update a nightly branch
 (#13564)

This adds a `nightly` branch that is updated once per day with a
specific TVM commit to enable third-parties to synchronize on which TVM
commit to test. A batch of commits is added once per day and tests
should be run on the `HEAD` of that batch.
---
 .github/workflows/update_nightly_branch.yml | 44 +++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 .github/workflows/update_nightly_branch.yml

diff --git a/.github/workflows/update_nightly_branch.yml b/.github/workflows/update_nightly_branch.yml
new file mode 100644
index 000000000000..2242577bdb58
--- /dev/null
+++ b/.github/workflows/update_nightly_branch.yml
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Update a branch for nightly test results
+name: Update nightly branch
+
+on:
+  schedule:
+    # 9 PM PST
+    - cron: "0 5 * * *"
+  workflow_dispatch:
+
+concurrency:
+  group: update-nightly-branch
+  cancel-in-progress: true
+
+jobs:
+  update-nightly-branch:
+    if: github.repository == 'driazati/tvm'
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v2
+      - name: Update nightly branch
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -eux
+          git checkout -B nightly
+          git log -5
+          git push origin --force nightly

From a94e2430267f99e9c2603e7398da2995b27e62f3 Mon Sep 17 00:00:00 2001
From: fPecc <peccfederico@frba.utn.edu.ar>
Date: Wed, 7 Dec 2022 12:11:00 +0100
Subject: [PATCH 028/286] Added macro generation in MLF export (#12789)

The generated MLF header files for each module contain the struct definition to use as input and outputs to call the generated function. If we want to call this tvmgen_default_run, we need to allocate space (statically or dynamically) for the input and output tensors. This generates macros that define the size of each input and output in bytes, this allows us to reference this new macros to statically or dynamically allocate vectors to store the inputs and outputs of the tvmgen_default_run function.


Co-authored-by: Federico Peccia <peccia@fzi.de>
Co-authored-by: Christopher Sidebottom <chris.sidebottom@arm.com>
---
 python/tvm/micro/model_library_format.py      |  68 +++-
 src/target/source/interface_c.cc              |  34 +-
 tests/cpp/target/source/interface_c_test.cc   | 316 +++++++++++++++---
 tests/micro/zephyr/utils.py                   |  11 +-
 .../test_micro_model_library_format.py        |  59 +++-
 5 files changed, 412 insertions(+), 76 deletions(-)

diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 5f8469463997..263371cda171 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -47,7 +47,16 @@ class UnsupportedInModelLibraryFormatError(Exception):
 
 
 def generate_c_interface_header(
-    module_name, inputs, outputs, pools, io_pool_allocations, devices, workspace_size, include_path
+    module_name,
+    inputs,
+    outputs,
+    pools,
+    io_pool_allocations,
+    devices,
+    workspace_size,
+    include_path,
+    input_sizes,
+    output_sizes,
 ):
     """Generate C Interface header to be included in MLF"""
     mangled_name = to_c_variable_style(prefix_generated_name(module_name))
@@ -55,7 +64,15 @@ def generate_c_interface_header(
 
     interface_c_create = tvm._ffi.get_global_func("runtime.InterfaceCCreate")
     interface_c_module = interface_c_create(
-        module_name, inputs, outputs, pools, io_pool_allocations, devices, workspace_size
+        module_name,
+        inputs,
+        outputs,
+        pools,
+        io_pool_allocations,
+        devices,
+        workspace_size,
+        input_sizes,
+        output_sizes,
     )
 
     with open(metadata_header, "w") as header_file:
@@ -193,6 +210,13 @@ def _build_sid_map(graph_json):
     return memory_map
 
 
+def _create_type_metadata(input_type):
+    return {
+        "size": int(_shape_to_size(input_type.shape, input_type.dtype)),
+        "dtype": str(input_type.dtype),
+    }
+
+
 def _build_function_memory_map(function_metadata):
     """Build a simple map that shows how much workspace is required to execute
     each primitive function. The main_func describes how much memory is required
@@ -277,6 +301,26 @@ def _create_empty_entry(target_device_type):
             main_func_metadata.io_sizes[target]
         )
 
+        # Now, we also add the information about the size of each input and output of the main
+        # function (in bytes)
+        input_dict = {}
+        for input_param in main_func_metadata.relay_primfuncs[target].params:
+            input_dict[input_param.name_hint] = _create_type_metadata(input_param.checked_type)
+        target_main_entries[int(target.get_target_device_type())]["inputs"] = input_dict
+
+        output_dict = {}
+        # For output, we dont have the name of the output, so we enumerate them
+        if isinstance(main_func_metadata.relay_primfuncs[target].ret_type, tvm.ir.type.TupleType):
+            output_list = _convert_tuple_to_outputs(
+                main_func_metadata.relay_primfuncs[target].ret_type
+            )
+            for i, output_type in enumerate(output_list):
+                output_dict[f"output{i}"] = _create_type_metadata(output_type)
+        else:
+            output_type = main_func_metadata.relay_primfuncs[target].ret_type
+            output_dict["output"] = _create_type_metadata(output_type)
+        target_main_entries[int(target.get_target_device_type())]["outputs"] = output_dict
+
     ret = {
         "operator_functions": func_entries,
         "main": list(target_main_entries.values()),
@@ -298,7 +342,7 @@ def _convert_tuple_to_outputs(ret_type, offset=0):
         if isinstance(ret_type.fields[output_index], TupleType):
             outputs.extend(_convert_tuple_to_outputs(ret_type.fields[output_index], next_output))
         else:
-            outputs.append(f"output{next_output}")
+            outputs.append(ret_type.fields[output_index])
     return outputs
 
 
@@ -427,6 +471,20 @@ def _export_graph_model_library_format(
                     "workspace_size_bytes"
                 ]
             )
+            inputs_sizes = metadata["modules"][mod.libmod_name]["memory"]["functions"]["main"][0][
+                "inputs"
+            ]
+            # Here, we merge the output sizes with the actual output names
+            output_sizes = {}
+            for i, key in enumerate(
+                metadata["modules"][mod.libmod_name]["memory"]["functions"]["main"][0][
+                    "outputs"
+                ].keys()
+            ):
+                output_sizes[outputs[i]] = metadata["modules"][mod.libmod_name]["memory"][
+                    "functions"
+                ]["main"][0]["outputs"][key]
+
             generate_c_interface_header(
                 mod.libmod_name,
                 inputs,
@@ -436,6 +494,8 @@ def _export_graph_model_library_format(
                 devices,
                 workspace_size,
                 include_path,
+                inputs_sizes,
+                output_sizes,
             )
 
         is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
@@ -459,7 +519,7 @@ class NonStaticShapeError(Exception):
 
 def _shape_to_size(shape, dtype):
     bits_per_item = int(
-        re.match(r"((float)|(int))(?P<width_bits>[0-9]+)", dtype).group("width_bits")
+        re.match(r"((float)|(int)|(uint))(?P<width_bits>[0-9]+)", dtype).group("width_bits")
     )
     assert bits_per_item is not None, f"don't know how to compute size of type {dtype}"
     total_bits = bits_per_item
diff --git a/src/target/source/interface_c.cc b/src/target/source/interface_c.cc
index ed7058f1f198..fe495b212ad8 100644
--- a/src/target/source/interface_c.cc
+++ b/src/target/source/interface_c.cc
@@ -47,20 +47,42 @@ class InterfaceCNode : public runtime::ModuleNode {
   InterfaceCNode(std::string module_name, Array<String> inputs, Array<String> outputs,
                  Array<tir::usmp::AllocatedPoolInfo> pools,
                  Map<String, tir::usmp::PoolAllocation> io_pool_allocations, Array<String> devices,
-                 int workspace_size)
+                 int workspace_size, Map<String, IntImm> input_sizes,
+                 Map<String, IntImm> output_sizes)
       : module_name_(module_name),
         inputs_(inputs),
         outputs_(outputs),
         devices_(devices),
         pools_(FilterExternalPools(pools)),
         io_pool_allocations_(io_pool_allocations),
-        workspace_size_(workspace_size) {}
+        workspace_size_(workspace_size),
+        input_sizes_(input_sizes),
+        output_sizes_(output_sizes) {}
   const char* type_key() const final { return "h"; }
 
   std::string GetSource(const std::string& format) final {
     std::stringstream code;
 
     EmitUpperHeaderGuard(code);
+
+    // Emit macros for input sizes
+    for (auto const& it : input_sizes_) {
+      std::string input_name = SanitizeName(it.first);
+      std::string input_macro_name = input_name + "_size";
+      int input_size = it.second->value;
+      EmitIntegerValueMacro(code, "Input tensor " + input_name + " size (in bytes)",
+                            input_macro_name, input_size);
+    }
+
+    // Emit macros for output sizes
+    for (auto const& it : output_sizes_) {
+      std::string output_name = SanitizeName(it.first);
+      std::string output_macro_name = output_name + "_size";
+      int output_size = it.second->value;
+      EmitIntegerValueMacro(code, "Output tensor " + output_name + " size (in bytes)",
+                            output_macro_name, output_size);
+    }
+
     EmitBrief(code, "Input tensor pointers");
     EmitStruct(code, "inputs", inputs_);
     EmitBrief(code, "Output tensor pointers");
@@ -278,14 +300,18 @@ class InterfaceCNode : public runtime::ModuleNode {
   Array<tir::usmp::AllocatedPoolInfo> pools_;
   Map<String, tir::usmp::PoolAllocation> io_pool_allocations_;
   int workspace_size_;
+  Map<String, IntImm> input_sizes_;
+  Map<String, IntImm> output_sizes_;
 };
 
 runtime::Module InterfaceCCreate(std::string module_name, Array<String> inputs,
                                  Array<String> outputs, Array<tir::usmp::AllocatedPoolInfo> pools,
                                  Map<String, tir::usmp::PoolAllocation> io_pool_allocations,
-                                 Array<String> devices, int workspace_size) {
+                                 Array<String> devices, int workspace_size,
+                                 Map<String, IntImm> input_sizes,
+                                 Map<String, IntImm> output_sizes) {
   auto n = make_object<InterfaceCNode>(module_name, inputs, outputs, pools, io_pool_allocations,
-                                       devices, workspace_size);
+                                       devices, workspace_size, input_sizes, output_sizes);
   return runtime::Module(n);
 }
 
diff --git a/tests/cpp/target/source/interface_c_test.cc b/tests/cpp/target/source/interface_c_test.cc
index d575bfeaf0c7..d9d9d80bbe31 100644
--- a/tests/cpp/target/source/interface_c_test.cc
+++ b/tests/cpp/target/source/interface_c_test.cc
@@ -33,7 +33,8 @@ namespace codegen {
 runtime::Module InterfaceCCreate(std::string module_name, Array<String> inputs,
                                  Array<String> outputs, Array<tir::usmp::AllocatedPoolInfo> pools,
                                  Map<String, tir::usmp::PoolAllocation> io_pool_allocations,
-                                 Array<String> devices, int workspace_size);
+                                 Array<String> devices, int workspace_size,
+                                 Map<String, IntImm> input_sizes, Map<String, IntImm> output_sizes);
 
 namespace {
 
@@ -53,8 +54,13 @@ TEST(InterfaceAPI, ContainsHeaderGuards) {
                      << "#endif\n\n"
                      << "#endif // TVMGEN_ULTIMATE_CAT_SPOTTER_H_\n";
 
-  runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {}, 0);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {},
+                                                 {}, {}, 0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(upper_header_guard.str()));
@@ -74,8 +80,13 @@ TEST(InterfaceAPI, ContainsRunFunction) {
                << "  struct tvmgen_ultimate_cat_spotter_outputs* outputs\n"
                << ");\n";
 
-  runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {}, 0);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {},
+                                                 {}, {}, 0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
   ASSERT_THAT(header_source, HasSubstr(run_function.str()));
 }
@@ -95,8 +106,13 @@ TEST(InterfaceAPI, ContainsRunFunctionWithDevices) {
                << "  struct tvmgen_ultimate_cat_spotter_devices* devices\n"
                << ");\n";
 
-  runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {"device"}, 0);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {},
+                                                 {}, {"device"}, 0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(run_function.str()));
@@ -117,11 +133,17 @@ TEST(InterfaceAPI, ContainsRunFunctionWithWorkspacePools) {
                << "  struct tvmgen_ultimate_cat_spotter_workspace_pools* workspace_pools\n"
                << ");\n";
 
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+
   PoolInfo pool_info = WorkspacePoolInfo("my_memory_pool", {});
   tir::usmp::AllocatedPoolInfo allocated_pool_info =
       tir::usmp::AllocatedPoolInfo(pool_info, 100000);
-  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
-                                                 {allocated_pool_info}, {}, {}, 0);
+  runtime::Module test_module =
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {allocated_pool_info}, {}, {},
+                       0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(run_function.str()));
@@ -142,6 +164,11 @@ TEST(InterfaceAPI, ContainsRunFunctionWithWorkspaceAndConstantPools) {
                << "  struct tvmgen_ultimate_cat_spotter_workspace_pools* workspace_pools\n"
                << ");\n";
 
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+
   PoolInfo pool_info = WorkspacePoolInfo("my_memory_pool", {});
   PoolInfo const_info = ConstantPoolInfo(
       "my_constant_pool", {},
@@ -151,9 +178,9 @@ TEST(InterfaceAPI, ContainsRunFunctionWithWorkspaceAndConstantPools) {
       tir::usmp::AllocatedPoolInfo(pool_info, 100000);
   tir::usmp::AllocatedPoolInfo allocated_const_info =
       tir::usmp::AllocatedPoolInfo(const_info, 100000);
-  runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
-                       {allocated_pool_info, allocated_const_info}, {}, {}, 0);
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
+                                                 {allocated_pool_info, allocated_const_info}, {},
+                                                 {}, 0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
   ASSERT_THAT(header_source, HasSubstr(run_function.str()));
   ASSERT_THAT(
@@ -186,11 +213,17 @@ TEST(InterfaceAPI, ContainsRunFunctionWithWorkspacePoolsAndDevices) {
                << "  struct tvmgen_ultimate_cat_spotter_devices* devices\n"
                << ");\n";
 
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+
   PoolInfo pool_info = WorkspacePoolInfo("my_memory_pool", {});
   tir::usmp::AllocatedPoolInfo allocated_pool_info =
       tir::usmp::AllocatedPoolInfo(pool_info, 100000);
-  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
-                                                 {allocated_pool_info}, {}, {"device"}, 0);
+  runtime::Module test_module =
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {allocated_pool_info}, {},
+                       {"device"}, 0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(run_function.str()));
@@ -226,14 +259,20 @@ TEST(InterfaceAPI, ContainsRunFunctionWithWorkspaceIO) {
       << "  struct tvmgen_ultimate_cat_spotter_workspace_pools* workspace_pools\n"
       << ");\n";
 
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+
   PoolInfo pool_info = WorkspacePoolInfo("my_memory_pool", {});
   tir::usmp::AllocatedPoolInfo allocated_pool_info =
       tir::usmp::AllocatedPoolInfo(pool_info, 100000);
   tir::usmp::PoolAllocation pool_allocation_input{pool_info, 1000};
   tir::usmp::PoolAllocation pool_allocation_output{pool_info, 2000};
-  runtime::Module test_module = InterfaceCCreate(
-      "ultimate_cat_spotter", {"input"}, {"output"}, {allocated_pool_info},
-      {{"input", pool_allocation_input}, {"output", pool_allocation_output}}, {}, 0);
+  runtime::Module test_module =
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {allocated_pool_info},
+                       {{"input", pool_allocation_input}, {"output", pool_allocation_output}}, {},
+                       0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
   std::cout << header_source << "\n";
   ASSERT_THAT(header_source, HasSubstr(run_function_with_map_functions.str()));
@@ -241,6 +280,13 @@ TEST(InterfaceAPI, ContainsRunFunctionWithWorkspaceIO) {
 
 TEST(InterfaceAPI, ContainsInputStructSingle) {
   std::stringstream input_struct;
+  std::stringstream input_size_macro;
+
+  input_size_macro
+      << "/*!\n"
+      << " * \\brief Input tensor input size (in bytes) for TVM module \"ultimate_cat_spotter\" \n"
+      << " */\n"
+      << "#define TVMGEN_ULTIMATE_CAT_SPOTTER_INPUT_SIZE 537\n";
 
   input_struct << "/*!\n"
                << " * \\brief Input tensor pointers for TVM module \"ultimate_cat_spotter\" \n"
@@ -249,51 +295,120 @@ TEST(InterfaceAPI, ContainsInputStructSingle) {
                << "  void* input;\n"
                << "};\n\n";
 
-  runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {}, 0);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 537));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {},
+                                                 {}, {}, 0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(input_struct.str()));
+
+  ASSERT_THAT(header_source, HasSubstr(input_size_macro.str()));
 }
 
 TEST(InterfaceAPI, ContainsInputStructMany) {
   std::stringstream input_struct;
+  std::stringstream input1_size_macro;
+  std::stringstream input2_size_macro;
+
+  input1_size_macro
+      << "/*!\n"
+      << " * \\brief Input tensor input1 size (in bytes) for TVM module \"ultimate_cat_spotter\" \n"
+      << " */\n"
+      << "#define TVMGEN_ULTIMATE_CAT_SPOTTER_INPUT1_SIZE 765\n";
+
+  input2_size_macro
+      << "/*!\n"
+      << " * \\brief Input tensor input2 size (in bytes) for TVM module \"ultimate_cat_spotter\" \n"
+      << " */\n"
+      << "#define TVMGEN_ULTIMATE_CAT_SPOTTER_INPUT2_SIZE 127\n";
 
   input_struct << "struct tvmgen_ultimate_cat_spotter_inputs {\n"
                << "  void* input1;\n"
                << "  void* input2;\n"
                << "};\n\n";
 
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input1", IntImm(DataType::Int(32), 765));
+  input_sizes.Set("input2", IntImm(DataType::Int(32), 127));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input1", "input2"}, {"output"}, {}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input1", "input2"}, {"output"}, {}, {}, {}, 0,
+                       input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(input_struct.str()));
+  ASSERT_THAT(header_source, HasSubstr(input1_size_macro.str()));
+  ASSERT_THAT(header_source, HasSubstr(input2_size_macro.str()));
 }
 
 TEST(InterfaceAPI, ContainsInputStructSanitised) {
   std::stringstream input_struct;
+  std::stringstream input1_size_macro;
+  std::stringstream input2_size_macro;
+
+  input1_size_macro << "/*!\n"
+                    << " * \\brief Input tensor input_1 size (in bytes) for TVM module "
+                       "\"ultimate_cat_spotter\" \n"
+                    << " */\n"
+                    << "#define TVMGEN_ULTIMATE_CAT_SPOTTER_INPUT_1_SIZE 765\n";
+
+  input2_size_macro << "/*!\n"
+                    << " * \\brief Input tensor input_2 size (in bytes) for TVM module "
+                       "\"ultimate_cat_spotter\" \n"
+                    << " */\n"
+                    << "#define TVMGEN_ULTIMATE_CAT_SPOTTER_INPUT_2_SIZE 127\n";
 
   input_struct << "struct tvmgen_ultimate_cat_spotter_inputs {\n"
                << "  void* input_1;\n"
                << "  void* input_2;\n"
                << "};\n\n";
 
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input+1", IntImm(DataType::Int(32), 765));
+  input_sizes.Set("input+2", IntImm(DataType::Int(32), 127));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input+1", "input+2"}, {"output"}, {}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input+1", "input+2"}, {"output"}, {}, {}, {}, 0,
+                       input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
+  std::cout << header_source << std::endl;
+
   ASSERT_THAT(header_source, HasSubstr(input_struct.str()));
+  ASSERT_THAT(header_source, HasSubstr(input1_size_macro.str()));
+  ASSERT_THAT(header_source, HasSubstr(input2_size_macro.str()));
 }
 
 TEST(InterfaceAPI, ContainsInputStructClash) {
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input+", IntImm(DataType::Int(32), 0));
+  input_sizes.Set("input-", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input+", "input-"}, {"output"}, {}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input+", "input-"}, {"output"}, {}, {}, {}, 0,
+                       input_sizes, output_sizes);
   ASSERT_THROW(test_module->GetSource(), InternalError);
 }
 
 TEST(InterfaceAPI, ContainsOutputStructSingle) {
   std::stringstream output_struct;
+  std::stringstream output_size_macro;
+
+  output_size_macro << "/*!\n"
+                    << " * \\brief Output tensor output size (in bytes) for TVM module "
+                       "\"ultimate_cat_spotter\" \n"
+                    << " */\n"
+                    << "#define TVMGEN_ULTIMATE_CAT_SPOTTER_OUTPUT_SIZE 543\n";
 
   output_struct << "/*!\n"
                 << " * \\brief Output tensor pointers for TVM module \"ultimate_cat_spotter\" \n"
@@ -302,46 +417,104 @@ TEST(InterfaceAPI, ContainsOutputStructSingle) {
                 << "  void* output;\n"
                 << "};\n\n";
 
-  runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {}, 0);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 543));
+
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {},
+                                                 {}, {}, 0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(output_struct.str()));
+  ASSERT_THAT(header_source, HasSubstr(output_size_macro.str()));
 }
 
 TEST(InterfaceAPI, ContainsOutputStructMany) {
   std::stringstream output_struct;
+  std::stringstream output1_size_macro;
+  std::stringstream output2_size_macro;
+
+  output1_size_macro << "/*!\n"
+                     << " * \\brief Output tensor output1 size (in bytes) for TVM module "
+                        "\"ultimate_cat_spotter\" \n"
+                     << " */\n"
+                     << "#define TVMGEN_ULTIMATE_CAT_SPOTTER_OUTPUT1_SIZE 345\n";
+
+  output2_size_macro << "/*!\n"
+                     << " * \\brief Output tensor output2 size (in bytes) for TVM module "
+                        "\"ultimate_cat_spotter\" \n"
+                     << " */\n"
+                     << "#define TVMGEN_ULTIMATE_CAT_SPOTTER_OUTPUT2_SIZE 984\n";
 
   output_struct << "struct tvmgen_ultimate_cat_spotter_outputs {\n"
                 << "  void* output1;\n"
                 << "  void* output2;\n"
                 << "};\n\n";
 
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output1", IntImm(DataType::Int(32), 345));
+  output_sizes.Set("output2", IntImm(DataType::Int(32), 984));
+
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output1", "output2"}, {}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output1", "output2"}, {}, {}, {}, 0,
+                       input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(output_struct.str()));
+  ASSERT_THAT(header_source, HasSubstr(output1_size_macro.str()));
+  ASSERT_THAT(header_source, HasSubstr(output2_size_macro.str()));
 }
 
 TEST(InterfaceAPI, ContainsOutputStructSanitised) {
   std::stringstream output_struct;
+  std::stringstream output1_size_macro;
+  std::stringstream output2_size_macro;
+
+  output1_size_macro << "/*!\n"
+                     << " * \\brief Output tensor output_1 size (in bytes) for TVM module "
+                        "\"ultimate_cat_spotter\" \n"
+                     << " */\n"
+                     << "#define TVMGEN_ULTIMATE_CAT_SPOTTER_OUTPUT_1_SIZE 345\n";
+
+  output2_size_macro << "/*!\n"
+                     << " * \\brief Output tensor output_2 size (in bytes) for TVM module "
+                        "\"ultimate_cat_spotter\" \n"
+                     << " */\n"
+                     << "#define TVMGEN_ULTIMATE_CAT_SPOTTER_OUTPUT_2_SIZE 984\n";
 
   output_struct << "struct tvmgen_ultimate_cat_spotter_outputs {\n"
                 << "  void* output_1;\n"
                 << "  void* output_2;\n"
                 << "};\n\n";
 
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output+1", IntImm(DataType::Int(32), 345));
+  output_sizes.Set("output-2", IntImm(DataType::Int(32), 984));
+
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output+1", "output-2"}, {}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output+1", "output-2"}, {}, {}, {}, 0,
+                       input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(output_struct.str()));
+  ASSERT_THAT(header_source, HasSubstr(output1_size_macro.str()));
+  ASSERT_THAT(header_source, HasSubstr(output2_size_macro.str()));
 }
 
 TEST(InterfaceAPI, ContainsOutputStructClash) {
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output+", IntImm(DataType::Int(32), 0));
+  output_sizes.Set("output-", IntImm(DataType::Int(32), 0));
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output+", "output-"}, {}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output+", "output-"}, {}, {}, {}, 0,
+                       input_sizes, output_sizes);
   ASSERT_THROW(test_module->GetSource(), InternalError);
 }
 
@@ -354,8 +527,12 @@ TEST(InterfaceAPI, NoDeviceAPIStructIfNoDevices) {
                 << "struct tvmgen_ultimate_cat_spotter_devices {\n"
                 << "};\n\n";
 
-  runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {}, 0);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {},
+                                                 {}, {}, 0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, Not(HasSubstr(device_struct.str())));
@@ -371,8 +548,12 @@ TEST(InterfaceAPI, ContainsDeviceStructSingle) {
                 << "  void* device;\n"
                 << "};\n\n";
 
-  runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {"device"}, 0);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {},
+                                                 {}, {"device"}, 0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(device_struct.str()));
@@ -386,8 +567,13 @@ TEST(InterfaceAPI, ContainsDeviceStructMany) {
                 << "  void* device2;\n"
                 << "};\n\n";
 
-  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {},
-                                                 {}, {"device1", "device2"}, 0);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+  runtime::Module test_module =
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {},
+                       {"device1", "device2"}, 0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(device_struct.str()));
@@ -401,22 +587,36 @@ TEST(InterfaceAPI, ContainsDeviceStructSanitised) {
                 << "  void* device_2;\n"
                 << "};\n\n";
 
-  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {},
-                                                 {}, {"device+1", "device+2"}, 0);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+  runtime::Module test_module =
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {},
+                       {"device+1", "device+2"}, 0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(device_struct.str()));
 }
 
 TEST(InterfaceAPI, ContainsDeviceStructClash) {
-  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {},
-                                                 {}, {"device+", "device-"}, 0);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+  runtime::Module test_module =
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {},
+                       {"device+", "device-"}, 0, input_sizes, output_sizes);
   ASSERT_THROW(test_module->GetSource(), InternalError);
 }
 
 TEST(InterfaceAPI, ContainsWorkspaceSize) {
-  runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {}, 765432);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {},
+                                                 {}, {}, 765432, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source,
@@ -441,8 +641,13 @@ TEST(InterfaceAPI, ContainsWorkspacePoolStructSingle) {
       << "  void* my_memory_pool;\n"
       << "};\n\n";
 
-  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
-                                                 {allocated_pool_info}, {}, {}, 0);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+  runtime::Module test_module =
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {allocated_pool_info}, {}, {},
+                       0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(workspace_struct.str()));
@@ -474,9 +679,13 @@ TEST(InterfaceAPI, ContainsWorkspacePoolStructMany) {
       << "  void* my_memory_pool_2;\n"
       << "};\n\n";
 
-  runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
-                       {allocated_pool_info1, allocated_pool_info2}, {}, {}, 0);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
+                                                 {allocated_pool_info1, allocated_pool_info2}, {},
+                                                 {}, 0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(workspace_struct.str()));
@@ -511,8 +720,13 @@ TEST(InterfaceAPI, ContainsWorkspacePoolStructSanitized) {
       << "  void* my_memory_pool_1;\n"
       << "};\n\n";
 
-  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
-                                                 {allocated_pool_info}, {}, {}, 0);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+  runtime::Module test_module =
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {allocated_pool_info}, {}, {},
+                       0, input_sizes, output_sizes);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(workspace_struct.str()));
@@ -533,9 +747,13 @@ TEST(InterfaceAPI, ContainsWorkspacePoolStructClash) {
   tir::usmp::AllocatedPoolInfo allocated_pool_info2 =
       tir::usmp::AllocatedPoolInfo(pool_info2, 200000);
 
-  runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
-                       {allocated_pool_info1, allocated_pool_info2}, {}, {}, 0);
+  Map<String, IntImm> input_sizes;
+  input_sizes.Set("input", IntImm(DataType::Int(32), 0));
+  Map<String, IntImm> output_sizes;
+  output_sizes.Set("output", IntImm(DataType::Int(32), 0));
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
+                                                 {allocated_pool_info1, allocated_pool_info2}, {},
+                                                 {}, 0, input_sizes, output_sizes);
   ASSERT_THROW(test_module->GetSource(), InternalError);
 }
 
diff --git a/tests/micro/zephyr/utils.py b/tests/micro/zephyr/utils.py
index 05b209094420..42419b637fa4 100644
--- a/tests/micro/zephyr/utils.py
+++ b/tests/micro/zephyr/utils.py
@@ -218,7 +218,16 @@ def generate_project(
                         model_files_path, arcname=os.path.relpath(model_files_path, tar_temp_dir)
                     )
                 header_path = generate_c_interface_header(
-                    lowered.libmod_name, ["input_1"], ["Identity"], [], {}, [], 0, model_files_path
+                    lowered.libmod_name,
+                    ["input_1"],
+                    ["Identity"],
+                    [],
+                    {},
+                    [],
+                    0,
+                    model_files_path,
+                    {},
+                    {},
                 )
                 tf.add(header_path, arcname=os.path.relpath(header_path, tar_temp_dir))
 
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index 9b957e617a13..7ccaf72b1baf 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -208,7 +208,12 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                 {
                     "constants_size_bytes": json_constants_size_bytes,
                     "device": 1,
+                    "inputs": {
+                        "a": {"dtype": "uint8", "size": 2},
+                        "b": {"dtype": "float32", "size": 8},
+                    },
                     "io_size_bytes": 18,
+                    "outputs": {"output": {"dtype": "float32", "size": 8}},
                     "workspace_size_bytes": 0,
                 }
             ]
@@ -295,7 +300,12 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                 {
                     "constants_size_bytes": 8,
                     "device": 1,
+                    "inputs": {
+                        "a": {"dtype": "uint8", "size": 2},
+                        "b": {"dtype": "float32", "size": 8},
+                    },
                     "io_size_bytes": 18,
+                    "outputs": {"output": {"dtype": "float32", "size": 8}},
                     "workspace_size_bytes": 0,
                 }
             ]
@@ -373,7 +383,13 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
             {
                 "constants_size_bytes": 0,
                 "device": 1,
+                "inputs": {
+                    "p0": {"dtype": "int16", "size": 802816},
+                    "p1": {"dtype": "int16", "size": 2304},
+                    "p2": {"dtype": "int32", "size": 512},
+                },
                 "io_size_bytes": 1207040,
+                "outputs": {"output": {"dtype": "uint8", "size": 401408}},
                 "workspace_size_bytes": 2466816,
             }
         ]
@@ -454,24 +470,26 @@ def test_export_byoc_c_module():
         with tf.extractfile("./metadata.json") as f:
             metadata = json.load(f)
         main_md = metadata["modules"][factory.libmod_name]["memory"]["functions"]["main"]
-        if platform.architecture()[0] == "64bit":
-            assert main_md == [
-                {
-                    "constants_size_bytes": 0,
-                    "device": 1,
-                    "io_size_bytes": 4800,
-                    "workspace_size_bytes": 1200,
-                }
-            ]
-        else:
-            assert main_md == [
-                {
-                    "constants_size_bytes": 0,
-                    "device": 1,
-                    "io_size_bytes": 4800,
-                    "workspace_size_bytes": 1200,
-                }
-            ]
+        assert main_md == [
+            {
+                "constants_size_bytes": 0,
+                "device": 1,
+                "inputs": {
+                    "w0": {"dtype": "float32", "size": 400},
+                    "w1": {"dtype": "float32", "size": 400},
+                    "w2": {"dtype": "float32", "size": 400},
+                    "w3": {"dtype": "float32", "size": 400},
+                    "w4": {"dtype": "float32", "size": 400},
+                    "w5": {"dtype": "float32", "size": 400},
+                    "w6": {"dtype": "float32", "size": 400},
+                    "w7": {"dtype": "float32", "size": 400},
+                    "x": {"dtype": "float32", "size": 400},
+                },
+                "io_size_bytes": 4800,
+                "outputs": {"output": {"dtype": "float32", "size": 1200}},
+                "workspace_size_bytes": 1200,
+            }
+        ]
 
 
 @tvm.testing.requires_micro
@@ -523,7 +541,12 @@ def test_multiple_relay_modules_graph():
             {
                 "constants_size_bytes": 0,
                 "device": 1,
+                "inputs": {
+                    "data": {"dtype": "int8", "size": 12288},
+                    "weight": {"dtype": "int8", "size": 600},
+                },
                 "io_size_bytes": 143960,
+                "outputs": {"output": {"dtype": "int32", "size": 131072}},
                 "workspace_size_bytes": 158088,
             }
         ]

From 5f90760f55175f8925464c9a23996166f8874d27 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Wed, 7 Dec 2022 11:49:19 +0000
Subject: [PATCH 029/286] [CodegenC] Explicit forward function declarations
 (#13522)

armclang 6.19 emits an error for implicit function declarations.
This commit adds support for generating forward function
declarations in the C file generated for __tvm_main__.
All the non-pure extern functions called from __tvm_main__
will be declared explicitly in this file.
---
 python/tvm/relay/testing/tflite.py            |  10 +
 .../backend/contrib/cmsisnn/tir_to_runtime.cc |   7 +-
 .../example_target_hooks/tir_to_runtime.cc    |   3 +-
 .../backend/contrib/uma/tir_to_runtime.cc     |   7 +-
 src/target/source/codegen_c.cc                |   5 +-
 src/target/source/codegen_c.h                 |  13 +-
 src/target/source/codegen_c_host.cc           |  57 +++-
 src/target/source/codegen_c_host.h            |  11 +-
 src/target/source/codegen_cuda.cc             |   2 +-
 src/target/source/codegen_cuda.h              |   2 +-
 src/target/source/codegen_opencl.cc           |   2 +-
 src/target/source/codegen_opencl.h            |   2 +-
 src/target/source/codegen_source_base.h       |   2 +
 src/target/source/codegen_vhls.cc             |   2 +-
 src/target/source/codegen_vhls.h              |   2 +-
 tests/python/relay/aot/corstone300.mk         |   2 +-
 .../aot/test_crt_forward_declarations.py      | 275 ++++++++++++++++++
 17 files changed, 372 insertions(+), 32 deletions(-)
 create mode 100644 tests/python/relay/aot/test_crt_forward_declarations.py

diff --git a/python/tvm/relay/testing/tflite.py b/python/tvm/relay/testing/tflite.py
index df40130cebaf..c45b76c77369 100644
--- a/python/tvm/relay/testing/tflite.py
+++ b/python/tvm/relay/testing/tflite.py
@@ -61,6 +61,16 @@ def conv2d_single_function(ifm_tensor):
 
         return conv2d_single_function
 
+    def load_from_file(self, model_file, shapes):
+        """Load tflite model from a tflite file"""
+        for i, shape in enumerate(shapes):
+            input_name = "input_" + str(i)
+            self.shape_dict.update({input_name: shape})
+            self.dtype_dict.update({input_name: self.dtype})
+
+        with open(model_file, "rb") as f:
+            self.serial_model = f.read()
+
     def create_tflite_model(self, tfl_function, shapes, ranges=None):
         """Creates TFLite serial graph"""
         tensor_specs = []
diff --git a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
index 420e8618a4f9..1d53373ba833 100644
--- a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
+++ b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
@@ -35,10 +35,10 @@ namespace cmsisnn {
 
 class CodeGenCMSISNN : public codegen::CodeGenCHost {
  public:
-  void Init(bool output_ssa, bool emit_asserts, std::string target_str) {
+  void Init(bool output_ssa, bool emit_asserts, bool emit_fwd_func_decl, std::string target_str) {
     std::unordered_set<std::string> devices;
     devices.insert("cmsis-nn");
-    CodeGenCHost::Init(output_ssa, emit_asserts, target_str, devices);
+    CodeGenCHost::Init(output_ssa, emit_asserts, emit_fwd_func_decl, target_str, devices);
   }
 
   /*!
@@ -491,9 +491,10 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
 runtime::Module TIRToRuntime(IRModule mod, Target target) {
   bool output_ssa = false;
   bool emit_asserts = false;
+  bool emit_fwd_func_decl = false;
   CodeGenCMSISNN codegen;
   Array<String> function_names;
-  codegen.Init(output_ssa, emit_asserts, target->str());
+  codegen.Init(output_ssa, emit_asserts, emit_fwd_func_decl, target->str());
 
   std::vector<std::pair<tvm::GlobalVar, tvm::BaseFunc>> funcs;
   for (auto kv : mod->functions) {
diff --git a/src/relay/backend/contrib/example_target_hooks/tir_to_runtime.cc b/src/relay/backend/contrib/example_target_hooks/tir_to_runtime.cc
index 9ad434b88c60..0db8d06c3143 100644
--- a/src/relay/backend/contrib/example_target_hooks/tir_to_runtime.cc
+++ b/src/relay/backend/contrib/example_target_hooks/tir_to_runtime.cc
@@ -47,10 +47,11 @@ class CodeGenExampleTargetHook : public codegen::CodeGenCHost {
 runtime::Module TIRToRuntime(IRModule mod, Target target) {
   bool output_ssa = false;
   bool emit_asserts = false;
+  bool emit_fwd_func_decl = false;
   CodeGenExampleTargetHook codegen;
   Array<String> function_names;
   std::unordered_set<std::string> devices;
-  codegen.Init(output_ssa, emit_asserts, target->str(), devices);
+  codegen.Init(output_ssa, emit_asserts, emit_fwd_func_decl, target->str(), devices);
   for (auto kv : mod->functions) {
     auto prim_func = Downcast<PrimFunc>(kv.second);
     auto global_symbol = prim_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
diff --git a/src/relay/backend/contrib/uma/tir_to_runtime.cc b/src/relay/backend/contrib/uma/tir_to_runtime.cc
index 4b5cd4332476..3b58fda54b52 100644
--- a/src/relay/backend/contrib/uma/tir_to_runtime.cc
+++ b/src/relay/backend/contrib/uma/tir_to_runtime.cc
@@ -37,7 +37,7 @@ class UMACodegen : public codegen::CodeGenCHost {
  public:
   explicit UMACodegen(String target_str) : target_str_(target_str) {}
 
-  void Init(bool output_ssa, bool emit_asserts) {
+  void Init(bool output_ssa, bool emit_asserts, bool emit_fwd_func_decl) {
     auto includes_pf =
         tvm::runtime::Registry::Get("relay.ext.uma.codegen_c_includes_" + target_str_);
     if (includes_pf) {
@@ -46,7 +46,7 @@ class UMACodegen : public codegen::CodeGenCHost {
     }
     std::unordered_set<std::string> devices;
     devices.insert(target_str_);
-    CodeGenCHost::Init(output_ssa, emit_asserts, target_str_, devices);
+    CodeGenCHost::Init(output_ssa, emit_asserts, emit_fwd_func_decl, target_str_, devices);
   }
 
   /*!
@@ -63,9 +63,10 @@ class UMACodegen : public codegen::CodeGenCHost {
 runtime::Module TIRToRuntime(IRModule mod, Target target) {
   bool output_ssa = false;
   bool emit_asserts = false;
+  bool emit_fwd_func_decl = false;
   UMACodegen codegen(target->kind->name);
   Array<String> function_names;
-  codegen.Init(output_ssa, emit_asserts);
+  codegen.Init(output_ssa, emit_asserts, emit_fwd_func_decl);
   for (auto kv : mod->functions) {
     auto prim_func = Downcast<PrimFunc>(kv.second);
     auto global_symbol = prim_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 66c92181c126..6bf14424bf38 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -85,7 +85,7 @@ void CodeGenC::AddFunction(const PrimFunc& f) {
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
 
-  this->PrintFuncPrefix();
+  this->PrintFuncPrefix(stream);
   this->PrintExtraAttrs(f);
   this->stream << " " << static_cast<std::string>(global_symbol.value()) << "(";
 
@@ -127,7 +127,7 @@ void CodeGenC::AddFunction(const PrimFunc& f) {
   this->stream << "}\n\n";
 }
 
-void CodeGenC::PrintFuncPrefix() { stream << "void"; }
+void CodeGenC::PrintFuncPrefix(std::ostream& os) { os << "void"; }
 
 void CodeGenC::PrintExtraAttrs(const PrimFunc& f) {}
 
@@ -540,6 +540,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       ICHECK_GE(op->args.size(), 1U);
       auto func = Downcast<StringImm>(op->args[0]);
       this->PrintCallExtern(GetType(GetRef<PrimExpr>(op)), func->value, op->args, true, os);
+      this->GenerateForwardFunctionDeclarations(func->value, op->args);
     } else if (op_attr_global_symbol_.count(call_op)) {
       // call extern if the op itself have a global symbol.
       this->PrintCallExtern(GetType(GetRef<PrimExpr>(op)), op_attr_global_symbol_[call_op],
diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h
index 0af24dfdc066..be715ad3a049 100644
--- a/src/target/source/codegen_c.h
+++ b/src/target/source/codegen_c.h
@@ -75,7 +75,7 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
    * \brief Finalize the compilation and return the code.
    * \return The code.
    */
-  std::string Finish();
+  virtual std::string Finish();
   /*!
    * \brief Print the Stmt n to CodeGenC->stream
    * \param n The statement to be printed.
@@ -99,10 +99,11 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
   // The following parts are overloadable print operations.
   /*!
    * \brief Print the function header before the argument list
+   * \param os The output stream
    *
    *  Example: stream << "void";
    */
-  virtual void PrintFuncPrefix();  // NOLINT(*)
+  virtual void PrintFuncPrefix(std::ostream& os);  // NOLINT(*)
   /*!
    * \brief Print extra function attributes
    *
@@ -230,6 +231,14 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
    */
   virtual bool IsScopePartOfType() const { return true; }
 
+  /*!
+   * \brief Generate forward function declarations.
+   * \param global_symbol The symbolc of the target function.
+   * \param args The arguments to the function.
+   * \param os The output stream.
+   */
+  virtual void GenerateForwardFunctionDeclarations(String global_symbol,
+                                                   const Array<PrimExpr>& args) {}
   /*!
    * \brief Print external function call.
    * \param ret_type The return type.
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 84fc9bb9dac9..78eb08202dfe 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -44,9 +44,10 @@ namespace codegen {
 
 CodeGenCHost::CodeGenCHost() { module_name_ = name_supply_->FreshName("__tvm_module_ctx"); }
 
-void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_str,
-                        const std::unordered_set<std::string>& devices) {
+void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, bool emit_fwd_func_decl,
+                        std::string target_str, const std::unordered_set<std::string>& devices) {
   emit_asserts_ = emit_asserts;
+  emit_fwd_func_decl_ = emit_fwd_func_decl;
   declared_globals_.clear();
   decl_stream << "// tvm target: " << target_str << "\n";
   decl_stream << "#define TVM_EXPORTS\n";
@@ -76,17 +77,18 @@ void CodeGenCHost::InitGlobalContext() {
 
 void CodeGenCHost::DefineModuleName() { decl_stream << "void* " << module_name_ << " = NULL;\n"; }
 
-void CodeGenCHost::AddFunction(const PrimFunc& f) {
+void CodeGenCHost::AddFunction(const PrimFunc& f, bool emit_fwd_func_decl) {
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
   ICHECK(global_symbol.defined())
       << "CodeGenCHost: Expect PrimFunc to have the global_symbol attribute";
   function_names_.push_back(global_symbol.value());
 
+  emit_fwd_func_decl_ = emit_fwd_func_decl;
   CodeGenC::AddFunction(f);
   if (f->HasNonzeroAttr(tir::attr::kIsEntryFunc)) {
     function_names_.push_back(runtime::symbol::tvm_module_main);
     stream << "// CodegenC: NOTE: Auto-generated entry function\n";
-    PrintFuncPrefix();
+    PrintFuncPrefix(stream);
     stream << " " << tvm::runtime::symbol::tvm_module_main
            << "(void* args, int* arg_type_ids, int num_args, void* out_ret_value, "
            << "int* out_ret_tcode, void* resource_handle) {\n";
@@ -96,11 +98,33 @@ void CodeGenCHost::AddFunction(const PrimFunc& f) {
   }
 }
 
-void CodeGenCHost::PrintFuncPrefix() {  // NOLINT(*)
-  stream << "#ifdef __cplusplus\n"
-         << "extern \"C\"\n"
-         << "#endif\n"
-         << "TVM_DLL int32_t";
+void CodeGenCHost::GenerateForwardFunctionDeclarations(String global_symbol,
+                                                       const Array<PrimExpr>& args) {
+  if (!emit_fwd_func_decl_) {
+    return;
+  }
+  for (auto& func_already_defined : GetFunctionNames()) {
+    if (global_symbol == func_already_defined) {
+      return;
+    }
+  }
+  this->PrintFuncPrefix(fwd_decl_stream);
+  fwd_decl_stream << " " << global_symbol << "(";
+  for (size_t i = 1; i < args.size(); ++i) {
+    CodeGenSourceBase::PrintType(GetType(args[i]), fwd_decl_stream);
+    fwd_decl_stream << " ", this->PrintExpr(args[i], fwd_decl_stream);
+    if (i < args.size() - 1) {
+      fwd_decl_stream << ", ";
+    }
+  }
+  fwd_decl_stream << ");\n";
+}
+
+void CodeGenCHost::PrintFuncPrefix(std::ostream& os) {  // NOLINT(*)
+  os << "#ifdef __cplusplus\n"
+     << "extern \"C\"\n"
+     << "#endif\n"
+     << "TVM_DLL int32_t";
 }
 
 void CodeGenCHost::PrintFinalReturn() {  // NOLINT(*)
@@ -108,6 +132,15 @@ void CodeGenCHost::PrintFinalReturn() {  // NOLINT(*)
   stream << "return 0;\n";
 }
 
+std::string CodeGenCHost::Finish() {  // NOLINT(*)
+  std::string ret = decl_stream.str();
+  if (emit_fwd_func_decl_) {
+    ret += fwd_decl_stream.str();
+  }
+  ret += stream.str();
+  return ret;
+}
+
 void CodeGenCHost::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
@@ -394,6 +427,7 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
   using tvm::runtime::Registry;
   bool output_ssa = false;
   bool emit_asserts = false;
+  bool emit_fwd_func_decl = true;
 
   std::unordered_set<std::string> devices;
   if (mod->GetAttr<Map<GlobalVar, String>>("device_contexts") != nullptr) {
@@ -405,7 +439,7 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
   }
 
   CodeGenCHost cg;
-  cg.Init(output_ssa, emit_asserts, target->str(), devices);
+  cg.Init(output_ssa, emit_asserts, emit_fwd_func_decl, target->str(), devices);
   cg.SetConstantsByteAlignment(target->GetAttr<Integer>("constants-byte-alignment").value_or(16));
   PrimFunc aot_executor_fn;
 
@@ -441,7 +475,8 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
 
   // Add __tvm_main__
   if (aot_executor_fn.defined()) {
-    cg.AddFunction(aot_executor_fn);
+    emit_fwd_func_decl = true;
+    cg.AddFunction(aot_executor_fn, emit_fwd_func_decl);
   }
 
   // NOTE: it's possible that kRuntime attr is not attached when the mod was built with tvm.build().
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index 84c27b91bac3..6bae574627d5 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -40,11 +40,12 @@ namespace codegen {
 class CodeGenCHost : public CodeGenC {
  public:
   CodeGenCHost();
-  void Init(bool output_ssa, bool emit_asserts, std::string target_str,
+  void Init(bool output_ssa, bool emit_asserts, bool emit_fwd_func_decl, std::string target_str,
             const std::unordered_set<std::string>& devices);
 
   void InitGlobalContext();
-  void AddFunction(const PrimFunc& f);
+  void AddFunction(const PrimFunc& f, bool emit_fwd_func_decl = false);
+  std::string Finish() final;
   /*!
    * \brief Add functions from the (unordered) range to the current module in a deterministic
    * order. This helps with debugging.
@@ -55,7 +56,7 @@ class CodeGenCHost : public CodeGenC {
   void DefineModuleName();
 
   void PrintType(DataType t, std::ostream& os) final;  // NOLINT(*)
-  void PrintFuncPrefix() final;                        // NOLINT(*)
+  void PrintFuncPrefix(std::ostream& os) final;        // NOLINT(*)
   void PrintFinalReturn() final;                       // NOLINT(*)
 
   // overload visitor functions
@@ -68,6 +69,8 @@ class CodeGenCHost : public CodeGenC {
 
   void VisitStmt_(const AssertStmtNode* op) final;  // NOLINT(*)
 
+  virtual void GenerateForwardFunctionDeclarations(String global_symbol,
+                                                   const Array<PrimExpr>& args);  // NOLINT(*)
   Array<String> GetFunctionNames() { return function_names_; }
 
  private:
@@ -87,6 +90,8 @@ class CodeGenCHost : public CodeGenC {
   Array<String> function_names_;
   /*! \brief whether to emit asserts in the resulting C code */
   bool emit_asserts_;
+  /*! \brief whether to emit forwared function declarations in the resulting C code */
+  bool emit_fwd_func_decl_;
 
   FunctionInfo GetFunctionInfo(const CallNode* op, bool has_resource_handle);
   std::string GetPackedName(const CallNode* op);
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index 3ae74cc16da4..436e85247ffe 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -48,7 +48,7 @@ void CodeGenCUDA::Init(bool output_ssa) {
   ICHECK_EQ(vid_global_barrier_state_, runtime::symbol::tvm_global_barrier_state);
 }
 
-void CodeGenCUDA::PrintFuncPrefix() { stream << "extern \"C\" __global__ void"; }
+void CodeGenCUDA::PrintFuncPrefix(std::ostream& os) { os << "extern \"C\" __global__ void"; }
 
 class ThreadIdxExtractor : public tir::StmtVisitor {
  private:
diff --git a/src/target/source/codegen_cuda.h b/src/target/source/codegen_cuda.h
index 673753c470ae..0fef15c7a7f3 100644
--- a/src/target/source/codegen_cuda.h
+++ b/src/target/source/codegen_cuda.h
@@ -45,7 +45,7 @@ class CodeGenCUDA final : public CodeGenC {
     return (enable_fp16_ || enable_bf16_ || enable_int8_ || need_math_constants_h_ || need_mma_h_);
   }
   // override behavior
-  void PrintFuncPrefix() final;
+  void PrintFuncPrefix(std::ostream& os) final;
   void PrintExtraAttrs(const PrimFunc& f) final;
   void VisitStmt_(const ForNode* op) final;
   void PrintStorageSync(const CallNode* op) final;
diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index cd898043eeb5..6e5a9db4d37c 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -87,7 +87,7 @@ void CodeGenOpenCL::InitFuncState(const PrimFunc& f) {
   }
 }
 
-void CodeGenOpenCL::PrintFuncPrefix() { stream << "__kernel void"; }
+void CodeGenOpenCL::PrintFuncPrefix(std::ostream& os) { os << "__kernel void"; }
 
 void CodeGenOpenCL::PreFunctionBody(const PrimFunc& f) {
   for (Var arg : f->params) {
diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h
index af6de1531017..bf3046f0d8df 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -41,7 +41,7 @@ class CodeGenOpenCL final : public CodeGenC {
 
   // override print thread tag.
   void InitFuncState(const PrimFunc& f) final;
-  void PrintFuncPrefix() final;                                              // NOLINT(*)
+  void PrintFuncPrefix(std::ostream& os) final;                              // NOLINT(*)
   void PreFunctionBody(const PrimFunc& f) final;                             // NOLINT(*)
   void BindThreadIndex(const IterVar& iv) final;                             // NOLINT(*)
   void PrintStorageScope(const std::string& scope, std::ostream& os) final;  // NOLINT(*)
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index 2fd0abcd68a6..8191ad43aa99 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -120,6 +120,8 @@ class CodeGenSourceBase {
   std::ostringstream decl_stream;
   /*! \brief the stream to be printed */
   std::ostringstream stream;
+  /*! \brief the forward declaration stream */
+  std::ostringstream fwd_decl_stream;
   /*! \brief name of each variable */
   std::unordered_map<const tir::VarNode*, std::string> var_idmap_;
   /*! \brief NameSupply for allocation */
diff --git a/src/target/source/codegen_vhls.cc b/src/target/source/codegen_vhls.cc
index 4091b64f4524..3ae3fb773d7f 100644
--- a/src/target/source/codegen_vhls.cc
+++ b/src/target/source/codegen_vhls.cc
@@ -80,7 +80,7 @@ void CodeGenVivadoHLS::PrintType(DataType t, std::ostream& os) {
   }
 }
 
-void CodeGenVivadoHLS::PrintFuncPrefix() { stream << "extern \"C\" void"; }
+void CodeGenVivadoHLS::PrintFuncPrefix(std::ostream& os) { os << "extern \"C\" void"; }
 
 void CodeGenVivadoHLS::PreFunctionBody(const PrimFunc& f) {
   for (size_t i = 0; i < f->params.size(); ++i) {
diff --git a/src/target/source/codegen_vhls.h b/src/target/source/codegen_vhls.h
index b9bec516bae9..32ddce1b3a30 100644
--- a/src/target/source/codegen_vhls.h
+++ b/src/target/source/codegen_vhls.h
@@ -40,7 +40,7 @@ class CodeGenVivadoHLS final : public CodeGenC {
   void Init(bool output_ssa);
   void PrintType(DataType t, std::ostream& os);
 
-  void PrintFuncPrefix() final;
+  void PrintFuncPrefix(std::ostream& os) final;
   void PreFunctionBody(const PrimFunc& f) final;
   void VisitExpr_(const MinNode* op, std::ostream& os) final;
   void VisitExpr_(const MaxNode* op, std::ostream& os) final;
diff --git a/tests/python/relay/aot/corstone300.mk b/tests/python/relay/aot/corstone300.mk
index 1361dbbc1946..cb1db5ea9995 100644
--- a/tests/python/relay/aot/corstone300.mk
+++ b/tests/python/relay/aot/corstone300.mk
@@ -42,7 +42,7 @@ ETHOSU_PATH=/opt/arm/ethosu
 DRIVER_PATH=${ETHOSU_PATH}/core_driver
 CMSIS_PATH=${ETHOSU_PATH}/cmsis
 PLATFORM_PATH=${ETHOSU_PATH}/core_platform/targets/corstone-300
-PKG_COMPILE_OPTS = -g -Wall -O2 -Wno-incompatible-pointer-types -Wno-format -mcpu=${MCPU}${MCPU_FLAGS} -mthumb -mfloat-abi=${MFLOAT_ABI} -std=gnu99
+PKG_COMPILE_OPTS = -g -Wall -O2 -Wno-incompatible-pointer-types -Wno-format -Werror-implicit-function-declaration -mcpu=${MCPU}${MCPU_FLAGS} -mthumb -mfloat-abi=${MFLOAT_ABI} -std=gnu99
 CMAKE = /opt/arm/cmake/bin/cmake
 CC = arm-none-eabi-gcc
 AR = arm-none-eabi-ar
diff --git a/tests/python/relay/aot/test_crt_forward_declarations.py b/tests/python/relay/aot/test_crt_forward_declarations.py
new file mode 100644
index 000000000000..7454f85ed153
--- /dev/null
+++ b/tests/python/relay/aot/test_crt_forward_declarations.py
@@ -0,0 +1,275 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""test forward function declarations codegen by CodegenCHost."""
+
+from collections import OrderedDict
+import pytest
+import numpy as np
+
+import tvm.testing
+from tvm import relay
+from tvm.contrib.download import download_testdata
+from tvm.relay.op.contrib import cmsisnn
+from tvm.testing.aot import AOTTestModel, compile_models, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+    AOT_USMP_CORSTONE300_RUNNER,
+    parametrize_aot_options,
+    AOTTestRunner,
+)
+
+
+def get_range_for_dtype_str(dtype):
+    """
+    Produces the min,max for a give data type.
+
+    Parameters
+    ----------
+    dtype : str
+        a type string (e.g., int8)
+
+    Returns
+    -------
+    type_info.min : int
+        the minimum of the range
+    type_info.max : int
+        the maximum of the range
+    """
+
+    try:
+        type_info = np.iinfo(dtype)
+    except ValueError:
+        type_info = np.finfo(dtype)
+    return type_info.min, type_info.max
+
+
+def _change_ndarray_layout(arr, src_layout, dst_layout):
+    """Makes a copy of an ndarray, reshaping it to a new data layout.
+
+    Parameter
+    ---------
+    arr : numpy.ndarray
+        The ndarray to be reformatted.
+
+    src_layout : str
+        The current layout of the Relay constant. Must be alphabetic (e.g. NHWC
+        or OIHW, but not NCHW2c).
+
+    dst_layout : str
+        The desired layout of new the Relay constant. Must be alphabetic (e.g. NHWC
+        or OIHW, but not NCHW2c).
+
+    Returns
+    -------
+    dst_shape : numpy.ndarray
+        A copy of the ndarray with the new layout.
+    """
+    assert src_layout.isalpha() and dst_layout.isalpha()
+    axis_order = [src_layout.index(c) for c in dst_layout]
+    return np.transpose(arr, axis_order)
+
+
+@tvm.testing.requires_package("tflite")
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("test_runner", [AOT_CORSTONE300_RUNNER, AOT_USMP_CORSTONE300_RUNNER])
+def test_external_calls(test_runner):
+    """Download a small network and partition for CMSIS-NN to test forward declarations for external
+    calls outside of __tvm_main__."""
+    # download the model
+    base_url = (
+        "https://github.com/ARM-software/ML-zoo/raw/"
+        "48a22ee22325d15d2371a6df24eb7d67e21dcc97"
+        "/models/keyword_spotting/cnn_small/tflite_int8"
+    )
+    file_to_download = "cnn_s_quantized.tflite"
+    file_saved = "cnn_s_quantized_15Dec2021.tflite"
+    model_file = download_testdata("{}/{}".format(base_url, file_to_download), file_saved)
+
+    # convert the tflite network into relay model
+    # pylint: disable=import-outside-toplevel
+    from tvm.relay.testing.tflite import TFLiteModel
+
+    input_shape = (1, 490)
+    dtype = "int8"
+    tfl_model = TFLiteModel(dtype)
+    tfl_model.load_from_file(model_file, [input_shape])
+    relay_mod, relay_params = tfl_model.convert_to_relay()
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(relay_mod, relay_params)
+
+    # obtain the executor factory post relay compilation.
+    input_map, output_map, output_tolerance = tfl_model.generate_reference_data()
+    interface_api = "c"
+    use_unpacked_api = True
+    compiled_models = compile_models(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=input_map,
+            outputs=output_map,
+            params=None,
+            output_tolerance=output_tolerance,
+        ),
+        interface_api,
+        use_unpacked_api,
+        pass_config=test_runner.pass_config,
+    )
+
+    # Validate frquency of function appearances in the Host C file after forward declarations.
+    lib_mod = compiled_models[0].executor_factory.lib.imported_modules[0]
+    main_source = lib_mod.get_source()
+    assert (
+        main_source.count("TVMBackendAllocWorkspace") == 3
+        or main_source.count("TVMBackendAllocWorkspace") == 0
+    )
+    assert main_source.count("tvmgen_default_fused_reshape") == 2
+    assert main_source.count("tvmgen_default_cmsis_nn_main") == 12
+    cmsisnn_source = lib_mod.imported_modules[0].get_source()
+    assert cmsisnn_source.count("arm_convolve_wrapper") == 1
+    assert cmsisnn_source.count("arm_fully_connected") == 3
+    assert cmsisnn_source.count("arm_softmax") == 1
+
+
+@parametrize_aot_options
+def test_internal_calls(interface_api, use_unpacked_api, test_runner):
+    """Test for all internal function calls. No forward declarations are expected here."""
+    dtype = "float32"
+    groups = 32
+    weight_shape = 1
+    ishape = (1, 32, 14, 14)
+    wshape = (32, weight_shape, 3, 3)
+    pass_config = {"tir.usmp.enable": True}
+    test_runner = AOTTestRunner(
+        makefile=test_runner.makefile,
+        prologue=test_runner.prologue,
+        epilogue=test_runner.epilogue,
+        includes=test_runner.includes,
+        parameters=test_runner.parameters,
+        pass_config=pass_config,
+    )
+
+    data0 = relay.var("data", shape=ishape, dtype=dtype)
+    weight0 = relay.var("weight", shape=wshape, dtype=dtype)
+    out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1), groups=groups)
+    main_f = relay.Function([data0, weight0], out)
+    mod = tvm.IRModule()
+    mod["main"] = main_f
+    mod = tvm.relay.transform.InferType()(mod)
+
+    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
+    w1_data = np.random.uniform(0, 1, wshape).astype(dtype)
+
+    inputs = OrderedDict([("data", i_data), ("weight", w1_data)])
+
+    output_list = generate_ref_data(mod, inputs)
+    compiled_models = compile_models(
+        models=AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+        interface_api=interface_api,
+        use_unpacked_api=use_unpacked_api,
+        pass_config=test_runner.pass_config,
+    )
+
+    lib_mod = compiled_models[0].executor_factory.lib.imported_modules[0]
+    main_source = lib_mod.get_source()
+    assert main_source.count("tvmgen_default_fused_nn_contrib_depthwise_conv2d_NCHWc") == 2
+    assert main_source.count("tvmgen_default_fused_layout_transform") == 6
+
+
+@tvm.testing.requires_corstone300
+def test_tensorized_calls():
+    """Test a subgraph with a mix of internal and tensorized calls."""
+    data_shape, kernel_size, num_filter, groups, strides, padding, dilation = (
+        (1, 32, 32, 16),
+        (3, 3),
+        16,
+        1,
+        1,
+        (0, 2, 2, 0),
+        1,
+    )
+    in_dtype = "int8"
+    data_layout = "NHWC"
+    kernel_layout = "HWOI"
+    ref_kernel_layout = "HWIO"
+    out_layout = "NHWC"
+    schedule_name = "conv2d_nhwc_dsp.arm_cpu"
+
+    ref_input_data = np.random.randint(low=-128, high=127, size=data_shape, dtype=in_dtype)
+    ref_input_var = relay.var("input", relay.TensorType(data_shape, in_dtype))  # NHWC layout
+    kernel_shape = (*kernel_size, data_shape[-1] // groups, num_filter)  # HWIO layout
+    ref_kernel_data = np.random.randint(low=-10, high=10, size=kernel_shape, dtype=in_dtype)
+
+    ref_relay_op = relay.op.nn.conv2d(
+        ref_input_var,
+        relay.const(_change_ndarray_layout(ref_kernel_data, "HWIO", ref_kernel_layout)),
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        groups=groups,
+        dilation=(dilation, dilation),
+        data_layout="NHWC",
+        kernel_layout=ref_kernel_layout,
+        out_dtype="int32",
+        out_layout="NHWC",
+    )
+    ref_module = tvm.IRModule.from_expr(relay.Function([ref_input_var], ref_relay_op))
+    ref_outputs = generate_ref_data(ref_module, {"input": ref_input_data})
+
+    # Reshape output dictionary to match out_layout
+    assert len(ref_outputs) == 1
+    output_tensor_name, output_tensor = next(iter(ref_outputs.items()))
+    ref_outputs[output_tensor_name] = _change_ndarray_layout(output_tensor, "NHWC", out_layout)
+
+    test_input_data = _change_ndarray_layout(ref_input_data, "NHWC", data_layout)
+    test_input_var = relay.var("input", relay.TensorType(test_input_data.shape, in_dtype))
+    test_kernel_data = _change_ndarray_layout(ref_kernel_data, "HWIO", kernel_layout)
+
+    test_relay_op = relay.op.nn.conv2d(
+        test_input_var,
+        relay.const(test_kernel_data),
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        groups=groups,
+        dilation=(dilation, dilation),
+        data_layout=data_layout,
+        kernel_layout=kernel_layout,
+        out_dtype="int32",
+        out_layout=out_layout,
+    )
+    test_function = relay.Function([test_input_var], test_relay_op)
+    test_model = AOTTestModel(
+        module=tvm.IRModule.from_expr(test_function),
+        inputs={"input": test_input_data},
+        outputs=ref_outputs,
+    )
+    compiled_models = compile_models(
+        test_model,
+        interface_api="c",
+        use_unpacked_api=True,
+        pass_config=AOT_CORSTONE300_RUNNER.pass_config,
+        target=f"c -keys=arm_cpu -mcpu=cortex-m7",
+        schedule_name=schedule_name,
+    )
+
+    lib_mod = compiled_models[0].executor_factory.lib.imported_modules[0]
+    main_source = lib_mod.get_source()
+    assert main_source.count("tvmgen_default_fused_nn_conv2d") == 2
+    assert main_source.count("gemm_") == 13
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From ffdf6afd7cd614d412fcb86b06cfaf008be0983c Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 7 Dec 2022 17:18:08 +0000
Subject: [PATCH 030/286] [ETHOSN] Remove inference test (#13576)

This test was causing cpptest to fail without reporting the
test as having failed. Looking back, this test doesn't really
make much sense as we are passing a file descriptor stating
that inference is running when it isn't. Therefore, removing
the test.
---
 tests/cpp/runtime/contrib/ethosn/inference_test.cc | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tests/cpp/runtime/contrib/ethosn/inference_test.cc b/tests/cpp/runtime/contrib/ethosn/inference_test.cc
index 95b27070e19a..45a6bd0997e9 100644
--- a/tests/cpp/runtime/contrib/ethosn/inference_test.cc
+++ b/tests/cpp/runtime/contrib/ethosn/inference_test.cc
@@ -43,18 +43,6 @@ TEST(WaitForInference, InferenceScheduled) {
   ICHECK_EQ(result.GetErrorDescription(), "Timed out while waiting for the inference to complete.");
 }
 
-TEST(WaitForInference, InferenceRunning) {
-  const int inference_result = 1 /* Running */;
-  const int timeout = 0;
-
-  dl::Inference inference = dl::Inference(inference_result);
-  InferenceWaitStatus result = WaitForInference(&inference, timeout);
-
-  ASSERT_EQ(result.GetErrorCode(), InferenceWaitErrorCode::kTimeout);
-  std::cout << result.GetErrorDescription() << std::endl;
-  ICHECK_EQ(result.GetErrorDescription(), "Timed out while waiting for the inference to complete.");
-}
-
 TEST(WaitForInference, InferenceError) {
   const int inference_result = 3 /* Error */;
   const int timeout = 0;

From 4afe8fea2e330ba01ff2e9e743cd6ea98a8fb270 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Wed, 7 Dec 2022 19:34:21 +0000
Subject: [PATCH 031/286] [COMMUNITY] Gavin Uberti -> Committer (#13575)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 7be6cd62b599..9fc6423d1e76 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -78,6 +78,7 @@ We do encourage everyone to work anything they are interested in.
 - [Siva Rama Krishna Reddy](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang
 - [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web
 - [Andrew Tulloch](https://github.com/ajtulloch): @ajtulloch - topi, compiler, runtime
+- [Gavin Uberti](https://github.com/guberti): @guberti - microtvm, arm
 - [Luis Vega](https://github.com/vegaluisjose): @vegaluisjose - vta, chisel
 - [Leyuan Wang](https://github.com/Laurawly) (PMC): @Laurawly: - topi
 - [Yao Wang](https://github.com/kevinthesun): @kevinthesun (PMC): - topi, vision

From 7383365aaceb8c17f98a18aa6e6b5f6ce6ab0b45 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 7 Dec 2022 22:16:45 -0700
Subject: [PATCH 032/286] [docs] Make building the cpu-only docs build explicit
 (#13315)

The docs are usually built with a GPU, so this PR simplifies some logic
that was conflating several options to automatically choose the docker
image to run. Now the CPU image is only used if the `--cpu` flag is
passed, which makes `ci.py docs` work like CI by default for the main
docs.
---
 .github/workflows/update_nightly_branch.yml |  2 +-
 tests/scripts/ci.py                         | 17 +++++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/update_nightly_branch.yml b/.github/workflows/update_nightly_branch.yml
index 2242577bdb58..301294689337 100644
--- a/.github/workflows/update_nightly_branch.yml
+++ b/.github/workflows/update_nightly_branch.yml
@@ -30,7 +30,7 @@ concurrency:
 
 jobs:
   update-nightly-branch:
-    if: github.repository == 'driazati/tvm'
+    if: github.repository == 'apache/tvm'
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v2
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index cfb91b37ce56..6799f68d43b7 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -232,6 +232,7 @@ def docker(
 
 def docs(
     tutorial_pattern: Optional[str] = None,
+    cpu: bool = False,
     full: bool = False,
     interactive: bool = False,
     skip_build: bool = False,
@@ -242,8 +243,9 @@ def docs(
     the Python docs without any tutorials.
 
     arguments:
-    full -- Build all language docs, not just Python (this will use the 'ci_gpu' Docker image)
-    tutorial-pattern -- Regex for which tutorials to execute when building docs (this will use the 'ci_gpu' Docker image)
+    full -- Build all language docs, not just Python (cannot be used with --cpu)
+    cpu -- Use the 'ci_cpu' Docker image (useful for building docs on a machine without a GPU)
+    tutorial-pattern -- Regex for which tutorials to execute when building docs (cannot be used with --cpu)
     skip_build -- skip build and setup scripts
     interactive -- start a shell after running build / test scripts
     docker-image -- manually specify the docker image to use
@@ -252,7 +254,7 @@ def docs(
 
     extra_setup = []
     image = "ci_gpu" if docker_image is None else docker_image
-    if not full and tutorial_pattern is None:
+    if cpu:
         # TODO: Change this to tlcpack/docs once that is uploaded
         image = "ci_cpu" if docker_image is None else docker_image
         build_dir = get_build_dir("cpu")
@@ -285,7 +287,7 @@ def docs(
         ]
 
         extra_setup = [
-            "python3 -m pip install --user " + " ".join(requirements),
+            "python3 -m pip install " + " ".join(requirements),
         ]
     else:
         check_gpu()
@@ -311,6 +313,13 @@ def docs(
         "TVM_LIBRARY_PATH": str(REPO_ROOT / build_dir),
     }
     docker(name=gen_name("docs"), image=image, scripts=scripts, env=env, interactive=interactive)
+    print_color(
+        col.GREEN,
+        "Done building the docs. You can view them by running "
+        "'python3 tests/scripts/ci.py serve-docs' and visiting:"
+        " http://localhost:8000 in your browser.",
+        bold=True,
+    )
 
 
 def serve_docs(directory: str = "_docs") -> None:

From f6c69da56af07ebe57619606943667fd5f893405 Mon Sep 17 00:00:00 2001
From: Matveenko Valery <50880524+valmat07@users.noreply.github.com>
Date: Thu, 8 Dec 2022 09:15:58 +0100
Subject: [PATCH 033/286] [OpenCL]  Introduction of weights on buffers (#13563)

* introduced support for weights on buffers

* update winograd schedule for support buffers on weights

* Updated the logic of using buffers

* Update texture annotation pass

* now weights on buffers support only in fp32 case

* update opencl tests for weights on buffers

* fix lint

* fix lint

* fix lint

* apply comments

* fix lint

* fix lint
---
 python/tvm/topi/adreno/conv2d_nchw.py         |  5 +-
 python/tvm/topi/adreno/conv2d_nhwc.py         |  5 +-
 .../tvm/topi/adreno/conv2d_winograd_common.py |  2 +
 .../tvm/topi/adreno/depthwise_conv2d_nchw.py  |  5 +-
 .../tvm/topi/adreno/depthwise_conv2d_nhwc.py  |  5 +-
 .../transforms/annotate_texture_storage.cc    | 78 +++++++++++++++----
 .../test_conv2d_nchw_texture.py               | 20 ++---
 7 files changed, 86 insertions(+), 34 deletions(-)

diff --git a/python/tvm/topi/adreno/conv2d_nchw.py b/python/tvm/topi/adreno/conv2d_nchw.py
index b1f229ebe5dc..bd128ed7bf75 100644
--- a/python/tvm/topi/adreno/conv2d_nchw.py
+++ b/python/tvm/topi/adreno/conv2d_nchw.py
@@ -305,8 +305,9 @@ def schedule_conv2d_NCHWc_KCRSk(cfg, s, output):
     if autotvm.GLOBAL_SCOPE.in_tuning or filter_pack_rt:
         if not autotvm.GLOBAL_SCOPE.in_tuning:
             bind_data_copy(s[kernel])
-        WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
-        bind_data_copy(s[WT])
+        if kernel.shape[2] == 1 and kernel.shape[3] == 1:
+            WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
+            bind_data_copy(s[WT])
 
     s[conv].set_scope("local")
     if latest_blocked == latest and output != latest:
diff --git a/python/tvm/topi/adreno/conv2d_nhwc.py b/python/tvm/topi/adreno/conv2d_nhwc.py
index 644978743b4d..e391495b5384 100644
--- a/python/tvm/topi/adreno/conv2d_nhwc.py
+++ b/python/tvm/topi/adreno/conv2d_nhwc.py
@@ -303,8 +303,9 @@ def schedule_conv2d_NHWC(cfg, s, output):
     if autotvm.GLOBAL_SCOPE.in_tuning or filter_pack_rt:
         if not autotvm.GLOBAL_SCOPE.in_tuning:
             bind_data_copy(s[kernel])
-        WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
-        bind_data_copy(s[WT])
+        if kernel.shape[0] == 1 and kernel.shape[1] == 1:
+            WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
+            bind_data_copy(s[WT])
 
     s[conv].set_scope("local")
     if latest_blocked == latest and output != latest:
diff --git a/python/tvm/topi/adreno/conv2d_winograd_common.py b/python/tvm/topi/adreno/conv2d_winograd_common.py
index 8c62f11c2fe5..d10acb73123d 100644
--- a/python/tvm/topi/adreno/conv2d_winograd_common.py
+++ b/python/tvm/topi/adreno/conv2d_winograd_common.py
@@ -451,6 +451,8 @@ def schedule_conv2d_winograd(cfg, s, output, pre_computed):
         autotvm.GLOBAL_SCOPE.in_tuning
         or isinstance(kernel.op, tvm.te.ComputeOp)
         and "filter_pack" in kernel.op.tag
+        and kernel.shape[2] == 1
+        and kernel.shape[3] == 1
     ):
         BB = s.cache_read(kernel_pack, get_texture_storage(kernel_pack.shape), [OL])
         bind_data_copy(s[BB])
diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
index 8549399fb0d0..7fae354dee0e 100644
--- a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
+++ b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
@@ -254,8 +254,9 @@ def schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, output):
         # create cache stage for tuning only or in case of 4d case
         AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
         bind_data_copy(s[AT])
-        WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
-        bind_data_copy(s[WT])
+        if kernel.shape[2] == 1 and kernel.shape[3] == 1:
+            WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
+            bind_data_copy(s[WT])
 
     # tile and bind spatial axes
     n, fc, y, x, fb = s[latest_blocked].op.axis
diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
index 82e128443e85..f224fe3c88dc 100644
--- a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
+++ b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
@@ -250,8 +250,9 @@ def schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, output):
         # create cache stage for tuning only or in case of 4d case
         AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
         bind_data_copy(s[AT])
-        WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
-        bind_data_copy(s[WT])
+        if kernel.shape[0] == 1 and kernel.shape[1] == 1:
+            WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
+            bind_data_copy(s[WT])
 
     # tile and bind spatial axes
     n, y, x, fc, fb = s[latest_blocked].op.axis
diff --git a/src/relay/transforms/annotate_texture_storage.cc b/src/relay/transforms/annotate_texture_storage.cc
index 9b700bef2a46..9dbd631ad32d 100644
--- a/src/relay/transforms/annotate_texture_storage.cc
+++ b/src/relay/transforms/annotate_texture_storage.cc
@@ -41,6 +41,7 @@
 
 #include <memory>
 #include <unordered_map>
+#include <unordered_set>
 
 #include "../op/memory/device_copy.h"
 #include "../op/memory/memory.h"
@@ -90,22 +91,28 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
     for (const auto& a : storage_info.args_to_vars_) {
       if (storage_map.count(a.first)) {
         for (const auto& v : a.second) {
-          storage_map.Set(v, storage_map[a.first]);
-          if (storage_map[a.first][Expr()][0] == "global" &&
-              storage_info.accept_textures_.count(v)) {
+          if (storage_info.buffers_params.find(v) != storage_info.buffers_params.end()) {
             Map<Expr, Array<String>> ent;
-            ent.Set(Expr(), storage_info.accept_textures_[v][Expr()]);
+            ent.Set(Expr(), Array<String>{"global"});
             storage_map.Set(v, ent);
-            for (const auto& calls : storage_info.accept_textures_[v]) {
-              if (calls.first != Expr()) {
-                if (storage_map.count(a.first)) {
-                  Map<Expr, Array<String>> ent_call = storage_map[a.first];
-                  ent_call.Set(calls.first, calls.second);
-                  storage_map.Set(a.first, ent_call);
-                } else {
-                  Map<Expr, Array<String>> ent_call;
-                  ent_call.Set(calls.first, calls.second);
-                  storage_map.Set(a.first, ent_call);
+          } else {
+            storage_map.Set(v, storage_map[a.first]);
+            if (storage_map[a.first][Expr()][0] == "global" &&
+                storage_info.accept_textures_.count(v)) {
+              Map<Expr, Array<String>> ent;
+              ent.Set(Expr(), storage_info.accept_textures_[v][Expr()]);
+              storage_map.Set(v, ent);
+              for (const auto& calls : storage_info.accept_textures_[v]) {
+                if (calls.first != Expr()) {
+                  if (storage_map.count(a.first)) {
+                    Map<Expr, Array<String>> ent_call = storage_map[a.first];
+                    ent_call.Set(calls.first, calls.second);
+                    storage_map.Set(a.first, ent_call);
+                  } else {
+                    Map<Expr, Array<String>> ent_call;
+                    ent_call.Set(calls.first, calls.second);
+                    storage_map.Set(a.first, ent_call);
+                  }
                 }
               }
             }
@@ -160,11 +167,20 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
               storage_scope_[call].push_back("global.texture");
             }
           }
+          const int weights_pos = 1;
           for (size_t i = 0; i < fn->params.size(); i++) {
             args_to_vars_[call->args[i]].push_back(fn->params[i]);
             // adding info about arguments if they can be converted to texture
             for (const auto& ttype : FlattenTupleType(fn->params[i]->checked_type())) {
               std::string scope = Scope(ttype->shape, GetVirtualDevice(GetRef<Expr>(call)));
+              if (expr_attrib.as<Conv2DAttrs>() || expr_attrib.as<Conv2DWinogradAttrs>()) {
+                if ((i == weights_pos) && !ttype->dtype.is_float16() &&
+                    CanUseBuffers(call->args[i], ttype->shape, fn->attrs)) {
+                  buffers_params.insert(fn->params[i]);
+                  buffers_args.insert(call->args[i]);
+                  scope = "global";
+                }
+              }
               if (scope.find("global.texture") != std::string::npos) {
                 if (accept_textures_.count(fn->params[i])) {
                   Map<Expr, Array<String>> ent = accept_textures_[fn->params[i]];
@@ -193,13 +209,15 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
         }
       }
     }
-
     if (!primitive_supports_texture_) {
+      expr_attrib = call->attrs;
       primitive_supports_texture_ = SupportsTextureStorage(call);
     }
 
     for (auto& arg : call->args) {
-      Visit(arg);
+      if (buffers_args.find(arg) == buffers_args.end()) {
+        Visit(arg);
+      }
     }
     // We have all callees filled into storage_scope_ if they support textures
     // We need to verify if this call expects texture and if it does not, remove from
@@ -398,6 +416,28 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
     return supports_texture_storage;
   }
 
+  bool CanUseBuffers(const Expr param, const Array<PrimExpr> shape,
+                     const tvm::DictAttrs param_attrs) const {
+    bool use_buffer = false;
+    if (param.as<ConstantNode>() && shape.size() == 5) {
+      auto kernel_layout = param_attrs.GetAttr<String>("kernel_layout");
+      if (kernel_layout == "HWOI4o" || kernel_layout == "HWIO4o") {
+        int a0 = shape[0].as<IntImmNode>()->value;
+        int a1 = shape[1].as<IntImmNode>()->value;
+        if (a0 != 1 && a1 != 1) {
+          use_buffer = true;
+        }
+      } else if (kernel_layout == "OIHW4o") {
+        int a2 = shape[2].as<IntImmNode>()->value;
+        int a3 = shape[3].as<IntImmNode>()->value;
+        if (a2 != 1 && a3 != 1) {
+          use_buffer = true;
+        }
+      }
+    }
+    return use_buffer;
+  }
+
   /*! \brief Temporary state for marking whether a visited function
    *         primitive supports texture storage scope */
   bool primitive_supports_texture_ = false;
@@ -409,6 +449,12 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
   std::unordered_map<Expr, std::vector<Var>, ObjectPtrHash, ObjectPtrEqual> args_to_vars_;
   /*! \brief mapping of arguments that can be converted to texture*/
   Map<Expr, Map<Expr, Array<String>>> accept_textures_;
+  /*! \brief main attribute for expression*/
+  tvm::Attrs expr_attrib;
+  /*! \brief parameters that filter out from storage_map to use buffers*/
+  std::unordered_set<Expr, ObjectPtrHash> buffers_params;
+  /*! \brief arguments in expression that will use buffers*/
+  std::unordered_set<Expr, ObjectPtrHash> buffers_args;
 };
 
 }  // namespace
diff --git a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
index d3fff68ae7cb..c5b58a7a8a31 100644
--- a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
+++ b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
@@ -592,12 +592,12 @@ def test_residual_block(remote, target, dtype):
         static_memory_scope = [
             "",
             "global.texture",
-            "global.texture-weight",
+            "global",
             "global.texture-weight",
             "global.texture",
             "global.texture-weight",
             "global.texture",
-            "global.texture-weight",
+            "global",
             "",
             "",
         ]
@@ -834,11 +834,11 @@ def test_pooling_branching_texture_params(remote, target, dtype):
         "global.texture-weight",
         "global.texture",
         "global.texture",
-        "global.texture-weight",
+        "global",
         "global.texture-weight",
         "global.texture-weight",
         "global.texture",
-        "global.texture-weight",
+        "global",
         "global.texture",
         "",
         "",
@@ -960,11 +960,11 @@ def test_branching_texture_params(remote, target, dtype):
         "global.texture",
         "global.texture-weight",
         "global.texture",
-        "global.texture-weight",
+        "global",
         "global.texture-weight",
         "global.texture-weight",
         "global.texture",
-        "global.texture-weight",
+        "global",
         "global.texture",
         "",
         "",
@@ -1179,9 +1179,9 @@ def test_injective_nwo_inputs1(remote, target, dtype):
     static_memory_scope = [
         "",
         "global.texture",
-        "global.texture-nhwc",
+        "global",
         "global.texture",
-        "global.texture-nhwc",
+        "global",
         "global.texture",
         "global",
         "global",
@@ -1277,10 +1277,10 @@ def test_injective_nwo_inputs2(remote, target, dtype):
     static_memory_scope = [
         "",
         "global.texture",
-        "global.texture-nhwc",
+        "global",
         "global.texture",
         "global",
-        "global.texture-nhwc",
+        "global",
         "global.texture",
         "global",
     ]

From b447022feea87a9af37c30f6f2e1d8da6d2bd041 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Thu, 8 Dec 2022 08:24:00 +0000
Subject: [PATCH 034/286] [bfloat16] Fixed dtype conversion in the arm_cpu
 injective schedule (#13417)

Following on from the example conversion in ndarray.py
to broaden the support for bfloat16 in the absence of
its full support, this commit converts bfloat16 to uint16
in the injective schedule to enable full compilation
of Conv2D via arm_cpu schedules.
---
 python/tvm/topi/arm_cpu/injective.py          |  3 +-
 python/tvm/topi/nn/winograd_util.py           |  1 +
 tests/python/integration/test_arm_aprofile.py | 74 +++++++++++++++++++
 3 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/integration/test_arm_aprofile.py

diff --git a/python/tvm/topi/arm_cpu/injective.py b/python/tvm/topi/arm_cpu/injective.py
index 7c3ea5261f5e..5c63e5a513db 100644
--- a/python/tvm/topi/arm_cpu/injective.py
+++ b/python/tvm/topi/arm_cpu/injective.py
@@ -68,7 +68,8 @@ def schedule_injective(outs):
 
     if list(s[x].op.axis):
         # do not vectorize for broadcast
-        (io, ii) = s[x].split(list(s[x].op.axis)[-1], 16 // np.dtype(x.dtype).itemsize)
+        dtype = "uint16" if x.dtype == "bfloat16" else x.dtype
+        (io, ii) = s[x].split(list(s[x].op.axis)[-1], 16 // np.dtype(dtype).itemsize)
         s[x].vectorize(ii)
     tvm.te.schedule.AutoInlineInjective(s)
 
diff --git a/python/tvm/topi/nn/winograd_util.py b/python/tvm/topi/nn/winograd_util.py
index c0f7097a6315..4bee06fcfaf8 100644
--- a/python/tvm/topi/nn/winograd_util.py
+++ b/python/tvm/topi/nn/winograd_util.py
@@ -169,6 +169,7 @@ def winograd_transform_matrices(tile_size, kernel_size, out_dtype):
     intp_pts = _interpolation_points(degree)
     A_data, B_data, G_data = _cook_toom_convolution(intp_pts, tile_size, kernel_size)
 
+    out_dtype = "uint16" if out_dtype == "bfloat16" else out_dtype
     return (
         const_matrix(A_data.astype(out_dtype), "A"),
         const_matrix(B_data.astype(out_dtype), "B"),
diff --git a/tests/python/integration/test_arm_aprofile.py b/tests/python/integration/test_arm_aprofile.py
new file mode 100644
index 000000000000..c38217a1b1c0
--- /dev/null
+++ b/tests/python/integration/test_arm_aprofile.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for Arm(R) A-Profile Architecture."""
+import os
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.relay.transform import ToMixedPrecision, FoldConstant
+from tvm.relay.build_module import bind_params_by_name
+
+
+def get_mattr(dtype):
+    mattr = "+v8.2a,+neon"
+    if dtype == "float16":
+        mattr += ",+fullfp16"
+    elif dtype == "bfloat16":
+        mattr += ",+bf16"
+    return mattr
+
+
+@tvm.testing.skip_if_32bit(reason="skipping test for i386.")
+@pytest.mark.parametrize("dtype", ["float32", "float16", "bfloat16"])
+def test_conv2d(dtype):
+    """Test if Conv2d cross compiles with TVM schedules."""
+    dtype = "float32"
+    ishape = [1, 28, 28, 3]  # NHWC
+    kernel_size = (3, 3)
+    wshape = (kernel_size[0], kernel_size[1], ishape[-1], 2)  # HWIO
+    weight_data = np.random.uniform(-128, 127, wshape).astype(dtype)
+    invar = relay.var("data", relay.TensorType(ishape, dtype))
+    weight = relay.const(weight_data, dtype)
+    out = relay.op.nn.conv2d(
+        invar,
+        weight,
+        kernel_size=kernel_size,
+        strides=(1, 1),
+        padding=(0, 0),
+        dilation=(1, 1),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype=dtype,
+        out_layout="NHWC",
+    )
+    mod = tvm.IRModule.from_expr(relay.Function([invar], out))
+    params = {}
+
+    prefixed_network_name = dtype + ".conv2d"
+    lib_path = os.getcwd() + "/" + prefixed_network_name + ".mod.so"
+    target = "llvm -mtriple=aarch64-linux-gnu -mattr=" + get_mattr(dtype)
+
+    mod["main"] = bind_params_by_name(mod["main"], params)
+    if dtype in ["float16", "bfloat16"]:
+        mod = ToMixedPrecision(dtype)(mod)
+        mod = FoldConstant()(mod)
+
+    with tvm.transform.PassContext(opt_level=3):
+        lib = tvm.relay.build(mod, target=target, params=params)
+        lib.export_library(lib_path, cc="aarch64-linux-gnu-gcc")

From b1437c9813440deb8b5b1267adcec73ddad77b8d Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Thu, 8 Dec 2022 12:29:00 +0200
Subject: [PATCH 035/286] [Adreno] Add global pooling schedule (#13573)

* [Adreno] Add global pooling schedule

The parallelizm opportuninties in case of global pooling are
limited by number of channels, need to change schedule to have
parallelizm by reduction axis/use rfactor

* address pylint hits

* address PR comments

* switch spatial axis to blk binding
---
 python/tvm/relay/op/strategy/adreno.py        |   7 +
 python/tvm/topi/adreno/pooling.py             | 107 ++++++++++++++
 .../relay/opencl_texture/test_pool_texture.py | 135 ++++++++++++++++++
 3 files changed, 249 insertions(+)
 create mode 100644 tests/python/relay/opencl_texture/test_pool_texture.py

diff --git a/python/tvm/relay/op/strategy/adreno.py b/python/tvm/relay/op/strategy/adreno.py
index 21252215fc28..b606ab05d701 100644
--- a/python/tvm/relay/op/strategy/adreno.py
+++ b/python/tvm/relay/op/strategy/adreno.py
@@ -215,6 +215,13 @@ def schedule_reduce_adreno(attrs, outs, target):
         return topi.adreno.schedule_reduce(outs)
 
 
+@schedule_adaptive_pool.register(["adreno"])
+def schedule_adaptive_pool_adreno(attrs, outs, target):
+    """schedule adaptive pooling ops for adreno"""
+    with target:
+        return topi.adreno.schedule_adaptive_pool(outs, attrs.layout)
+
+
 @concatenate_strategy.register(["adreno"])
 def concatenate_strategy_adreno(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
diff --git a/python/tvm/topi/adreno/pooling.py b/python/tvm/topi/adreno/pooling.py
index 49f103c04a2f..f02af0c01fd2 100644
--- a/python/tvm/topi/adreno/pooling.py
+++ b/python/tvm/topi/adreno/pooling.py
@@ -19,6 +19,113 @@
 import tvm
 from tvm import te
 from .. import tag
+from .utils import get_div
+
+
+def schedule_adaptive_pool(outs, layout="NCHW"):
+    """Schedule for adaptive_pool.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of adaptive_pool
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for adaptive_pool.
+    """
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+
+    def _schedule_global(Pool, layout):
+        # examples of latest pool op is global max pool and non latest is global avg pooling
+        # OL - an Expr will be used for rfactor
+        # Out - programming of the parallelizm on the global level
+        # shared is not required, local could be enough but shared scope gives quite significant
+        # perf boost
+        if Pool.op in s.outputs:
+            Out = Pool
+            OL = s.cache_write(Pool, "shared")
+        else:
+            Out = outs[0].op.output(0)
+            s[Pool].set_scope("shared")
+            OL = Pool
+
+        PaddedInput = Pool.op.input_tensors[0]
+
+        # detect axis for later reorder and binding of batch/channel to blocks and
+        # spatial to threads
+        if layout in ("NCHW", "NCHW4c"):
+            channel_index = 1
+            height_index = 2
+            width_index = 3
+        else:
+            channel_index = 3
+            height_index = 1
+            width_index = 2
+
+        if isinstance(PaddedInput.op, tvm.te.ComputeOp):
+            s[PaddedInput].compute_inline()
+
+        fused_reduce = s[OL].fuse(*s[OL].op.reduce_axis)
+
+        spatial = PaddedInput.shape[height_index].value * PaddedInput.shape[width_index].value
+        # below values were selected empirically assuming that we should have some work in each
+        # thread (currently from 25-49) and number of threads not exceeding some threshold that
+        # was selected as 256 from performance point of view after experiments on Adreno 660
+        max_threads = spatial // 25 if spatial > 25 else 1
+        max_threads = 256 if max_threads > 256 else max_threads
+        num_thread = get_div(spatial, max_threads)
+
+        thread_y = te.thread_axis((0, num_thread), "threadIdx.y")
+
+        _, ki = s[OL].split(fused_reduce, factor=num_thread)
+        data_out_rf = s.rfactor(OL, ki)
+        s[data_out_rf].compute_at(s[OL], s[OL].op.reduce_axis[0])
+        s[OL].bind(s[OL].op.reduce_axis[0], thread_y)
+
+        naxis = s[Out].op.axis[0]
+        caxis = s[Out].op.axis[channel_index]
+        haxis = s[Out].op.axis[height_index]
+        waxis = s[Out].op.axis[width_index]
+
+        if layout in ("NHWC4c", "NCHW4c"):
+            texture_axis = s[Out].op.axis[-1]
+            s[Out].reorder(naxis, caxis, haxis, waxis, texture_axis)
+            s[Out].vectorize(texture_axis)
+        else:
+            texture_axis = None
+            s[Out].reorder(naxis, caxis, haxis, waxis)
+
+        bx = s[Out].fuse(naxis, caxis, haxis, waxis)
+        s[Out].bind(bx, te.thread_axis("blockIdx.x"))
+
+        s[OL].compute_at(s[Out], bx)
+
+    scheduled_ops = []
+
+    def traverse(OP):
+        """Internal traverse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_injective(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                    traverse(tensor.op)
+        # schedule global_pool
+        elif OP.tag.startswith("adaptive_pool"):
+            Pool = OP.output(0)
+            _schedule_global(Pool, layout)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+        scheduled_ops.append(OP)
+
+    traverse(outs[0].op)
+    return s
 
 
 def schedule_pool(outs, layout):
diff --git a/tests/python/relay/opencl_texture/test_pool_texture.py b/tests/python/relay/opencl_texture/test_pool_texture.py
new file mode 100644
index 000000000000..faeb121c800c
--- /dev/null
+++ b/tests/python/relay/opencl_texture/test_pool_texture.py
@@ -0,0 +1,135 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import relay
+from utils.adreno_utils import build_run_compare
+
+
+dtype = tvm.testing.parameter("float32")
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_global_pool2d_nchw_wide(remote, target, dtype):
+    """
+    Use case of NCHW global pooling with big spatial valies
+    """
+    input_shape = (1, 32, 160, 160)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    C = relay.nn.global_avg_pool2d(A)
+    mod = relay.Function([A], C)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_global_pool2d_nchw4c_wide(remote, target, dtype):
+    """
+    Use case of blocked NCHW4c global pooling with big spatial valies
+    """
+    input_shape = (1, 8, 160, 160, 4)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    C = relay.nn.global_avg_pool2d(A, layout="NCHW4c")
+    mod = relay.Function([A], C)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_global_pool2d_nchw_deep(remote, target, dtype):
+    """
+    Use case of NCHW deep global pooling
+    """
+    input_shape = (1, 2048, 20, 20)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    C = relay.nn.global_avg_pool2d(A)
+    mod = relay.Function([A], C)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_global_pool2d_nchw4c_deep(remote, target, dtype):
+    """
+    Use case of blocked NCHW4c deep global pooling
+    """
+    input_shape = (1, 512, 20, 20, 4)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    C = relay.nn.global_avg_pool2d(A, layout="NCHW4c")
+    mod = relay.Function([A], C)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_global_pool2d_nhwc(remote, target, dtype):
+    """
+    Use case of NHWC global pooling with big spatial valies
+    """
+    input_shape = (1, 160, 160, 32)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    C = relay.nn.global_avg_pool2d(A, layout="NHWC")
+    mod = relay.Function([A], C)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_global_pool2d_nhwc4c(remote, target, dtype):
+    """
+    Use case of NHWC deep global pooling
+    """
+    input_shape = (1, 160, 160, 8, 4)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    C = relay.nn.global_avg_pool2d(A, layout="NHWC4c")
+    mod = relay.Function([A], C)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_global_max_pool2d_nchw_wide(remote, target, dtype):
+    """
+    Use case of NCHW global pooling with big spatial valies
+    """
+    input_shape = (1, 32, 160, 160)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    C = relay.nn.global_max_pool2d(A)
+    mod = relay.Function([A], C)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_global_max_pool2d_nchw4c_wide(remote, target, dtype):
+    """
+    Use case of blocked NCHW4c global pooling with big spatial valies
+    """
+    input_shape = (1, 8, 160, 160, 4)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    C = relay.nn.global_max_pool2d(A, layout="NCHW4c")
+    mod = relay.Function([A], C)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)

From e11695ddd6530d95d459f02ecb1794eac01f0450 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 8 Dec 2022 14:02:29 -0600
Subject: [PATCH 036/286] [Hexagon] Use get_hexagon_target in test, NFC
 (#13584)

---
 tests/python/contrib/test_hexagon/test_launcher.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 76d5cba60a1f..95c6c1e19805 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -449,8 +449,7 @@ def get_conv2d_nchw(d_shape, w_shape, padding, strides=(1, 1)):
             out_dtype=out_dtype,
         )
 
-    target_hexagon = tvm.target.hexagon("v68")
-    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+    target = get_hexagon_target("v68")
     I, O, H, W = 64, 256, 56, 56
     kH = kW = 3
     padding = (1, 1)
@@ -515,8 +514,7 @@ def test_dense_relay_vrmpy(hexagon_session, data_dtype, weight_dtype):
     if data_dtype == "int8" and weight_dtype == "uint8":
         pytest.skip("(i8, u8) input pair is not supported")
 
-    target_hexagon = tvm.target.hexagon("v68")
-    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+    target = get_hexagon_target("v68")
 
     M = 128
     N = 1000

From c2a0d5a39a0990a6db47f9718aaf25458661be23 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 8 Dec 2022 14:14:43 -0600
Subject: [PATCH 037/286] [Hexagon] Lookup intrinsic by name instead of using
 enum value (#13583)

---
 tests/python/contrib/test_hexagon/test_async_dma_pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index a35eefd1a300..6be761b7d0d1 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -471,7 +471,7 @@ def main(
                         b_i8x128 = b_buffer[0, 0:128]
                         b_i32x32: T.int32x32 = T.reinterpret(b_i8x128, dtype="int32x32")
                         c_buffer[0:32] = T.call_llvm_pure_intrin(
-                            4217,
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.acc.128B"),
                             T.uint32(3),
                             c_buffer[0:32],
                             T.broadcast(a_i32, 32),
@@ -626,7 +626,7 @@ def main(
                             b_i8x128 = b_buffer[0, 0:128]
                             b_i32x32: T.int32x32 = T.reinterpret(b_i8x128, dtype="int32x32")
                             c_buffer[0:32] = T.call_llvm_pure_intrin(
-                                4217,
+                                T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.acc.128B"),
                                 T.uint32(3),
                                 c_buffer[0:32],
                                 T.broadcast(a_i32, 32),

From 093f72301bf8318f35ef212698bd88daf71b77bd Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 8 Dec 2022 14:25:48 -0600
Subject: [PATCH 038/286] [Test] Make tests work with older numpy versions
 (#13582)

Add explicit "constant" as `mode` argument in `pad` and replace
`default_rng` with `randint`.
---
 python/tvm/topi/testing/depthwise_conv2d_python.py       | 2 +-
 .../contrib/test_hexagon/test_async_dma_pipeline.py      | 9 ++++-----
 .../test_hexagon/topi/slice_op/test_conv2d_slice.py      | 4 +++-
 tests/python/contrib/test_hexagon/topi/test_pad.py       | 2 +-
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/python/tvm/topi/testing/depthwise_conv2d_python.py b/python/tvm/topi/testing/depthwise_conv2d_python.py
index a6247e9f92cc..69f4bbbb4533 100644
--- a/python/tvm/topi/testing/depthwise_conv2d_python.py
+++ b/python/tvm/topi/testing/depthwise_conv2d_python.py
@@ -64,7 +64,7 @@ def depthwise_conv2d_python_nchw(input_np, filter_np, stride, padding):
         for j in range(out_channel):
             apad = input_np[i, j // channel_multiplier, :, :]
             if pad_h or pad_w:
-                apad = np.pad(apad, [(pad_top, pad_bottom), (pad_left, pad_right)])
+                apad = np.pad(apad, [(pad_top, pad_bottom), (pad_left, pad_right)], "constant")
 
             conv = _convolve2d(
                 apad,
diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index 6be761b7d0d1..51427f18f6f4 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -20,7 +20,6 @@
 import numpy as np
 import pytest
 import tvm
-from numpy.random import default_rng
 from tvm.script import tir as T
 
 VRMPY_SIZE_B = 128
@@ -184,11 +183,11 @@ class TestAsyncDMAPipeline:
 
     @tvm.testing.fixture
     def input_a(self, size_a):
-        return default_rng().integers(0, 8, (size_a, VRMPY_SIZE_B), dtype="uint8")
+        return np.random.randint(0, 8, (size_a, VRMPY_SIZE_B), dtype="uint8")
 
     @tvm.testing.fixture
     def input_w(self, size_w):
-        return default_rng().integers(0, 8, (size_w, VRMPY_SIZE_B), dtype="uint8")
+        return np.random.randint(0, 8, (size_w, VRMPY_SIZE_B), dtype="uint8")
 
     @tvm.testing.fixture
     def expected_output(self, size_a, size_w, input_a, input_w):
@@ -657,8 +656,8 @@ def test_meta(hexagon_session):
     if tvm.testing.utils.IS_IN_CI:
         pytest.skip("Skipping test since it takes too long in CI.")
 
-    a_data = default_rng().integers(1, 8, (1, 1, 230, 230, 4), dtype="uint8")
-    w_data = default_rng().integers(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8")
+    a_data = np.random.randint(1, 8, (1, 1, 230, 230, 4), dtype="uint8")
+    w_data = np.random.randint(1, 8, (2, 1, 7, 7, 1, 32, 4), dtype="int8")
     c_data = np.zeros((1, 2, 112, 112, 32), dtype="int32")
 
     sch = tvm.tir.Schedule(ModuleBase)
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_conv2d_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_conv2d_slice.py
index e06636cde365..dcc926addcab 100644
--- a/tests/python/contrib/test_hexagon/topi/slice_op/test_conv2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_conv2d_slice.py
@@ -110,7 +110,9 @@ def padded_filt_shape(filt_shape):
 def weights_np_padded(weights_np, filt_shape, padded_filt_shape):
     pad_in_channels = padded_filt_shape[2] - filt_shape[2]
     pad_out_channels = padded_filt_shape[3] - filt_shape[3]
-    filt_padded = np.pad(weights_np, ((0, 0), (0, 0), (0, pad_in_channels), (0, pad_out_channels)))
+    filt_padded = np.pad(
+        weights_np, ((0, 0), (0, 0), (0, pad_in_channels), (0, pad_out_channels)), "constant"
+    )
     return filt_padded
 
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_pad.py b/tests/python/contrib/test_hexagon/topi/test_pad.py
index 18a392e5b1ac..f44f228a01d3 100644
--- a/tests/python/contrib/test_hexagon/topi/test_pad.py
+++ b/tests/python/contrib/test_hexagon/topi/test_pad.py
@@ -50,7 +50,7 @@ def test_nn_pad(hexagon_session: Session):
     mod["pad"](a, b)
 
     # Reference numpy pad output
-    ref_out = np.pad(data_in, pad_width=((0, 0), (1, 1), (1, 1), (0, 0)))
+    ref_out = np.pad(data_in, pad_width=((0, 0), (1, 1), (1, 1), (0, 0)), mode="constant")
 
     tvm.testing.assert_allclose(b.numpy(), ref_out)
 

From deeb2e7aed56f8403b6a58200cc9e96465c5a387 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 8 Dec 2022 17:08:51 -0700
Subject: [PATCH 039/286] [ci][docker] Read docker image tags during CI runs
 (#13572)

Hardcoding the Docker images in the Jenkinsfiles has the downside that only trusted changes are actually run in CI. Since Docker image updates are pretty frequent, this makes for a pretty bad UX (e.g. CI runs but doesn't actually test the PR). The fix here makes it so the images are read from a different file during CI execution so updates from anyone are picked up.

This PR eliminates the need to use the batch jobs in Jenkins to update Docker images. Docker images are built as part of post-merge CI on `main` and anyone can trigger a validation run by filing a PR with changed Docker tags.
---
 ci/jenkins/docker-images.ini                  | 29 ++++++++++++++++
 ci/jenkins/generated/arm_jenkinsfile.groovy   | 26 +++++++-------
 .../generated/cortexm_jenkinsfile.groovy      | 26 +++++++-------
 ci/jenkins/generated/cpu_jenkinsfile.groovy   | 26 +++++++-------
 .../generated/docker_jenkinsfile.groovy       | 26 +++++++-------
 ci/jenkins/generated/gpu_jenkinsfile.groovy   | 26 +++++++-------
 .../generated/hexagon_jenkinsfile.groovy      | 26 +++++++-------
 ci/jenkins/generated/i386_jenkinsfile.groovy  | 26 +++++++-------
 ci/jenkins/generated/lint_jenkinsfile.groovy  | 26 +++++++-------
 .../generated/minimal_jenkinsfile.groovy      | 26 +++++++-------
 ci/jenkins/generated/riscv_jenkinsfile.groovy | 26 +++++++-------
 ci/jenkins/generated/wasm_jenkinsfile.groovy  | 26 +++++++-------
 ci/jenkins/templates/utils/Prepare.groovy.j2  |  2 +-
 ci/jenkins/templates/utils/base.groovy.j2     | 23 +++++++------
 ci/scripts/jenkins/determine_docker_images.py | 25 ++++++++++----
 docker/bash.sh                                |  5 +--
 docs/contribute/ci.rst                        | 34 +++++++++++++++----
 tests/lint/check_file_type.py                 |  2 ++
 tests/python/ci/test_ci.py                    | 12 +++++--
 19 files changed, 257 insertions(+), 161 deletions(-)
 create mode 100644 ci/jenkins/docker-images.ini

diff --git a/ci/jenkins/docker-images.ini b/ci/jenkins/docker-images.ini
new file mode 100644
index 000000000000..119a43218642
--- /dev/null
+++ b/ci/jenkins/docker-images.ini
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This data file is read during when Jenkins runs job to determine docker images.
+[jenkins]
+ci_arm: tlcpack/ci-arm:20221013-060115-61c9742ea
+ci_cortexm: tlcpack/ci-cortexm:20221013-060115-61c9742ea
+ci_cpu: tlcpack/ci-cpu:20221013-060115-61c9742ea
+ci_gpu: tlcpack/ci-gpu:20221019-060125-0b4836739
+ci_hexagon: tlcpack/ci-hexagon:20221013-060115-61c9742ea
+ci_i386: tlcpack/ci-i386:20221013-060115-61c9742ea
+ci_lint: tlcpack/ci-lint:20221013-060115-61c9742ea
+ci_minimal: tlcpack/ci-minimal:20221013-060115-61c9742ea
+ci_riscv: tlcpack/ci-riscv:20221013-060115-61c9742ea
+ci_wasm: tlcpack/ci-wasm:20221013-060115-61c9742ea
diff --git a/ci/jenkins/generated/arm_jenkinsfile.groovy b/ci/jenkins/generated/arm_jenkinsfile.groovy
index f387687528c0..f1bcc786b72e 100644
--- a/ci/jenkins/generated/arm_jenkinsfile.groovy
+++ b/ci/jenkins/generated/arm_jenkinsfile.groovy
@@ -60,19 +60,21 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-05T14:48:42.092397
+// Generated at 2022-12-06T20:56:42.365592
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
-ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
-ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
-ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
-ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
-ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
-ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
-ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
-ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
-ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+// These are set at runtime from data in ci/jenkins/docker-images.yml, update
+// image tags in that file
+ci_lint = ''
+ci_gpu = ''
+ci_cpu = ''
+ci_minimal = ''
+ci_wasm = ''
+ci_i386 = ''
+ci_cortexm = ''
+ci_arm = ''
+ci_hexagon = ''
+ci_riscv = ''
 
 // Parameters to allow overriding (in Jenkins UI), the images
 // to be used by a given build. When provided, they take precedence
@@ -322,7 +324,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm ci_cortexm ci_cpu ci_gpu ci_hexagon ci_i386 ci_lint ci_minimal ci_riscv ci_wasm ",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
diff --git a/ci/jenkins/generated/cortexm_jenkinsfile.groovy b/ci/jenkins/generated/cortexm_jenkinsfile.groovy
index 76dbbbb7a3d8..4b5ba2e104f4 100644
--- a/ci/jenkins/generated/cortexm_jenkinsfile.groovy
+++ b/ci/jenkins/generated/cortexm_jenkinsfile.groovy
@@ -60,19 +60,21 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-05T14:48:41.929980
+// Generated at 2022-12-06T20:56:42.204393
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
-ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
-ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
-ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
-ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
-ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
-ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
-ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
-ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
-ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+// These are set at runtime from data in ci/jenkins/docker-images.yml, update
+// image tags in that file
+ci_lint = ''
+ci_gpu = ''
+ci_cpu = ''
+ci_minimal = ''
+ci_wasm = ''
+ci_i386 = ''
+ci_cortexm = ''
+ci_arm = ''
+ci_hexagon = ''
+ci_riscv = ''
 
 // Parameters to allow overriding (in Jenkins UI), the images
 // to be used by a given build. When provided, they take precedence
@@ -322,7 +324,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm ci_cortexm ci_cpu ci_gpu ci_hexagon ci_i386 ci_lint ci_minimal ci_riscv ci_wasm ",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
diff --git a/ci/jenkins/generated/cpu_jenkinsfile.groovy b/ci/jenkins/generated/cpu_jenkinsfile.groovy
index ad168c591872..378b20db51b0 100644
--- a/ci/jenkins/generated/cpu_jenkinsfile.groovy
+++ b/ci/jenkins/generated/cpu_jenkinsfile.groovy
@@ -60,19 +60,21 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-05T14:48:42.120032
+// Generated at 2022-12-06T20:56:42.393957
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
-ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
-ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
-ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
-ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
-ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
-ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
-ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
-ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
-ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+// These are set at runtime from data in ci/jenkins/docker-images.yml, update
+// image tags in that file
+ci_lint = ''
+ci_gpu = ''
+ci_cpu = ''
+ci_minimal = ''
+ci_wasm = ''
+ci_i386 = ''
+ci_cortexm = ''
+ci_arm = ''
+ci_hexagon = ''
+ci_riscv = ''
 
 // Parameters to allow overriding (in Jenkins UI), the images
 // to be used by a given build. When provided, they take precedence
@@ -322,7 +324,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm ci_cortexm ci_cpu ci_gpu ci_hexagon ci_i386 ci_lint ci_minimal ci_riscv ci_wasm ",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
diff --git a/ci/jenkins/generated/docker_jenkinsfile.groovy b/ci/jenkins/generated/docker_jenkinsfile.groovy
index 246b40c6e9c0..050ef2983e43 100644
--- a/ci/jenkins/generated/docker_jenkinsfile.groovy
+++ b/ci/jenkins/generated/docker_jenkinsfile.groovy
@@ -60,19 +60,21 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-06T21:25:49.429894
+// Generated at 2022-12-07T07:10:24.637792
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
-ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
-ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
-ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
-ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
-ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
-ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
-ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
-ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
-ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+// These are set at runtime from data in ci/jenkins/docker-images.yml, update
+// image tags in that file
+ci_lint = ''
+ci_gpu = ''
+ci_cpu = ''
+ci_minimal = ''
+ci_wasm = ''
+ci_i386 = ''
+ci_cortexm = ''
+ci_arm = ''
+ci_hexagon = ''
+ci_riscv = ''
 
 // Parameters to allow overriding (in Jenkins UI), the images
 // to be used by a given build. When provided, they take precedence
@@ -322,7 +324,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm ci_cortexm ci_cpu ci_gpu ci_hexagon ci_i386 ci_lint ci_minimal ci_riscv ci_wasm ",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
diff --git a/ci/jenkins/generated/gpu_jenkinsfile.groovy b/ci/jenkins/generated/gpu_jenkinsfile.groovy
index ef357f0d7c0b..48a6619cbab1 100644
--- a/ci/jenkins/generated/gpu_jenkinsfile.groovy
+++ b/ci/jenkins/generated/gpu_jenkinsfile.groovy
@@ -60,19 +60,21 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-06T20:30:23.035868
+// Generated at 2022-12-07T07:10:24.840515
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
-ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
-ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
-ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
-ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
-ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
-ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
-ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
-ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
-ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+// These are set at runtime from data in ci/jenkins/docker-images.yml, update
+// image tags in that file
+ci_lint = ''
+ci_gpu = ''
+ci_cpu = ''
+ci_minimal = ''
+ci_wasm = ''
+ci_i386 = ''
+ci_cortexm = ''
+ci_arm = ''
+ci_hexagon = ''
+ci_riscv = ''
 
 // Parameters to allow overriding (in Jenkins UI), the images
 // to be used by a given build. When provided, they take precedence
@@ -322,7 +324,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm ci_cortexm ci_cpu ci_gpu ci_hexagon ci_i386 ci_lint ci_minimal ci_riscv ci_wasm ",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
diff --git a/ci/jenkins/generated/hexagon_jenkinsfile.groovy b/ci/jenkins/generated/hexagon_jenkinsfile.groovy
index 6296d0c5c868..e5397eee3a9c 100644
--- a/ci/jenkins/generated/hexagon_jenkinsfile.groovy
+++ b/ci/jenkins/generated/hexagon_jenkinsfile.groovy
@@ -60,19 +60,21 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-05T14:48:42.065368
+// Generated at 2022-12-06T20:56:42.338377
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
-ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
-ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
-ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
-ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
-ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
-ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
-ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
-ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
-ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+// These are set at runtime from data in ci/jenkins/docker-images.yml, update
+// image tags in that file
+ci_lint = ''
+ci_gpu = ''
+ci_cpu = ''
+ci_minimal = ''
+ci_wasm = ''
+ci_i386 = ''
+ci_cortexm = ''
+ci_arm = ''
+ci_hexagon = ''
+ci_riscv = ''
 
 // Parameters to allow overriding (in Jenkins UI), the images
 // to be used by a given build. When provided, they take precedence
@@ -322,7 +324,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm ci_cortexm ci_cpu ci_gpu ci_hexagon ci_i386 ci_lint ci_minimal ci_riscv ci_wasm ",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
diff --git a/ci/jenkins/generated/i386_jenkinsfile.groovy b/ci/jenkins/generated/i386_jenkinsfile.groovy
index f0170f586721..876670acebba 100644
--- a/ci/jenkins/generated/i386_jenkinsfile.groovy
+++ b/ci/jenkins/generated/i386_jenkinsfile.groovy
@@ -60,19 +60,21 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-05T14:48:42.016799
+// Generated at 2022-12-06T20:56:42.288840
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
-ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
-ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
-ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
-ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
-ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
-ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
-ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
-ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
-ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+// These are set at runtime from data in ci/jenkins/docker-images.yml, update
+// image tags in that file
+ci_lint = ''
+ci_gpu = ''
+ci_cpu = ''
+ci_minimal = ''
+ci_wasm = ''
+ci_i386 = ''
+ci_cortexm = ''
+ci_arm = ''
+ci_hexagon = ''
+ci_riscv = ''
 
 // Parameters to allow overriding (in Jenkins UI), the images
 // to be used by a given build. When provided, they take precedence
@@ -322,7 +324,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm ci_cortexm ci_cpu ci_gpu ci_hexagon ci_i386 ci_lint ci_minimal ci_riscv ci_wasm ",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
diff --git a/ci/jenkins/generated/lint_jenkinsfile.groovy b/ci/jenkins/generated/lint_jenkinsfile.groovy
index ee63a1008b13..3aaea4436fcb 100644
--- a/ci/jenkins/generated/lint_jenkinsfile.groovy
+++ b/ci/jenkins/generated/lint_jenkinsfile.groovy
@@ -60,19 +60,21 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-05T14:48:42.041376
+// Generated at 2022-12-06T20:56:42.313954
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
-ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
-ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
-ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
-ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
-ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
-ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
-ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
-ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
-ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+// These are set at runtime from data in ci/jenkins/docker-images.yml, update
+// image tags in that file
+ci_lint = ''
+ci_gpu = ''
+ci_cpu = ''
+ci_minimal = ''
+ci_wasm = ''
+ci_i386 = ''
+ci_cortexm = ''
+ci_arm = ''
+ci_hexagon = ''
+ci_riscv = ''
 
 // Parameters to allow overriding (in Jenkins UI), the images
 // to be used by a given build. When provided, they take precedence
@@ -322,7 +324,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm ci_cortexm ci_cpu ci_gpu ci_hexagon ci_i386 ci_lint ci_minimal ci_riscv ci_wasm ",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
diff --git a/ci/jenkins/generated/minimal_jenkinsfile.groovy b/ci/jenkins/generated/minimal_jenkinsfile.groovy
index 4c9f469b3bb6..f8a59ef5734d 100644
--- a/ci/jenkins/generated/minimal_jenkinsfile.groovy
+++ b/ci/jenkins/generated/minimal_jenkinsfile.groovy
@@ -60,19 +60,21 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-05T23:21:03.010229
+// Generated at 2022-12-06T20:56:42.235080
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
-ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
-ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
-ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
-ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
-ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
-ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
-ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
-ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
-ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+// These are set at runtime from data in ci/jenkins/docker-images.yml, update
+// image tags in that file
+ci_lint = ''
+ci_gpu = ''
+ci_cpu = ''
+ci_minimal = ''
+ci_wasm = ''
+ci_i386 = ''
+ci_cortexm = ''
+ci_arm = ''
+ci_hexagon = ''
+ci_riscv = ''
 
 // Parameters to allow overriding (in Jenkins UI), the images
 // to be used by a given build. When provided, they take precedence
@@ -322,7 +324,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm ci_cortexm ci_cpu ci_gpu ci_hexagon ci_i386 ci_lint ci_minimal ci_riscv ci_wasm ",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
diff --git a/ci/jenkins/generated/riscv_jenkinsfile.groovy b/ci/jenkins/generated/riscv_jenkinsfile.groovy
index b485e9906f4c..eb62c3731f79 100644
--- a/ci/jenkins/generated/riscv_jenkinsfile.groovy
+++ b/ci/jenkins/generated/riscv_jenkinsfile.groovy
@@ -60,19 +60,21 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-05T14:48:42.170796
+// Generated at 2022-12-06T20:56:42.442689
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
-ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
-ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
-ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
-ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
-ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
-ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
-ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
-ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
-ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+// These are set at runtime from data in ci/jenkins/docker-images.yml, update
+// image tags in that file
+ci_lint = ''
+ci_gpu = ''
+ci_cpu = ''
+ci_minimal = ''
+ci_wasm = ''
+ci_i386 = ''
+ci_cortexm = ''
+ci_arm = ''
+ci_hexagon = ''
+ci_riscv = ''
 
 // Parameters to allow overriding (in Jenkins UI), the images
 // to be used by a given build. When provided, they take precedence
@@ -322,7 +324,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm ci_cortexm ci_cpu ci_gpu ci_hexagon ci_i386 ci_lint ci_minimal ci_riscv ci_wasm ",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
diff --git a/ci/jenkins/generated/wasm_jenkinsfile.groovy b/ci/jenkins/generated/wasm_jenkinsfile.groovy
index 0c7c2ccf2aaa..d43c7f9d24e4 100644
--- a/ci/jenkins/generated/wasm_jenkinsfile.groovy
+++ b/ci/jenkins/generated/wasm_jenkinsfile.groovy
@@ -60,19 +60,21 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-05T14:48:42.147157
+// Generated at 2022-12-06T20:56:42.420989
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
-ci_lint = 'tlcpack/ci-lint:20221013-060115-61c9742ea'
-ci_gpu = 'tlcpack/ci-gpu:20221019-060125-0b4836739'
-ci_cpu = 'tlcpack/ci-cpu:20221013-060115-61c9742ea'
-ci_minimal = 'tlcpack/ci-minimal:20221013-060115-61c9742ea'
-ci_wasm = 'tlcpack/ci-wasm:20221013-060115-61c9742ea'
-ci_i386 = 'tlcpack/ci-i386:20221013-060115-61c9742ea'
-ci_cortexm = 'tlcpack/ci-cortexm:20221013-060115-61c9742ea'
-ci_arm = 'tlcpack/ci-arm:20221013-060115-61c9742ea'
-ci_hexagon = 'tlcpack/ci-hexagon:20221013-060115-61c9742ea'
-ci_riscv = 'tlcpack/ci-riscv:20221013-060115-61c9742ea'
+// These are set at runtime from data in ci/jenkins/docker-images.yml, update
+// image tags in that file
+ci_lint = ''
+ci_gpu = ''
+ci_cpu = ''
+ci_minimal = ''
+ci_wasm = ''
+ci_i386 = ''
+ci_cortexm = ''
+ci_arm = ''
+ci_hexagon = ''
+ci_riscv = ''
 
 // Parameters to allow overriding (in Jenkins UI), the images
 // to be used by a given build. When provided, they take precedence
@@ -322,7 +324,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ",
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm ci_cortexm ci_cpu ci_gpu ci_hexagon ci_i386 ci_lint ci_minimal ci_riscv ci_wasm ",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
diff --git a/ci/jenkins/templates/utils/Prepare.groovy.j2 b/ci/jenkins/templates/utils/Prepare.groovy.j2
index 099bde5bc770..b295bb430853 100644
--- a/ci/jenkins/templates/utils/Prepare.groovy.j2
+++ b/ci/jenkins/templates/utils/Prepare.groovy.j2
@@ -194,7 +194,7 @@ def prepare() {
 
         if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
           sh(
-            script: "./${jenkins_scripts_root}/determine_docker_images.py {% for image in images %}{{ image.name }}={% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %} {% endfor %}",
+            script: "./${jenkins_scripts_root}/determine_docker_images.py {% for image in images %}{{ image.name }} {% endfor %}",
             label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
           )
           // Pull image names from the results of should_rebuild_docker.py
diff --git a/ci/jenkins/templates/utils/base.groovy.j2 b/ci/jenkins/templates/utils/base.groovy.j2
index 0854091c7a65..bb00f99300ff 100644
--- a/ci/jenkins/templates/utils/base.groovy.j2
+++ b/ci/jenkins/templates/utils/base.groovy.j2
@@ -49,16 +49,19 @@
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'utils/macros.j2' as m with context -%}
 
-ci_lint = '{{ ci_lint }}'
-ci_gpu = '{{ ci_gpu }}'
-ci_cpu = '{{ ci_cpu }}'
-ci_minimal = '{{ ci_minimal }}'
-ci_wasm = '{{ ci_wasm }}'
-ci_i386 = '{{ ci_i386 }}'
-ci_cortexm = '{{ ci_cortexm }}'
-ci_arm = '{{ ci_arm }}'
-ci_hexagon = '{{ ci_hexagon }}'
-ci_riscv = '{{ ci_riscv }}'
+
+// These are set at runtime from data in ci/jenkins/docker-images.yml, update
+// image tags in that file
+ci_lint = ''
+ci_gpu = ''
+ci_cpu = ''
+ci_minimal = ''
+ci_wasm = ''
+ci_i386 = ''
+ci_cortexm = ''
+ci_arm = ''
+ci_hexagon = ''
+ci_riscv = ''
 
 // Parameters to allow overriding (in Jenkins UI), the images
 // to be used by a given build. When provided, they take precedence
diff --git a/ci/scripts/jenkins/determine_docker_images.py b/ci/scripts/jenkins/determine_docker_images.py
index 82acf2ea46b4..78da9a354629 100755
--- a/ci/scripts/jenkins/determine_docker_images.py
+++ b/ci/scripts/jenkins/determine_docker_images.py
@@ -20,17 +20,18 @@
 import json
 import logging
 import urllib.error
-from pathlib import Path
+import configparser
 
+from pathlib import Path
 from typing import Dict, Any
 
-
 from http_utils import get
-from cmd_utils import init_log
+from cmd_utils import init_log, REPO_ROOT
 
 DOCKER_API_BASE = "https://hub.docker.com/v2/"
 PAGE_SIZE = 25
 TEST_DATA = None
+IMAGE_TAGS_FILE = REPO_ROOT / "ci" / "jenkins" / "docker-images.ini"
 
 
 def docker_api(url: str, use_pagination: bool = False) -> Dict[str, Any]:
@@ -77,6 +78,10 @@ def image_exists(spec: str) -> bool:
         "--testing-docker-data",
         help="(testing only) JSON data to mock response from Docker Hub API",
     )
+    parser.add_argument(
+        "--testing-images-data",
+        help=f"(testing only) JSON data to mock contents of {IMAGE_TAGS_FILE}",
+    )
     parser.add_argument(
         "--base-dir",
         default=".docker-image-names",
@@ -85,10 +90,18 @@ def image_exists(spec: str) -> bool:
     args, other = parser.parse_known_args()
     name_dir = Path(args.base_dir)
 
+    if args.testing_images_data:
+        repo_image_tags = json.loads(args.testing_images_data)
+    else:
+        config = configparser.ConfigParser()
+        config.read(IMAGE_TAGS_FILE)
+        repo_image_tags = {}
+        for name in other:
+            repo_image_tags[name] = config.get("jenkins", name)
+
     images = {}
-    for item in other:
-        name, tag = item.split("=")
-        images[name] = tag
+    for name in other:
+        images[name] = repo_image_tags[name]
 
     if args.testing_docker_data is not None:
         TEST_DATA = json.loads(args.testing_docker_data)
diff --git a/docker/bash.sh b/docker/bash.sh
index 6919ce9edb65..bd5ca70f43d6 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -306,16 +306,13 @@ source "$(dirname $0)/dev_common.sh" || exit 2
 
 DOCKER_MOUNT=( )
 DOCKER_DEVICES=( )
-
-
 # If the user gave a shortcut defined in the Jenkinsfile, use it.
 EXPANDED_SHORTCUT=$(lookup_image_spec "${DOCKER_IMAGE_NAME}")
 if [ -n "${EXPANDED_SHORTCUT}" ]; then
     if [ "${CI+x}" == "x" ]; then
         DOCKER_IMAGE_NAME="${EXPANDED_SHORTCUT}"
     else
-        python3 ci/scripts/jenkins/determine_docker_images.py "$DOCKER_IMAGE_NAME=$EXPANDED_SHORTCUT" 2> /dev/null
-        echo "HERE HERE HERE"
+        python3 ci/scripts/jenkins/determine_docker_images.py "$DOCKER_IMAGE_NAME" 2> /dev/null
         DOCKER_IMAGE_NAME=$(cat ".docker-image-names/$DOCKER_IMAGE_NAME")
         if [[ "$DOCKER_IMAGE_NAME" == *"tlcpackstaging"* ]]; then
             echo "WARNING: resolved docker image to fallback tag in tlcpackstaging" >&2
diff --git a/docs/contribute/ci.rst b/docs/contribute/ci.rst
index 1284fd95fbea..428319a99a0a 100644
--- a/docs/contribute/ci.rst
+++ b/docs/contribute/ci.rst
@@ -168,13 +168,35 @@ Docker Images
 ^^^^^^^^^^^^^
 
 Each CI job runs most of its work inside a Docker container, built from files
-in the `docker/ <https://github.com/apache/tvm/tree/main/docker>`_ folder. These
-files are built nightly in Jenkins via the `docker-images-ci <https://ci.tlcpack.ai/job/docker-images-ci/>`_ job.
-The images for these containers are hosted in the `tlcpack Docker Hub <https://hub.docker.com/u/tlcpack>`_
-and referenced in the `Jenkinsfile.j2 <https://github.com/apache/tvm/tree/main/Jenkinsfile.j2>`_. These can be inspected and run
-locally via standard Docker commands.
+in the `docker/ <https://github.com/apache/tvm/tree/main/docker>`_ folder.
 
-Adding a new Docker image
+
+Updating a Docker Image Tag
+"""""""""""""""""""""""""""
+
+To update a tag, a new image needs to be built and uploaded to Docker Hub, then
+the image tags in  `docker-images.ini <https://github.com/apache/tvm/tree/main/ci/jenkins/docker-images.ini>`_
+need to be updated to match the image tags on Docker Hub.
+
+Docker images are built automatically nightly via the `docker-images-ci <https://ci.tlcpack.ai/job/docker-images-ci/>`_,
+which uploads the built images to https://hub.docker.com/u/tlcpackstaging once
+they have passed CI. Post-merge CI runs on ``main`` build Docker images ad-hoc
+and upload them to the ``tlcpackstaging`` Docker Hub account as well. There is an
+auto-promotion process for ``tlcpackstaging`` Docker images to be moved to the
+``tlcpack`` account. This means that image tags from ``tlcpackstaging`` can be
+used in CI and they will be automatically moved to ``tlcpack`` after a successful
+post-merge CI run on ``main``. So the steps to update the image are:
+
+1. Merge a PR that changes the Dockerfiles under ``docker/`` or scripts in ``docker/install``.
+2. Do either of:
+
+    a. Wait for the post-merge CI build from the PR to complete and upload the newly built image to the `tlcpackstaging <https://hub.docker.com/u/tlcpackstaging>`_ Docker Hub.
+    b. Wait for the nightly Docker image build to complete and upload the newly built image to the `tlcpackstaging <https://hub.docker.com/u/tlcpackstaging>`_ Docker Hub.
+
+3. Find the newly uploaded image tag on the `tlcpackstaging <https://hub.docker.com/u/tlcpackstaging>`_ Docker Hub, for example ``20221208-070144-22ff38dff`` and update the tag in ``ci/jenkins/docker-images.ini`` to use the tlcpackstaging tag but under the tlcpack account, e.g. ``tlcpack/ci-arm:20221208-070144-22ff38dff``. Send in a PR with these changes and wait for it to run through CI to ensure the new images are valid.
+4. Merge the ``docker-images.ini`` update PR. Once post-merge CI finishes running on ``main`` the ``tlcpackstaging`` tag will be re-uploaded to ``tlcpack`` automatically.
+
+Adding a New Docker Image
 """""""""""""""""""""""""
 
 New docker images can be added to test TVM on a variety of platforms. Here are the steps for adding
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index f5d5a2f0a370..56fad7a3a820 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -91,6 +91,8 @@
     "j2",
     # Jenkinsfiles
     "groovy",
+    # Python-parseable config files
+    "ini",
 }
 
 # List of file names allowed
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index cf31b50b63ec..f83fa40c618d 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -1287,14 +1287,14 @@ def test_open_docker_update_pr(
 
 @parameterize_named(
     use_tlcpack=dict(
-        images=["ci_arm=tlcpack/ci-arm:abc-abc-123", "ci_lint=tlcpack/ci-lint:abc-abc-234"],
+        images=["ci_arm", "ci_lint"],
         expected={
             "ci_arm": "tlcpack/ci-arm:abc-abc-123",
             "ci_lint": "tlcpack/ci-lint:abc-abc-234",
         },
     ),
     use_staging=dict(
-        images=["ci_arm2=tlcpack/ci-arm2:abc-abc-123"],
+        images=["ci_arm2"],
         expected={
             "ci_arm2": "tlcpackstaging/ci_arm2:abc-abc-123",
         },
@@ -1311,11 +1311,19 @@ def test_determine_docker_images(tmpdir_factory, images, expected):
         "repositories/tlcpack/ci-lint/tags/abc-abc-234": {},
     }
 
+    images_data = {
+        "ci_arm": "tlcpack/ci-arm:abc-abc-123",
+        "ci_lint": "tlcpack/ci-lint:abc-abc-234",
+        "ci_arm2": "tlcpack/ci-arm2:abc-abc-123",
+    }
+
     run_script(
         [
             script,
             "--testing-docker-data",
             json.dumps(docker_data),
+            "--testing-images-data",
+            json.dumps(images_data),
             "--base-dir",
             git_dir,
         ]

From af02ed1a25440e1faa4fa437d25ee20105d009f2 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 9 Dec 2022 09:37:42 +0900
Subject: [PATCH 040/286] [MetaSchedule] Restore `num_threads` parameter in
 tuning API  (#13561)

* [MetaSchedule] Restore num_threads argument in tune_relay

* pass num_threads to XGBModel

* fix default

* pass num_threads as max_workers to Builder and Runner

* add test

* clean up

* fix kwarg

* num_threads -> num_tuning_cores

* typo

* num_threads -> num_tuning_cores in contrib/torch

* typo in document
---
 python/tvm/contrib/hexagon/meta_schedule.py   | 21 +++++----
 python/tvm/contrib/torch/as_torch.py          |  4 +-
 .../meta_schedule/cost_model/cost_model.py    |  5 +++
 .../tvm/meta_schedule/cost_model/xgb_model.py |  7 ++-
 python/tvm/meta_schedule/relay_integration.py | 12 ++++--
 python/tvm/meta_schedule/runner/runner.py     |  2 +
 python/tvm/meta_schedule/tir_integration.py   |  8 ++--
 python/tvm/meta_schedule/tune.py              | 12 ++++--
 .../metaschedule_e2e/test_resnet50_int8.py    | 43 ++++++++++---------
 .../test_hexagon/test_meta_schedule.py        |  7 ++-
 .../test_meta_schedule_relay_integration.py   |  1 +
 11 files changed, 77 insertions(+), 45 deletions(-)

diff --git a/python/tvm/contrib/hexagon/meta_schedule.py b/python/tvm/contrib/hexagon/meta_schedule.py
index dcc7d232d8c4..6e1541e498a9 100644
--- a/python/tvm/contrib/hexagon/meta_schedule.py
+++ b/python/tvm/contrib/hexagon/meta_schedule.py
@@ -128,7 +128,9 @@ def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat, artifact_path
     return costs
 
 
-def get_hexagon_local_builder(pass_context: tvm.transform.PassContext = None):
+def get_hexagon_local_builder(
+    pass_context: tvm.transform.PassContext = None, max_workers: Optional[int] = None
+):
     """Return Hexagon-compatible Builder for meta schedule."""
 
     def export_func(mod):
@@ -143,13 +145,19 @@ def default_build_with_context(
             return tvm_build(mod, target=target)
 
     if pass_context is not None:
-        return LocalBuilder(f_build=default_build_with_context, f_export=export_func)
+        return LocalBuilder(
+            f_build=default_build_with_context, f_export=export_func, max_workers=max_workers
+        )
     else:
-        return LocalBuilder(f_export=export_func)
+        return LocalBuilder(f_export=export_func, max_workers=max_workers)
 
 
 def get_hexagon_rpc_runner(
-    hexagon_launcher: HexagonLauncherRPC, number=3, repeat=1, min_repeat_ms=100
+    hexagon_launcher: HexagonLauncherRPC,
+    number=3,
+    repeat=1,
+    min_repeat_ms=100,
+    max_workers: Optional[int] = None,
 ):
     """Return Hexagon-compatible RPC Runner for meta schedule.
 
@@ -177,7 +185,4 @@ def get_hexagon_rpc_runner(
         enable_cpu_cache_flush=False,
     )
 
-    return HexagonRPCRunner(
-        hexagon_launcher,
-        evaluator_config,
-    )
+    return HexagonRPCRunner(hexagon_launcher, evaluator_config, max_workers=max_workers)
diff --git a/python/tvm/contrib/torch/as_torch.py b/python/tvm/contrib/torch/as_torch.py
index 918ce3ff3b6a..c4ca88adf738 100644
--- a/python/tvm/contrib/torch/as_torch.py
+++ b/python/tvm/contrib/torch/as_torch.py
@@ -67,7 +67,7 @@ def tune(
         space: ms.SpaceGenerator.SpaceGeneratorType = "post-order-apply",
         strategy: ms.SearchStrategy.SearchStrategyType = "replay-trace",
         task_name: str = "main",
-        num_threads: Union[Literal["physical", "logical"], int] = "physical",
+        num_tuning_cores: Union[Literal["physical", "logical"], int] = "physical",
         seed: Optional[int] = None,
     ) -> None:
         """
@@ -100,7 +100,7 @@ def tune(
                 space=space,
                 strategy=strategy,
                 task_name=task_name,
-                num_threads=num_threads,
+                num_tuning_cores=num_tuning_cores,
                 seed=seed,
             )
             sch = ms.tir_integration.compile_tir(database, self.ir_module, target)
diff --git a/python/tvm/meta_schedule/cost_model/cost_model.py b/python/tvm/meta_schedule/cost_model/cost_model.py
index f139fcc4e4b3..c0f6ea5fb9e1 100644
--- a/python/tvm/meta_schedule/cost_model/cost_model.py
+++ b/python/tvm/meta_schedule/cost_model/cost_model.py
@@ -126,6 +126,11 @@ def create(
 
         if kind == "xgb":
             return XGBModel(*args, **kwargs)  # type: ignore
+
+        if "num_tuning_cores" in kwargs:
+            # num_tuning_cores is only relevant for XGBModel.
+            kwargs.pop("num_tuning_cores")
+
         if kind == "random":
             return RandomModel(*args, **kwargs)  # type: ignore
         if kind == "mlp":
diff --git a/python/tvm/meta_schedule/cost_model/xgb_model.py b/python/tvm/meta_schedule/cost_model/xgb_model.py
index 0a2786c6abe0..901e18ce3fa5 100644
--- a/python/tvm/meta_schedule/cost_model/xgb_model.py
+++ b/python/tvm/meta_schedule/cost_model/xgb_model.py
@@ -333,6 +333,7 @@ def __init__(
         verbose_eval: int = 25,
         average_peak_n: int = 32,
         adaptive_training: bool = True,
+        num_tuning_cores: Optional[int] = None,
     ):
         super().__init__()
         if not isinstance(extractor, FeatureExtractor):
@@ -342,7 +343,11 @@ def __init__(
         # model-related
         if config.nthread is None:
             # use physical core number
-            config = config._replace(nthread=cpu_count(logical=False))
+            if num_tuning_cores is None:
+                config = config._replace(nthread=cpu_count(logical=False))
+            else:
+                config = config._replace(nthread=num_tuning_cores)
+
         self.config = config
         # behavior of randomness
         self.num_warmup_samples = num_warmup_samples
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index df76684d2d42..0b8705aafea9 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -180,7 +180,7 @@ def extracted_tasks_to_tune_contexts(
     work_dir: str,
     space: SpaceGenerator.SpaceGeneratorType = "post-order-apply",
     strategy: SearchStrategy.SearchStrategyType = "evolutionary",
-    num_threads: Union[Literal["physical", "logical"], int] = "physical",
+    num_tuning_cores: Union[Literal["physical", "logical"], int] = "physical",
     seed: Optional[int] = None,
 ) -> Tuple[List[TuneContext], List[float]]:
     """Convert ExtractedTask to TuneContext.
@@ -195,8 +195,8 @@ def extracted_tasks_to_tune_contexts(
         The space generator to use.
     strategy : SearchStrategy.SearchStrategyType
         The search strategy to use.
-    num_threads : Union[Literal["physical", "logical"], int]
-        The number of threads to use in multi-threaded search algorithm.
+    num_tuning_cores : Union[Literal["physical", "logical"], int]
+        The number of CPU cores to use during tuning.
     seed : Optional[int]
         The random seed to use.
 
@@ -223,7 +223,7 @@ def extracted_tasks_to_tune_contexts(
                 task_name=task.task_name,
                 logger=logger,
                 rand_state=rand_state,
-                num_threads=num_threads,
+                num_threads=num_tuning_cores,
             ).clone()
         )
         task_weights.append(task.weight)
@@ -249,6 +249,7 @@ def tune_relay(
     strategy: SearchStrategy.SearchStrategyType = "evolutionary",
     seed: Optional[int] = None,
     module_equality: str = "structural",
+    num_tuning_cores: Union[Literal["physical", "logical"], int] = "physical",
 ) -> Database:
     """Tune a Relay program.
 
@@ -296,6 +297,8 @@ def tune_relay(
                             given module. The "ignore-ndarray" varint is used for the extracted
                             blocks or in case no anchor block is found.
                             For the definition of the anchor block, see tir/analysis/analysis.py.
+    num_tuning_cores : Union[Literal["physical", "logical"], int]
+        The number of CPU cores to use during tuning.
 
     Returns
     -------
@@ -308,6 +311,7 @@ def tune_relay(
         space=space,
         strategy=strategy,
         seed=seed,
+        num_tuning_cores=num_tuning_cores,
     )
     return tune_tasks(
         tasks=tasks,
diff --git a/python/tvm/meta_schedule/runner/runner.py b/python/tvm/meta_schedule/runner/runner.py
index 1753d8b4abf9..1a8f78414e91 100644
--- a/python/tvm/meta_schedule/runner/runner.py
+++ b/python/tvm/meta_schedule/runner/runner.py
@@ -194,6 +194,8 @@ def create(  # pylint: disable=keyword-arg-before-vararg
         from . import LocalRunner, RPCRunner  # pylint: disable=import-outside-toplevel
 
         if kind == "local":
+            if "max_workers" in kwargs:
+                kwargs.pop("max_workers")
             return LocalRunner(*args, **kwargs)  # type: ignore
         elif kind == "rpc":
             return RPCRunner(*args, **kwargs)  # type: ignore
diff --git a/python/tvm/meta_schedule/tir_integration.py b/python/tvm/meta_schedule/tir_integration.py
index 975987ebcb67..f3d505c28b0e 100644
--- a/python/tvm/meta_schedule/tir_integration.py
+++ b/python/tvm/meta_schedule/tir_integration.py
@@ -54,7 +54,7 @@ def tune_tir(
     space: SpaceGenerator.SpaceGeneratorType = "post-order-apply",
     strategy: SearchStrategy.SearchStrategyType = "evolutionary",
     task_name: str = "main",
-    num_threads: Union[Literal["physical", "logical"], int] = "physical",
+    num_tuning_cores: Union[Literal["physical", "logical"], int] = "physical",
     seed: Optional[int] = None,
 ) -> Database:
     """Tune a TIR function.
@@ -89,8 +89,8 @@ def tune_tir(
         The search strategy.
     task_name : str
         The name of the task.
-    num_threads : Union[Literal["physical", "logical"], int]
-        The number of threads to use.
+    num_tuning_cores : Union[Literal["physical", "logical"], int]
+        The number of CPU cores to use during tuning.
     seed : Optional[int]
         The seed for the random number generator.
 
@@ -111,7 +111,7 @@ def tune_tir(
                 task_name=task_name,
                 logger=logger,
                 rand_state=seed,
-                num_threads=num_threads,
+                num_threads=num_tuning_cores,
             ).clone()
         ],
         task_weights=[1.0],
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index a69c8f126272..0c4035844c71 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -86,22 +86,28 @@ def tune_tasks(
     database : Database
         The database with all tuning records
     """
+    if len(tasks) == 0:
+        raise ValueError("No tasks to tune.")
+
     if len(tasks) != len(task_weights):
         raise ValueError(
             f"Length of tasks ({len(tasks)}) and task_weights ({len(task_weights)}) do not match."
         )
+
+    num_cores = tasks[0].num_threads
+
     if max_trials_per_task is None:
         max_trials_per_task = max_trials_global
     if not isinstance(builder, Builder):
-        builder = Builder.create(builder)
+        builder = Builder.create(builder, max_workers=num_cores)
     if not isinstance(runner, Runner):
-        runner = Runner.create(runner)
+        runner = Runner.create(runner, max_workers=num_cores)
     if database == "json":
         database = Database.create(database, work_dir=work_dir, module_equality=module_equality)
     elif not isinstance(database, Database):
         database = Database.create(database, module_equality=module_equality)
     if not isinstance(cost_model, CostModel):
-        cost_model = CostModel.create(cost_model)
+        cost_model = CostModel.create(cost_model, num_tuning_cores=num_cores)
     if isinstance(measure_callbacks, MeasureCallback):
         measure_callbacks = [measure_callbacks]
     elif measure_callbacks == "default":
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
index e15b0a4e7ddb..1e01cb28a749 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
@@ -33,6 +33,7 @@
     get_hexagon_rpc_runner,
 )
 from tvm.meta_schedule import postproc, schedule_rule
+from tvm.meta_schedule.utils import cpu_count
 from tvm.tir.schedule import BlockRV, Schedule
 from tvm.tir.schedule.analysis import has_block
 from tvm.tir.tensor_intrin.hexagon import (
@@ -44,10 +45,24 @@
 from ..infrastructure import get_hexagon_target
 
 MODEL_JSON = "resnet50_int8.json"
+MODEL_PARAMS = "resnet50_int8.params"
 EXECUTOR = relay.backend.Executor("graph", {"link-params": True})
 TARGET_LLVM = tvm.target.Target("llvm")
 TARGET_HEXAGON = get_hexagon_target("v68")
-MODEL_PARAMS = "resnet50_int8.params"
+
+
+def load_model():
+    """Load renset50 model."""
+    if not os.path.exists(MODEL_JSON):
+        pytest.skip(msg="Run python export_models.py first.")
+
+    with open(MODEL_JSON, "r") as file:
+        mod = tvm.ir.load_json(file.read())
+
+    with open(MODEL_PARAMS, "rb") as file:
+        params = relay.load_param_dict(file.read())
+
+    return mod, params
 
 
 def tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher):
@@ -110,6 +125,8 @@ def tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher):
     # task extraction and relay.build(...).
     mod = mod.with_attr("executor", EXECUTOR)
 
+    num_cores = cpu_count(logical=False)
+
     with tempfile.TemporaryDirectory() as work_dir:
         database = ms.relay_integration.tune_relay(
             mod=mod,
@@ -125,8 +142,8 @@ def tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher):
             # num_trials_per_iter=32,
             # max_trials_per_task=128,
             # strategy="evolutionary",
-            builder=get_hexagon_local_builder(),
-            runner=get_hexagon_rpc_runner(hexagon_launcher, number=20),
+            builder=get_hexagon_local_builder(max_workers=num_cores),
+            runner=get_hexagon_rpc_runner(hexagon_launcher, number=20, max_workers=num_cores),
             space=ms.space_generator.PostOrderApply(
                 sch_rules=sch_rules,
                 postprocs=postprocs,
@@ -137,6 +154,7 @@ def tune_vrmpy_auto_tensorize(mod, params, hexagon_launcher):
             # It reduces the number of conv2d tuning tasks in the int8 resnet50 model
             # from 36 to 23, with negligible performance difference.
             module_equality="anchor-block",
+            num_tuning_cores=num_cores,
         )
         return ms.relay_integration.compile_relay(
             database=database,
@@ -156,11 +174,8 @@ def test_resnet50(hexagon_launcher):
     if not os.path.exists(MODEL_JSON):
         pytest.skip(msg="Run python export_models.py first.")
 
-    with open(MODEL_JSON, "r") as file:
-        mod = tvm.ir.load_json(file.read())
+    mod, params = load_model()
 
-    with open(MODEL_PARAMS, "rb") as file:
-        params = relay.load_param_dict(file.read())
     inp = np.random.randn(1, 3, 224, 224).astype("float32")
     input_name = "image"
 
@@ -231,20 +246,6 @@ def evaluate_mod(hexagon_launcher, hexagon_lowered, llvm_lowered, input_name, in
         np.testing.assert_allclose(ref_result, output, atol=1e-4, rtol=1e-5)
 
 
-def load_model():
-    """Load renset50 model."""
-    if not os.path.exists(MODEL_JSON):
-        pytest.skip(msg="Run python export_models.py first.")
-
-    with open(MODEL_JSON, "r") as file:
-        mod = tvm.ir.load_json(file.read())
-
-    with open(MODEL_PARAMS, "rb") as file:
-        params = relay.load_param_dict(file.read())
-
-    return mod, params
-
-
 def _schedule_packed_8x8x32_conv2d():
     """Manually schedule a conv2d block, created from TE compute op via CreatePrimFunc,
     using 8x8x32 packed layout.
diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py
index a83a3b279a7f..1089f0f03589 100644
--- a/tests/python/contrib/test_hexagon/test_meta_schedule.py
+++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py
@@ -73,8 +73,11 @@ def test_builder_runner(hexagon_launcher):
 
     mod = MatmulModule
 
-    builder = get_hexagon_local_builder()
-    runner = get_hexagon_rpc_runner(hexagon_launcher, number=1, repeat=1, min_repeat_ms=0)
+    max_workers = 4
+    builder = get_hexagon_local_builder(max_workers=max_workers)
+    runner = get_hexagon_rpc_runner(
+        hexagon_launcher, number=1, repeat=1, min_repeat_ms=0, max_workers=max_workers
+    )
 
     (builder_result,) = builder.build([BuilderInput(mod, get_hexagon_target("v68"))])
     assert builder_result.artifact_path is not None
diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index 021db0f86ad2..062da0b00ca3 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -742,6 +742,7 @@ def _test_anchor_tuning(target):
             max_trials_global=4,
             strategy="replay-trace",
             module_equality=module_equality,
+            num_tuning_cores=4,
         )
         lib = ms.relay_integration.compile_relay(database, mod, target, params)
 

From afea5b5a50c78aed5764606fdb3bfc2f9ee975f4 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 8 Dec 2022 16:37:51 -0800
Subject: [PATCH 041/286] [TIR] Add preserve_unit_iters option to
 blockize/tensorize (#13579)

* [TIR] Add preserve_unit_iters option to blockize/tensorize

* fix
---
 include/tvm/arith/iter_affine_map.h           |   5 +-
 include/tvm/tir/schedule/schedule.h           |  11 +-
 python/tvm/arith/iter_affine_map.py           |  15 +-
 python/tvm/tir/schedule/schedule.py           |  17 +-
 src/arith/iter_affine_map.cc                  |  31 +-
 src/tir/schedule/concrete_schedule.cc         |  16 +-
 src/tir/schedule/concrete_schedule.h          |   6 +-
 src/tir/schedule/primitive.h                  |   6 +-
 .../schedule/primitive/blockize_tensorize.cc  |  51 ++--
 src/tir/schedule/schedule.cc                  |   6 +-
 src/tir/schedule/traced_schedule.cc           |  20 +-
 src/tir/schedule/traced_schedule.h            |   6 +-
 .../unittest/test_arith_iter_affine_map.py    |  29 ++
 ..._meta_schedule_schedule_rule_mlt_intrin.py |  30 +-
 ...test_meta_schedule_schedule_rule_mlt_tc.py |  41 +--
 .../test_meta_schedule_trace_apply.py         | 278 +++++++++---------
 .../unittest/test_tir_schedule_blockize.py    |  29 +-
 17 files changed, 352 insertions(+), 245 deletions(-)

diff --git a/include/tvm/arith/iter_affine_map.h b/include/tvm/arith/iter_affine_map.h
index 6b98d84fdf17..0d8bd574ae6e 100644
--- a/include/tvm/arith/iter_affine_map.h
+++ b/include/tvm/arith/iter_affine_map.h
@@ -396,6 +396,8 @@ Map<Var, PrimExpr> InverseAffineIterMap(const Array<IterSumExpr>& iter_map,
  * \param predicate The predicate constraints on the input iterators
  * \param check_level The iter mapping checking level.
  * \param analyzer Analyzer used to get context information.
+ * \param simplify_trivial_iterators If true, iterators with extent of
+ *           1 will be replaced with a constant value.
  *
  * \return The result list has length len(bindings) + 1
         [0, len(bindings)): The iter map matching result. The inner list is of length 2.
@@ -407,7 +409,8 @@ Map<Var, PrimExpr> InverseAffineIterMap(const Array<IterSumExpr>& iter_map,
 Array<Array<IterMark>> SubspaceDivide(const Array<PrimExpr>& bindings,
                                       const Map<Var, Range>& input_iters,
                                       const Array<Var>& sub_iters, const PrimExpr& predicate,
-                                      IterMapLevel check_level, arith::Analyzer* analyzer);
+                                      IterMapLevel check_level, arith::Analyzer* analyzer,
+                                      bool simplify_trivial_iterators = true);
 
 /*!
  * \brief Given an expression that may contain IterMapExpr, transform it to normal PrimExpr.
diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 5dbc1b5af395..c4838f2eb8aa 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -563,21 +563,26 @@ class ScheduleNode : public runtime::Object {
   /*!
    * \brief Convert the subtree rooted at a specific loop into a block.
    * \param loop_rv the root of the subtree
+   * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
    * \return the new block
    */
-  virtual BlockRV Blockize(const LoopRV& loop_rv) = 0;
+  virtual BlockRV Blockize(const LoopRV& loop_rv, bool preserve_unit_iters = true) = 0;
   /*!
    * \brief Tensorize the computation enclosed by loop with the tensor intrin.
    * \param loop_rv The loop to be tensorized
    * \param intrin Name of the tensor intrinsic
+   * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
    */
-  virtual void Tensorize(const LoopRV& loop_rv, const String& intrin) = 0;
+  virtual void Tensorize(const LoopRV& loop_rv, const String& intrin,
+                         bool preserve_unit_iters = true) = 0;
   /*!
    * \brief Tensorize the computation enclosed by loop with the tensor intrin.
    * \param block_rv The block to be tensorized
    * \param intrin Name of the tensor intrinsic
+   * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
    */
-  virtual void Tensorize(const BlockRV& block_rv, const String& intrin) = 0;
+  virtual void Tensorize(const BlockRV& block_rv, const String& intrin,
+                         bool preserve_unit_iters = true) = 0;
 
   /******** Schedule: Annotation ********/
   /*!
diff --git a/python/tvm/arith/iter_affine_map.py b/python/tvm/arith/iter_affine_map.py
index 77d6f418b853..54dbcef32590 100644
--- a/python/tvm/arith/iter_affine_map.py
+++ b/python/tvm/arith/iter_affine_map.py
@@ -173,7 +173,12 @@ def normalize_iter_map_to_expr(expr):
 
 
 def subspace_divide(
-    bindings, input_iters, sub_iters, predicate=True, check_level=IterMapLevel.Surjective
+    bindings,
+    input_iters,
+    sub_iters,
+    predicate=True,
+    check_level=IterMapLevel.Surjective,
+    simplify_trivial_iterators=True,
 ):
     """Detect if bindings can be written as
     [a_0*e_0 + b_0 + c_0, a_1*e_1 + b_1, ..., a_n*e_n + b_n]
@@ -206,6 +211,10 @@ def subspace_divide(
     check_level : Union[str, IterMapLevel]
         Checking level of iteration mapping
 
+    simplify_trivial_iterators: bool
+        If true, iterators with extent of 1 will be replaced with a
+        constant value.
+
     Returns
     -------
     results : List[List[PrimExpr]]
@@ -218,7 +227,9 @@ def subspace_divide(
     """
     if isinstance(check_level, str):
         check_level = IterMapLevel.from_str(check_level)
-    return _ffi_api.SubspaceDivide(bindings, input_iters, sub_iters, predicate, check_level)
+    return _ffi_api.SubspaceDivide(
+        bindings, input_iters, sub_iters, predicate, check_level, simplify_trivial_iterators
+    )
 
 
 def inverse_affine_iter_map(iter_map, outputs):
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 91c42f2a8d1d..5ff9d7131396 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -2186,13 +2186,15 @@ def after_set_scope(
     ########## Schedule: Blockize & Tensorize ##########
 
     @type_checked
-    def blockize(self, loop: LoopRV) -> BlockRV:
+    def blockize(self, loop: LoopRV, preserve_unit_iters: bool = True) -> BlockRV:
         """Convert the subtree rooted at a specific loop into a block.
 
         Parameters
         ----------
         loop : LoopRV
             The root of the subtree.
+        preserve_unit_iters : bool
+            Whether or not to preserve unit iterators in block bindings
 
         Returns
         -------
@@ -2257,10 +2259,15 @@ def after_blockize(
         block are divisible by the subspace represented by the loops starting at the given loop.
         """
 
-        return _ffi_api.ScheduleBlockize(self, loop)  # type: ignore # pylint: disable=no-member
+        return _ffi_api.ScheduleBlockize(self, loop, preserve_unit_iters)  # type: ignore # pylint: disable=no-member
 
     @type_checked
-    def tensorize(self, block_or_loop: Union[BlockRV, LoopRV], tensor_intrin: str) -> None:
+    def tensorize(
+        self,
+        block_or_loop: Union[BlockRV, LoopRV],
+        tensor_intrin: str,
+        preserve_unit_iters: bool = True,
+    ) -> None:
         """Tensorize the computation enclosed by loop with the tensor intrinsic.
 
         Parameters
@@ -2269,6 +2276,8 @@ def tensorize(self, block_or_loop: Union[BlockRV, LoopRV], tensor_intrin: str) -
             The loop to be tensorized.
         tensor_intrin : str
             The tensor intrin or the name of the tensor intrin.
+        preserve_unit_iters : bool
+            Whether or not to preserve unit iterators in block bindings
 
         Examples
         --------
@@ -2402,7 +2411,7 @@ def after_tensorize(
                         )
         """
         _ffi_api.ScheduleTensorize(  # type: ignore # pylint: disable=no-member
-            self, block_or_loop, tensor_intrin
+            self, block_or_loop, tensor_intrin, preserve_unit_iters
         )
 
     ########## Schedule: Annotation ##########
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index adba61632fb2..03a36e803be8 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -1812,18 +1812,26 @@ class SubspaceDivider {
     // extent of inner
     PrimExpr inner_extent;
 
+    // The kind of the division result.
+    enum class Kind {
+      kInner,  // Indicates the division result is totally in inner subspace.
+      kOuter,  // Indicates the division result is totally in outer subspace.
+      kMixed,  // Indicates the division result is mixed in both subspace.
+    } kind;
+
     DivisionResult(IterMapExpr outer, PrimExpr outer_extent, IterMapExpr inner,
-                   PrimExpr inner_extent)
+                   PrimExpr inner_extent, Kind kind = Kind::kMixed)
         : outer(std::move(outer)),
           inner(std::move(inner)),
           outer_extent(std::move(outer_extent)),
-          inner_extent(std::move(inner_extent)) {}
+          inner_extent(std::move(inner_extent)),
+          kind(kind) {}
 
     // whether the division result is totally in outer subspace
-    bool IsOuter() const { return is_one(inner_extent); }
+    bool IsOuter() const { return kind == Kind::kOuter; }
 
     // whether the division result is totally in inner subspace
-    bool IsInner() const { return is_one(outer_extent); }
+    bool IsInner() const { return kind == Kind::kInner; }
 
     IterSplitExpr GetOuterAsSplit() const { return GetAsSplit(outer, outer_extent); }
 
@@ -1832,13 +1840,13 @@ class SubspaceDivider {
     static DivisionResult Inner(const IterMapExpr& iter, const PrimExpr& extent) {
       auto dtype = iter.dtype();
       return DivisionResult(IterSumExpr({}, make_const(dtype, 0)), make_const(dtype, 1), iter,
-                            extent);
+                            extent, Kind::kInner);
     }
 
     static DivisionResult Outer(const IterMapExpr& iter, const PrimExpr& extent) {
       auto dtype = iter.dtype();
       return DivisionResult(iter, extent, IterSumExpr({}, make_const(dtype, 0)),
-                            make_const(dtype, 1));
+                            make_const(dtype, 1), Kind::kOuter);
     }
 
     // Special value to indicate the division is not possible
@@ -2066,9 +2074,11 @@ class SubspaceDivider {
 Array<Array<IterMark>> SubspaceDivide(const Array<PrimExpr>& bindings,
                                       const Map<Var, Range>& input_iters,
                                       const Array<Var>& sub_iters, const PrimExpr& predicate,
-                                      IterMapLevel check_level, arith::Analyzer* analyzer) {
+                                      IterMapLevel check_level, arith::Analyzer* analyzer,
+                                      bool simplify_trivial_iterators) {
   if (!IterRangeSanityCheck(input_iters)) return Array<Array<IterMark>>();
-  auto res = DetectIterMap(bindings, input_iters, predicate, check_level, analyzer);
+  auto res = DetectIterMap(bindings, input_iters, predicate, check_level, analyzer,
+                           simplify_trivial_iterators);
   const Array<IterSumExpr>& maps = res->indices;
   if (maps.empty()) return {};
 
@@ -2096,10 +2106,11 @@ Array<Array<IterMark>> SubspaceDivide(const Array<PrimExpr>& bindings,
 
 TVM_REGISTER_GLOBAL("arith.SubspaceDivide")
     .set_body_typed([](const Array<PrimExpr>& bindings, const Map<Var, Range>& root_iters,
-                       const Array<Var>& sub_iters, const PrimExpr& predicate, int check_level) {
+                       const Array<Var>& sub_iters, const PrimExpr& predicate, int check_level,
+                       bool simplify_trivial_iterators) {
       arith::Analyzer ana;
       return SubspaceDivide(bindings, root_iters, sub_iters, predicate, IterMapLevel(check_level),
-                            &ana);
+                            &ana, simplify_trivial_iterators);
     });
 
 class InverseAffineIterMapTransformer {
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index a0d29a00f886..7ae0185b425c 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -690,25 +690,29 @@ BlockRV ConcreteScheduleNode::RFactor(const LoopRV& loop_rv, int factor_axis) {
 }
 
 /******** Schedule: Blockize & Tensorize ********/
-BlockRV ConcreteScheduleNode::Blockize(const LoopRV& loop_rv) {
+BlockRV ConcreteScheduleNode::Blockize(const LoopRV& loop_rv, bool preserve_unit_iters) {
   StmtSRef result{nullptr};
   TVM_TIR_SCHEDULE_BEGIN();
-  result = tir::Blockize(state_, this->GetSRef(loop_rv));
+  result = tir::Blockize(state_, this->GetSRef(loop_rv), preserve_unit_iters);
   this->state_->DebugVerify();
   TVM_TIR_SCHEDULE_END("blockize", this->error_render_level_);
   return CreateRV<BlockRV>(result);
 }
 
-void ConcreteScheduleNode::Tensorize(const LoopRV& loop_rv, const String& intrin) {
+void ConcreteScheduleNode::Tensorize(const LoopRV& loop_rv, const String& intrin,
+                                     bool preserve_unit_iters) {
   TVM_TIR_SCHEDULE_BEGIN();
-  tir::Tensorize(state_, this->GetSRef(loop_rv), tir::TensorIntrin::Get(intrin).value());
+  tir::Tensorize(state_, this->GetSRef(loop_rv), tir::TensorIntrin::Get(intrin).value(),
+                 preserve_unit_iters);
   this->state_->DebugVerify();
   TVM_TIR_SCHEDULE_END("tensorize", this->error_render_level_);
 }
 
-void ConcreteScheduleNode::Tensorize(const BlockRV& block_rv, const String& intrin) {
+void ConcreteScheduleNode::Tensorize(const BlockRV& block_rv, const String& intrin,
+                                     bool preserve_unit_iters) {
   TVM_TIR_SCHEDULE_BEGIN();
-  tir::Tensorize(state_, this->GetSRef(block_rv), tir::TensorIntrin::Get(intrin).value());
+  tir::Tensorize(state_, this->GetSRef(block_rv), tir::TensorIntrin::Get(intrin).value(),
+                 preserve_unit_iters);
   this->state_->DebugVerify();
   TVM_TIR_SCHEDULE_END("tensorize", this->error_render_level_);
 }
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 66fca107715b..2381870760a0 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -137,9 +137,9 @@ class ConcreteScheduleNode : public ScheduleNode {
                     int offset) override;
   void SetScope(const BlockRV& block_rv, int buffer_index, const String& storage_scope) override;
   /******** Schedule: Blockize & Tensorize ********/
-  BlockRV Blockize(const LoopRV& loop_rv) override;
-  void Tensorize(const BlockRV& block_rv, const String& intrin) override;
-  void Tensorize(const LoopRV& loop_rv, const String& intrin) override;
+  BlockRV Blockize(const LoopRV& loop_rv, bool preserve_unit_iters) override;
+  void Tensorize(const BlockRV& block_rv, const String& intrin, bool preserve_unit_iters) override;
+  void Tensorize(const LoopRV& loop_rv, const String& intrin, bool preserve_unit_iters) override;
   /******** Schedule: Annotation ********/
   void Annotate(const LoopRV& loop_rv, const String& ann_key, const ObjectRef& ann_val) override;
   void Unannotate(const LoopRV& loop_rv, const String& ann_key) override;
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index af1988eaaf36..38931aa27147 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -452,18 +452,20 @@ TVM_DLL void SetAxisSeparator(ScheduleState self, const StmtSRef& block_sref, in
  * \brief Convert the subtree rooted at a specific loop into a block.
  * \param self The state of the schedule
  * \param loop_sref The root of the subtree
+ * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
  * \return The new block
  */
-TVM_DLL StmtSRef Blockize(ScheduleState self, const StmtSRef& loop_sref);
+TVM_DLL StmtSRef Blockize(ScheduleState self, const StmtSRef& loop_sref, bool preserve_unit_iters);
 
 /*!
  * \brief Tensorize the computation enclosed by loop with the tensor intrinsic.
  * \param self The state of the schedule
  * \param block_or_loop_sref The block or loop to be tensorized.
  * \param intrin The tensor intrinsic.
+ * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
  */
 TVM_DLL void Tensorize(ScheduleState self, const StmtSRef& block_or_loop_sref,
-                       const TensorIntrin& intrin);
+                       const TensorIntrin& intrin, bool preserve_unit_iters);
 
 /******** Schedule: Annotation ********/
 /*!
diff --git a/src/tir/schedule/primitive/blockize_tensorize.cc b/src/tir/schedule/primitive/blockize_tensorize.cc
index 80a653c544b0..6860927c4d36 100644
--- a/src/tir/schedule/primitive/blockize_tensorize.cc
+++ b/src/tir/schedule/primitive/blockize_tensorize.cc
@@ -76,7 +76,7 @@ class SubspaceNotDivisibleError : public ScheduleError {
  *   1. The binding covers no inner loop vars.
  *   2. The binding covers only inner loop vars.
  *
- * The bindings are not required to be quasi-affine.
+ * The bindings are not required to be quasi-affine. Trivial block iters are always preserved.
  *
  * \param iter_vars The input iterators
  * \param bindings The values of iter_vars
@@ -146,12 +146,13 @@ Array<Array<arith::IterMark>> TrivialSubspaceDivision(const Array<IterVar>& iter
  * \param loop_sref The loop that is the root of the second subspace.
  * \param loops The loops that represents the second part of the subspace.
  * \param analyzer The arithmetic analyzer to use.
+ * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
  */
 Array<Array<arith::IterMark>> SubspaceDivide(const BlockRealize& realize,
                                              const StmtSRef& block_sref,  //
                                              const StmtSRef& loop_sref,   //
                                              std::vector<const ForNode*>* loops,
-                                             arith::Analyzer* analyzer) {
+                                             arith::Analyzer* analyzer, bool preserve_unit_iters) {
   Array<Var> inner_vars;
   Array<Var> outer_vars;
   Map<Var, Range> loop_var_domain;
@@ -173,7 +174,8 @@ Array<Array<arith::IterMark>> SubspaceDivide(const BlockRealize& realize,
   }
   Array<Array<arith::IterMark>> result =
       arith::SubspaceDivide(realize->iter_values, loop_var_domain, inner_vars, realize->predicate,
-                            arith::IterMapLevel::Surjective, analyzer);
+                            arith::IterMapLevel::Surjective, analyzer,
+                            /*simplify_trivial_iterators=*/!preserve_unit_iters);
   if (!result.empty()) {
     return result;
   }
@@ -191,6 +193,7 @@ Array<Array<arith::IterMark>> SubspaceDivide(const BlockRealize& realize,
  * \param outer_bindings The outer block bindings.
  * \param inner_iter_vars The inner block iterators.
  * \param inner_bindings The inner block bindings.
+ * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
  * \return A substitution plan to the iterators in the original inner block.
  */
 Map<Var, PrimExpr> DeriveBlockBinding(const Array<IterVar>& iter_vars,                //
@@ -198,7 +201,7 @@ Map<Var, PrimExpr> DeriveBlockBinding(const Array<IterVar>& iter_vars,
                                       Array<IterVar>* outer_iter_vars,                //
                                       Array<PrimExpr>* outer_bindings,                //
                                       Array<IterVar>* inner_iter_vars,                //
-                                      Array<PrimExpr>* inner_bindings) {
+                                      Array<PrimExpr>* inner_bindings, bool preserve_unit_iters) {
   using arith::IterMapExpr;
   using arith::IterMapExprNode;
   using arith::NormalizeIterMapToExpr;
@@ -427,7 +430,8 @@ Stmt MakeLoopNest(Stmt stmt, const std::vector<const ForNode*>& loops) {
 }
 
 BlockRealize BlockizeImpl(const ScheduleState& self, const StmtSRef& loop_sref,
-                          Map<Block, Block>* block_sref_reuse, arith::Analyzer* analyzer) {
+                          Map<Block, Block>* block_sref_reuse, arith::Analyzer* analyzer,
+                          bool preserve_unit_iters) {
   TVM_SREF_TO_FOR(loop_sref);
   // Step 1: Check and get the only block under `loop`.
   BlockRealize block_realize = CheckGetSingleChildBlockRealizeOnSRefTree(self, loop_sref);
@@ -436,7 +440,7 @@ BlockRealize BlockizeImpl(const ScheduleState& self, const StmtSRef& loop_sref,
   // Step 2: Derive subspace division
   std::vector<const ForNode*> loops;
   Array<Array<arith::IterMark>> division =
-      SubspaceDivide(block_realize, block_sref, loop_sref, &loops, analyzer);
+      SubspaceDivide(block_realize, block_sref, loop_sref, &loops, analyzer, preserve_unit_iters);
   if (division.empty()) {
     throw SubspaceNotDivisibleError(self->mod, GetRef<For>(loops.back()), block);
   }
@@ -450,7 +454,8 @@ BlockRealize BlockizeImpl(const ScheduleState& self, const StmtSRef& loop_sref,
   Map<Var, PrimExpr> block_var_subst =                       //
       DeriveBlockBinding(block->iter_vars, division,         //
                          &outer_iter_vars, &outer_bindings,  //
-                         &inner_iter_vars, &inner_bindings);
+                         &inner_iter_vars, &inner_bindings,  //
+                         preserve_unit_iters);
   // Step 4: Do var substitution to adjust to the new block bindings
   Map<Var, arith::IntSet> inner_iter_dom;
   for (const IterVar& iter : inner_iter_vars) {
@@ -494,10 +499,11 @@ BlockRealize BlockizeImpl(const ScheduleState& self, const StmtSRef& loop_sref,
                 : Optional<Stmt>(NullOpt)));
 }
 
-StmtSRef Blockize(ScheduleState self, const StmtSRef& loop_sref) {
+StmtSRef Blockize(ScheduleState self, const StmtSRef& loop_sref, bool preserve_unit_iters) {
   arith::Analyzer analyzer;
   Map<Block, Block> block_sref_reuse;
-  BlockRealize blockized = BlockizeImpl(self, loop_sref, &block_sref_reuse, &analyzer);
+  BlockRealize blockized =
+      BlockizeImpl(self, loop_sref, &block_sref_reuse, &analyzer, preserve_unit_iters);
   self->Replace(loop_sref, blockized, block_sref_reuse);
   StmtSRef result = self->stmt2ref.at(blockized->block.get());
   StmtSRef scope_root = tir::GetScopeRoot(self, result, /*require_stage_pipeline=*/false);
@@ -507,7 +513,8 @@ StmtSRef Blockize(ScheduleState self, const StmtSRef& loop_sref) {
   return result;
 }
 
-void Tensorize(ScheduleState self, const StmtSRef& sref, const TensorIntrin& intrin) {
+void Tensorize(ScheduleState self, const StmtSRef& sref, const TensorIntrin& intrin,
+               bool preserve_unit_iters) {
   // Step 1: Blockize the subtree rooted at the given loop if needed
   BlockRealize block_realize{nullptr};
   Optional<Block> old_block = NullOpt;
@@ -517,7 +524,7 @@ void Tensorize(ScheduleState self, const StmtSRef& sref, const TensorIntrin& int
   } else if (sref->stmt->IsInstance<ForNode>()) {
     arith::Analyzer analyzer;
     Map<Block, Block> block_sref_reuse;
-    block_realize = BlockizeImpl(self, sref, &block_sref_reuse, &analyzer);
+    block_realize = BlockizeImpl(self, sref, &block_sref_reuse, &analyzer, preserve_unit_iters);
   } else {
     LOG(FATAL) << "TypeError: Tensorize only support For or Block, but gets: "
                << GetRef<Stmt>(sref->stmt);
@@ -617,16 +624,17 @@ struct BlockizeTraits : public UnpackedInstTraits<BlockizeTraits> {
 
  private:
   static constexpr size_t kNumInputs = 1;
-  static constexpr size_t kNumAttrs = 0;
+  static constexpr size_t kNumAttrs = 1;
   static constexpr size_t kNumDecisions = 0;
 
-  static BlockRV UnpackedApplyToSchedule(Schedule sch, LoopRV loop_rv) {
-    return sch->Blockize(loop_rv);
+  static BlockRV UnpackedApplyToSchedule(Schedule sch, LoopRV loop_rv, Bool preserve_unit_iters) {
+    return sch->Blockize(loop_rv, preserve_unit_iters.operator bool());
   }
 
-  static String UnpackedAsPython(Array<String> outputs, String loop_rv) {
+  static String UnpackedAsPython(Array<String> outputs, String loop_rv, Bool preserve_unit_iters) {
     PythonAPICall py("blockize");
     py.Input("loop", loop_rv);
+    py.Input("preserve_unit_iters", preserve_unit_iters.operator bool());
     py.SingleOutput(outputs);
     return py.Str();
   }
@@ -641,24 +649,27 @@ struct TensorizeTraits : public UnpackedInstTraits<TensorizeTraits> {
 
  private:
   static constexpr size_t kNumInputs = 1;
-  static constexpr size_t kNumAttrs = 1;
+  static constexpr size_t kNumAttrs = 2;
   static constexpr size_t kNumDecisions = 0;
 
-  static void UnpackedApplyToSchedule(Schedule sch, ObjectRef block_or_loop_rv, String intrin) {
+  static void UnpackedApplyToSchedule(Schedule sch, ObjectRef block_or_loop_rv, String intrin,
+                                      Bool preserve_unit_iters) {
     if (const auto* block = block_or_loop_rv.as<BlockRVNode>()) {
-      sch->Tensorize(GetRef<BlockRV>(block), intrin);
+      sch->Tensorize(GetRef<BlockRV>(block), intrin, preserve_unit_iters.operator bool());
     } else if (const auto* loop = block_or_loop_rv.as<LoopRVNode>()) {
-      sch->Tensorize(GetRef<LoopRV>(loop), intrin);
+      sch->Tensorize(GetRef<LoopRV>(loop), intrin, preserve_unit_iters.operator bool());
     } else {
       LOG(FATAL) << "TypeError: Expected Block or Loop, but gets: "
                  << block_or_loop_rv->GetTypeKey();
     }
   }
 
-  static String UnpackedAsPython(Array<String> outputs, String block_or_loop_rv, String intrin) {
+  static String UnpackedAsPython(Array<String> outputs, String block_or_loop_rv, String intrin,
+                                 Bool preserve_unit_iters) {
     PythonAPICall py("tensorize");
     py.Input("block_or_loop", block_or_loop_rv);
     py.Input("tensor_intrin", intrin);
+    py.Input("preserve_unit_iters", preserve_unit_iters.operator bool());
     return py.Str();
   }
 
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index 3fe81c9f433b..d008f3639c78 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -211,11 +211,11 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleSetScope")
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleBlockize")
     .set_body_method<Schedule>(&ScheduleNode::Blockize);
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleTensorize")
-    .set_body_typed([](Schedule self, ObjectRef rv, String intrin) {
+    .set_body_typed([](Schedule self, ObjectRef rv, String intrin, bool preserve_unit_iters) {
       if (const auto* block_rv = rv.as<BlockRVNode>()) {
-        self->Tensorize(GetRef<BlockRV>(block_rv), intrin);
+        self->Tensorize(GetRef<BlockRV>(block_rv), intrin, preserve_unit_iters);
       } else if (const auto* loop_rv = rv.as<LoopRVNode>()) {
-        self->Tensorize(GetRef<LoopRV>(loop_rv), intrin);
+        self->Tensorize(GetRef<LoopRV>(loop_rv), intrin, preserve_unit_iters);
       } else {
         LOG(FATAL) << "TypeError: Cannot evaluate the random variable of type: " << rv->GetTypeKey()
                    << ". Its value is: " << rv;
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 010730f66c60..00941b48575d 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -442,34 +442,36 @@ void TracedScheduleNode::SetScope(const BlockRV& block_rv, int buffer_index,
 
 /******** Schedule: Blockize & Tensorize ********/
 
-BlockRV TracedScheduleNode::Blockize(const LoopRV& loop_rv) {
-  BlockRV new_block = ConcreteScheduleNode::Blockize(loop_rv);
+BlockRV TracedScheduleNode::Blockize(const LoopRV& loop_rv, bool preserve_unit_iters) {
+  BlockRV new_block = ConcreteScheduleNode::Blockize(loop_rv, preserve_unit_iters);
   static const InstructionKind& kind = InstructionKind::Get("Blockize");
   trace_->Append(/*inst=*/Instruction(
       /*kind=*/kind,
       /*inputs=*/{loop_rv},
-      /*attrs=*/{},
+      /*attrs=*/{Bool(preserve_unit_iters)},
       /*outputs=*/{new_block}));
   return new_block;
 }
 
-void TracedScheduleNode::Tensorize(const LoopRV& loop_rv, const String& intrin) {
-  ConcreteScheduleNode::Tensorize(loop_rv, intrin);
+void TracedScheduleNode::Tensorize(const LoopRV& loop_rv, const String& intrin,
+                                   bool preserve_unit_iters) {
+  ConcreteScheduleNode::Tensorize(loop_rv, intrin, preserve_unit_iters);
   static const InstructionKind& kind = InstructionKind::Get("Tensorize");
   trace_->Append(/*inst=*/Instruction(
       /*kind=*/kind,
       /*inputs=*/{loop_rv},
-      /*attrs=*/{intrin},
+      /*attrs=*/{intrin, Bool(preserve_unit_iters)},
       /*outputs=*/{}));
 }
 
-void TracedScheduleNode::Tensorize(const BlockRV& block_rv, const String& intrin) {
-  ConcreteScheduleNode::Tensorize(block_rv, intrin);
+void TracedScheduleNode::Tensorize(const BlockRV& block_rv, const String& intrin,
+                                   bool preserve_unit_iters) {
+  ConcreteScheduleNode::Tensorize(block_rv, intrin, preserve_unit_iters);
   static const InstructionKind& kind = InstructionKind::Get("Tensorize");
   trace_->Append(/*inst=*/Instruction(
       /*kind=*/kind,
       /*inputs=*/{block_rv},
-      /*attrs=*/{intrin},
+      /*attrs=*/{intrin, Bool(preserve_unit_iters)},
       /*outputs=*/{}));
 }
 
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index cea2096d20a6..80257f644f6b 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -96,9 +96,9 @@ class TracedScheduleNode : public ConcreteScheduleNode {
                     int offset) final;
   void SetScope(const BlockRV& block_rv, int buffer_index, const String& storage_scope) final;
   /******** Schedule: Blockize & Tensorize ********/
-  BlockRV Blockize(const LoopRV& loop_rv) final;
-  void Tensorize(const BlockRV& block_rv, const String& intrin) final;
-  void Tensorize(const LoopRV& loop_rv, const String& intrin) final;
+  BlockRV Blockize(const LoopRV& loop_rv, bool preserve_unit_iters) final;
+  void Tensorize(const BlockRV& block_rv, const String& intrin, bool preserve_unit_iters) final;
+  void Tensorize(const LoopRV& loop_rv, const String& intrin, bool preserve_unit_iters) final;
   /******** Schedule: Annotation ********/
   void Annotate(const LoopRV& loop_rv, const String& ann_key, const ObjectRef& ann_val) override;
   void Unannotate(const LoopRV& loop_rv, const String& ann_key) override;
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index 6a2fdbbb3f1c..7ae5c58a9507 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -670,6 +670,35 @@ def test_subspace_division():
     assert len(res) == 0
 
 
+def test_subspace_divide_trivial_iters():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+
+    # trivial 1.1
+    res = tvm.arith.subspace_divide(
+        [x * 16 + y], var_dom([(x, 1), (y, 16)]), [y], simplify_trivial_iterators=False
+    )
+    res = convert_division(res)
+    assert len(res) == 2
+    tvm.ir.assert_structural_equal(res[0][0], x)
+    tvm.ir.assert_structural_equal(res[0][1], y)
+
+    # trivial 1.2
+    res = tvm.arith.subspace_divide(
+        [x, y],
+        var_dom([(x, 1), (y, 1)]),
+        [y],
+        simplify_trivial_iterators=False,
+    )
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], x)
+    tvm.ir.assert_structural_equal(res[0][1], 0)
+    tvm.ir.assert_structural_equal(res[1][0], 0)
+    tvm.ir.assert_structural_equal(res[1][1], y)
+
+
 def test_complex():
     n0 = create_iter("n0", 2)
     n1 = create_iter("n1", 4)
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
index e70f7cb2c618..54f342c3a5d8 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
@@ -74,16 +74,16 @@ def vnni_conv2d_nchwc_0(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
         for i0_0, i1_0, i2_0, i3_0, i4_0_0, i0_1, i1_1, i2_1, i3_1, i4_0_1 in T.grid(1, 8, 28, 56, 1, 1, 2, 1, 1, 1):
             for i5_0, i6_0, i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(1, 1, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1):
                 with T.block("conv2d_NCHWc_int8_o"):
-                    n = T.axis.spatial(1, 0)
+                    n = T.axis.spatial(1, i0_2 + i0_3 + i0_0 + i0_1)
                     oc_chunk = T.axis.spatial(16, i1_0 * 2 + i1_1 + i1_2 + i1_3)
                     oh = T.axis.spatial(56, i2_0 * 2 + i2_1 * 2 + i2_2 + i2_3)
                     ow = T.axis.spatial(56, i3_3 + i3_0 + i3_1 + i3_2)
-                    oc_block_o = T.axis.spatial(1, 0)
-                    kh = T.axis.reduce(1, 0)
-                    kw = T.axis.reduce(1, 0)
+                    oc_block_o = T.axis.spatial(1, i4_0_2 + i4_0_3 + i4_0_0 + i4_0_1)
+                    kh = T.axis.reduce(1, i5_1 + i5_0)
+                    kw = T.axis.reduce(1, i6_0 + i6_1)
                     ic_outer = T.axis.reduce(4, i7_0 * 4 + i7_1)
                     ic_f_inner = T.axis.reduce(4, i8_0 + i8_1)
-                    ic_s_inner_o = T.axis.reduce(1, 0)
+                    ic_s_inner_o = T.axis.reduce(1, i9_0_1 + i9_0_0)
                     T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
                     T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, 0 : 16])
                     T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"})
@@ -119,16 +119,16 @@ def vnni_conv2d_nchwc_1(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
         for i0_0, i1_0, i2_0, i3_0, i4_0_0 in T.grid(1, 8, 28, 56, 1):
             for i0_1, i1_1, i2_1, i3_1, i4_0_1, i5_0, i6_0, i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(1, 2, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1):
                 with T.block("conv2d_NCHWc_int8_o"):
-                    n = T.axis.spatial(1, 0)
+                    n = T.axis.spatial(1, i0_2 + i0_3 + i0_0 + i0_1)
                     oc_chunk = T.axis.spatial(16, i1_0 * 2 + i1_1 + i1_2 + i1_3)
                     oh = T.axis.spatial(56, i2_0 * 2 + i2_1 * 2 + i2_2 + i2_3)
                     ow = T.axis.spatial(56, i3_3 + i3_0 + i3_1 + i3_2)
-                    oc_block_o = T.axis.spatial(1, 0)
-                    kh = T.axis.reduce(1, 0)
-                    kw = T.axis.reduce(1, 0)
+                    oc_block_o = T.axis.spatial(1, i4_0_2 + i4_0_3 + i4_0_0 + i4_0_1)
+                    kh = T.axis.reduce(1, i5_1 + i5_0)
+                    kw = T.axis.reduce(1, i6_0 + i6_1)
                     ic_outer = T.axis.reduce(4, i7_0 * 4 + i7_1)
                     ic_f_inner = T.axis.reduce(4, i8_0 + i8_1)
-                    ic_s_inner_o = T.axis.reduce(1, 0)
+                    ic_s_inner_o = T.axis.reduce(1, i9_0_1 + i9_0_0)
                     T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
                     T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, 0 : 16])
                     T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"})
@@ -162,16 +162,16 @@ def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         for i0_0, i1_0, i2_0, i3_0, i4_0_0, i0_1, i1_1, i2_1, i3_1, i4_0_1, i5_0, i6_0, i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(1, 8, 28, 56, 1, 1, 2, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1):
             with T.block("conv2d_NCHWc_int8_o"):
-                n = T.axis.spatial(1, 0)
+                n = T.axis.spatial(1, i0_2 + i0_3 + i0_0 + i0_1)
                 oc_chunk = T.axis.spatial(16, i1_0 * 2 + i1_1 + i1_2 + i1_3)
                 oh = T.axis.spatial(56, i2_0 * 2 + i2_1 * 2 + i2_2 + i2_3)
                 ow = T.axis.spatial(56, i3_3 + i3_0 + i3_1 + i3_2)
-                oc_block_o = T.axis.spatial(1, 0)
-                kh = T.axis.reduce(1, 0)
-                kw = T.axis.reduce(1, 0)
+                oc_block_o = T.axis.spatial(1, i4_0_2 + i4_0_3 + i4_0_0 + i4_0_1)
+                kh = T.axis.reduce(1, i5_1 + i5_0)
+                kw = T.axis.reduce(1, i6_0 + i6_1)
                 ic_outer = T.axis.reduce(4, i7_0 * 4 + i7_1)
                 ic_f_inner = T.axis.reduce(4, i8_0 + i8_1)
-                ic_s_inner_o = T.axis.reduce(1, 0)
+                ic_s_inner_o = T.axis.reduce(1, i9_0_1 + i9_0_0)
                 T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
                 T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16])
                 T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"})
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
index acc626b904a1..73b2c990f08a 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py
@@ -117,7 +117,7 @@ def matmul_relu_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 128), "f
                             for ax0_0, ax1_0 in T.grid(2, 1):
                                 with T.block("B_reindex_shared_wmma.matrix_b_o"):
                                     v0_o = T.axis.spatial(8, ax2_0_1 * 2 + ax0_0)
-                                    v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused)
+                                    v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused + ax1_0)
                                     T.reads(B_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"})
@@ -152,7 +152,7 @@ def matmul_relu_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 128), "f
                     for ax0_0, ax1_0 in T.grid(2, 1):
                         with T.block("C_reindex_shared_wmma.accumulator_o"):
                             v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused // 2 * 2 + ax0_0)
-                            v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused)
+                            v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused + ax1_0)
                             T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                             T.writes(C_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                             T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"})
@@ -396,7 +396,8 @@ def conv2d_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3,
                         for ax2_0_1 in T.serial(18):
                             for ax0_0, ax1_0 in T.grid(1, 1):
                                 with T.block("PadInput_reindex_shared_wmma.matrix_a_o"):
-                                    v0_o, v1_o = T.axis.remap("SS", [ax0_0_1_ax1_0_1_fused, ax2_0_1])
+                                    v0_o = T.axis.spatial(16, ax0_0_1_ax1_0_1_fused + ax0_0)
+                                    v1_o = T.axis.spatial(18, ax2_0_1 + ax1_0)
                                     T.reads(PadInput_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"})
@@ -408,7 +409,8 @@ def conv2d_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3,
                                             PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
                             for ax0_0, ax1_0 in T.grid(1, 1):
                                 with T.block("weight_reindex_shared_wmma.matrix_b_o"):
-                                    v0_o, v1_o = T.axis.remap("SS", [ax2_0_1, ax0_0_0_ax1_0_0_fused])
+                                    v0_o = T.axis.spatial(18, ax2_0_1 + ax0_0)
+                                    v1_o = T.axis.spatial(2, ax0_0_0_ax1_0_0_fused + ax1_0)
                                     T.reads(weight_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.writes(weight_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"})
@@ -442,7 +444,8 @@ def conv2d_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3,
                                             conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(weight_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32")
                     for ax0_0, ax1_0 in T.grid(1, 1):
                         with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator_o"):
-                            v0_o, v1_o = T.axis.remap("SS", [ax0_0_1_ax1_0_1_fused, ax0_0_0_ax1_0_0_fused])
+                            v0_o = T.axis.spatial(16, ax0_0_1_ax1_0_1_fused + ax0_0)
+                            v1_o = T.axis.spatial(2, ax0_0_0_ax1_0_0_fused + ax1_0)
                             T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                             T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                             T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"})
@@ -560,7 +563,7 @@ def matmul_relu_pipeline_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128,
                             for ax0_0, ax1_0 in T.grid(2, 1):
                                 with T.block("A_reindex_shared_wmma.matrix_a_o"):
                                     v0_o = T.axis.spatial(8, ax0_0_1_ax1_0_1_fused // 4 * 2 + ax0_0)
-                                    v1_o = T.axis.spatial(8, ax2_0_0 * 2 + ax2_0_1)
+                                    v1_o = T.axis.spatial(8, ax2_0_0 * 2 + ax2_0_1 + ax1_0)
                                     T.reads(A_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"})
@@ -572,7 +575,7 @@ def matmul_relu_pipeline_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128,
                                             A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
                             for ax0_0, ax1_0 in T.grid(1, 2):
                                 with T.block("B_reindex_shared_wmma.matrix_b_o"):
-                                    v0_o = T.axis.spatial(8, ax2_0_0 * 2 + ax2_0_1)
+                                    v0_o = T.axis.spatial(8, ax2_0_0 * 2 + ax2_0_1 + ax0_0)
                                     v1_o = T.axis.spatial(8, ax0_0_1_ax1_0_1_fused % 4 * 2 + ax1_0)
                                     T.reads(B_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
@@ -706,7 +709,7 @@ def matmul_relu_global_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 1
                         for ax2_0_1 in T.serial(2):
                             for ax0_0, ax1_0 in T.grid(1, 2):
                                 with T.block("A_reindex_shared_wmma.matrix_a_o"):
-                                    v0_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused // 2)
+                                    v0_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused // 2 + ax0_0)
                                     v1_o = T.axis.spatial(8, ax2_0_0 * 4 + ax2_0_1 * 2 + ax1_0)
                                     T.reads(A_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
@@ -754,7 +757,7 @@ def matmul_relu_global_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 1
                                             C_reindex_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32")
                     for ax0_0, ax1_0 in T.grid(1, 4):
                         with T.block("C_reindex_wmma.accumulator_o"):
-                            v0_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused // 2)
+                            v0_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused // 2 + ax0_0)
                             v1_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused % 2 * 4 + ax1_0)
                             T.reads(C_reindex_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                             T.writes(C[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
@@ -875,7 +878,7 @@ def padded_matmul_relu_0(A: T.Buffer[(127, 127), "float16"], B: T.Buffer[(127, 1
                             for ax0_0, ax1_0 in T.grid(2, 1):
                                 with T.block("B_reindex_shared_wmma.matrix_b_o"):
                                     v0_o = T.axis.spatial(8, ax2_0_1 * 2 + ax0_0)
-                                    v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused)
+                                    v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused + ax1_0)
                                     T.reads(B_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"})
@@ -910,7 +913,7 @@ def padded_matmul_relu_0(A: T.Buffer[(127, 127), "float16"], B: T.Buffer[(127, 1
                     for ax0_0, ax1_0 in T.grid(2, 1):
                         with T.block("C_reindex_shared_wmma.accumulator_o"):
                             v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused // 2 * 2 + ax0_0)
-                            v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused)
+                            v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused + ax1_0)
                             T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                             T.writes(C_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                             T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"})
@@ -1001,7 +1004,7 @@ def conv2d_1x1_0(inputs: T.Buffer[(1, 16, 16, 64), "float16"], weight: T.Buffer[
                         for ax0_1, ax1_1, ax4_0_1 in T.grid(1, 1, 1):
                             for ax0_0_1, ax1_0_1 in T.grid(1, 4):
                                 with T.block("PadInput_reindex_shared_wmma.matrix_a_o"):
-                                    v0_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused // 2 * 2 + ax2_0_1_ax3_0_1_fused)
+                                    v0_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused // 2 * 2 + ax2_0_1_ax3_0_1_fused + ax0_0_1)
                                     v1_o = T.axis.spatial(4, ax1_0_1)
                                     T.reads(PadInput_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                     T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
@@ -1014,10 +1017,8 @@ def conv2d_1x1_0(inputs: T.Buffer[(1, 16, 16, 64), "float16"], weight: T.Buffer[
                                             PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
                             for ax0, ax1, ax2_0, ax3_0 in T.grid(1, 1, 4, 1):
                                 with T.block("weight_reindex_shared_wmma.matrix_b_o"):
-                                    v0 = T.axis.spatial(1, 0)
-                                    v1 = T.axis.spatial(1, 0)
-                                    v2_o = T.axis.spatial(4, ax2_0)
-                                    v3_o = T.axis.spatial(4, ax2_0_0_ax3_0_0_fused % 2 * 2 + ax2_0_2_ax3_0_2_fused)
+                                    v0, v1, v2_o = T.axis.remap("SSS", [ax0, ax1, ax2_0])
+                                    v3_o = T.axis.spatial(4, ax2_0_0_ax3_0_0_fused % 2 * 2 + ax2_0_2_ax3_0_2_fused + ax3_0)
                                     T.reads(weight_reindex_shared[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
                                     T.writes(weight_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
                                     T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"})
@@ -1029,8 +1030,8 @@ def conv2d_1x1_0(inputs: T.Buffer[(1, 16, 16, 64), "float16"], weight: T.Buffer[
                                             weight_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i] = weight_reindex_shared[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i]
                             for ax2_0_3, ax3_0_3, ax0_2, ax1_2, ax4_0_2, ax2_0_4, ax3_0_4 in T.grid(1, 1, 1, 1, 4, 1, 1):
                                 with T.block("conv2d_nhwc_o"):
-                                    v0 = T.axis.reduce(1, 0)
-                                    v1 = T.axis.reduce(1, 0)
+                                    v0 = T.axis.reduce(1, ax0_2 + ax0_0 + ax0_1)
+                                    v1 = T.axis.reduce(1, ax1_1 + ax1_2 + ax1_0)
                                     v2_o = T.axis.spatial(16, ax2_0_4 + ax2_0_0_ax3_0_0_fused // 2 * 2 + ax2_0_1_ax3_0_1_fused + ax2_0_3)
                                     v3_o = T.axis.spatial(4, ax3_0_4 + ax2_0_0_ax3_0_0_fused % 2 * 2 + ax2_0_2_ax3_0_2_fused + ax3_0_3)
                                     v4_o = T.axis.reduce(4, ax4_0_0 * 4 + ax4_0_1 * 4 + ax4_0_2)
@@ -1053,8 +1054,8 @@ def conv2d_1x1_0(inputs: T.Buffer[(1, 16, 16, 64), "float16"], weight: T.Buffer[
                                             conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i] + T.cast(PadInput_reindex_shared_wmma_matrix_a[v2_o * 16 + v2_i, v4_o * 16 + v4_i], "float32") * T.cast(weight_reindex_shared_wmma_matrix_b[v0, v1, v4_o * 16 + v4_i, v3_o * 16 + v3_i], "float32")
                     for ax0_0, ax1_0 in T.grid(1, 1):
                         with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator_o"):
-                            v0_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused // 2 * 2 + ax2_0_1_ax3_0_1_fused)
-                            v1_o = T.axis.spatial(4, ax2_0_0_ax3_0_0_fused % 2 * 2 + ax2_0_2_ax3_0_2_fused)
+                            v0_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused // 2 * 2 + ax2_0_1_ax3_0_1_fused + ax0_0)
+                            v1_o = T.axis.spatial(4, ax2_0_0_ax3_0_0_fused % 2 * 2 + ax2_0_2_ax3_0_2_fused + ax1_0)
                             T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                             T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                             T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"})
diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
index c8e6bf6a0c73..9a62207fa261 100644
--- a/tests/python/unittest/test_meta_schedule_trace_apply.py
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -635,26 +635,26 @@ class Conv2dInt8_tensorcore_scheduled:
     def main(p0: T.Buffer[(16, 56, 56, 64), "int8"], p1: T.Buffer[(256, 1, 1, 64), "int8"], p2: T.Buffer[(1, 1, 1, 256), "int32"], p3: T.Buffer[(1, 1, 1, 256), "int32"], p4: T.Buffer[(1, 1, 1, 256), "int64"], p5: T.Buffer[(1, 1, 1, 256), "int64"], p6: T.Buffer[(1, 1, 1, 256), "int64"], p7: T.Buffer[(), "int32"], p8: T.Buffer[1, "int32"], p9: T.Buffer[(16, 56, 56, 256), "int32"], compute: T.Buffer[(16, 56, 56, 256), "uint8"]) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        a0 = T.var("int32")
-        a1 = T.var("int32")
-        b0 = T.var("int32")
-        b1 = T.var("int32")
-        c0 = T.var("int32")
-        c1 = T.var("int32")
-        d0 = T.var("int32")
-        d0_1 = T.var("int32")
-        d0_2 = T.var("int32")
-        d0_3 = T.var("int32")
-        d1 = T.var("int32")
-        d1_1 = T.var("int32")
-        d1_2 = T.var("int32")
-        d1_3 = T.var("int32")
-        s0 = T.var("int32")
-        s0_1 = T.var("int32")
-        s0_2 = T.var("int32")
-        s1 = T.var("int32")
-        s1_1 = T.var("int32")
-        s1_2 = T.var("int32")
+        A_s0 = T.var("int32")
+        A_s0_1 = T.var("int32")
+        A_s0_2 = T.var("int32")
+        A_s0_3 = T.var("int32")
+        A_s1 = T.var("int32")
+        A_s1_1 = T.var("int32")
+        A_s1_2 = T.var("int32")
+        A_s1_3 = T.var("int32")
+        B_s0 = T.var("int32")
+        B_s1 = T.var("int32")
+        C_s0 = T.var("int32")
+        C_s0_1 = T.var("int32")
+        C_s0_2 = T.var("int32")
+        C_s0_3 = T.var("int32")
+        C_s0_4 = T.var("int32")
+        C_s1 = T.var("int32")
+        C_s1_1 = T.var("int32")
+        C_s1_2 = T.var("int32")
+        C_s1_3 = T.var("int32")
+        C_s1_4 = T.var("int32")
         # body
         # with T.block("root")
         conv2d_nhwc_reindex_shared = T.alloc_buffer([50176, 256], dtype="int32", scope="shared")
@@ -666,83 +666,81 @@ def main(p0: T.Buffer[(16, 56, 56, 64), "int8"], p1: T.Buffer[(256, 1, 1, 64), "
         for ax2_0_0_ax3_0_0_fused in T.thread_binding(3136, thread="blockIdx.x", annotations={"pragma_auto_unroll_max_step":512, "pragma_unroll_explicit":1}):
             for ax2_0_1_ax3_0_1_fused in T.thread_binding(1, thread="vthread.x"):
                 for ax2_0_2_ax3_0_2_fused in T.thread_binding(16, thread="threadIdx.x"):
-                    for ax0_0, ax1_0 in T.grid(1, 1):
-                        for ax2_0_3_init, ax3_0_3_init, ax2_0_4_init, ax3_0_4_init in T.grid(1, 1, 1, 1):
-                            with T.block("conv2d_nhwc_o_init"):
-                                v2_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 8 * 8 + ax2_0_2_ax3_0_2_fused // 2 + ax2_0_3_init + ax2_0_4_init)
-                                v3_o = T.axis.spatial(16, ax3_0_4_init + ax2_0_0_ax3_0_0_fused % 8 * 2 + ax2_0_2_ax3_0_2_fused % 2 + ax3_0_3_init)
-                                T.reads()
-                                T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
-                                T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "warp_execution":1})
-                                C = T.match_buffer(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], [16, 16], dtype="int32", strides=[d1, d0], scope="wmma.accumulator", offset_factor=16)
-                                T.evaluate(T.tvm_fill_fragment(C.data, 16, 16, 16, C.elem_offset // d1 // 16 * (d1 // 16) + C.elem_offset % d1 // 16, T.float32(0), dtype="handle"))
-                        for ax4_0_0 in T.serial(2):
-                            for ax0_ax1_fused_0 in T.serial(16):
-                                for ax0_ax1_fused_1 in T.thread_binding(16, thread="threadIdx.x"):
-                                    for ax0_ax1_fused_2 in T.vectorized(16):
-                                        with T.block("pad_temp_reindex_shared"):
-                                            v0 = T.axis.spatial(50176, ax2_0_0_ax3_0_0_fused // 8 * 128 + (ax0_ax1_fused_0 * 256 + ax0_ax1_fused_1 * 16 + ax0_ax1_fused_2) // 32)
-                                            v1 = T.axis.spatial(64, ax4_0_0 * 32 + (ax0_ax1_fused_0 * 256 + ax0_ax1_fused_1 * 16 + ax0_ax1_fused_2) % 32)
-                                            T.reads(p0[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1])
-                                            T.writes(pad_temp_reindex_shared[v0, v1])
-                                            T.block_attr({"buffer_dim_align":[[0, 0, 32, 16]]})
-                                            pad_temp_reindex_shared[v0, v1] = p0[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1]
-                            for ax0_ax1_ax2_ax3_fused_0 in T.serial(8):
-                                for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(16, thread="threadIdx.x"):
-                                    for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(8):
-                                        with T.block("p1_reindex_shared"):
-                                            v0 = T.axis.spatial(1, 0)
-                                            v1 = T.axis.spatial(1, 0)
-                                            v2 = T.axis.spatial(256, ax2_0_0_ax3_0_0_fused % 8 * 32 + (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 8 + ax0_ax1_ax2_ax3_fused_2) // 32)
-                                            v3 = T.axis.spatial(64, ax4_0_0 * 32 + (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 8 + ax0_ax1_ax2_ax3_fused_2) % 32)
-                                            T.reads(p1[v2, v0, v1, v3])
-                                            T.writes(p1_reindex_shared[v0, v1, v2, v3])
-                                            T.block_attr({"buffer_dim_align":[[0, 2, 32, 16]]})
-                                            p1_reindex_shared[v0, v1, v2, v3] = p1[v2, v0, v1, v3]
-                            for ax0_1, ax1_1, ax4_0_1 in T.grid(1, 1, 1):
-                                for ax0_0_1, ax1_0_1 in T.grid(1, 2):
-                                    with T.block("pad_temp_reindex_shared_wmma.matrix_a_o"):
-                                        v0_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 8 * 8 + ax2_0_2_ax3_0_2_fused // 2)
-                                        v1_o = T.axis.spatial(4, ax4_0_0 * 2 + ax1_0_1)
-                                        T.reads(pad_temp_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
-                                        T.writes(pad_temp_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
-                                        A = T.match_buffer(pad_temp_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16], [16, 16], dtype="int8", strides=[s1, s0], scope="shared", offset_factor=16)
-                                        C_1 = T.match_buffer(pad_temp_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16], [16, 16], dtype="int8", strides=[d1_1, d0_1], scope="wmma.matrix_a", offset_factor=16)
-                                        T.evaluate(T.tvm_load_matrix_sync(C_1.data, 16, 16, 16, C_1.elem_offset // d1_1 // 16 * (d1_1 // 16) + C_1.elem_offset % d1_1 // 16, T.tvm_access_ptr(T.type_annotation(dtype="int8"), A.data, A.elem_offset, s1 * 16, 1, dtype="handle"), s1, "row_major", dtype="handle"))
-                                for ax0, ax1, ax2_0, ax3_0 in T.grid(1, 1, 1, 2):
-                                    with T.block("p1_reindex_shared_wmma.matrix_b_o"):
+                    for ax2_0_3_init, ax3_0_3_init, ax2_0_4_init, ax3_0_4_init in T.grid(1, 1, 1, 1):
+                        with T.block("conv2d_nhwc_o_init"):
+                            v2_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 8 * 8 + ax2_0_2_ax3_0_2_fused // 2 + ax2_0_3_init + ax2_0_4_init)
+                            v3_o = T.axis.spatial(16, ax3_0_4_init + ax2_0_0_ax3_0_0_fused % 8 * 2 + ax2_0_2_ax3_0_2_fused % 2 + ax3_0_3_init)
+                            T.reads()
+                            T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                            T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "warp_execution":1})
+                            C = T.match_buffer(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], [16, 16], dtype="int32", strides=[C_s0, C_s1], scope="wmma.accumulator", offset_factor=16)
+                            T.tvm_fill_fragment(C.data, 16, 16, 16, C.elem_offset // C_s0 // 16 * (C_s0 // 16) + C.elem_offset % C_s0 // 16, T.float32(0), dtype="handle")
+                    for ax0_0, ax1_0, ax4_0_0 in T.grid(1, 1, 2):
+                        for ax0_ax1_fused_0 in T.serial(16):
+                            for ax0_ax1_fused_1 in T.thread_binding(16, thread="threadIdx.x"):
+                                for ax0_ax1_fused_2 in T.vectorized(16):
+                                    with T.block("pad_temp_reindex_shared"):
+                                        v0 = T.axis.spatial(50176, ax2_0_0_ax3_0_0_fused // 8 * 128 + (ax0_ax1_fused_0 * 256 + ax0_ax1_fused_1 * 16 + ax0_ax1_fused_2) // 32)
+                                        v1 = T.axis.spatial(64, ax4_0_0 * 32 + (ax0_ax1_fused_0 * 256 + ax0_ax1_fused_1 * 16 + ax0_ax1_fused_2) % 32)
+                                        T.reads(p0[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1])
+                                        T.writes(pad_temp_reindex_shared[v0, v1])
+                                        T.block_attr({"buffer_dim_align":[[0, 0, 32, 16]]})
+                                        pad_temp_reindex_shared[v0, v1] = p0[v0 // 3136, v0 % 3136 // 56, v0 % 56, v1]
+                        for ax0_ax1_ax2_ax3_fused_0 in T.serial(8):
+                            for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(16, thread="threadIdx.x"):
+                                for ax0_ax1_ax2_ax3_fused_2 in T.vectorized(8):
+                                    with T.block("p1_reindex_shared"):
                                         v0 = T.axis.spatial(1, 0)
                                         v1 = T.axis.spatial(1, 0)
-                                        v2_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 8 * 2 + ax2_0_2_ax3_0_2_fused % 2)
-                                        v3_o = T.axis.spatial(4, ax4_0_0 * 2 + ax3_0)
-                                        T.reads(p1_reindex_shared[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
-                                        T.writes(p1_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
-                                        A_1 = T.match_buffer(p1_reindex_shared[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], [16, 16], dtype="int8", strides=[s1_1, s0_1], scope="shared", offset_factor=16)
-                                        C_2 = T.match_buffer(p1_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], [16, 16], dtype="int8", strides=[d1_2, d0_2], scope="wmma.matrix_b", offset_factor=16)
-                                        T.evaluate(T.tvm_load_matrix_sync(C_2.data, 16, 16, 16, C_2.elem_offset // d1_2 // 16 * (d1_2 // 16) + C_2.elem_offset % d1_2 // 16, T.tvm_access_ptr(T.type_annotation(dtype="int8"), A_1.data, A_1.elem_offset, s1_1 * 16, 1, dtype="handle"), s1_1, "col_major", dtype="handle"))
-                                for ax2_0_3, ax3_0_3, ax0_2, ax1_2, ax4_0_2, ax2_0_4, ax3_0_4 in T.grid(1, 1, 1, 1, 2, 1, 1):
-                                    with T.block("conv2d_nhwc_o_update"):
-                                        v0 = T.axis.reduce(1, 0)
-                                        v1 = T.axis.reduce(1, 0)
-                                        v2_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 8 * 8 + ax2_0_2_ax3_0_2_fused // 2 + ax2_0_3 + ax2_0_4)
-                                        v3_o = T.axis.spatial(16, ax3_0_4 + ax2_0_0_ax3_0_0_fused % 8 * 2 + ax2_0_2_ax3_0_2_fused % 2 + ax3_0_3)
-                                        v4_o = T.axis.reduce(4, ax4_0_0 * 2 + ax4_0_1 * 2 + ax4_0_2)
-                                        T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 : v2_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16], p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 : v3_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16])
-                                        T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
-                                        T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "warp_execution":1})
-                                        A_2 = T.match_buffer(pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 : v2_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16], [16, 16], dtype="int8", strides=[a1, a0], scope="wmma.matrix_a", offset_factor=16)
-                                        B = T.match_buffer(p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 : v3_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16], [16, 16], dtype="int8", strides=[b1, b0], scope="wmma.matrix_b", offset_factor=16)
-                                        C_3 = T.match_buffer(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], [16, 16], dtype="int32", strides=[c1, c0], scope="wmma.accumulator", offset_factor=16)
-                                        T.evaluate(T.tvm_mma_sync(C_3.data, C_3.elem_offset // c1 // 16 * (c1 // 16) + C_3.elem_offset % c1 // 16, A_2.data, A_2.elem_offset // a1 // 16 * (a1 // 16) + A_2.elem_offset % a1 // 16, B.data, B.elem_offset // b1 // 16 * (b1 // 16) + B.elem_offset % b1 // 16, C_3.data, C_3.elem_offset // c1 // 16 * (c1 // 16) + C_3.elem_offset % c1 // 16, dtype="handle"))
+                                        v2 = T.axis.spatial(256, ax2_0_0_ax3_0_0_fused % 8 * 32 + (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 8 + ax0_ax1_ax2_ax3_fused_2) // 32)
+                                        v3 = T.axis.spatial(64, ax4_0_0 * 32 + (ax0_ax1_ax2_ax3_fused_0 * 128 + ax0_ax1_ax2_ax3_fused_1 * 8 + ax0_ax1_ax2_ax3_fused_2) % 32)
+                                        T.reads(p1[v2, v0, v1, v3])
+                                        T.writes(p1_reindex_shared[v0, v1, v2, v3])
+                                        T.block_attr({"buffer_dim_align":[[0, 2, 32, 16]]})
+                                        p1_reindex_shared[v0, v1, v2, v3] = p1[v2, v0, v1, v3]
+                        for ax0_1, ax1_1, ax4_0_1 in T.grid(1, 1, 1):
+                            for ax0_0_1, ax1_0_1 in T.grid(1, 2):
+                                with T.block("pad_temp_reindex_shared_wmma.matrix_a_o"):
+                                    v0_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 8 * 8 + ax2_0_2_ax3_0_2_fused // 2 + ax0_0_1)
+                                    v1_o = T.axis.spatial(4, ax4_0_0 * 2 + ax1_0_1)
+                                    T.reads(pad_temp_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    T.writes(pad_temp_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                                    A = T.match_buffer(pad_temp_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16], [16, 16], dtype="int8", strides=[A_s0, A_s1], scope="shared", offset_factor=16)
+                                    C_1 = T.match_buffer(pad_temp_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16], [16, 16], dtype="int8", strides=[C_s0_1, C_s1_1], scope="wmma.matrix_a", offset_factor=16)
+                                    T.tvm_load_matrix_sync(C_1.data, 16, 16, 16, C_1.elem_offset // C_s0_1 // 16 * (C_s0_1 // 16) + C_1.elem_offset % C_s0_1 // 16, T.tvm_access_ptr(T.type_annotation(dtype="int8"), A.data, A.elem_offset, A_s0 * 16, 1, dtype="handle"), A_s0, "row_major", dtype="handle")
+                            for ax0, ax1, ax2_0, ax3_0 in T.grid(1, 1, 1, 2):
+                                with T.block("p1_reindex_shared_wmma.matrix_b_o"):
+                                    v0, v1 = T.axis.remap("SS", [ax0, ax1])
+                                    v2_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 8 * 2 + ax2_0_2_ax3_0_2_fused % 2 + ax2_0)
+                                    v3_o = T.axis.spatial(4, ax4_0_0 * 2 + ax3_0)
+                                    T.reads(p1_reindex_shared[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                    T.writes(p1_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                    A_1 = T.match_buffer(p1_reindex_shared[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], [16, 16], dtype="int8", strides=[A_s0_1, A_s1_1], scope="shared", offset_factor=16)
+                                    C_2 = T.match_buffer(p1_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], [16, 16], dtype="int8", strides=[C_s0_2, C_s1_2], scope="wmma.matrix_b", offset_factor=16)
+                                    T.tvm_load_matrix_sync(C_2.data, 16, 16, 16, C_2.elem_offset // C_s0_2 // 16 * (C_s0_2 // 16) + C_2.elem_offset % C_s0_2 // 16, T.tvm_access_ptr(T.type_annotation(dtype="int8"), A_1.data, A_1.elem_offset, A_s0_1 * 16, 1, dtype="handle"), A_s0_1, "col_major", dtype="handle")
+                            for ax2_0_3, ax3_0_3, ax0_2, ax1_2, ax4_0_2, ax2_0_4, ax3_0_4 in T.grid(1, 1, 1, 1, 2, 1, 1):
+                                with T.block("conv2d_nhwc_o_update"):
+                                    v0 = T.axis.reduce(1, ax0_2 + ax0_0 + ax0_1)
+                                    v1 = T.axis.reduce(1, ax1_1 + ax1_2 + ax1_0)
+                                    v2_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 8 * 8 + ax2_0_2_ax3_0_2_fused // 2 + ax2_0_3 + ax2_0_4)
+                                    v3_o = T.axis.spatial(16, ax3_0_4 + ax2_0_0_ax3_0_0_fused % 8 * 2 + ax2_0_2_ax3_0_2_fused % 2 + ax3_0_3)
+                                    v4_o = T.axis.reduce(4, ax4_0_0 * 2 + ax4_0_1 * 2 + ax4_0_2)
+                                    T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 : v2_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16], p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 : v3_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16])
+                                    T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
+                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "warp_execution":1})
+                                    A_2 = T.match_buffer(pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 : v2_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16], [16, 16], dtype="int8", strides=[A_s0_2, A_s1_2], scope="wmma.matrix_a", offset_factor=16)
+                                    B = T.match_buffer(p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 : v3_o * 16 + 16, v4_o * 16 : v4_o * 16 + 16], [16, 16], dtype="int8", strides=[B_s0, B_s1], scope="wmma.matrix_b", offset_factor=16)
+                                    C_3 = T.match_buffer(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], [16, 16], dtype="int32", strides=[C_s0_3, C_s1_3], scope="wmma.accumulator", offset_factor=16)
+                                    T.tvm_mma_sync(C_3.data, C_3.elem_offset // C_s0_3 // 16 * (C_s0_3 // 16) + C_3.elem_offset % C_s0_3 // 16, A_2.data, A_2.elem_offset // A_s0_2 // 16 * (A_s0_2 // 16) + A_2.elem_offset % A_s0_2 // 16, B.data, B.elem_offset // B_s0 // 16 * (B_s0 // 16) + B.elem_offset % B_s0 // 16, C_3.data, C_3.elem_offset // C_s0_3 // 16 * (C_s0_3 // 16) + C_3.elem_offset % C_s0_3 // 16, dtype="handle")
                     for ax0_0, ax1_0 in T.grid(1, 1):
                         with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator_o"):
-                            v0_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 8 * 8 + ax2_0_2_ax3_0_2_fused // 2)
-                            v1_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 8 * 2 + ax2_0_2_ax3_0_2_fused % 2)
+                            v0_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 8 * 8 + ax2_0_2_ax3_0_2_fused // 2 + ax0_0)
+                            v1_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 8 * 2 + ax2_0_2_ax3_0_2_fused % 2 + ax1_0)
                             T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                             T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
-                            A_3 = T.match_buffer(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16], [16, 16], dtype="int32", strides=[d1_3, d0_3], scope="wmma.accumulator", offset_factor=16)
-                            C_4 = T.match_buffer(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16], [16, 16], dtype="int32", strides=[s1_2, s0_2], scope="shared", offset_factor=16)
-                            T.evaluate(T.tvm_store_matrix_sync(A_3.data, 16, 16, 16, A_3.elem_offset // d1_3 // 16 * (d1_3 // 16) + A_3.elem_offset % d1_3 // 16, T.tvm_access_ptr(T.type_annotation(dtype="int32"), C_4.data, C_4.elem_offset, s1_2 * 16, 2, dtype="handle"), s1_2, "row_major", dtype="handle"))
+                            A_3 = T.match_buffer(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16], [16, 16], dtype="int32", strides=[A_s0_3, A_s1_3], scope="wmma.accumulator", offset_factor=16)
+                            C_4 = T.match_buffer(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16], [16, 16], dtype="int32", strides=[C_s0_4, C_s1_4], scope="shared", offset_factor=16)
+                            T.tvm_store_matrix_sync(A_3.data, 16, 16, 16, A_3.elem_offset // A_s0_3 // 16 * (A_s0_3 // 16) + A_3.elem_offset % A_s0_3 // 16, T.tvm_access_ptr(T.type_annotation(dtype="int32"), C_4.data, C_4.elem_offset, C_s0_4 * 16, 2, dtype="handle"), C_s0_4, "row_major", dtype="handle")
                 for ax0, ax1_0 in T.grid(128, 2):
                     for ax1_1 in T.thread_binding(16, thread="threadIdx.x"):
                         with T.block("conv2d_nhwc_reindex_shared"):
@@ -1145,45 +1143,44 @@ def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1,
             conv2d_NCHWc_int8 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
             for i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused in T.parallel(128, annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}):
                 for i2_1, i3_1, i4_0_1 in T.grid(7, 1, 1):
-                    for i5_0, i6_0 in T.grid(1, 1):
-                        for i1_2_init, i2_2_init, i3_2_init, i1_3_init, i2_3_init, i3_3_init in T.grid(1, 1, 1, 1, 1, 7):
-                            with T.block("conv2d_NCHWc_int8_o_init"):
-                                n = T.axis.spatial(1, 0)
-                                oc_chunk = T.axis.spatial(128, i1_2_init + i1_3_init + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused // 32 * 32 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused % 32)
-                                oh = T.axis.spatial(7, i2_1 + i2_2_init + i2_3_init)
-                                ow = T.axis.spatial(7, i3_1 * 7 + i3_2_init * 7 + i3_3_init)
-                                oc_block_o = T.axis.spatial(1, 0)
-                                T.reads()
-                                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16])
-                                for i4_1 in T.vectorized(16):
-                                    with T.block("conv2d_NCHWc_int8_init"):
-                                        oc_block_i_init = T.axis.spatial(16, i4_1)
-                                        T.reads()
-                                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
-                                        conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
-                        for i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 7, 1):
-                            with T.block("conv2d_NCHWc_int8_o_update"):
-                                n = T.axis.spatial(1, 0)
-                                oc_chunk = T.axis.spatial(128, i1_2 + i1_3 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused // 32 * 32 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused % 32)
-                                oh = T.axis.spatial(7, i2_1 + i2_2 + i2_3)
-                                ow = T.axis.spatial(7, i3_1 * 7 + i3_2 * 7 + i3_3)
-                                oc_block_o = T.axis.spatial(1, 0)
-                                kh = T.axis.reduce(1, 0)
-                                kw = T.axis.reduce(1, 0)
-                                ic_outer = T.axis.reduce(32, i7_0 * 8 + i7_1)
-                                ic_f_inner = T.axis.reduce(4, i8_1 + i8_0)
-                                ic_s_inner_o = T.axis.reduce(1, 0)
-                                T.reads(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16], p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
-                                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16])
-                                A = T.match_buffer(p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], [4], dtype="uint8", offset_factor=1)
-                                B = T.match_buffer(p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4], [16, 4], dtype="int8", offset_factor=1)
-                                C = T.match_buffer(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16], [16], dtype="int32", offset_factor=1)
-                                A_u8x4: T.uint8x4 = A[0:4]
-                                A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
-                                B_i8x64: T.int8x64 = B[0, 0:64]
-                                B_i32x16: T.int32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
-                                C_i32x16: T.int32x16 = C[0:16]
-                                C[0:16] = T.call_llvm_pure_intrin(intrin_id, T.uint32(0), C_i32x16, T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
+                    for i0_2_init, i1_2_init, i2_2_init, i3_2_init, i4_0_2_init, i0_3_init, i1_3_init, i2_3_init, i3_3_init, i4_0_3_init in T.grid(1, 1, 1, 1, 1, 1, 1, 1, 7, 1):
+                        with T.block("conv2d_NCHWc_int8_o_init"):
+                            n = T.axis.spatial(1, i0_3_init + i0_2_init)
+                            oc_chunk = T.axis.spatial(128, i1_2_init + i1_3_init + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused // 32 * 32 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused % 32)
+                            oh = T.axis.spatial(7, i2_1 + i2_2_init + i2_3_init)
+                            ow = T.axis.spatial(7, i3_1 * 7 + i3_2_init * 7 + i3_3_init)
+                            oc_block_o = T.axis.spatial(1, i4_0_3_init + i4_0_1 + i4_0_2_init)
+                            T.reads()
+                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16])
+                            for i4_1 in T.vectorized(16):
+                                with T.block("conv2d_NCHWc_int8_init"):
+                                    oc_block_i_init = T.axis.spatial(16, i4_1)
+                                    T.reads()
+                                    T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
+                                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+                    for i5_0, i6_0, i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(1, 1, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 7, 1):
+                        with T.block("conv2d_NCHWc_int8_o_update"):
+                            n = T.axis.spatial(1, i0_3 + i0_2)
+                            oc_chunk = T.axis.spatial(128, i1_2 + i1_3 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused // 32 * 32 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused % 32)
+                            oh = T.axis.spatial(7, i2_1 + i2_2 + i2_3)
+                            ow = T.axis.spatial(7, i3_1 * 7 + i3_2 * 7 + i3_3)
+                            oc_block_o = T.axis.spatial(1, i4_0_3 + i4_0_1 + i4_0_2)
+                            kh = T.axis.reduce(1, i5_0 + i5_1)
+                            kw = T.axis.reduce(1, i6_1 + i6_0)
+                            ic_outer = T.axis.reduce(32, i7_0 * 8 + i7_1)
+                            ic_f_inner = T.axis.reduce(4, i8_1 + i8_0)
+                            ic_s_inner_o = T.axis.reduce(1, i9_0_0 + i9_0_1)
+                            T.reads(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16], p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
+                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16])
+                            A = T.match_buffer(p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], [4], dtype="uint8", offset_factor=1)
+                            B = T.match_buffer(p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4], [16, 4], dtype="int8", offset_factor=1)
+                            C = T.match_buffer(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16], [16], dtype="int32", offset_factor=1)
+                            A_u8x4: T.uint8x4 = A[0:4]
+                            A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                            B_i8x64: T.int8x64 = B[0, 0:64]
+                            B_i32x16: T.int32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
+                            C_i32x16: T.int32x16 = C[0:16]
+                            C[0:16] = T.call_llvm_pure_intrin(T.uint32(intrin_id), T.uint32(0), C_i32x16, T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
                     for ax0, ax1, ax2, ax3 in T.grid(1, 1, 1, 7):
                         for ax4_fused in T.vectorized(16):
                             with T.block("T_cast_8"):
@@ -1740,8 +1737,8 @@ def main(p0: T.Buffer[(16, 56, 56, 64), "int8"], p1: T.Buffer[(256, 1, 1, 64), "
                             for ax0_1, ax1_1, ax4_0_1 in T.grid(1, 1, 2):
                                 for ax0_0_1, ax1_0_1 in T.grid(1, 1):
                                     with T.block("pad_temp_reindex_shared_wmma.matrix_a_o"):
-                                        v0_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 4 * 392 + ax2_0_1_ax3_0_1_fused * 2 + ax2_0_2_ax3_0_2_fused // 2)
-                                        v1_o = T.axis.spatial(4, ax4_0_0 * 2 + ax4_0_1)
+                                        v0_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 4 * 392 + ax2_0_1_ax3_0_1_fused * 2 + ax2_0_2_ax3_0_2_fused // 2 + ax0_0_1)
+                                        v1_o = T.axis.spatial(4, ax4_0_0 * 2 + ax4_0_1 + ax1_0_1)
                                         T.reads(pad_temp_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                         T.writes(pad_temp_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                         T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_s8_a"})
@@ -1753,10 +1750,9 @@ def main(p0: T.Buffer[(16, 56, 56, 64), "int8"], p1: T.Buffer[(256, 1, 1, 64), "
                                                 pad_temp_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = pad_temp_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]
                                 for ax0, ax1, ax2_0, ax3_0 in T.grid(1, 1, 2, 1):
                                     with T.block("p1_reindex_shared_wmma.matrix_b_o"):
-                                        v0 = T.axis.spatial(1, 0)
-                                        v1 = T.axis.spatial(1, 0)
+                                        v0, v1 = T.axis.remap("SS", [ax0, ax1])
                                         v2_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 4 * 4 + ax2_0_2_ax3_0_2_fused % 2 * 2 + ax2_0)
-                                        v3_o = T.axis.spatial(4, ax4_0_0 * 2 + ax4_0_1)
+                                        v3_o = T.axis.spatial(4, ax4_0_0 * 2 + ax4_0_1 + ax3_0)
                                         T.reads(p1_reindex_shared[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
                                         T.writes(p1_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 : v2_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16])
                                         T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_s8_b_trans"})
@@ -1768,8 +1764,8 @@ def main(p0: T.Buffer[(16, 56, 56, 64), "int8"], p1: T.Buffer[(256, 1, 1, 64), "
                                                 p1_reindex_shared_wmma_matrix_b[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i] = p1_reindex_shared[v0, v1, v2_o * 16 + v2_i, v3_o * 16 + v3_i]
                                 for ax2_0_3, ax3_0_3, ax0_2, ax1_2, ax4_0_2, ax2_0_4, ax3_0_4 in T.grid(1, 1, 1, 1, 1, 1, 2):
                                     with T.block("conv2d_nhwc_o"):
-                                        v0 = T.axis.reduce(1, 0)
-                                        v1 = T.axis.reduce(1, 0)
+                                        v0 = T.axis.reduce(1, ax0_2 + ax0_0 + ax0_1)
+                                        v1 = T.axis.reduce(1, ax1_1 + ax1_2 + ax1_0)
                                         v2_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 4 * 392 + ax2_0_1_ax3_0_1_fused * 2 + ax2_0_2_ax3_0_2_fused // 2 + ax2_0_3 + ax2_0_4)
                                         v3_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 4 * 4 + ax2_0_2_ax3_0_2_fused % 2 * 2 + ax3_0_3 * 2 + ax3_0_4)
                                         v4_o = T.axis.reduce(4, ax4_0_0 * 2 + ax4_0_1 + ax4_0_2)
@@ -1789,10 +1785,10 @@ def main(p0: T.Buffer[(16, 56, 56, 64), "int8"], p1: T.Buffer[(256, 1, 1, 64), "
                                                 T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i], pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 + v2_i, v4_o * 16 + v4_i], p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 + v3_i, v4_o * 16 + v4_i])
                                                 T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i])
                                                 T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"})
-                                                conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i] + T.cast(pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 + v2_i, v4_o * 16 + v4_i], "int32") * T.cast(p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 + v3_i, v4_o * 16 + v4_i], "int32")
+                                                conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v2_o * 16 + v2_i, v3_o * 16 + v3_i] + T.Cast("int32", pad_temp_reindex_shared_wmma_matrix_a[v2_o * 16 + v2_i, v4_o * 16 + v4_i]) * T.Cast("int32", p1_reindex_shared_wmma_matrix_b[v0, v1, v3_o * 16 + v3_i, v4_o * 16 + v4_i])
                         for ax0_0, ax1_0 in T.grid(1, 2):
                             with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator_o"):
-                                v0_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 4 * 392 + ax2_0_1_ax3_0_1_fused * 2 + ax2_0_2_ax3_0_2_fused // 2)
+                                v0_o = T.axis.spatial(3136, ax2_0_0_ax3_0_0_fused // 4 * 392 + ax2_0_1_ax3_0_1_fused * 2 + ax2_0_2_ax3_0_2_fused // 2 + ax0_0)
                                 v1_o = T.axis.spatial(16, ax2_0_0_ax3_0_0_fused % 4 * 4 + ax2_0_2_ax3_0_2_fused % 2 * 2 + ax1_0)
                                 T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
                                 T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
@@ -2478,7 +2474,7 @@ def apply_trace(sch):
             l311,
             l312,
         ) = sch.get_loops(block=b296)
-        b313 = sch.decompose_reduction(block=b296, loop=l302)
+        b313 = sch.decompose_reduction(block=b296, loop=l300)
         sch.unannotate(block_or_loop=b313, ann_key="meta_schedule.auto_tensorize")
         sch.annotate(
             block_or_loop=b313,
@@ -2723,7 +2719,7 @@ def apply_trace(sch):
             l188,
             l189,
         ) = sch.get_loops(block=b165)
-        b190 = sch.decompose_reduction(block=b165, loop=l172)
+        b190 = sch.decompose_reduction(block=b165, loop=l170)
         sch.unannotate(block_or_loop=b190, ann_key="meta_schedule.auto_tensorize")
         sch.annotate(block_or_loop=b190, ann_key="meta_schedule.auto_tensorize", ann_val="")
         b191 = sch.get_block(name="conv2d_NCHWc_int8_o_init", func_name="main")
diff --git a/tests/python/unittest/test_tir_schedule_blockize.py b/tests/python/unittest/test_tir_schedule_blockize.py
index 12836cdb9e68..a68170009bb5 100644
--- a/tests/python/unittest/test_tir_schedule_blockize.py
+++ b/tests/python/unittest/test_tir_schedule_blockize.py
@@ -20,6 +20,7 @@
 from tvm import tir
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
+import pytest
 
 # fmt: off
 # pylint: disable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
@@ -247,7 +248,8 @@ def after_rowsum_blockize(
     verify_trace_roundtrip(sch=s, mod=rowsum)
 
 
-def test_blockize_outer_int64_shape():
+@pytest.mark.parametrize("preserve_unit_iters", [True, False])
+def test_blockize_outer_int64_shape(preserve_unit_iters):
     @T.prim_func
     def single_elementwise_int64(
         A: T.Buffer[(T.int64(16), T.int64(128)), "float32"],
@@ -275,10 +277,31 @@ def after_single_elementwise_int64_blockize(
                             vi_i, vj_o * T.int64(16) + vj_i
                         ] + T.float32(1)
 
+    @T.prim_func
+    def after_single_elementwise_int64_blockize_preserve_unit_iters(
+        A: T.Buffer[(T.int64(16), T.int64(128)), "float32"],
+        B: T.Buffer[(T.int64(16), T.int64(128)), "float32"],
+    ) -> None:
+        for i0, j0 in T.grid(T.int64(1), T.int64(8)):
+            with T.block("B_o"):
+                vi_o = T.axis.spatial(T.int64(1), i0)
+                vj_o = T.axis.spatial(T.int64(8), j0)
+                for i1, j1 in T.grid(T.int64(16), T.int64(16)):
+                    with T.block("B"):
+                        vi_i, vj_i = T.axis.remap("SS", [i1, j1])
+                        B[vi_i, vj_o * T.int64(16) + vj_i] = A[
+                            vi_i, vj_o * T.int64(16) + vj_i
+                        ] + T.float32(1)
+
     s = tir.Schedule(single_elementwise_int64, debug_mask="all")
     _, _, i1, _ = s.get_loops(s.get_block("B"))
-    s.blockize(i1)
-    tvm.ir.assert_structural_equal(s.mod["main"], after_single_elementwise_int64_blockize)
+    s.blockize(i1, preserve_unit_iters=preserve_unit_iters)
+    expected = (
+        after_single_elementwise_int64_blockize_preserve_unit_iters
+        if preserve_unit_iters
+        else after_single_elementwise_int64_blockize
+    )
+    tvm.ir.assert_structural_equal(s.mod["main"], expected)
     verify_trace_roundtrip(sch=s, mod=single_elementwise_int64)
 
 
From e946e999cc9ef9c253dc3278bd699d26daacce3a Mon Sep 17 00:00:00 2001
From: "Fred.Jia" <jfrfengrong@163.com>
Date: Fri, 9 Dec 2022 08:38:34 +0800
Subject: [PATCH 042/286] [TIR][Transform] Keep the allocate buffers order
 after update buffer allocation location (#13560)

[TIR][Transform] Keep the allocate buffers order after update buffer allocated location
---
 .../plan_update_buffer_allocation_location.cc | 63 ++++++++++++++++---
 ..._plan_update_buffer_allocation_location.py | 25 ++++++--
 2 files changed, 74 insertions(+), 14 deletions(-)

diff --git a/src/tir/transforms/plan_update_buffer_allocation_location.cc b/src/tir/transforms/plan_update_buffer_allocation_location.cc
index 90150ebd3cdf..4c63d3393fd8 100644
--- a/src/tir/transforms/plan_update_buffer_allocation_location.cc
+++ b/src/tir/transforms/plan_update_buffer_allocation_location.cc
@@ -48,10 +48,53 @@ class CollectUnmanagedAllocations : public StmtExprVisitor {
   std::unordered_set<const VarNode*> unmanaged_allocations;
 };
 
+/*! \brief Collect the allocate buffer order. */
+class BufferAllocateOrderCollector : public StmtExprVisitor {
+ public:
+  static Array<Buffer> Collect(const PrimFunc& func) {
+    BufferAllocateOrderCollector collector;
+    for (const auto& kv : func->buffer_map) {
+      collector.buffer_alloc_recorder_.push_back(kv.second);
+    }
+    collector(func->body);
+    return std::move(collector.buffer_alloc_recorder_);
+  }
+
+ private:
+  void VisitStmt_(const BlockNode* op) final {
+    for (const Buffer& buffer : op->alloc_buffers) {
+      buffer_alloc_recorder_.push_back(buffer);
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const BufferLoadNode* op) final {
+    if (std::find(buffer_alloc_recorder_.begin(), buffer_alloc_recorder_.end(), op->buffer) ==
+        buffer_alloc_recorder_.end()) {
+      buffer_alloc_recorder_.push_back(op->buffer);
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const BufferStoreNode* op) final {
+    if (std::find(buffer_alloc_recorder_.begin(), buffer_alloc_recorder_.end(), op->buffer) ==
+        buffer_alloc_recorder_.end()) {
+      buffer_alloc_recorder_.push_back(op->buffer);
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  /*! \brief The buffer allocated order recorder. */
+  Array<Buffer> buffer_alloc_recorder_;
+};
+
 class BufferAllocationLocator : public StmtExprMutator {
  public:
   explicit BufferAllocationLocator(const PrimFunc& func) {
     Map<Buffer, Optional<Stmt>> buffer_lca = DetectBufferAccessLCA(func);
+    // The buffer_alloc_recorder Array is used to keep the buffer allocation order
+    // since the buffer_lca Map is unordered.
+    Array<Buffer> buffer_alloc_recorder = BufferAllocateOrderCollector::Collect(func);
     std::unordered_set<const VarNode*> arg_buffer_vars;
     CollectUnmanagedAllocations collector;
     collector(func->body);
@@ -63,16 +106,18 @@ class BufferAllocationLocator : public StmtExprMutator {
       buffer_data_to_buffer_.Set(buffer->data, buffer);
     }
     // create buffers to be allocated at each stmts
-    for (const auto& kv : buffer_lca) {
-      const Buffer& buffer = kv.first;
-      const StmtNode* stmt = kv.second.get();
-      if (arg_buffer_vars.count(buffer->data.get())) {
-        continue;
+    for (const auto& buffer : buffer_alloc_recorder) {
+      auto it = buffer_lca.find(buffer);
+      if (it != buffer_lca.end()) {
+        const StmtNode* stmt = (*it).second.get();
+        if (arg_buffer_vars.count(buffer->data.get())) {
+          continue;
+        }
+        if (!unmanaged_allocations_.count(buffer->data.get())) {
+          alloc_buffers_[stmt].push_back(buffer);
+        }
+        buffer_data_to_buffer_.Set(buffer->data, buffer);
       }
-      if (!unmanaged_allocations_.count(buffer->data.get())) {
-        alloc_buffers_[stmt].push_back(buffer);
-      }
-      buffer_data_to_buffer_.Set(buffer->data, buffer);
     }
   }
 
diff --git a/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py b/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py
index 34d82f86a422..92e3cbd66e2f 100644
--- a/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py
+++ b/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py
@@ -245,11 +245,13 @@ def test_lower_te():
 
 def test_loop_carried_dependency():
     """The buffer allocation should be above opaque iter var's loop scopes
-    such that buffer accesses with loop carried dependencies are covered."""
+    such that buffer accesses with loop carried dependencies are covered,
+    and the allocate buffer should keep the order."""
 
     @T.prim_func
     def before(A: T.Buffer[(8, 8, 8), "int32"], B: T.Buffer[(8, 8, 8), "int32"]):
         C = T.alloc_buffer([8, 8, 8], dtype="int32")
+        D = T.alloc_buffer([8, 8, 8], dtype="int32")
         for i in T.serial(8):
             for j in T.serial(8):
                 for k in T.serial(8):
@@ -258,10 +260,16 @@ def before(A: T.Buffer[(8, 8, 8), "int32"], B: T.Buffer[(8, 8, 8), "int32"]):
                         C[vi, vj, vk] = A[vi, vj, vk] + 1
                 for k in T.serial(8):
                     with T.block("b1"):
+                        vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+                        D[vi, vj, vk] = A[vi, vj, vk] + 2
+                for k in T.serial(8):
+                    with T.block("b2"):
                         vi, vk = T.axis.remap("SS", [i, k])
                         vj = T.axis.opaque(8, j)
-                        B[vi, vj, vk] = C[vi, vj, vk] + T.if_then_else(
-                            0 < vj, C[vi, vj - 1, vk], 0, dtype="int32"
+                        B[vi, vj, vk] = (
+                            C[vi, vj, vk]
+                            + T.if_then_else(0 < vj, C[vi, vj - 1, vk], 0, dtype="int32")
+                            + D[vi, vj, vk]
                         )
 
     @T.prim_func
@@ -271,6 +279,7 @@ def after(A: T.Buffer[(8, 8, 8), "int32"], B: T.Buffer[(8, 8, 8), "int32"]) -> N
                 T.reads(A[i, 0:8, 0:8])
                 T.writes(B[i, 0:8, 0:8])
                 C = T.alloc_buffer([8, 8, 8], dtype="int32")
+                D = T.alloc_buffer([8, 8, 8], dtype="int32")
                 for j in T.serial(8):
                     for k in T.serial(8):
                         with T.block("b0"):
@@ -278,10 +287,16 @@ def after(A: T.Buffer[(8, 8, 8), "int32"], B: T.Buffer[(8, 8, 8), "int32"]) -> N
                             C[vi, vj, vk] = A[vi, vj, vk] + 1
                     for k in T.serial(8):
                         with T.block("b1"):
+                            vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+                            D[vi, vj, vk] = A[vi, vj, vk] + 2
+                    for k in T.serial(8):
+                        with T.block("b2"):
                             vi, vk = T.axis.remap("SS", [i, k])
                             vj = T.axis.opaque(8, j)
-                            B[vi, vj, vk] = C[vi, vj, vk] + T.if_then_else(
-                                0 < vj, C[vi, vj - 1, vk], 0, dtype="int32"
+                            B[vi, vj, vk] = (
+                                C[vi, vj, vk]
+                                + T.if_then_else(0 < vj, C[vi, vj - 1, vk], 0, dtype="int32")
+                                + D[vi, vj, vk]
                             )
 
     _check(before, after)

From 8bcef00361cda6a46c68f907b92bcb9ccc351812 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 9 Dec 2022 03:22:25 -0600
Subject: [PATCH 043/286] [Hexagon] Skip test if "onnx" module not available
 (#13585)

---
 tests/python/contrib/test_hexagon/test_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_hexagon/test_models.py b/tests/python/contrib/test_hexagon/test_models.py
index f91b660766ed..007a70495462 100644
--- a/tests/python/contrib/test_hexagon/test_models.py
+++ b/tests/python/contrib/test_hexagon/test_models.py
@@ -28,7 +28,7 @@
 
 def get_mobilenet():
     """Download and import mobilenet model with ONNX"""
-    import onnx  # pylint: disable=import-outside-toplevel
+    onnx = pytest.importorskip("onnx")
 
     model_url = "https://github.com/onnx/models/raw/main/vision/classification/mobilenet/model/mobilenetv2-7.onnx"  # pylint: disable=line-too-long
     model_path = tvm.contrib.download.download_testdata(

From 5e20a14cab3397387c010a6eab2f19d5c9114e68 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 9 Dec 2022 20:00:57 +0900
Subject: [PATCH 044/286] [FQ2I] Support converting `dense` -> `add` to
 `qnn.dense` -> `add` -> `requantize` (#13578)

* wip

* hack to convert size-1 scale and zp tensors to scalar

* fix to binary op fast path

* check output zp

* add assert

* add comment

* lint

* clean up beta handling

* use regular binary op only for 32 bit add (bias addition)

* do float(beta) when we know that beta is not None

* restore original beta handling code to avoid mul by 1

* add comment on overflow
---
 python/tvm/relay/frontend/onnx.py             |  5 ++-
 .../transform/fake_quantization_to_integer.py | 31 +++++++++++++
 .../fake_quantization_to_integer.cc           |  2 +-
 .../test_pass_fake_quantization_to_integer.py | 43 ++++++++++++++++---
 4 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index d185d143c7a6..62f0f4b2dd25 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1406,7 +1406,10 @@ def _impl_v1(cls, inputs, attr, params):
             inputs[0] *= _expr.const(alpha, dtype=dtype)
         out = _op.nn.dense(inputs[0], inputs[1], units=channels)
         if len(inputs) == 3:
-            out = out + _expr.const(beta, dtype=dtype) * inputs[2]
+            if beta != 1.0:
+                out += _expr.const(float(beta), dtype=dtype) * inputs[2]
+            else:
+                out += inputs[2]
         return out
 
 
diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index 46bdd94ace1a..84b1f33e98cc 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -502,6 +502,37 @@ def register_binary_qnn(op_name, op):
 
     def binary(expr, type_map):
         left, right, left_t, right_t, out_t = get_binary_types(expr, type_map)
+
+        if (
+            op_name == "add"
+            and approx_equal(left_t.scale, right_t.scale)
+            and approx_equal(left_t.zero_point, right_t.zero_point)
+            and tvm.ir.structural_equal(left_t.dtype, right_t.dtype)
+            and left_t.dtype == "int32"
+            and approx_equal(left_t.scale, out_t.scale)
+            and approx_equal(left_t.zero_point, out_t.zero_point)
+            and np.all(out_t.zero_point.data.numpy() == 0)
+        ):
+            # If this add op comes after conv2d or dense, out_t.scale and out_t.zero_point
+            # can be a vector, which is not supported by QNN binary operators.
+            # In particular, the pattern of an `add` op following `dense`, where the addition is
+            # really a bias addtion, can come up often. We identify that pattern and convert it to
+            # `qnn.dense` -> `add`.
+            # To avoid overflow, we do this conversion only when the input data type is 32 bit (bias
+            # addition is typically done in 32 bit).
+            return [left + right, left_t]
+
+        assert (
+            len(out_t.scale.data.shape) == 0
+        ), "The output scale needs to be a scalar, but got a tensor of shape {}".format(
+            out_t.scale.data.shape
+        )
+        assert (
+            len(out_t.zero_point.data.shape) == 0
+        ), "The output zero point needs to be a scalar, but got a tensor of shape {}".format(
+            out_t.zero_point.data.shape
+        )
+
         out = op(
             left,
             right,
diff --git a/src/relay/transforms/fake_quantization_to_integer.cc b/src/relay/transforms/fake_quantization_to_integer.cc
index eb176df5c978..31353d5aa25e 100644
--- a/src/relay/transforms/fake_quantization_to_integer.cc
+++ b/src/relay/transforms/fake_quantization_to_integer.cc
@@ -193,7 +193,7 @@ class SubgraphMutator : public ExprMutator {
       return Mutate(expr);
     } catch (std::exception& e) {
       if (hard_fail_) {
-        throw e;
+        LOG(FATAL) << e.what();
       } else {
         DLOG(INFO) << "Ran into an error rewriting a subgraph, skipping" << expr << std::endl;
         return expr;
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
index 569bd9d7d653..d384635e42e5 100644
--- a/tests/python/relay/test_pass_fake_quantization_to_integer.py
+++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -154,6 +154,41 @@ def test_fake_quantize_dense_per_channel():
         compare_fq_to_int(op, [x_np, w_np], allow_rounding_error=True)
 
 
+def test_fake_quantize_dense_bias():
+    out_dtype = "int8"
+    x = relay.var("x", shape=[128, 64], dtype="int8")
+    w = relay.var("w", shape=[256, 64], dtype="int8")
+    bias = relay.var("bias", shape=[256], dtype="int32")
+    one = relay.const(1.0)
+    zero = relay.const(0)
+    w_scale = np.random.random([256]).astype("float32")
+
+    op = relay.op.nn.dense(
+        relay.qnn.op.dequantize(x, relay.const(2.0), zero),
+        relay.qnn.op.dequantize(
+            w,
+            relay.const(w_scale),
+            zero,
+            axis=0,
+        ),
+        units=256,
+    )
+
+    op += relay.qnn.op.dequantize(
+        bias,
+        relay.const(2.0 * w_scale),
+        zero,
+    )
+
+    op = relay.qnn.op.quantize(op, one, zero, out_dtype=out_dtype)
+
+    x_np = np.random.randint(-128, 127, size=[128, 64], dtype="int8")
+    w_np = np.random.randint(-128, 127, size=[256, 64], dtype="int8")
+    bias_np = np.random.randint(-128, 127, size=[256], dtype="int32")
+
+    compare_fq_to_int(op, [x_np, w_np, bias_np], allow_rounding_error=True)
+
+
 def test_fake_quantize_batch_matmul():
     for out_dtype in ["int8", "uint8"]:
         x = relay.var("x", shape=[1, 128, 64], dtype="int8")
@@ -976,15 +1011,9 @@ def test_fq_qat_positive_nothing_to_do():
     op1 = relay.qnn.op.quantize(
         relay.const(1.0), relay.const(12.0), relay.const(0), out_dtype="int32"
     )
-    op2 = relay.qnn.op.add(
+    op2 = relay.op.add(
         op0,
         op1,
-        relay.const(12.0),
-        relay.const(0),
-        relay.const(12.0),
-        relay.const(0),
-        relay.const(12.0),
-        relay.const(0),
     )
     expected_expr = relay.qnn.op.requantize(
         op2, relay.const(12.0), relay.const(0), relay.const(1.0), relay.const(0), out_dtype="int8"

From 9a3bec80421cce5e10b00230e8ace1be58fcdf5d Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 9 Dec 2022 21:15:28 -0700
Subject: [PATCH 045/286] [ci][docker] Allow usage of ECR images in PRs
 (#13590)

This fixes `ecr_pull` so that `docker-images.ini` can be updated with Docker images from a previous CI run for testing purposes

Example run: https://ci.tlcpack.ai/blue/organizations/jenkins/tvm-cortexm/detail/PR-13590/4/pipeline/#step-80-log-9
---
 ci/jenkins/generated/arm_jenkinsfile.groovy   | 46 +++++++++-
 .../generated/cortexm_jenkinsfile.groovy      | 46 +++++++++-
 ci/jenkins/generated/cpu_jenkinsfile.groovy   | 46 +++++++++-
 .../generated/docker_jenkinsfile.groovy       | 87 +++++++++----------
 ci/jenkins/generated/gpu_jenkinsfile.groovy   | 46 +++++++++-
 .../generated/hexagon_jenkinsfile.groovy      | 46 +++++++++-
 ci/jenkins/generated/i386_jenkinsfile.groovy  | 46 +++++++++-
 ci/jenkins/generated/lint_jenkinsfile.groovy  | 46 +++++++++-
 .../generated/minimal_jenkinsfile.groovy      | 46 +++++++++-
 ci/jenkins/generated/riscv_jenkinsfile.groovy | 46 +++++++++-
 ci/jenkins/generated/wasm_jenkinsfile.groovy  | 46 +++++++++-
 .../templates/docker_jenkinsfile.groovy.j2    | 41 ---------
 ci/jenkins/templates/utils/Prepare.groovy.j2  | 44 +++++++++-
 ci/scripts/jenkins/determine_docker_images.py |  6 +-
 14 files changed, 520 insertions(+), 118 deletions(-)

diff --git a/ci/jenkins/generated/arm_jenkinsfile.groovy b/ci/jenkins/generated/arm_jenkinsfile.groovy
index f1bcc786b72e..0fc71b430ca0 100644
--- a/ci/jenkins/generated/arm_jenkinsfile.groovy
+++ b/ci/jenkins/generated/arm_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-06T20:56:42.365592
+// Generated at 2022-12-09T15:39:24.387114
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -205,8 +205,7 @@ def docker_init(image) {
   if (image.contains("amazonaws.com")) {
     // If this string is in the image name it's from ECR and needs to be pulled
     // with the right credentials
-    // ecr_pull(image)
-    sh "echo Pulling from AWS is not implemented && exit 1"
+    ecr_pull(image)
   } else {
     sh(
       script: """
@@ -219,6 +218,47 @@ def docker_init(image) {
   }
 }
 
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ${jenkins_scripts_root}/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/ci/jenkins/generated/cortexm_jenkinsfile.groovy b/ci/jenkins/generated/cortexm_jenkinsfile.groovy
index 4b5ba2e104f4..25846f5b4b5e 100644
--- a/ci/jenkins/generated/cortexm_jenkinsfile.groovy
+++ b/ci/jenkins/generated/cortexm_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-06T20:56:42.204393
+// Generated at 2022-12-09T15:39:24.437899
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -205,8 +205,7 @@ def docker_init(image) {
   if (image.contains("amazonaws.com")) {
     // If this string is in the image name it's from ECR and needs to be pulled
     // with the right credentials
-    // ecr_pull(image)
-    sh "echo Pulling from AWS is not implemented && exit 1"
+    ecr_pull(image)
   } else {
     sh(
       script: """
@@ -219,6 +218,47 @@ def docker_init(image) {
   }
 }
 
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ${jenkins_scripts_root}/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/ci/jenkins/generated/cpu_jenkinsfile.groovy b/ci/jenkins/generated/cpu_jenkinsfile.groovy
index 378b20db51b0..f9ede00399a2 100644
--- a/ci/jenkins/generated/cpu_jenkinsfile.groovy
+++ b/ci/jenkins/generated/cpu_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-06T20:56:42.393957
+// Generated at 2022-12-09T15:39:24.540570
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -205,8 +205,7 @@ def docker_init(image) {
   if (image.contains("amazonaws.com")) {
     // If this string is in the image name it's from ECR and needs to be pulled
     // with the right credentials
-    // ecr_pull(image)
-    sh "echo Pulling from AWS is not implemented && exit 1"
+    ecr_pull(image)
   } else {
     sh(
       script: """
@@ -219,6 +218,47 @@ def docker_init(image) {
   }
 }
 
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ${jenkins_scripts_root}/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/ci/jenkins/generated/docker_jenkinsfile.groovy b/ci/jenkins/generated/docker_jenkinsfile.groovy
index 050ef2983e43..9e1946c194e6 100644
--- a/ci/jenkins/generated/docker_jenkinsfile.groovy
+++ b/ci/jenkins/generated/docker_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-07T07:10:24.637792
+// Generated at 2022-12-09T15:39:24.508775
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -205,8 +205,7 @@ def docker_init(image) {
   if (image.contains("amazonaws.com")) {
     // If this string is in the image name it's from ECR and needs to be pulled
     // with the right credentials
-    // ecr_pull(image)
-    sh "echo Pulling from AWS is not implemented && exit 1"
+    ecr_pull(image)
   } else {
     sh(
       script: """
@@ -219,6 +218,47 @@ def docker_init(image) {
   }
 }
 
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ${jenkins_scripts_root}/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
@@ -544,47 +584,6 @@ def ecr_push(full_name) {
   return ecr_name
 }
 
-def ecr_pull(full_name) {
-  aws_account_id = sh(
-    returnStdout: true,
-    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
-    label: 'Get AWS ID'
-  ).trim()
-
-  try {
-    withEnv([
-      "AWS_ACCOUNT_ID=${aws_account_id}",
-      'AWS_DEFAULT_REGION=us-west-2',
-      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
-      sh(
-        script: '''
-          set -eux
-          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
-        ''',
-        label: 'Log in to ECR'
-      )
-      sh(
-        script: """
-          set -eux
-          . ci/scripts/retry.sh
-          retry 5 docker pull ${full_name}
-        """,
-        label: 'Pull image from ECR'
-      )
-    }
-  } finally {
-    withEnv([
-      "AWS_ACCOUNT_ID=${aws_account_id}",
-      'AWS_DEFAULT_REGION=us-west-2',
-      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
-      sh(
-        script: 'docker logout $AWS_ECR_REPO',
-        label: 'Clean up login credentials'
-      )
-    }
-  }
-}
-
 def build_image(image_name) {
   hash = sh(
     returnStdout: true,
diff --git a/ci/jenkins/generated/gpu_jenkinsfile.groovy b/ci/jenkins/generated/gpu_jenkinsfile.groovy
index 48a6619cbab1..bebc0c4c22a5 100644
--- a/ci/jenkins/generated/gpu_jenkinsfile.groovy
+++ b/ci/jenkins/generated/gpu_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-07T07:10:24.840515
+// Generated at 2022-12-09T15:39:24.455336
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -205,8 +205,7 @@ def docker_init(image) {
   if (image.contains("amazonaws.com")) {
     // If this string is in the image name it's from ECR and needs to be pulled
     // with the right credentials
-    // ecr_pull(image)
-    sh "echo Pulling from AWS is not implemented && exit 1"
+    ecr_pull(image)
   } else {
     sh(
       script: """
@@ -219,6 +218,47 @@ def docker_init(image) {
   }
 }
 
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ${jenkins_scripts_root}/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/ci/jenkins/generated/hexagon_jenkinsfile.groovy b/ci/jenkins/generated/hexagon_jenkinsfile.groovy
index e5397eee3a9c..c2f39a0d084b 100644
--- a/ci/jenkins/generated/hexagon_jenkinsfile.groovy
+++ b/ci/jenkins/generated/hexagon_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-06T20:56:42.338377
+// Generated at 2022-12-09T15:39:24.369191
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -205,8 +205,7 @@ def docker_init(image) {
   if (image.contains("amazonaws.com")) {
     // If this string is in the image name it's from ECR and needs to be pulled
     // with the right credentials
-    // ecr_pull(image)
-    sh "echo Pulling from AWS is not implemented && exit 1"
+    ecr_pull(image)
   } else {
     sh(
       script: """
@@ -219,6 +218,47 @@ def docker_init(image) {
   }
 }
 
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ${jenkins_scripts_root}/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/ci/jenkins/generated/i386_jenkinsfile.groovy b/ci/jenkins/generated/i386_jenkinsfile.groovy
index 876670acebba..ae66fbe3e48c 100644
--- a/ci/jenkins/generated/i386_jenkinsfile.groovy
+++ b/ci/jenkins/generated/i386_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-06T20:56:42.288840
+// Generated at 2022-12-09T15:39:24.421467
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -205,8 +205,7 @@ def docker_init(image) {
   if (image.contains("amazonaws.com")) {
     // If this string is in the image name it's from ECR and needs to be pulled
     // with the right credentials
-    // ecr_pull(image)
-    sh "echo Pulling from AWS is not implemented && exit 1"
+    ecr_pull(image)
   } else {
     sh(
       script: """
@@ -219,6 +218,47 @@ def docker_init(image) {
   }
 }
 
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ${jenkins_scripts_root}/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/ci/jenkins/generated/lint_jenkinsfile.groovy b/ci/jenkins/generated/lint_jenkinsfile.groovy
index 3aaea4436fcb..f8dccc863590 100644
--- a/ci/jenkins/generated/lint_jenkinsfile.groovy
+++ b/ci/jenkins/generated/lint_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-06T20:56:42.313954
+// Generated at 2022-12-09T15:39:24.476946
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -205,8 +205,7 @@ def docker_init(image) {
   if (image.contains("amazonaws.com")) {
     // If this string is in the image name it's from ECR and needs to be pulled
     // with the right credentials
-    // ecr_pull(image)
-    sh "echo Pulling from AWS is not implemented && exit 1"
+    ecr_pull(image)
   } else {
     sh(
       script: """
@@ -219,6 +218,47 @@ def docker_init(image) {
   }
 }
 
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ${jenkins_scripts_root}/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/ci/jenkins/generated/minimal_jenkinsfile.groovy b/ci/jenkins/generated/minimal_jenkinsfile.groovy
index f8a59ef5734d..6c4abb0bd5af 100644
--- a/ci/jenkins/generated/minimal_jenkinsfile.groovy
+++ b/ci/jenkins/generated/minimal_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-06T20:56:42.235080
+// Generated at 2022-12-09T15:39:24.492813
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -205,8 +205,7 @@ def docker_init(image) {
   if (image.contains("amazonaws.com")) {
     // If this string is in the image name it's from ECR and needs to be pulled
     // with the right credentials
-    // ecr_pull(image)
-    sh "echo Pulling from AWS is not implemented && exit 1"
+    ecr_pull(image)
   } else {
     sh(
       script: """
@@ -219,6 +218,47 @@ def docker_init(image) {
   }
 }
 
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ${jenkins_scripts_root}/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/ci/jenkins/generated/riscv_jenkinsfile.groovy b/ci/jenkins/generated/riscv_jenkinsfile.groovy
index eb62c3731f79..7b9bbe7ad399 100644
--- a/ci/jenkins/generated/riscv_jenkinsfile.groovy
+++ b/ci/jenkins/generated/riscv_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-06T20:56:42.442689
+// Generated at 2022-12-09T15:39:24.405262
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -205,8 +205,7 @@ def docker_init(image) {
   if (image.contains("amazonaws.com")) {
     // If this string is in the image name it's from ECR and needs to be pulled
     // with the right credentials
-    // ecr_pull(image)
-    sh "echo Pulling from AWS is not implemented && exit 1"
+    ecr_pull(image)
   } else {
     sh(
       script: """
@@ -219,6 +218,47 @@ def docker_init(image) {
   }
 }
 
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ${jenkins_scripts_root}/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/ci/jenkins/generated/wasm_jenkinsfile.groovy b/ci/jenkins/generated/wasm_jenkinsfile.groovy
index d43c7f9d24e4..8c8ee0388699 100644
--- a/ci/jenkins/generated/wasm_jenkinsfile.groovy
+++ b/ci/jenkins/generated/wasm_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-06T20:56:42.420989
+// Generated at 2022-12-09T15:39:24.526394
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -205,8 +205,7 @@ def docker_init(image) {
   if (image.contains("amazonaws.com")) {
     // If this string is in the image name it's from ECR and needs to be pulled
     // with the right credentials
-    // ecr_pull(image)
-    sh "echo Pulling from AWS is not implemented && exit 1"
+    ecr_pull(image)
   } else {
     sh(
       script: """
@@ -219,6 +218,47 @@ def docker_init(image) {
   }
 }
 
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ${jenkins_scripts_root}/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/ci/jenkins/templates/docker_jenkinsfile.groovy.j2 b/ci/jenkins/templates/docker_jenkinsfile.groovy.j2
index db3e6159b82a..07ae49811337 100644
--- a/ci/jenkins/templates/docker_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/docker_jenkinsfile.groovy.j2
@@ -61,47 +61,6 @@ def ecr_push(full_name) {
   return ecr_name
 }
 
-def ecr_pull(full_name) {
-  aws_account_id = sh(
-    returnStdout: true,
-    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
-    label: 'Get AWS ID'
-  ).trim()
-
-  try {
-    withEnv([
-      "AWS_ACCOUNT_ID=${aws_account_id}",
-      'AWS_DEFAULT_REGION={{ aws_default_region }}',
-      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
-      sh(
-        script: '''
-          set -eux
-          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
-        ''',
-        label: 'Log in to ECR'
-      )
-      sh(
-        script: """
-          set -eux
-          . ci/scripts/retry.sh
-          retry 5 docker pull ${full_name}
-        """,
-        label: 'Pull image from ECR'
-      )
-    }
-  } finally {
-    withEnv([
-      "AWS_ACCOUNT_ID=${aws_account_id}",
-      'AWS_DEFAULT_REGION={{ aws_default_region }}',
-      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
-      sh(
-        script: 'docker logout $AWS_ECR_REPO',
-        label: 'Clean up login credentials'
-      )
-    }
-  }
-}
-
 def build_image(image_name) {
   hash = sh(
     returnStdout: true,
diff --git a/ci/jenkins/templates/utils/Prepare.groovy.j2 b/ci/jenkins/templates/utils/Prepare.groovy.j2
index b295bb430853..d5aebdc07008 100644
--- a/ci/jenkins/templates/utils/Prepare.groovy.j2
+++ b/ci/jenkins/templates/utils/Prepare.groovy.j2
@@ -75,8 +75,7 @@ def docker_init(image) {
   if (image.contains("amazonaws.com")) {
     // If this string is in the image name it's from ECR and needs to be pulled
     // with the right credentials
-    // ecr_pull(image)
-    sh "echo Pulling from AWS is not implemented && exit 1"
+    ecr_pull(image)
   } else {
     sh(
       script: """
@@ -89,6 +88,47 @@ def docker_init(image) {
   }
 }
 
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION={{ aws_default_region }}',
+      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ${jenkins_scripts_root}/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION={{ aws_default_region }}',
+      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/ci/scripts/jenkins/determine_docker_images.py b/ci/scripts/jenkins/determine_docker_images.py
index 78da9a354629..41003958dd61 100755
--- a/ci/scripts/jenkins/determine_docker_images.py
+++ b/ci/scripts/jenkins/determine_docker_images.py
@@ -32,6 +32,7 @@
 PAGE_SIZE = 25
 TEST_DATA = None
 IMAGE_TAGS_FILE = REPO_ROOT / "ci" / "jenkins" / "docker-images.ini"
+TVM_CI_ECR = "477529581014.dkr.ecr.us-west-2.amazonaws.com"
 
 
 def docker_api(url: str, use_pagination: bool = False) -> Dict[str, Any]:
@@ -111,7 +112,10 @@ def image_exists(spec: str) -> bool:
     name_dir.mkdir(exist_ok=True)
     images_to_use = {}
     for filename, spec in images.items():
-        if image_exists(spec):
+        if spec.startswith(TVM_CI_ECR):
+            logging.info(f"{spec} is from ECR")
+            images_to_use[filename] = spec
+        elif image_exists(spec):
             logging.info(f"{spec} found in tlcpack")
             images_to_use[filename] = spec
         else:

From 3008e78fa12e7b03b2e3fb1c0c9c3f682e27e461 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Mon, 12 Dec 2022 11:53:11 +0800
Subject: [PATCH 046/286] [TIR][Schedule] Support for specific consumer block
 targeting in cache_write (#13510)

Add optional consumer blocks to cache_write.
---
 include/tvm/tir/schedule/schedule.h           |   4 +-
 python/tvm/tir/schedule/schedule.py           |  11 +-
 src/tir/schedule/concrete_schedule.cc         |  11 +-
 src/tir/schedule/concrete_schedule.h          |   4 +-
 src/tir/schedule/primitive.h                  |   4 +-
 .../schedule/primitive/cache_read_write.cc    |  72 ++++++++++--
 src/tir/schedule/traced_schedule.cc           |   8 +-
 src/tir/schedule/traced_schedule.h            |   4 +-
 .../test_tir_schedule_cache_read_write.py     | 103 ++++++++++++++++++
 9 files changed, 198 insertions(+), 23 deletions(-)

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index c4838f2eb8aa..8b22c173a3d8 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -399,10 +399,12 @@ class ScheduleNode : public runtime::Object {
    * \param block_rv The producer of the buffer
    * \param write_buffer_index The index of the buffer in block's write region
    * \param storage_scope The target storage scope
+   * \param consumer_blocks An optional list of consumers to read from cache directly.
    * \return The cache stage block.
    */
   virtual BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
-                             const String& storage_scope) = 0;
+                             const String& storage_scope,
+                             const Array<BlockRV> consumer_blocks = {}) = 0;
   /*!
    * \brief Create 2 blocks that read&write a buffer region into a read/write cache.
    * It requires the the target block both read & write the target buffer.
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 5ff9d7131396..48850012cbb7 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -1110,6 +1110,7 @@ def cache_write(
         block: Union[BlockRV, str],
         write_buffer_index: Union[int, str, Buffer],
         storage_scope: str,
+        consumer_blocks=None,
     ) -> BlockRV:
         """Create a block that reads a buffer region into a write cache. It requires:
 
@@ -1130,6 +1131,9 @@ def cache_write(
         storage_scope: str
             The target storage scope.
 
+        consumer_blocks: Optional[List[Union[BlockRV, str]]]
+            An optional list of consumers that should read directly from the cache.
+            If not specified, all consumers will read from the original buffer.
 
         Returns
         -------
@@ -1179,6 +1183,11 @@ def after_cache_write(a: T.handle, b: T.handle) -> None:
                         B[vi, vj] = B_local[vi, vj]
 
         """
+        if consumer_blocks is None:
+            consumer_blocks = []
+
+        # Convert any string block names into Block RVs.
+        consumer_blocks = [self._normalize_block_arg(b) for b in consumer_blocks]
         block = self._normalize_block_arg(block)
 
         if not isinstance(write_buffer_index, int):
@@ -1186,7 +1195,7 @@ def after_cache_write(a: T.handle, b: T.handle) -> None:
                 block, write_buffer_index, required_buffer_type="write"
             )
         return _ffi_api.ScheduleCacheWrite(  # type: ignore # pylint: disable=no-member
-            self, block, write_buffer_index, storage_scope
+            self, block, write_buffer_index, storage_scope, consumer_blocks
         )
 
     @type_checked
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 7ae0185b425c..163c72eb0777 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -552,10 +552,17 @@ BlockRV ConcreteScheduleNode::CacheRead(const BlockRV& block_rv, int read_buffer
 }
 
 BlockRV ConcreteScheduleNode::CacheWrite(const BlockRV& block_rv, int write_buffer_index,
-                                         const String& storage_scope) {
+                                         const String& storage_scope,
+                                         const Array<BlockRV> consumer_blocks) {
   StmtSRef result{nullptr};
+  // Create a new array of SRefs from the consumer block list.
+  Array<StmtSRef> consumer_block_refs = {};
+  for (BlockRV block : consumer_blocks) {
+    consumer_block_refs.push_back(this->GetSRef(block));
+  }
   TVM_TIR_SCHEDULE_BEGIN();
-  result = tir::CacheWrite(state_, this->GetSRef(block_rv), write_buffer_index, storage_scope);
+  result = tir::CacheWrite(state_, this->GetSRef(block_rv), write_buffer_index, storage_scope,
+                           consumer_block_refs);
   TVM_TIR_SCHEDULE_END("cache-write", this->error_render_level_);
   this->state_->DebugVerify();
   return CreateRV<BlockRV>(result);
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 2381870760a0..899775f2a15d 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -114,8 +114,8 @@ class ConcreteScheduleNode : public ScheduleNode {
   /******** Schedule: Insert cache stages ********/
   BlockRV CacheRead(const BlockRV& block_rv, int read_buffer_index, const String& storage_scope,
                     const Array<BlockRV> consumer_blocks = {}) override;
-  BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
-                     const String& storage_scope) override;
+  BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index, const String& storage_scope,
+                     const Array<BlockRV> consumer_blocks = {}) override;
   Array<BlockRV> CacheInplace(const BlockRV& block_rv, int read_buffer_index,
                               const String& storage_scope) override;
   Array<BlockRV> CacheIndex(const BlockRV& block_rv, int write_buffer_index) override;
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 38931aa27147..9e7f77f55ea5 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -263,10 +263,12 @@ TVM_DLL StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int r
  * \param block_sref The producer of the buffer
  * \param write_buffer_index The index of the buffer in block's write region
  * \param storage_scope The target storage scope
+ * \param consumer_blocks Array of blocks that consume the cache.
  * \return The cache stage block.
  */
 TVM_DLL StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_buffer_index,
-                            const String& storage_scope);
+                            const String& storage_scope,
+                            const Array<StmtSRef> consumer_blocks = {});
 /*!
  *!
  * \brief Create 2 blocks that read&write a buffer region into a read/write cache.
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index 27244f157592..4174a6699e06 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -382,21 +382,34 @@ class CacheLocDetector : public StmtVisitor {
    * writer block of the buffer being applied cache_read or cache_write \param scope_sref The sref
    * of the scope block of the cached block \param info The cache stage info.
    */
+  template <bool is_cache_read>
   static void Detect(const ScheduleState& self, const StmtSRef& block_sref,
                      const StmtSRef& scope_sref, CacheStageInfo* info) {
     std::vector<StmtSRef> related_blocks;
     // If consumer is specified, skip detecting the others
-    if (info->consumer_blocks.size() > 0) {
-      for (StmtSRef consumer : info->consumer_blocks) {
-        related_blocks.emplace_back(consumer);
+    if (is_cache_read) {
+      if (info->consumer_blocks.size() > 0) {
+        for (StmtSRef consumer : info->consumer_blocks) {
+          related_blocks.emplace_back(consumer);
+        }
+      } else {
+        for (const Dependency& def : self->GetBlockScope(scope_sref)->GetDepsBySrc(block_sref)) {
+          if (def->kind == DepKind::kRAW) {
+            related_blocks.push_back(def->dst);
+          }
+        }
       }
     } else {
       for (const Dependency& def : self->GetBlockScope(scope_sref)->GetDepsBySrc(block_sref)) {
         if (def->kind == DepKind::kRAW) {
+          if (info->consumer_blocks.count(def->dst)) {
+            continue;
+          }
           related_blocks.push_back(def->dst);
         }
       }
     }
+
     if (!related_blocks.empty()) {
       CacheLocDetector detector(self, block_sref, scope_sref, related_blocks);
       detector(GetRef<Stmt>(scope_sref->stmt));
@@ -739,6 +752,30 @@ class CacheWriteRewriter : public StmtExprMutator {
 
   Stmt VisitStmt_(const BlockNode* block) final {
     Block old_stmt = GetRef<Block>(block);
+
+    // Check if this block is one of the specified cache consumers.
+    // update the read buffer to the cache.
+    for (StmtSRef consumer_sref : info_->consumer_blocks) {
+      const BlockNode* consumer_node = TVM_SREF_TO_BLOCK(consumer_sref);
+      Block consumer_block = GetRef<Block>(consumer_node);
+      if (old_stmt.same_as(consumer_block)) {
+        Array<BufferRegion> reads =
+            ReplaceBuffer(block->reads, info_->write_buffer, info_->read_buffer);
+        Array<MatchBufferRegion> match_buffers =
+            ReplaceBuffer(block->match_buffers, info_->write_buffer, info_->read_buffer);
+        if (!reads.same_as(block->reads) || !match_buffers.same_as(block->match_buffers)) {
+          auto n = CopyOnWrite(block);
+          n->reads = std::move(reads);
+          n->match_buffers = std::move(match_buffers);
+          n->body = VisitStmt(block->body);
+          Block new_consumer = Block(n);
+          info_->block_reuse.Set(old_stmt, new_consumer);
+          return std::move(new_consumer);
+        }
+        return std::move(old_stmt);
+      }
+    }
+
     // We only mutate the block which generates info->write_buffer
     if (block != writer_block_sref_->stmt && block != scope_sref_->stmt && !under_writer_block_) {
       return std::move(old_stmt);
@@ -1160,7 +1197,7 @@ StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buff
     StmtSRef parent_sref = GetRef<StmtSRef>(write_block_sref->parent);
 
     // Detect insert position
-    CacheLocDetector::Detect(self, write_block_sref, scope_sref, &info);
+    CacheLocDetector::Detect</*is_cache_read=*/true>(self, write_block_sref, scope_sref, &info);
     cache_region = RelaxBufferRegion(self, region, write_block_sref, parent_sref, info.loc_sref);
   } else {
     // Case 2. The buffer is the input block for the scope.
@@ -1190,7 +1227,7 @@ StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buff
 }
 
 StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_buffer_index,
-                    const String& storage_scope) {
+                    const String& storage_scope, const Array<StmtSRef> consumer_blocks) {
   /*!
    * Check:
    *   - The index is in the array of block reading region
@@ -1219,6 +1256,14 @@ StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_bu
   // Create the corresponding buffer allocation
   info.alloc = info.read_buffer;
 
+  // info.consumer_blocks indicates which buffers should consume the cache.
+  for (auto consumer : consumer_blocks) {
+    info.consumer_blocks.insert(consumer);
+    for (auto child : tir::GetChildBlocks(self, consumer)) {
+      info.consumer_blocks.insert(child);
+    }
+  }
+
   // Step 3. Check the only writer block.
   ICHECK_EQ(block_sref.get(), GetOnlyWriteBlock(self, scope_sref, write_buffer).get());
 
@@ -1226,7 +1271,7 @@ StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_bu
   BufferRegion region = GetBufferRegionFromBuffer(block->writes, write_buffer).value();
   StmtSRef parent_sref = GetRef<StmtSRef>(block_sref->parent);
   // Detect insert position
-  CacheLocDetector::Detect(self, block_sref, scope_sref, &info);
+  CacheLocDetector::Detect</*is_cache_read=*/false>(self, block_sref, scope_sref, &info);
   BufferRegion cache_region =
       RelaxBufferRegion(self, region, block_sref, parent_sref, info.loc_sref);
 
@@ -1468,21 +1513,26 @@ struct CacheWriteTraits : public UnpackedInstTraits<CacheWriteTraits> {
   static constexpr bool kIsPure = false;
 
  private:
-  static constexpr size_t kNumInputs = 1;
+  static constexpr size_t kNumInputs = 2;
   static constexpr size_t kNumAttrs = 2;
   static constexpr size_t kNumDecisions = 0;
 
-  static BlockRV UnpackedApplyToSchedule(Schedule sch, BlockRV block, Integer write_buffer_index,
+  static BlockRV UnpackedApplyToSchedule(Schedule sch, BlockRV block,
+                                         Array<BlockRV> consumer_blocks, Integer write_buffer_index,
                                          String storage_scope) {
-    return sch->CacheWrite(block, write_buffer_index->value, storage_scope);
+    return sch->CacheWrite(block, write_buffer_index->value, storage_scope, consumer_blocks);
   }
 
-  static String UnpackedAsPython(Array<String> outputs, String block, Integer write_buffer_index,
-                                 String storage_scope) {
+  static String UnpackedAsPython(Array<String> outputs, String block, Array<String> consumer_blocks,
+                                 Integer write_buffer_index, String storage_scope) {
     PythonAPICall py("cache_write");
     py.Input("block", block);
     py.Input("write_buffer_index", write_buffer_index->value);
     py.Input("storage_scope", storage_scope);
+    // Only write out consumer blocks if provided.
+    if (!consumer_blocks.empty()) {
+      py.Input("consumer_blocks", consumer_blocks);
+    }
     py.SingleOutput(outputs);
     return py.Str();
   }
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 00941b48575d..70559608e789 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -296,12 +296,14 @@ BlockRV TracedScheduleNode::CacheRead(const BlockRV& block_rv, int read_buffer_i
 }
 
 BlockRV TracedScheduleNode::CacheWrite(const BlockRV& block_rv, int write_buffer_index,
-                                       const String& storage_scope) {
-  BlockRV result = ConcreteScheduleNode::CacheWrite(block_rv, write_buffer_index, storage_scope);
+                                       const String& storage_scope,
+                                       const Array<BlockRV> consumer_blocks) {
+  BlockRV result = ConcreteScheduleNode::CacheWrite(block_rv, write_buffer_index, storage_scope,
+                                                    consumer_blocks);
 
   static const InstructionKind& kind = InstructionKind::Get("CacheWrite");
   trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
-                                      /*inputs=*/{block_rv},
+                                      /*inputs=*/{block_rv, consumer_blocks},
                                       /*attrs=*/{Integer(write_buffer_index), storage_scope},
                                       /*outputs=*/{result}));
   return result;
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index 80257f644f6b..c54574e9c9ff 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -74,8 +74,8 @@ class TracedScheduleNode : public ConcreteScheduleNode {
   /******** Schedule: Insert cache stages ********/
   BlockRV CacheRead(const BlockRV& block_rv, int read_buffer_index, const String& storage_scope,
                     const Array<BlockRV> consumer_blocks = {}) final;
-  BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
-                     const String& storage_scope) final;
+  BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index, const String& storage_scope,
+                     const Array<BlockRV> consumer_blocks = {}) final;
   Array<BlockRV> CacheInplace(const BlockRV& block_rv, int read_buffer_index,
                               const String& storage_scope) final;
   BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
diff --git a/tests/python/unittest/test_tir_schedule_cache_read_write.py b/tests/python/unittest/test_tir_schedule_cache_read_write.py
index 3476ca083056..28c9a13700bf 100644
--- a/tests/python/unittest/test_tir_schedule_cache_read_write.py
+++ b/tests/python/unittest/test_tir_schedule_cache_read_write.py
@@ -858,6 +858,81 @@ def cache_write_multi_consumer() -> None:
             C[vi] = A[vi]
 
 
+@T.prim_func
+def cache_write_multi_consumer_B_consume_cache():
+    A = T.alloc_buffer([128], dtype="float32")
+    B = T.alloc_buffer([128], dtype="float32")
+    C = T.alloc_buffer([128], dtype="float32")
+    A_global = T.alloc_buffer([128], dtype="float32")
+    for i in T.serial(8):
+        for j in T.serial(16):
+            with T.block("A"):
+                vi = T.axis.spatial(128, i * 16 + j)
+                A_global[vi] = 1.0
+        for j in T.serial(16):
+            with T.block("B"):
+                vi = T.axis.spatial(128, i * 16 + j)
+                B[vi] = A_global[vi] + 1.0
+    for ax0 in T.serial(128):
+        with T.block("A_global"):
+            v0 = T.axis.spatial(128, ax0)
+            A[v0] = A_global[v0]
+    for i in T.serial(128):
+        with T.block("C"):
+            vi = T.axis.spatial(128, i)
+            C[vi] = A[vi]
+
+
+@T.prim_func
+def cache_write_multi_consumer_C_consume_cache():
+    A = T.alloc_buffer([128], dtype="float32")
+    B = T.alloc_buffer([128], dtype="float32")
+    C = T.alloc_buffer([128], dtype="float32")
+    A_global = T.alloc_buffer([128], dtype="float32")
+    for i in T.serial(8):
+        for j in T.serial(16):
+            with T.block("A"):
+                vi = T.axis.spatial(128, i * 16 + j)
+                A_global[vi] = T.float32(1)
+        for ax0 in T.serial(16):
+            with T.block("A_global"):
+                v0 = T.axis.spatial(128, i * 16 + ax0)
+                A[v0] = A_global[v0]
+        for j in T.serial(16):
+            with T.block("B"):
+                vi = T.axis.spatial(128, i * 16 + j)
+                B[vi] = A[vi] + T.float32(1)
+    for i in T.serial(128):
+        with T.block("C"):
+            vi = T.axis.spatial(128, i)
+            C[vi] = A_global[vi]
+
+
+@T.prim_func
+def cache_write_multi_consumer_all_consume_cache():
+    A = T.alloc_buffer([128], dtype="float32")
+    B = T.alloc_buffer([128], dtype="float32")
+    C = T.alloc_buffer([128], dtype="float32")
+    A_global = T.alloc_buffer([128], dtype="float32")
+    for i in T.serial(8):
+        for j in T.serial(16):
+            with T.block("A"):
+                vi = T.axis.spatial(128, i * 16 + j)
+                A_global[vi] = T.float32(1)
+        for j in T.serial(16):
+            with T.block("B"):
+                vi = T.axis.spatial(128, i * 16 + j)
+                B[vi] = A_global[vi] + T.float32(1)
+    for i in T.serial(128):
+        with T.block("C"):
+            vi = T.axis.spatial(128, i)
+            C[vi] = A_global[vi]
+    for ax0 in T.serial(128):
+        with T.block("A_global"):
+            v0 = T.axis.spatial(128, ax0)
+            A[v0] = A_global[v0]
+
+
 @T.prim_func
 def continuous_cache_write(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
@@ -1113,6 +1188,34 @@ def test_cache_write_location(use_block_name):
     tvm.ir.assert_structural_equal(cache_write_multi_consumer, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=func_multi_consumer)
 
+    # Test that specific consumer block targetting works.
+    # B read cache buffer and C read original output buffer
+    sch = tir.Schedule(func_multi_consumer, debug_mask="all")
+    block_a = "A" if use_block_name else sch.get_block("A")
+    block_b = "B" if use_block_name else sch.get_block("B")
+    sch.cache_write(block_a, 0, "global", consumer_blocks=[block_b])
+    tvm.ir.assert_structural_equal(cache_write_multi_consumer_B_consume_cache, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=func_multi_consumer)
+
+    # Test that specific consumer block targetting works.
+    # B read original output buffer and C read cache buffer
+    sch = tir.Schedule(func_multi_consumer, debug_mask="all")
+    block_a = "A" if use_block_name else sch.get_block("A")
+    block_c = "C" if use_block_name else sch.get_block("C")
+    sch.cache_write(block_a, 0, "global", consumer_blocks=[block_c])
+    tvm.ir.assert_structural_equal(cache_write_multi_consumer_C_consume_cache, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=func_multi_consumer)
+
+    # Test that specific consumer block targetting works.
+    # B and C read cache buffer
+    sch = tir.Schedule(func_multi_consumer, debug_mask="all")
+    block_a = "A" if use_block_name else sch.get_block("A")
+    block_b = "B" if use_block_name else sch.get_block("B")
+    block_c = "C" if use_block_name else sch.get_block("C")
+    sch.cache_write(block_a, 0, "global", consumer_blocks=[block_b, block_c])
+    tvm.ir.assert_structural_equal(cache_write_multi_consumer_all_consume_cache, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=func_multi_consumer)
+
 
 def test_continuous_cache_write(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")

From f0f23d13de43070080ced139178265789d5fa50c Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Mon, 12 Dec 2022 18:53:15 +0800
Subject: [PATCH 047/286] [LLVM] Fix get tm allow_missing check pos (#13591)

Fix get tm allow_missing check pos
---
 src/target/llvm/llvm_instance.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/target/llvm/llvm_instance.cc b/src/target/llvm/llvm_instance.cc
index 44454fc6b92d..2aa190ad708e 100644
--- a/src/target/llvm/llvm_instance.cc
+++ b/src/target/llvm/llvm_instance.cc
@@ -297,9 +297,9 @@ llvm::TargetMachine* LLVMTargetInfo::GetOrCreateTargetMachine(bool allow_missing
         llvm_instance->createTargetMachine(triple_, cpu_, GetTargetFeatureString(), target_options_,
                                            reloc_model_, code_model_, opt_level_);
     target_machine_ = std::unique_ptr<llvm::TargetMachine>(tm);
-    if (!allow_missing) {
-      ICHECK(target_machine_ != nullptr) << error;
-    }
+  }
+  if (!allow_missing) {
+    ICHECK(target_machine_ != nullptr) << error;
   }
   return target_machine_.get();
 }

From ed52610dad15718a880e3e77352937385de0b2ff Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Mon, 12 Dec 2022 22:38:57 +0900
Subject: [PATCH 048/286] [Torch] Stable diffusion support (#13594)

* add baddbmm conversion

* fix

* suppress lint
---
 python/tvm/relay/frontend/pytorch.py          | 18 +++++++++++++++++-
 tests/python/frontend/pytorch/test_forward.py | 16 +++++++++++++++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 30f14b490b1b..b9d167ad2d86 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -1863,6 +1863,13 @@ def chunk(self, inputs, input_types):
 
         return _op.split(data, indeces, axis)
 
+    def baddbmm(self, inputs, _):
+        input = inputs[0]
+        batch1, batch2 = inputs[1:3]
+        beta = _expr.const(float(inputs[3]))
+        alpha = _expr.const(float(inputs[4]))
+        return beta * input + alpha * _op.nn.batch_matmul(batch1, batch2, transpose_b=False)
+
     def matmul(self, inputs, input_types):
 
         inputs_0 = inputs[0]
@@ -2565,7 +2572,14 @@ def numel(self, inputs, input_types):
         return _op.ndarray_size(inputs[0])
 
     def empty(self, inputs, input_types):
-        shape = inputs[0]
+        shape = []
+        for s in inputs[0]:
+            if isinstance(s, _expr.Constant):
+                shape.append(s.data.numpy().item())
+            else:
+                assert isinstance(s, int)
+                shape.append(s)
+
         return _op.zeros(shape, _convert_dtype_value(inputs[1]))
 
     def empty_like(self, inputs, input_types):
@@ -3621,6 +3635,7 @@ def create_convert_map(self):
             "aten::unsafe_chunk": self.chunk,
             "aten::matmul": self.matmul,
             "aten::bmm": self.matmul,
+            "aten::baddbmm": self.baddbmm,
             "aten::expand": self.expand,
             "aten::Int": self.int,
             "prim::NumToTensor": self.numtotensor,
@@ -4587,6 +4602,7 @@ def from_pytorch(
         if inp.type().kind() == "TupleType" or inp.type().kind() == "ListType":
             enable_lower_all_tuples = False
             break
+
     _run_jit_passes(graph, enable_lower_all_tuples)
 
     if custom_convert_map:
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 36bb5bede475..35242fbf7dde 100755
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=import-self, invalid-name, unused-argument
+# pylint: disable=import-self, invalid-name, unused-argument, missing-function-docstring
 """Unit tests for various models and operators"""
 import os
 import platform
@@ -5038,5 +5038,19 @@ def _test_multinomial(num_samples):
     )
 
 
+@tvm.testing.uses_gpu
+def test_baddbmm():
+    def test_fn(alpha, beta):
+        return lambda inp, batch1, batch2: torch.baddbmm(
+            inp, batch1, batch2, beta=beta, alpha=alpha
+        )
+
+    M = torch.randn(10, 3, 5)
+    batch1 = torch.randn(10, 3, 4)
+    batch2 = torch.randn(10, 4, 5)
+
+    verify_model(test_fn(0.5, 1.0), [M, batch1, batch2])
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From dc9ac6d349999f6a72f8f3e06215a61d50424b99 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Tue, 13 Dec 2022 00:02:27 +0300
Subject: [PATCH 049/286] [OpenCL][CI] Enable OpenCL cpp tests in CI (#13400)

* [OpenCL][CI] Enable OpenCL cpp tests in CI

* Add building gtest for OpenCL in GPU build

* Fix CI build

* Change OpenCL cpp tests build approach

* Fix lint

* Try to enable test in CI

* Update version of gpu docker image

* Change script mod
---
 CMakeLists.txt                                | 15 -----
 ci/jenkins/docker-images.ini                  |  2 +-
 ci/jenkins/generated/gpu_jenkinsfile.groovy   |  8 +++
 .../templates/gpu_jenkinsfile.groovy.j2       |  8 +++
 cmake/modules/OpenCL.cmake                    | 12 +++-
 tests/cpp-runtime/opencl/run_gtests.cc        | 60 -------------------
 .../contrib/test_opencl/test_run_gtests.py    | 56 -----------------
 tests/scripts/ci.py                           |  1 +
 tests/scripts/task_config_build_gpu.sh        |  1 +
 tests/scripts/task_opencl_cpp_unittest.sh     | 39 ++++++++++++
 10 files changed, 69 insertions(+), 133 deletions(-)
 delete mode 100644 tests/cpp-runtime/opencl/run_gtests.cc
 delete mode 100644 tests/python/contrib/test_opencl/test_run_gtests.py
 create mode 100755 tests/scripts/task_opencl_cpp_unittest.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9cfa48fc045d..b59d5ab69185 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -642,18 +642,6 @@ if(BUILD_FOR_HEXAGON AND DEFINED USE_HEXAGON_GTEST AND EXISTS ${USE_HEXAGON_GTES
   include_directories("${USE_HEXAGON_GTEST}/include")
 endif()
 
-if(USE_OPENCL AND DEFINED USE_OPENCL_GTEST AND EXISTS ${USE_OPENCL_GTEST})
-  include(FetchContent)
-  FetchContent_Declare(googletest SOURCE_DIR "${USE_OPENCL_GTEST}")
-  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-  FetchContent_MakeAvailable(googletest)
-  target_link_libraries(tvm_runtime PUBLIC gtest)
-  target_link_libraries(tvm PUBLIC gtest)
-  include_directories("${USE_OPENCL_GTEST}/include")
-  include_directories("${USE_OPENCL_GTEST}/googletest/include")
-  message(STATUS "Found OpenCL gtest at ${USE_OPENCL_GTEST}")
-endif()
-
 # Set flags for clang
 include(cmake/modules/ClangFlags.cmake)
 set(CRC16_INCLUDE_PATH "3rdparty/libcrc/include")
@@ -719,9 +707,6 @@ install(TARGETS tvm_runtime EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_S
 if(BUILD_FOR_HEXAGON AND DEFINED USE_HEXAGON_GTEST AND EXISTS ${USE_HEXAGON_GTEST})
   install(TARGETS gtest EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
 endif()
-if(USE_OPENCL AND DEFINED USE_OPENCL_GTEST AND EXISTS ${USE_OPENCL_GTEST})
-  install(TARGETS gtest EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
-endif()
 
 if (INSTALL_DEV)
   install(
diff --git a/ci/jenkins/docker-images.ini b/ci/jenkins/docker-images.ini
index 119a43218642..40e1b8a1313f 100644
--- a/ci/jenkins/docker-images.ini
+++ b/ci/jenkins/docker-images.ini
@@ -20,7 +20,7 @@
 ci_arm: tlcpack/ci-arm:20221013-060115-61c9742ea
 ci_cortexm: tlcpack/ci-cortexm:20221013-060115-61c9742ea
 ci_cpu: tlcpack/ci-cpu:20221013-060115-61c9742ea
-ci_gpu: tlcpack/ci-gpu:20221019-060125-0b4836739
+ci_gpu: tlcpack/ci-gpu:20221128-070141-ae4fd7df7
 ci_hexagon: tlcpack/ci-hexagon:20221013-060115-61c9742ea
 ci_i386: tlcpack/ci-i386:20221013-060115-61c9742ea
 ci_lint: tlcpack/ci-lint:20221013-060115-61c9742ea
diff --git a/ci/jenkins/generated/gpu_jenkinsfile.groovy b/ci/jenkins/generated/gpu_jenkinsfile.groovy
index bebc0c4c22a5..a5609697af46 100644
--- a/ci/jenkins/generated/gpu_jenkinsfile.groovy
+++ b/ci/jenkins/generated/gpu_jenkinsfile.groovy
@@ -614,6 +614,14 @@ def shard_run_unittest_GPU_1_of_3() {
               make_standalone_crt(ci_gpu, 'build')
               make_cpp_tests(ci_gpu, 'build')
               cpp_unittest(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} python3 ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod --cmake-target opencl-cpptest --build-dir build",
+                label: 'Make OpenCL cpp unit tests',
+              )
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_opencl_cpp_unittest.sh",
+                label: 'Run OpenCL cpp unit tests',
+              )
               micro_cpp_unittest(ci_gpu)
               sh (
                 script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
diff --git a/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2 b/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2
index 2a9e7236d26d..40698131a783 100644
--- a/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/gpu_jenkinsfile.groovy.j2
@@ -63,6 +63,14 @@
   make_standalone_crt(ci_gpu, 'build')
   make_cpp_tests(ci_gpu, 'build')
   cpp_unittest(ci_gpu)
+  sh (
+    script: "${docker_run} ${ci_gpu} python3 ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod --cmake-target opencl-cpptest --build-dir build",
+    label: 'Make OpenCL cpp unit tests',
+  )
+  sh (
+    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_opencl_cpp_unittest.sh",
+    label: 'Run OpenCL cpp unit tests',
+  )
   micro_cpp_unittest(ci_gpu)
   {% else %}
   {{ m.download_artifacts(tag='gpu') }}
diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
index e738df7c564c..1e1041efe386 100644
--- a/cmake/modules/OpenCL.cmake
+++ b/cmake/modules/OpenCL.cmake
@@ -59,9 +59,19 @@ if(USE_OPENCL)
   endif()
 
   if(DEFINED USE_OPENCL_GTEST AND EXISTS ${USE_OPENCL_GTEST})
-    file_glob_append(RUNTIME_OPENCL_SRCS
+    include(FetchContent)
+    FetchContent_Declare(googletest SOURCE_DIR "${USE_OPENCL_GTEST}")
+    set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+    FetchContent_MakeAvailable(googletest)
+    install(TARGETS gtest EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
+
+    message(STATUS "Found OpenCL gtest at ${USE_OPENCL_GTEST}")
+
+    tvm_file_glob(GLOB_RECURSE OPENCL_TEST_SRCS
       "${CMAKE_SOURCE_DIR}/tests/cpp-runtime/opencl/*.cc"
     )
+    add_executable(opencl-cpptest ${OPENCL_TEST_SRCS})
+    target_link_libraries(opencl-cpptest PRIVATE gtest_main tvm_runtime)
   endif()
   list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
 else()
diff --git a/tests/cpp-runtime/opencl/run_gtests.cc b/tests/cpp-runtime/opencl/run_gtests.cc
deleted file mode 100644
index ffe86a7f52c0..000000000000
--- a/tests/cpp-runtime/opencl/run_gtests.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <gtest/gtest.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-
-#include <string>
-#include <vector>
-
-#include "../src/support/utils.h"
-
-namespace tvm {
-namespace runtime {
-namespace cl {
-
-TVM_REGISTER_GLOBAL("opencl.run_gtests").set_body([](TVMArgs args, TVMRetValue* rv) {
-  // gtest args are passed into this packed func as a singular string
-  // split gtest args using <space> delimiter and build argument vector
-  std::vector<std::string> parsed_args = tvm::support::Split(args[0], ' ');
-  std::vector<char*> argv;
-
-  // add executable name
-  argv.push_back(const_cast<char*>("opencl_run_gtests"));
-
-  // add parsed arguments
-  for (size_t i = 0; i < parsed_args.size(); ++i) {
-    argv.push_back(const_cast<char*>(parsed_args[i].data()));
-  }
-
-  // end of parsed arguments
-  argv.push_back(nullptr);
-
-  // set argument count
-  int argc = argv.size() - 1;
-
-  // initialize gtest with arguments and run
-  ::testing::InitGoogleTest(&argc, argv.data());
-  *rv = RUN_ALL_TESTS();
-});
-
-}  // namespace cl
-}  // namespace runtime
-}  // namespace tvm
diff --git a/tests/python/contrib/test_opencl/test_run_gtests.py b/tests/python/contrib/test_opencl/test_run_gtests.py
deleted file mode 100644
index ee59086b25f1..000000000000
--- a/tests/python/contrib/test_opencl/test_run_gtests.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import pytest
-import numpy as np
-
-import tvm
-from tvm import rpc
-
-
-# use pytest -sv to observe gtest output
-# use --gtest_args to pass arguments to gtest
-# for example to run all "foo" tests twice and observe gtest output run
-# pytest -sv <this file> --gtests_args="--gtest_filter=*foo* --gtest_repeat=2"
-@tvm.testing.requires_opencl
-@pytest.mark.skipif(tvm.testing.utils.IS_IN_CI, reason="failed due to nvidia libOpencl in the CI")
-def test_run_gtests(gtest_args):
-    if (
-        "TVM_TRACKER_HOST" in os.environ
-        and "TVM_TRACKER_PORT" in os.environ
-        and "TVM_TRACKER_KEY" in os.environ
-    ):
-        rpc_tracker_host = os.environ["TVM_TRACKER_HOST"]
-        rpc_tracker_port = os.environ["TVM_TRACKER_PORT"]
-        rpc_tracker_port = int(rpc_tracker_port)
-        rpc_key = os.environ["TVM_TRACKER_KEY"]
-        tracker = rpc.connect_tracker(rpc_tracker_host, rpc_tracker_port)
-        rpc_connection = tracker.request(rpc_key, priority=0, session_timeout=600)
-    else:
-        rpc_connection = rpc.LocalSession()
-
-    try:
-        func = rpc_connection.get_function("opencl.run_gtests")
-    except:
-        print(
-            "This test requires TVM Runtime to be built with a OpenCL gtest version using OpenCL API cmake flag -DUSE_OPENCL_GTEST=/path/to/opencl/googletest/gtest"
-        )
-        raise
-
-    gtest_error_code = func(gtest_args)
-    np.testing.assert_equal(gtest_error_code, 0)
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 6799f68d43b7..b11ee538dc68 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -593,6 +593,7 @@ def add_subparser(
                 "run unit tests",
                 [
                     "./tests/scripts/task_java_unittest.sh",
+                    "./tests/scripts/task_opencl_cpp_unittest.sh",
                     "./tests/scripts/task_python_unittest_gpuonly.sh",
                     "./tests/scripts/task_python_integration_gpuonly.sh",
                 ],
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index ca5f3e935c08..90c91fb990be 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -29,6 +29,7 @@ echo set\(USE_CUDA ON\) >> config.cmake
 echo set\(USE_VULKAN ON\) >> config.cmake
 echo set\(USE_OPENGL ON\) >> config.cmake
 echo set\(USE_OPENCL ON\) >> config.cmake
+echo set\(USE_OPENCL_GTEST \"/googletest\"\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_LLVM \"/usr/bin/llvm-config-9 --link-static\"\) >> config.cmake
diff --git a/tests/scripts/task_opencl_cpp_unittest.sh b/tests/scripts/task_opencl_cpp_unittest.sh
new file mode 100755
index 000000000000..7ea6ea470db7
--- /dev/null
+++ b/tests/scripts/task_opencl_cpp_unittest.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euxo pipefail
+
+if [ $# -gt 0 ]; then
+    BUILD_DIR="$1"
+elif [ -n "${TVM_BUILD_PATH:-}" ]; then
+    # TVM_BUILD_PATH may contain multiple space-separated paths.  If
+    # so, use the first one.
+    BUILD_DIR=$(IFS=" "; set -- $TVM_BUILD_PATH; echo $1)
+else
+    BUILD_DIR=build
+fi
+
+
+# to avoid CI thread throttling.
+export TVM_BIND_THREADS=0
+export OMP_NUM_THREADS=1
+
+pushd "${BUILD_DIR}"
+# run cpp test executable
+./opencl-cpptest
+popd

From 4a5032b9e5c027714f42ef74d3331f63631d0a81 Mon Sep 17 00:00:00 2001
From: padreofthegame <97688606+padreofthegame@users.noreply.github.com>
Date: Mon, 12 Dec 2022 23:17:33 +0100
Subject: [PATCH 050/286] [Relay] Bug fix in relay.squeeze function for issue
 #12400 (#12684)

[Relay] Bug fix in relay.squeeze function. Also added functionality for parameter axis of type int
---
 python/tvm/relay/op/transform.py     | 26 +++++++++++++++++++++-----
 tests/python/relay/test_op_level3.py |  7 ++++++-
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index e7ae5f7d8315..024da84cbfd8 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -204,23 +204,39 @@ def squeeze(data, axis=None):
 
     Parameters
     ----------
-    data : tvm.relay.Expr
+    data : relay.Expr
         The input data to the operator.
 
-    axis : None or List[int] or Expr
+    axis : Union[None, int, Tuple[int], List[int]] or Expr
         The set of axes to remove.
-        If axis = None, remove all axis of dimensions 1.
+        If axis = None, remove all axes of dimension 1.
         If any specified axis has dimension that does not equal 1, it is an error.
 
     Returns
     -------
-    result : tvm.relay.Expr
+    result : relay.Expr
         The squeezed result.
     """
     if isinstance(axis, Constant):
-        axis = list(axis.data.numpy())
+        if axis.data.shape:
+            axis = list(axis.data.numpy())
+        else:
+            axis = [axis.data.numpy().item()]
     if isinstance(axis, Expr):
         return _dyn_make.squeeze(data, axis)
+    if isinstance(axis, int):
+        axis = [axis]
+    if isinstance(axis, (tuple, list)):
+        tempaxis = []
+        for tmpax in axis:
+            if isinstance(tmpax, _expr.IntImm):
+                tempaxis.append(tmpax.value)
+            else:
+                try:
+                    tempaxis.append(int(tmpax))
+                except ValueError as err:
+                    raise RuntimeError("Unrecognized axis type: %s" % err)
+        axis = tempaxis
     return _make.squeeze(data, axis)
 
 
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index c3b3215e84e4..c96bc940f920 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -210,13 +210,18 @@ class TestSqueeze:
         ((1, 3, 2, 5), "float32", None),
         ((1, 3, 1), "float32", [0]),
         ((1, 2, 1, 2, 1), "float32", [0, 2]),
+        ((1, 3, 1), "float32", 2),
+        ((1, 3, 1), "float32", []),
     )
 
     def test_squeeze(self, shape, dtype, axis):
         x = relay.var("x", relay.TensorType(shape, dtype))
         squeeze = relay.squeeze(x, axis=axis)
 
-        np_axis = tuple(axis) if axis is not None else None
+        if isinstance(axis, int):
+            np_axis = (axis,)
+        else:
+            np_axis = tuple(axis) if axis is not None else None
 
         data = np.random.random_sample(shape).astype(dtype)
         op_res = create_executor().evaluate(squeeze, {x: relay.const(data)})

From e3968c18e4b7ec2b91addc9d38a3653cb5d77344 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Tue, 13 Dec 2022 14:31:36 +0900
Subject: [PATCH 051/286] [Relay] Fix `CombineParallelDense` slicing axis 
 (#13597)

The current implementation of `CombineParallelDense` is hardcoded to slice along the last axis after the combined dense. I hit an error using this pass on the stable diffusion UNet, since it has a combined group where the dense is followed by `expand_dims` which changes the slicing axis (see https://github.com/masahi/torchscript-to-tvm/blob/master/stable-diffusion/compile.py for repro)

```
  %76 = concatenate(%74) /* ty=Tensor[(20160, 1280), float32] */;
  %79 = concatenate(%77) /* ty=Tensor[(20160), float32] */;
  %78 = nn.dense(%75, %76, units=20160) /* ty=Tensor[(2, 20160), float32] */;
  %80 = nn.bias_add(%78, %79, axis=-1) /* ty=Tensor[(2, 20160), float32] */;
  %81 = expand_dims(%80, axis=2) /* ty=Tensor[(2, 20160, 1), float32] */;
  %82 = expand_dims(%81, axis=3) /* ty=Tensor[(2, 20160, 1, 1), float32] */;
```

The correct way to generate `strided_slice`:
```
  %84 = strided_slice(%82, begin=[0, 0, 0, 0], end=[-1, 320, -1, -1], strides=[1, 1, 1, 1], slice_mode="size", axes=None) /* ty=Tensor[(2, 320, 1, 1), float32] */;
```

As I documented in the code, this fix is probably not 100% fail-proof. I think this is a difficult problem, since it requires tracking how the original output-channel axis of the combined dense moves across shape-changing operations like `reshape /transpose / split`. But this is at least "more correct" than the current implementation, so I'm submitting this fix as is for now.

With this fix, `CombineParallelDense` works successfully on the stable diffusion UNet, and it reduces the number of `nn.dense` from 184 to 100.
---
 .../transforms/combine_parallel_dense.cc      | 43 +++++++++++-----
 .../relay/test_pass_combine_parallel_dense.py | 51 ++++++++++++++++---
 2 files changed, 74 insertions(+), 20 deletions(-)

diff --git a/src/relay/transforms/combine_parallel_dense.cc b/src/relay/transforms/combine_parallel_dense.cc
index 7cf102b5bcab..e5f7e0b975f4 100644
--- a/src/relay/transforms/combine_parallel_dense.cc
+++ b/src/relay/transforms/combine_parallel_dense.cc
@@ -195,23 +195,40 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner {
   void UpdateGroupOutput(const Expr& data, const Group& branches, size_t depth,
                          ExprSubstMap* subst_map) {
     int index = 0;
+    const auto dense_op = Op::Get("nn.dense");
     for (const auto& branch : branches) {
       const CallNode* call = branch[depth];
       auto& out_shape = call->type_as<TensorTypeNode>()->shape;
-      auto out_dims = tir::as_const_int(out_shape[out_shape.size() - 1]);
-      ICHECK(out_dims != nullptr);
-      Array<Integer> begin;
-      Array<Integer> end;
-      Array<Integer> strides;
-      for (size_t k = 0; k < out_shape.size() - 1; ++k) {
-        begin.push_back(0);
-        end.push_back(-1);
-        strides.push_back(1);
+
+      const CallNode* dense = branch[0];
+      ICHECK(dense->op.same_as(dense_op));
+      auto& dense_shape = dense->type_as<TensorTypeNode>()->shape;
+      auto dense_out_dims = tir::as_const_int(dense_shape[1]);
+      ICHECK(dense_out_dims != nullptr);
+
+      // dense can be followed by shape-changing operations, so the slicing axis is
+      // not necessarily the last one.
+      // TODO(masahi): The following logic is incorrect if (1) there is no axis in
+      // out_shape[i] that directly corresponds to the output channel of dense or (2) there
+      // is another axis that happens to have the same size as the output channel of dense.
+      // Such cases might arise due to reshape / transpose / split etc. Revisit this logic
+      // when we encounter them in practice.
+      auto slice_axis = -1;
+      for (size_t i = out_shape.size() - 1; i >= 0; --i) {
+        ICHECK(tir::as_const_int(out_shape[i]));
+        if (*tir::as_const_int(out_shape[i]) == *dense_out_dims) {
+          slice_axis = i;
+          break;
+        }
       }
-      begin.push_back(index);
-      end.push_back(*out_dims);
-      strides.push_back(1);
-      index += *out_dims;
+      ICHECK(slice_axis != -1);
+
+      Array<Integer> begin(out_shape.size(), 0);
+      Array<Integer> end(out_shape.size(), -1);
+      Array<Integer> strides(out_shape.size(), 1);
+      begin.Set(slice_axis, index);
+      end.Set(slice_axis, *dense_out_dims);
+      index += *dense_out_dims;
       auto slice = MakeStridedSlice(data, begin, end, strides, "size");
       subst_map->insert({GetRef<Expr>(branch[depth]), slice});
     }
diff --git a/tests/python/relay/test_pass_combine_parallel_dense.py b/tests/python/relay/test_pass_combine_parallel_dense.py
index cd946ab593bf..2494c1a550cd 100644
--- a/tests/python/relay/test_pass_combine_parallel_dense.py
+++ b/tests/python/relay/test_pass_combine_parallel_dense.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import te
+import tvm.testing
 from tvm import relay
 from tvm.relay import transform
 
@@ -359,10 +359,47 @@ def check(i, j, k, scale1, scale2, newshape1, newshape2):
     check(100, 200, 300, 0.5, 0.25, (1, 1, 20000), (1, 1, 40000))
 
 
+def test_combine_parallel_dense_expand_dims():
+    """Verify that the correct slice axis is selected after the combined dense."""
+
+    def before(x, w1, w2):
+        args = [x, w1, w2]
+        y1 = relay.nn.dense(x, w1)
+        y1 = relay.expand_dims(y1, axis=2)
+
+        y2 = relay.nn.dense(x, w2)
+        y2 = relay.expand_dims(y2, axis=2)
+
+        y = relay.Tuple((y1, y2))
+        return relay.Function(args, y)
+
+    def expected(x, w1, w2):
+        args = [x, w1, w2]
+        w_stacked = relay.concatenate((w1, w2), axis=0)
+        y = relay.nn.dense(x, w_stacked, units=24)
+        y = relay.expand_dims(y, axis=2)
+
+        strides = [1, 1, 1]
+        y1 = relay.strided_slice(
+            y, begin=[0, 0, 0], end=[-1, 16, -1], strides=strides, slice_mode="size"
+        )
+        y2 = relay.strided_slice(
+            y, begin=[0, 16, 0], end=[-1, 8, -1], strides=strides, slice_mode="size"
+        )
+        y = relay.Tuple((y1, y2))
+        return relay.Function(args, y)
+
+    x = relay.var("x", shape=(2, 32))
+    w1 = relay.var("w1", shape=(16, 32))
+    w2 = relay.var("w2", shape=(8, 32))
+
+    y_before = before(x, w1, w2)
+    combine_pass = transform.CombineParallelDense(min_num_branches=2, to_batch=False)
+    y = run_opt_pass(y_before, combine_pass)
+    y_expected = expected(x, w1, w2)
+    y_expected = run_opt_pass(y_expected, transform.InferType())
+    tvm.ir.assert_structural_equal(y, y_expected, map_free_vars=True)
+
+
 if __name__ == "__main__":
-    test_combine_parallel_dense()
-    test_combine_parallel_dense_biasadd()
-    test_combine_parallel_dense_biasadd_scale_reshape()
-    test_combine_parallel_dense_flat()
-    test_combine_parallel_dense_flat_biasadd()
-    test_combine_parallel_dense_flat_biasadd_scale_reshape()
+    tvm.testing.main()

From 4aa8c55d03a958b882c0be1f10987b393ab58188 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Tue, 13 Dec 2022 02:05:01 -0500
Subject: [PATCH 052/286] [Fix] Task scheduler error prompt upon build/run
 failure (#13601)

---
 src/meta_schedule/task_scheduler/task_scheduler.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index 69a70f63c5c0..9d859947e4fe 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -120,7 +120,9 @@ void TaskCleanUp(TaskRecordNode* self, int task_id, const Array<RunnerResult>& r
       std::string err = error_msg.value();
       TVM_PY_LOG(INFO, logger) << std::fixed << std::setprecision(4)  //
                                << "[Task #" << task_id << ": " << name << "] Trial #" << trials
-                               << ": Error in building:\n"
+                               << ": Error in "
+                               << (builder_result->error_msg.defined() ? "building" : "running")
+                               << ":\n"
                                << err << "\n"
                                << tir::AsTVMScript(sch->mod()) << "\n"
                                << Concat(sch->trace().value()->AsPython(false), "\n");

From c8ffabce5958e4f10f249d787af2431719a9373d Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Tue, 13 Dec 2022 20:10:13 +0900
Subject: [PATCH 053/286] [TIR] Fix PlanAndUpdateBufferAllocationLocation not
 visiting constant buffer (#13605)

* Fix PlanAndUpdateBufferAllocationLocation not visiting constant buffer

* add comment
---
 .../plan_update_buffer_allocation_location.cc | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/tir/transforms/plan_update_buffer_allocation_location.cc b/src/tir/transforms/plan_update_buffer_allocation_location.cc
index 4c63d3393fd8..11d8330ec8fe 100644
--- a/src/tir/transforms/plan_update_buffer_allocation_location.cc
+++ b/src/tir/transforms/plan_update_buffer_allocation_location.cc
@@ -61,24 +61,35 @@ class BufferAllocateOrderCollector : public StmtExprVisitor {
   }
 
  private:
+  bool find(const Buffer& buf) {
+    return std::find(buffer_alloc_recorder_.begin(), buffer_alloc_recorder_.end(), buf) !=
+           buffer_alloc_recorder_.end();
+  }
+
   void VisitStmt_(const BlockNode* op) final {
     for (const Buffer& buffer : op->alloc_buffers) {
       buffer_alloc_recorder_.push_back(buffer);
     }
+    // Also visit match_buffers to collect constant buffers associated with AllocateConst nodes.
+    // These buffers only appear in read and match_buffer regions.
+    for (const auto& region : op->match_buffers) {
+      if (!find(region->source->buffer)) {
+        buffer_alloc_recorder_.push_back(region->source->buffer);
+      }
+    }
+
     StmtExprVisitor::VisitStmt_(op);
   }
 
   void VisitExpr_(const BufferLoadNode* op) final {
-    if (std::find(buffer_alloc_recorder_.begin(), buffer_alloc_recorder_.end(), op->buffer) ==
-        buffer_alloc_recorder_.end()) {
+    if (!find(op->buffer)) {
       buffer_alloc_recorder_.push_back(op->buffer);
     }
     StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitStmt_(const BufferStoreNode* op) final {
-    if (std::find(buffer_alloc_recorder_.begin(), buffer_alloc_recorder_.end(), op->buffer) ==
-        buffer_alloc_recorder_.end()) {
+    if (!find(op->buffer)) {
       buffer_alloc_recorder_.push_back(op->buffer);
     }
     StmtExprVisitor::VisitStmt_(op);

From 0d4a2cde116d4b770989644630e7f55dc9b44272 Mon Sep 17 00:00:00 2001
From: Farshid Salemi Parizi <fparizi@octoml.ai>
Date: Tue, 13 Dec 2022 07:44:08 -0800
Subject: [PATCH 054/286] [Hexagon] Enable depthwise conv2d NHWC with an HWIO
 kernel layout (#13414)

Enable depthwise conv2d NHWC with HWIO kernel layout.  The default kernel layout is HWOI, matched to previous behavior.
---
 python/tvm/relay/op/strategy/arm_cpu.py |  2 +-
 python/tvm/relay/op/strategy/hexagon.py |  3 +--
 python/tvm/relay/op/strategy/x86.py     |  3 +--
 python/tvm/topi/nn/depthwise_conv2d.py  | 19 ++++++++++++++++---
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 261b979dedaf..c8d51bc23c82 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -318,7 +318,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
             else:
                 logger.warning("depthwise_conv2d with layout NHWC is not optimized for arm cpu.")
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc, need_kernel_layout=True),
                     wrap_topi_schedule(conv2d_generic.schedule_depthwise_conv2d_nhwc),
                     name="depthwise_conv2d_nhwc.generic",
                 )
diff --git a/python/tvm/relay/op/strategy/hexagon.py b/python/tvm/relay/op/strategy/hexagon.py
index c1d64f2fe143..f42503a1477c 100644
--- a/python/tvm/relay/op/strategy/hexagon.py
+++ b/python/tvm/relay/op/strategy/hexagon.py
@@ -86,9 +86,8 @@ def conv2d_strategy_hexagon(attrs, inputs, out_type, target):
                 name="depthwise_conv2d_nchw.hexagon",
             )
         elif layout == "NHWC":
-            assert kernel_layout == "HWOI"
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc, need_kernel_layout=True),
                 wrap_topi_schedule(topi.hexagon.schedule_depthwise_conv2d_nhwc),
                 name="depthwise_conv2d_nhwc.hexagon",
             )
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 3e59209f5822..7ff4dbc0ad1b 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -228,13 +228,12 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             assert _OIHWio_matcher.match(kernel_layout)  # check if kernel is OIHWio
             return depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
         elif layout == "NHWC":
-            assert kernel_layout == "HWOI"
             if (not need_auto_scheduler_layout) and (not need_meta_schedule_layout):
                 logger.warning(
                     "depthwise_conv2d NHWC layout is not optimized for x86 with autotvm."
                 )
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc, need_kernel_layout=True),
                 wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc),
                 name="depthwise_conv2d_nhwc.generic",
             )
diff --git a/python/tvm/topi/nn/depthwise_conv2d.py b/python/tvm/topi/nn/depthwise_conv2d.py
index 48ffb8c6d9ff..7c446a23a813 100644
--- a/python/tvm/topi/nn/depthwise_conv2d.py
+++ b/python/tvm/topi/nn/depthwise_conv2d.py
@@ -19,6 +19,7 @@
 from __future__ import absolute_import as _abs
 from collections import namedtuple
 import tvm
+import numpy as np
 from tvm import te
 
 from .dilate import dilate
@@ -211,7 +212,9 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=No
     return Output
 
 
-def depthwise_conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype=None):
+def depthwise_conv2d_nhwc(
+    Input, Filter, stride, padding, dilation, kernel_layout="HWOI", out_dtype=None
+):
     """Depthwise convolution nhwc forward operator.
 
     Parameters
@@ -252,8 +255,14 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype=No
         dilation_h, dilation_w = dilation
 
     batch, in_height, in_width, in_channel = Input.shape
+
     # shape of dilated kernel
-    filter_height, filter_width, filter_channel, channel_multiplier = Filter.shape
+    if kernel_layout == "HWIO":
+        filter_height, filter_width, channel_multiplier, filter_channel = Filter.shape
+        kernel_permutation = [0, 1, 3, 2]
+    else:
+        filter_height, filter_width, filter_channel, channel_multiplier = Filter.shape
+        kernel_permutation = [0, 1, 2, 3]
 
     dilated_kernel_h = (filter_height - 1) * dilation_h + 1
     dilated_kernel_w = (filter_width - 1) * dilation_w + 1
@@ -285,7 +294,11 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype=No
                     idxdiv(c, channel_multiplier),
                 ].astype(out_dtype)
                 * Filter[
-                    di, dj, idxdiv(c, channel_multiplier), idxmod(c, channel_multiplier)
+                    tuple(
+                        np.array(
+                            [di, dj, idxdiv(c, channel_multiplier), idxmod(c, channel_multiplier)]
+                        )[kernel_permutation]
+                    )
                 ].astype(out_dtype)
             ),
             axis=[di, dj],

From c48c063683166cfb599caba7264e0a37b9331a81 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@octoml.ai>
Date: Tue, 13 Dec 2022 09:37:03 -0800
Subject: [PATCH 055/286] [Relay][Frontend][Onnx] SequenceAt and
 SplitToSequence Operators (#13602)

* Add support for SequenceAt and SplitToSequence to onnx importer

* Formatting

* Change keepdims comparison

* Only unify non-tuples in If
---
 python/tvm/relay/frontend/onnx.py          | 79 ++++++++++++++++++++++
 python/tvm/relay/op/_transform.py          |  2 -
 tests/python/frontend/onnx/test_forward.py | 31 +++++----
 3 files changed, 98 insertions(+), 14 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 62f0f4b2dd25..3470099100d4 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -4008,6 +4008,23 @@ def _impl_v1(cls, inputs, attr, params):
         for var in else_free_vars:
             graph_scope._nodes.update({var.name_hint: var})
 
+        # Sometimes pytorch to onnx will insert silly if statements that produce dynamic ranks.
+        # Often these dont contribute anything. If we see a dynamic rank output, try to unify
+        # them so we can continue without breaking.
+        if not isinstance(then_expr, _expr.Tuple) and not isinstance(else_expr, _expr.Tuple):
+            then_shape = infer_shape(then_expr)
+            else_shape = infer_shape(else_expr)
+            if len(then_shape) != len(else_shape):
+                warning_msg = (
+                    "If statement produced outputs with different rank. "
+                    "Attempting to unify ranks but this may produce incorrect results."
+                )
+                warnings.warn(warning_msg)
+                if len(then_shape) < len(else_shape):
+                    then_expr = _op.broadcast_to_like(then_expr, else_expr)
+                else:
+                    else_expr = _op.broadcast_to_like(else_expr, then_expr)
+
         # Now we can construct the relay if statement and return.
         ret = _expr.If(cond, then_expr, else_expr)
         if len(then_branch.output) > 1:
@@ -5565,6 +5582,66 @@ def _impl_v11(cls, inputs, attr, params):
         return _op.concatenate(inputs[0], axis=axis)
 
 
+class SplitToSequence(OnnxOpConverter):
+    """Operator converter for split to sequence op."""
+
+    @classmethod
+    def _impl_v11(cls, inputs, attr, params):
+        axis = attr.get("axis", 0)
+        keepdims = attr.get("keepdims", 1)
+
+        input_tensor = inputs[0]
+        input_shape = infer_shape(input_tensor)
+        split = inputs[1]
+
+        # If split is not provided, we split all values along axis.
+        if split is None:
+            output = _op.split(input_tensor, input_shape[axis], axis=axis)
+            # If keepdims is 0, then we need to squeeze off the axis.
+            if not keepdims:
+                output = [_op.squeeze(tensor_slice, axis=[axis]) for tensor_slice in output]
+            return _expr.Tuple(list(output))
+
+        # Otherwise, split based on provided split value.
+        else:
+            # For now we only support constant valued split.
+            assert isinstance(
+                split, _expr.Constant
+            ), "Only constant split supported for SplitToSequence"
+            split = split.data.numpy()
+            if len(split.shape) == 1 and split.shape[0] > 1:
+                # If split is a 1D tensor, it must be converted to indices for relay compatibility.
+                split = np.cumsum(split)
+                # Remove final invalid index.
+                split = split[:-1]
+            else:
+                # Otherwise get split as an integer.
+                split = int(split)
+
+            output = _op.split(input_tensor, split, axis=axis)
+
+            # If keepdims is set to 0 remove split axis. Note that this is
+            # an inconsistency with the onnx spec but is needed for pytorch compatibility.
+            if not keepdims:
+                output = [_op.squeeze(tensor_slice, axis=[axis]) for tensor_slice in output]
+            return _expr.Tuple(list(output))
+
+
+class SequenceAt(OnnxOpConverter):
+    """Operator converter for sequence at op."""
+
+    @classmethod
+    def _impl_v11(cls, inputs, attr, params):
+        input_sequence = inputs[0]
+        position = inputs[1]
+        assert isinstance(
+            position, _expr.Constant
+        ), "Only constant position supported for SequenceAt"
+        # Convert position to integer.
+        position = int(position.data.numpy())
+        return input_sequence[position]
+
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -5793,6 +5870,8 @@ def _get_convert_map(opset):
         "SequenceConstruct": SequenceConstruct.get_converter(opset),
         "SequenceInsert": SequenceInsert.get_converter(opset),
         "ConcatFromSequence": ConcatFromSequence.get_converter(opset),
+        "SplitToSequence": SplitToSequence.get_converter(opset),
+        "SequenceAt": SequenceAt.get_converter(opset),
     }
 
 
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 5b7e342c4b4e..d4e4a527835a 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -432,8 +432,6 @@ def _concatenate_shape_func(inputs, axis):
     for i in const_range(ndim):
         if i != axis:
             out[i] = inputs[0][i]
-            for j in const_range(1, len(inputs)):
-                assert out[i] == inputs[j][i], "Dims mismatch in the inputs of concatenate."
         else:
             out[i] = int64(0)
             for j in const_range(len(inputs)):
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 211d7f798aba..dcd4f2defbe8 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -7043,7 +7043,7 @@ def verify_linear_regressor(a_shape, c_shape, i_shape, targets=1, batch=1):
 def test_sequence(target, dev):
     """test_sequence"""
 
-    def verify_sequence_ops(tensor_shape, num_tensors, axis=0, position=None, new_axis=None):
+    def verify_sequence_ops(tensor_shape, num_tensors, axis=0, position=0, new_axis=None):
         tensor_shape = list(tensor_shape)
         tensor_values = []
         for i in range(num_tensors):
@@ -7062,20 +7062,30 @@ def verify_sequence_ops(tensor_shape, num_tensors, axis=0, position=None, new_ax
             outputs=["sequence"],
         )
 
-        insert_inputs = ["sequence", input_tensor_names[0]]
-        position_node = None
-        if position is not None:
-            insert_inputs.append("position")
-            position_node = make_constant_node("position", TensorProto.INT32, (), [position])
+        position_node = make_constant_node("position", TensorProto.INT32, (), [position])
 
         # Test sequence insertion.
         insert_node = helper.make_node(
-            "SequenceInsert", inputs=insert_inputs, outputs=["inserted_sequence"]
+            "SequenceInsert",
+            inputs=["sequence", input_tensor_names[0], "position"],
+            outputs=["inserted_sequence"],
         )
 
         # Test sequence concatenation.
         concat_node = helper.make_node(
-            "ConcatFromSequence", inputs=["inserted_sequence"], outputs=["output"], axis=axis
+            "ConcatFromSequence",
+            inputs=["inserted_sequence"],
+            outputs=["concat_sequence"],
+            axis=axis,
+        )
+
+        # Test splitting a tensor into a sequence.
+        split_node = helper.make_node(
+            "SplitToSequence", inputs=["concat_sequence"], outputs=["split_sequence"], axis=axis
+        )
+
+        at_node = helper.make_node(
+            "SequenceAt", inputs=["split_sequence", "position"], outputs=["output"]
         )
 
         if new_axis is not None:
@@ -7097,10 +7107,7 @@ def verify_sequence_ops(tensor_shape, num_tensors, axis=0, position=None, new_ax
             output_shape[axis] = (num_tensors + 1) * output_shape[axis]
         graph_outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, output_shape)]
 
-        graph_nodes = []
-        if position_node is not None:
-            graph_nodes.append(position_node)
-        graph_nodes += [construct_node, insert_node, concat_node]
+        graph_nodes = [position_node, construct_node, insert_node, concat_node, split_node, at_node]
 
         graph = helper.make_graph(
             graph_nodes,

From e181045fd35cd58f73dfb56395f21ea9eacc5751 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 14 Dec 2022 16:48:05 +0900
Subject: [PATCH 056/286] [Relay][TIR] Add utility to lower Relay func to TIR
 prim func (#13606)

* introduce LowerToPrimFunc to lower Relay func to TIR prim func

* add doc

* expose to python

* adding test

* another minor doc update

* Verify that the input is a primitive function
---
 python/tvm/relay/backend/te_compiler.py       | 23 ++++++++
 src/relay/backend/task_extraction.cc          |  6 +-
 src/relay/backend/te_compiler_cache.cc        | 27 +++++++++
 src/relay/backend/te_compiler_cache.h         | 20 +++++--
 ..._plan_update_buffer_allocation_location.py | 56 +++++++++++++++++++
 5 files changed, 124 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py
index 5594e36cb855..814e79329019 100644
--- a/python/tvm/relay/backend/te_compiler.py
+++ b/python/tvm/relay/backend/te_compiler.py
@@ -412,3 +412,26 @@ def get():
         The TE Compiler.
     """
     return _backend._TECompilerGlobal()
+
+
+def lower_to_primfunc(relay_func, target):
+    """Lower Relay Function to TIR PrimFunc.
+
+    Parameters
+    ----------
+    relay_func: relay.Function
+        The source primitive function, created by FuseOps.
+
+    target : Target
+        The compilation target.
+
+    Returns
+    -------
+    prim_func : tir.PrimFunc
+        The created prim func.
+    """
+    f = tvm._ffi.get_global_func("relay.backend.LowerToPrimFunc")
+    assert f is not None, "relay.backend.LowerToPrimFunc does not exist. "
+
+    with target:
+        return f(relay_func, target)
diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc
index e7e677938e1a..fc45311e085d 100644
--- a/src/relay/backend/task_extraction.cc
+++ b/src/relay/backend/task_extraction.cc
@@ -59,7 +59,6 @@ Array<meta_schedule::ExtractedTask> ExtractTask(IRModule mod, Target target,
   using meta_schedule::ExtractedTask;
   using meta_schedule::ModuleEqual;
   using meta_schedule::ModuleHash;
-  backend::FTECompilerTIRConverter tir_converter = backend::GetTIRConverter();
   backend::BindParamsInModule(mod, params);
   // is_vm=true for backward compatibility
   Array<Pass> pass_seqs = relay::backend::GetPassPrefix(/*is_homogenous=*/true, /*is_vm=*/true);
@@ -84,10 +83,9 @@ Array<meta_schedule::ExtractedTask> ExtractTask(IRModule mod, Target target,
       if (!relay_func->HasNonzeroAttr(attr::kPrimitive)) {
         return;
       }
-      auto [inputs_outputs, constants, fused_name] =
-          tec::LowerTECompute(relay_func, target, constant_name_supply, /*return_inputs=*/true);
 
-      if (Optional<tir::PrimFunc> f = tir_converter(inputs_outputs, constants)) {
+      auto [f, fused_name] = tec::LowerToPrimFunc(relay_func, target, constant_name_supply);
+      if (f) {
         IRModule tir_mod = PrimFuncToIRModule(f.value());
         lower_results.push_back(std::make_tuple(fused_name, relay_func, tir_mod));
       }
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 511f0a901d11..d71cbcfc667d 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -1088,6 +1088,33 @@ std::tuple<Array<te::Tensor>, Array<runtime::NDArray>, std::string> LowerTECompu
   return std::make_tuple(tensor_outs, constants, lower_te_compute.candidate_name_);
 }
 
+std::pair<Optional<tir::PrimFunc>, std::string> LowerToPrimFunc(const Function& relay_func,
+                                                                Target target,
+                                                                NameSupply constant_name_supply) {
+  ICHECK(relay_func->HasNonzeroAttr(attr::kPrimitive))
+      << "The input must be a Relay primitive function.";
+
+  auto [inputs_outputs, constants, fused_name] =
+      tec::LowerTECompute(relay_func, target, constant_name_supply, /*return_inputs=*/true);
+  auto tir_converter = backend::GetTIRConverter();
+  return std::make_pair(tir_converter(inputs_outputs, constants), fused_name);
+}
+
+tir::PrimFunc LowerToPrimFunc(const Function& relay_func, Target target) {
+  auto [f_opt, _] = LowerToPrimFunc(relay_func, target, NameSupply(""));
+  (void)_;  // to suppress -Werror=unused-variable warning
+  if (f_opt) {
+    return f_opt.value();
+  }
+  LOG(FATAL) << "Failed to convert the Relay function: " << AsText(relay_func, false);
+  return PrimFunc();
+}
+
+TVM_REGISTER_GLOBAL("relay.backend.LowerToPrimFunc")
+    .set_body_typed([](Function relay_func, Target target) {
+      return LowerToPrimFunc(relay_func, target);
+    });
+
 TVM_REGISTER_GLOBAL("relay.backend.LowerToTE").set_body_typed([](Function prim_func) {
   auto tgt = tvm::Target("ext_dev");
   LowerToTECompute lower_te_compute(tgt, NameSupply(""));
diff --git a/src/relay/backend/te_compiler_cache.h b/src/relay/backend/te_compiler_cache.h
index fcbf10477fdf..76939a923cdf 100644
--- a/src/relay/backend/te_compiler_cache.h
+++ b/src/relay/backend/te_compiler_cache.h
@@ -212,10 +212,10 @@ class CCacheValue : public ObjectRef {
 Array<IndexExpr> GetShape(const Array<IndexExpr>& shape);
 
 /*!
- * \brief Lowers Relay primitive Function to TE Compute
+ * \brief Lower Relay primitive Function to TE Compute
  * \param source_func The primitive function to be lowered.
- * \param target The target we want to create schedule for.
- * \param constant_name_supply A name supplier for constants.
+ * \param target The compilation target.
+ * \param constant_name_supply A name supplier for constants
  *  across different invocations of this function.
  * \param return_inputs If true, prepend input tensors to the output array of tensors.
  * \return Tuple of the lowered TE compute, constant raw data, and fused function name.
@@ -224,10 +224,22 @@ std::tuple<Array<te::Tensor>, Array<runtime::NDArray>, std::string> LowerTECompu
     const Function& source_func, Target target, NameSupply constant_name_supply,
     bool return_inputs = true);
 
+/*!
+ * \brief Lower Relay Function to TIR PrimFunc, by composing LowerTECompute and CreatePrimFunc.
+ * \param relay_func The primitive function to be lowered.
+ * \param target The compilation target.
+ * \param constant_name_supply A name supplier for constants
+ *  across different invocations of this function.
+ * \return A pair of the created prim func and the name of the fused function.
+ */
+std::pair<Optional<tir::PrimFunc>, std::string> LowerToPrimFunc(const Function& relay_func,
+                                                                Target target,
+                                                                NameSupply constant_name_supply);
+
 /*!
  * \brief Create schedule for target.
  * \param source_func The primitive function to be lowered.
- * \param target The target we want to create schedule for.
+ * \param target The compilation target.
  * \param global_var_supply A name supplier for global variables.
  * \param constant_name_supply A name supplier for constants.
  * \return Pair of schedule and cache.
diff --git a/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py b/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py
index 92e3cbd66e2f..0a8a0dd59fbf 100644
--- a/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py
+++ b/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py
@@ -14,10 +14,15 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import numpy as np
+
 import tvm
 import tvm.testing
 from tvm import te
 from tvm.script import tir as T
+from tvm import relay, tir
+from tvm.relay.backend.te_compiler import lower_to_primfunc
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
 
 
 def _check(original, transformed):
@@ -360,5 +365,56 @@ def after(A: T.Buffer[(4, 16), "int32"], C: T.Buffer[(4, 8), "int32"]):
     _check(before, after)
 
 
+def test_allocate_const_after_tensorize():
+    i_size, o_size, h_size, w_size = 64, 64, 56, 56
+    k_height_size = k_width_size = 3
+    w_shape = (o_size, i_size, k_height_size, k_width_size)
+
+    data = relay.var("data", shape=(1, i_size, h_size, w_size), dtype="uint8")
+    weight = relay.var("weight", shape=w_shape, dtype="uint8")
+    conv2d = relay.nn.conv2d(
+        data=data,
+        weight=weight,
+        kernel_size=(k_height_size, k_width_size),
+        channels=o_size,
+        padding=(0, 0),
+        strides=(1, 1),
+        out_dtype="int32",
+    )
+    mod = tvm.IRModule.from_expr(conv2d)
+
+    executor = relay.backend.Executor("graph", {"link-params": True})
+    mod = mod.with_attr("executor", executor)
+
+    weight_np = np.random.uniform(1, 10, size=w_shape).astype("uint8")
+
+    target = tvm.target.Target("hexagon")
+
+    with tvm.transform.PassContext(opt_level=3):
+        opt_mod, _ = relay.optimize(mod, params={"weight": weight_np}, target=target)
+
+    conv2d_func = opt_mod["main"].body.args[0].op
+    prim_func = lower_to_primfunc(conv2d_func, target)
+
+    sch = tir.Schedule(prim_func)
+    block = sch.get_block("conv2d_NCHWc_int8")
+    loops = sch.get_loops(block)
+
+    sch.reorder(loops[8], loops[4], loops[-1])
+    sch.decompose_reduction(block, loops[1])
+    sch.tensorize(loops[4], VRMPY_u8u8i32_INTRIN)
+
+    seq = tvm.transform.Sequential(
+        [
+            tvm.tir.transform.LowerInitBlock(),
+            tvm.tir.transform.PlanAndUpdateBufferAllocationLocation(),
+        ]
+    )
+
+    # The following error is emitted if AllocateConst nodes are not correctly handled:
+    #  Check failed: (buffer_data_to_buffer_.count(source_var)) is false:
+    _ = seq(sch.mod)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 1a6798febfd1339739825f8a10149e90a752768d Mon Sep 17 00:00:00 2001
From: Alexey Yazev <113356454+Alexey-Yazev@users.noreply.github.com>
Date: Wed, 14 Dec 2022 13:15:47 +0400
Subject: [PATCH 057/286] [microNPU] Disable copying weights to SRAM for
 FullyConnected ops in CopyConstants scheduler (#13588)

In Ethos-U, CopyConstants scheduler currently copies weights for all operators. But in Vela, there are a number of scenarios where the weights are not buffered in SRAM, and FullyConnected case is one of them.
---
 .../backend/contrib/ethosu/tir/scheduler.py      | 10 +++++++++-
 .../python/contrib/test_ethosu/test_scheduler.py | 16 ++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
index bcabe2b7c2fa..cee8f563ff7a 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
@@ -132,6 +132,12 @@ def copy_constants():
     def _planner(cached_func, const_dict, sch):
         planned = set()  # type: ignore
 
+        def _is_matmul(tensor):
+            if tensor.name not in ["ethosu_conv2d"]:
+                return False
+            a, b = tensor.op.input_tensors[0:2]
+            return a.shape[1:3] == [1, 1] and b.shape[1:3] == [1, 1]
+
         def _visit(tensor, reader, lut):
             if tensor not in planned:
                 planned.add(tensor)
@@ -140,7 +146,9 @@ def _visit(tensor, reader, lut):
                     # ambiguity when encountering a scalar.
                     is_same = [var.same_as(tensor) for var in cached_func.inputs]
                     index = is_same.index(True)
-                    if index in const_dict:
+                    # Along with constants, also skip for FullyConnected to correspond
+                    # with Vela behavior
+                    if index in const_dict and not _is_matmul(reader):
                         sch.cache_read(tensor, "global", [reader])
 
                 elif isinstance(tensor.op, tvm.te.ComputeOp):
diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
index fd1e1afa60d9..695aed0d1919 100644
--- a/tests/python/contrib/test_ethosu/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -217,5 +217,21 @@ def test_schedule_diamond_graph():
     tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
 
+def test_copy_constants_fully_connected_weights():
+    """Check that MatMul-like conv2d ops do not copy weights to SRAM."""
+    ifm = relay.var("IFM", shape=(1, 1, 1, 32), dtype="int8")
+    conv = make_ethosu_conv2d(ifm, 32, 8, (1, 1), (0, 0), (1, 1), (1, 1))
+    func = relay.Function(relay.analysis.free_vars(conv), conv)
+    func = run_opt_pass(func, relay.transform.InferType())
+
+    func, const_dict = extract_constants(func)
+    cached_func = lower_to_te(func)
+
+    sch = te.create_schedule([cached_func.outputs[0].op])
+    planner = copy_constants()
+    planner(cached_func, const_dict, sch)
+    assert True not in [".global" in s.op.name for s in sch.stages]
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From 2024e6359c40f38ebc42140025244c3ec54084b5 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 14 Dec 2022 09:25:53 -0800
Subject: [PATCH 058/286] [microTVM][Zephyr] Fix TVMC test on hardware (#13598)

* fixed test

* fix flag for arduino
---
 tests/micro/common/test_tvmc.py | 46 ++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/tests/micro/common/test_tvmc.py b/tests/micro/common/test_tvmc.py
index b2321f7d86a3..3aa7fec2f299 100644
--- a/tests/micro/common/test_tvmc.py
+++ b/tests/micro/common/test_tvmc.py
@@ -44,6 +44,26 @@ def _run_tvmc(cmd_args: list, *args, **kwargs):
     return subprocess.check_call(cmd_args_list, *args, **kwargs)
 
 
+def create_project_command(project_path: str, mlf_path: str, platform: str, board: str) -> list:
+    """Returns create project command with tvmc micro."""
+    cmd = [
+        "micro",
+        "create-project",
+        project_path,
+        mlf_path,
+        platform,
+        "--project-option",
+        "project_type=host_driven",
+        f"board={board}",
+    ]
+
+    if platform == "zephyr":
+        # TODO: 4096 is driven by experiment on nucleo_l4r5zi. We should cleanup this after we have
+        # better memory management.
+        cmd.append("config_main_stack_size=4096")
+    return cmd
+
+
 @tvm.testing.requires_micro
 def test_tvmc_exist(platform, board):
     cmd_result = _run_tvmc(["micro", "-h"])
@@ -93,18 +113,7 @@ def test_tvmc_model_build_only(platform, board, output_dir):
     )
     assert cmd_result == 0, "tvmc failed in step: compile"
 
-    create_project_cmd = [
-        "micro",
-        "create-project",
-        project_dir,
-        tar_path,
-        platform,
-        "--project-option",
-        "project_type=host_driven",
-        f"board={board}",
-    ]
-
-    cmd_result = _run_tvmc(create_project_cmd)
+    cmd_result = _run_tvmc(create_project_command(project_dir, tar_path, platform, board))
     assert cmd_result == 0, "tvmc micro failed in step: create-project"
 
     build_cmd = ["micro", "build", project_dir, platform]
@@ -157,18 +166,7 @@ def test_tvmc_model_run(platform, board, output_dir):
     )
     assert cmd_result == 0, "tvmc failed in step: compile"
 
-    create_project_cmd = [
-        "micro",
-        "create-project",
-        project_dir,
-        tar_path,
-        platform,
-        "--project-option",
-        "project_type=host_driven",
-        f"board={board}",
-    ]
-
-    cmd_result = _run_tvmc(create_project_cmd)
+    cmd_result = _run_tvmc(create_project_command(project_dir, tar_path, platform, board))
     assert cmd_result == 0, "tvmc micro failed in step: create-project"
 
     build_cmd = ["micro", "build", project_dir, platform]

From f566e61815aebdc22dea63e537481edeb2d18dc3 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 14 Dec 2022 19:49:52 -0600
Subject: [PATCH 059/286] [LLVM] Use std::nullopt instead of llvm::None
 (#13617)

Pass `std::nullopt` to initialization of `PassBuilder` for `PGOOptions`.
LLVM is moving away from its own `Optional` type to `std::optional`.
---
 src/target/llvm/codegen_llvm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 7aae17788800..526bcf0fb26e 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -362,7 +362,7 @@ void CodeGenLLVM::Optimize() {
 
   llvm::PipelineTuningOptions pto = llvm::PipelineTuningOptions();
   llvm::PassInstrumentationCallbacks pic;
-  llvm::PassBuilder builder(tm, pto, llvm::None, &pic);
+  llvm::PassBuilder builder(tm, pto, std::nullopt, &pic);
 
   llvm::LoopAnalysisManager lam;
   llvm::FunctionAnalysisManager fam;

From 69e35095b0bef4f86f616020b57d64f6bed98704 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 14 Dec 2022 19:50:23 -0600
Subject: [PATCH 060/286] [Hexagon] Switch from default_rng to random in
 Hexagon tests (#13616)

default_rng was introduced in numpy 1.19, which is not present
even in Ubuntu 20.04 (it comes with 1.17.4).
---
 tests/python/contrib/test_hexagon/test_parallel_hvx.py    | 6 ++----
 .../contrib/test_hexagon/test_parallel_hvx_load_vtcm.py   | 5 ++---
 tests/python/contrib/test_hexagon/test_parallel_scalar.py | 8 ++++----
 tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py  | 4 +---
 4 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx.py b/tests/python/contrib/test_hexagon/test_parallel_hvx.py
index 15273afdd41e..13ad36278bc6 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx.py
@@ -19,7 +19,6 @@
 Test parallelizing HVX workloads and compare them to single thread examples.
 """
 import numpy as np
-from numpy.random import default_rng
 
 import tvm
 from tvm.script import tir as T
@@ -148,9 +147,8 @@ def evaluate(hexagon_session, shape_dtypes, expected_output_producer, sch):
     func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v68"))
     module = hexagon_session.load_module(func_tir)
 
-    rng = default_rng()
-    a = rng.integers(0, 16, a_shape, dtype=a_dtype)
-    b = rng.integers(0, 16, b_shape, dtype=b_dtype)
+    a = np.random.randint(0, 16, a_shape, dtype=a_dtype)
+    b = np.random.randint(0, 16, b_shape, dtype=b_dtype)
     c = np.zeros(c_shape, dtype=c_dtype)
 
     a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
index 6cca44388d09..ee7f789ed103 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
@@ -19,7 +19,6 @@
 
 import numpy as np
 import tvm
-from numpy.random import default_rng
 from tvm.script import tir as T
 
 from .infrastructure import get_hexagon_target
@@ -395,11 +394,11 @@ class TestMatMulVec:
 
     @tvm.testing.fixture
     def input_a(self, operations):
-        return default_rng().integers(0, 16, (operations, 128), dtype="uint8")
+        return np.random.randint(0, 16, (operations, 128), dtype="uint8")
 
     @tvm.testing.fixture
     def input_b(self, operations):
-        return default_rng().integers(0, 16, (operations, 128), dtype="uint8")
+        return np.random.randint(0, 16, (operations, 128), dtype="uint8")
 
     @tvm.testing.fixture
     def input_c(self, operations):
diff --git a/tests/python/contrib/test_hexagon/test_parallel_scalar.py b/tests/python/contrib/test_hexagon/test_parallel_scalar.py
index b96265d9df99..0ca8c6ba0c47 100644
--- a/tests/python/contrib/test_hexagon/test_parallel_scalar.py
+++ b/tests/python/contrib/test_hexagon/test_parallel_scalar.py
@@ -18,7 +18,6 @@
 """ Test parallelism for multiple different scalar workloads. """
 
 import numpy as np
-from numpy.random import default_rng
 
 import tvm
 from tvm.script import tir as T
@@ -91,9 +90,10 @@ def evaluate(hexagon_session, operations, expected, sch):
     func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v68"))
     module = hexagon_session.load_module(func_tir)
 
-    rng = default_rng()
-    a = rng.random(shape, dtype=dtype)
-    b = rng.random(shape, dtype=dtype)
+    # np.random.random returns float64 by default, but make the cast explicit
+    # to make it easier to switch when necessary.
+    a = np.random.random(shape).astype(dtype)
+    b = np.random.random(shape).astype(dtype)
     c = np.zeros(shape, dtype=dtype)
 
     a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device)
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
index 0b6b52335cb5..254eb00cb2ea 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -18,7 +18,6 @@
 """Test theoretical bandwith for data transfers to VTCM for different strategies."""
 
 import numpy as np
-from numpy.random import default_rng
 
 import tvm
 from tvm.script import tir as T
@@ -96,8 +95,7 @@ def evaluate(hexagon_session, sch, size):
     func_tir = tvm.build(sch.mod["main"], target=get_hexagon_target("v69"))
     module = hexagon_session.load_module(func_tir)
 
-    rng = default_rng()
-    a = rng.integers(-128, 127, a_shape, dtype="int8")
+    a = np.random.randint(-128, 127, a_shape, dtype="int8")
     a_vtcm = np.zeros(a_shape, dtype="int8")
 
     a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope="global")

From f04ec55ddca482acb26b8c20b1dda4fba2bd0a61 Mon Sep 17 00:00:00 2001
From: Alexey Voronov <avoronov.icemist@gmail.com>
Date: Thu, 15 Dec 2022 04:51:54 +0300
Subject: [PATCH 061/286] [Metaschedule] Aligning get_top_k logic in
 MemoryDatabase and JSONDatabase (#13611)

[Metaschedule] Align get_top_k logic in MemoryDatabase and JSONDatabase
---
 src/meta_schedule/database/json_database.cc   | 10 ++++-
 src/meta_schedule/database/memory_database.cc | 12 +++++-
 .../unittest/test_meta_schedule_database.py   | 39 +++++++++++++++++++
 3 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index bd5183f0cf60..22d6ec849c5f 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -126,16 +126,22 @@ class JSONDatabaseNode : public DatabaseNode {
     }
     Array<TuningRecord> results;
     results.reserve(top_k);
-    int counter = 0;
     for (const TuningRecord& record : this->tuning_records_) {
+      if (!record->run_secs.defined() || record->run_secs.value().empty()) {
+        continue;
+      }
       if (record->workload.same_as(workload) ||
           WorkloadEqual(GetModuleEquality())(record->workload, workload)) {
         results.push_back(record);
-        if (++counter == top_k) {
+        if (results.size() == static_cast<size_t>(top_k)) {
           break;
         }
       }
     }
+    if (results.size() < static_cast<size_t>(top_k)) {
+      LOG(WARNING) << "The size of the GetTopK result is smaller than requested. There are not "
+                      "enough valid records in the database for this workload.";
+    }
     return results;
   }
 
diff --git a/src/meta_schedule/database/memory_database.cc b/src/meta_schedule/database/memory_database.cc
index 24fba6dfa105..19178a35f456 100644
--- a/src/meta_schedule/database/memory_database.cc
+++ b/src/meta_schedule/database/memory_database.cc
@@ -61,8 +61,12 @@ class MemoryDatabaseNode : public DatabaseNode {
   void CommitTuningRecord(const TuningRecord& record) final { records.push_back(record); }
 
   Array<TuningRecord> GetTopK(const Workload& workload, int top_k) final {
+    CHECK_GE(top_k, 0) << "ValueError: top_k must be non-negative";
+    if (top_k == 0) {
+      return {};
+    }
     std::vector<std::pair<double, TuningRecord>> results;
-    results.reserve(this->records.size());
+    results.reserve(records.size());
     for (const TuningRecord& record : records) {
       if (!record->run_secs.defined()) {
         continue;
@@ -83,7 +87,7 @@ class MemoryDatabaseNode : public DatabaseNode {
     std::sort(results.begin(), results.end());
     auto begin = results.begin();
     auto end = results.end();
-    if (static_cast<int>(results.size()) > top_k) {
+    if (results.size() > static_cast<size_t>(top_k)) {
       end = begin + top_k;
     }
     Array<TuningRecord> ret;
@@ -92,6 +96,10 @@ class MemoryDatabaseNode : public DatabaseNode {
       ret.push_back(begin->second);
       ++begin;
     }
+    if (ret.size() < static_cast<size_t>(top_k)) {
+      LOG(WARNING) << "The size of the GetTopK result is smaller than requested. There are not "
+                      "enough valid records in the database for this workload.";
+    }
     return ret;
   }
 
diff --git a/tests/python/unittest/test_meta_schedule_database.py b/tests/python/unittest/test_meta_schedule_database.py
index 777c5589a141..4ec10b556c3b 100644
--- a/tests/python/unittest/test_meta_schedule_database.py
+++ b/tests/python/unittest/test_meta_schedule_database.py
@@ -18,6 +18,7 @@
 """Test Meta Schedule Database"""
 import os.path as osp
 import tempfile
+import pytest
 from typing import Callable, Optional, List
 
 import tvm
@@ -536,5 +537,43 @@ def test_meta_schedule_pydatabase_current():
         assert ms.database.Database.current() == db
 
 
+def call_get_top_k(run_secs_list, database, k):
+    mod: IRModule = Matmul
+    workload = database.commit_workload(mod)
+    for run_secs in run_secs_list:
+        record = ms.database.TuningRecord(
+            _create_schedule(mod, _schedule_matmul).trace,
+            workload,
+            run_secs,
+            tvm.target.Target("llvm"),
+            ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
+        )
+        database.commit_tuning_record(record)
+    return [[v.value for v in record.run_secs] for record in database.get_top_k(workload, k)]
+
+
+@pytest.mark.parametrize(
+    "k,expected",
+    [(0, []), (3, [[0.0, 2.0], [2.0], [1.5, 4.5]]), (5, [[0.0, 2.0], [2.0], [1.5, 4.5]])],
+)
+def test_memory_database_get_top_k(k, expected):
+    run_secs_list = [[1.5, 4.5], [], [0.0, 2.0], None, [2.0]]
+    database = ms.database.MemoryDatabase()
+    result = call_get_top_k(run_secs_list, database, k)
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    "k,expected",
+    [(0, []), (3, [[0.0, 2.0], [2.0], [1.5, 4.5]]), (5, [[0.0, 2.0], [2.0], [1.5, 4.5]])],
+)
+def test_json_database_get_top_k(k, expected):
+    run_secs_list = [[1.5, 4.5], [], [0.0, 2.0], None, [2.0]]
+    with tempfile.TemporaryDirectory() as tmpdir:
+        database = _create_tmp_database(tmpdir)
+        result = call_get_top_k(run_secs_list, database, k)
+    assert result == expected
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 1938273c4c668d46d5d6b4a72e19f38b2862379e Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 15 Dec 2022 11:15:52 +0900
Subject: [PATCH 062/286] [TOPI] Fix batch_matmul tensorcore legalize for
 transpose_b = False case (#13618)

* fixed tensor core batch_matmul legalize for transpose_b = False case

* add test

* clean up
---
 python/tvm/topi/cuda/tensorcore_alter_op.py   | 32 +++++++++++---
 .../relay/test_pass_legalize_tensorcore.py    | 43 ++++++++++++++++---
 2 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
index 0ba428014548..dbbf9e74903c 100644
--- a/python/tvm/topi/cuda/tensorcore_alter_op.py
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -48,14 +48,22 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
     x_tensor, y_tensor = arg_types[0], arg_types[1]
     dtype = x_tensor.dtype
 
+    if attrs.transpose_a:
+        B, K, M = x_tensor.shape
+    else:
+        B, M, K = x_tensor.shape
+
+    if attrs.transpose_b:
+        B, N, K = y_tensor.shape
+    else:
+        B, K, N = y_tensor.shape
+
     # Collect the output tensor.
     output_tensor = arg_types[2]
 
     # Collect the input exprs.
     x, y = inputs
 
-    B, M, K = x_tensor.shape
-    B, N, K = y_tensor.shape
     if (
         isinstance(B, tir.expr.Any)
         or isinstance(M, tir.expr.Any)
@@ -96,9 +104,23 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
         return None
 
     logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", extra_flops)
-    x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk))) if dm or dk else x
-    y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk))) if dn or dk else y
-    out_ = relay.nn.batch_matmul(x_, y_, attrs.out_dtype)
+
+    if attrs.transpose_a:
+        pad_width = ((0, 0), (0, dk), (0, dm))
+    else:
+        pad_width = ((0, 0), (0, dm), (0, dk))
+
+    x_ = relay.nn.pad(x, pad_width=pad_width) if dm or dk else x
+
+    if attrs.transpose_b:
+        pad_width = ((0, 0), (0, dn), (0, dk))
+    else:
+        pad_width = ((0, 0), (0, dk), (0, dn))
+
+    y_ = relay.nn.pad(y, pad_width=pad_width) if dn or dk else y
+
+    out_ = relay.nn.batch_matmul(x_, y_, **attrs)
+
     out = (
         relay.strided_slice(out_, begin=[0, 0, 0], end=[x.value for x in output_tensor.shape])
         if dm or dn
diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index 0e3c171d87da..c9782aec1b2c 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -277,17 +277,27 @@ def expected():
 
 @tvm.testing.uses_gpu
 def test_legalize_batch_matmul():
-    def _test_legalize_batch_matmul(data_shape, kernel_shape, pad_shape, dtype, do_pad=True):
+    def _test_legalize_batch_matmul(
+        data_shape, kernel_shape, pad_shape, dtype, do_pad=True, transpose_a=False, transpose_b=True
+    ):
         """test legalize dense to enable tensorcore"""
-        B, M, _ = data_shape
-        _, N, _ = kernel_shape
+        if transpose_a:
+            B, _, M = data_shape
+        else:
+            B, M, _ = data_shape
+
+        if transpose_b:
+            _, N, _ = kernel_shape
+        else:
+            _, _, N = kernel_shape
+
         out_shape = (B, M, N)
         dm, dk, dn = pad_shape
 
         def before():
             x = relay.var("x", shape=data_shape, dtype=dtype)
             weight = relay.var("weight", shape=kernel_shape, dtype=dtype)
-            y = relay.nn.batch_matmul(x, weight)
+            y = relay.nn.batch_matmul(x, weight, transpose_a=transpose_a, transpose_b=transpose_b)
             y = relay.Function([x, weight], y)
             return y
 
@@ -298,19 +308,31 @@ def legalize_batch_matmul(attrs, inputs, types):
         def expected():
             if not do_pad:
                 return before()
+
             x = relay.var("x", shape=data_shape, dtype=dtype)
+            weight = relay.var("weight", shape=(kernel_shape), dtype=dtype)
+
             if dm or dk:
-                x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
+                if transpose_a:
+                    x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, dk), (0, dm)))
+                else:
+                    x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
             else:
                 x_pad = x
-            weight = relay.var("weight", shape=(kernel_shape), dtype=dtype)
+
             if dn or dk:
-                weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, dn), (0, dk)))
+                if transpose_b:
+                    weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, dn), (0, dk)))
+                else:
+                    weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, dk), (0, dn)))
             else:
                 weight_pad = weight
+
             y_pad = relay.nn.batch_matmul(
                 x_pad,
                 weight_pad,
+                transpose_a=transpose_a,
+                transpose_b=transpose_b,
             )
             if dm or dn:
                 y = relay.strided_slice(y_pad, begin=[0, 0, 0], end=out_shape)
@@ -343,6 +365,13 @@ def expected():
     _test_legalize_batch_matmul((16, 8, 16), (16, 32, 16), (0, 16, 0), "int4")
     _test_legalize_batch_matmul((16, 2, 16), (16, 32, 16), (0, 0, 0), "int4", False)
 
+    _test_legalize_batch_matmul(
+        (16, 8, 16), (16, 16, 32), (0, 0, 0), "float16", False, transpose_b=False
+    )
+    _test_legalize_batch_matmul(
+        (16, 16, 8), (16, 32, 16), (0, 0, 0), "float16", False, transpose_a=True
+    )
+
 
 if __name__ == "__main__":
     test_legalize_conv2d_NHWC()

From 0a3535e24679b2da83da9f2727fa708e990924a1 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@octoml.ai>
Date: Wed, 14 Dec 2022 18:51:08 -0800
Subject: [PATCH 063/286] [Relay] Remove overwriting of matmul shapes when they
 are static (#13615)

In the Relay Matmul shape relation, we are a little over enthusiastic about unifying dynamic shapes. If one of the shapes is static, it does not need to be unified. This change only rewrites dynamic shapes to required static constraints.

* Remove overwriting of matmul shapes when they are static

* Simplify nesting

* Add shape check to dense tests.
---
 src/relay/op/nn/nn.h                 | 33 ++++++++++++++++++----------
 tests/python/relay/test_op_level1.py |  3 +++
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index f5497a4603bf..cf601ff5f11b 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -113,23 +113,32 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
         std::vector<PrimExpr> B_shape(tensor_b->shape.begin(), tensor_b->shape.end());
         auto sa = A_shape.size();
         auto sb = B_shape.size();
+        size_t index_swap_A;
+        size_t index_swap_B;
         if (transpose_a && transpose_b) {
-          auto tmp = A_shape[sa - 2];
-          A_shape[sa - 2] = B_shape[sb - 1];
-          B_shape[sb - 1] = tmp;
+          index_swap_A = sa - 2;
+          index_swap_B = sb - 1;
         } else if (transpose_a) {
-          auto tmp = A_shape[sa - 2];
-          A_shape[sa - 2] = B_shape[sb - 2];
-          B_shape[sb - 2] = tmp;
+          index_swap_A = sa - 2;
+          index_swap_B = sb - 2;
         } else if (transpose_b) {
-          auto tmp = A_shape[sa - 1];
-          A_shape[sa - 1] = B_shape[sb - 1];
-          B_shape[sb - 1] = tmp;
+          index_swap_A = sa - 1;
+          index_swap_B = sb - 1;
         } else {
-          auto tmp = A_shape[sa - 1];
-          A_shape[sa - 1] = B_shape[sb - 2];
-          B_shape[sb - 2] = tmp;
+          index_swap_A = sa - 1;
+          index_swap_B = sb - 2;
         }
+
+        // Rewrite dynamic axes to static where constraints allow.
+        auto tmp = A_shape[index_swap_A];
+        if (A_shape[index_swap_A].as<tir::AnyNode>()) {
+          A_shape[index_swap_A] = B_shape[index_swap_B];
+        }
+        if (B_shape[index_swap_B].as<tir::AnyNode>()) {
+          B_shape[index_swap_B] = tmp;
+        }
+
+        // Update input types with new constrained shapes.
         reporter->Assign(types[0], TensorType(A_shape, tensor_a->dtype));
         reporter->Assign(types[1], TensorType(B_shape, tensor_b_dtype));
       }
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 30d9d88ad7cb..bd4e1b72c3cd 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -25,6 +25,7 @@
 import tvm.topi.testing
 from tvm.contrib.nvcc import have_fp16
 import tvm.testing
+from tvm.topi.utils import get_const_tuple
 
 executor_kind = tvm.testing.parameter("graph", "vm")
 
@@ -695,6 +696,8 @@ def test_dense(executor_kind):
         w = relay.var("w", relay.TensorType((k, n), dtype))
         y = relay.nn.dense(x, w)
         yy = run_infer_type(y)
+        # Confirm that input shape has not been rewritten to become dynamic.
+        assert get_const_tuple(yy.type_args[0].shape) == (4, 2)
 
         n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), 2
         x = relay.var("x", relay.TensorType((n, c, h, w), dtype))

From dfdaab7f85a465f6e8c8c4b872ba4a59292549b3 Mon Sep 17 00:00:00 2001
From: Jianjian Guan <jacquesguan@me.com>
Date: Thu, 15 Dec 2022 14:08:15 +0800
Subject: [PATCH 064/286] [Frontend] [ONNX] Support sequence_lens of GRU
 (#13587)

[Frontend] [ONNX] Support sequence_lens of GRU.

Support convert sequence_lens input of GRU.
---
 python/tvm/relay/frontend/common.py        | 57 ++++++++++++++++++++--
 python/tvm/relay/frontend/onnx.py          | 18 ++++---
 tests/python/frontend/onnx/test_forward.py | 40 ++++++++++++++-
 3 files changed, 104 insertions(+), 11 deletions(-)

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 5f961f1ae0e8..660426fb4ad5 100755
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -737,6 +737,7 @@ def gru_cell(
     n_act=_op.tanh,
     backwards=False,
     linear_before_reset=True,
+    sequence_lens=None,
 ):
     """
     Common implementation of GRU cell for all frontends of TVM
@@ -765,7 +766,12 @@ def gru_cell(
         activation function for new gate. it is tanh by default
     backwards : bool
         Flag for reverse pass of GRU
-
+    linear_before_reset : bool
+        Flag for applying the linear transformation before multiplying by the output of the reset
+        gate.
+    sequence_lens : relay.op
+        Tensor specifying lengths of the sequences in a batch.
+        Shape = (batch_size)
     Returns
     -------
     result : List[relay.Expr], relay.Expr, relay.Expr
@@ -773,7 +779,40 @@ def gru_cell(
     """
 
     outputs_list = []
-    for x_t in input_seqs if not backwards else reversed(input_seqs):
+
+    seq_len = len(input_seqs)
+    input_dtype = infer_type(input_seqs[0]).checked_type.dtype
+
+    if sequence_lens is not None:
+        shape = infer_shape(sequence_lens)
+        dtype = infer_type(sequence_lens).checked_type.dtype
+
+        arange = _op.arange(_op.const(0), _op.const(seq_len), dtype=dtype)
+        arange = _op.expand_dims(arange, 1)
+        sequence_lens = _op.broadcast_to(sequence_lens, [seq_len, shape[0]])
+
+        # cast to data dtype
+        mask = _op.less(arange, sequence_lens)
+        mask = _op.cast(mask, dtype=input_dtype)
+        mask = _op.expand_dims(mask, 2)
+        mask_seqs = unbind(mask)
+
+        res_mask = _op.greater_equal(arange, sequence_lens)
+        res_mask = _op.cast(res_mask, dtype=input_dtype)
+        res_mask = _op.expand_dims(res_mask, 2)
+        res_mask_seqs = unbind(res_mask)
+
+        if backwards:
+            # need a mask to keep intial_h_B correct
+            initial_h = hidden_state
+            initial_h_mask = _op.equal(arange, sequence_lens)
+            initial_h_mask = _op.cast(initial_h_mask, dtype=input_dtype)
+            initial_h_mask = _op.expand_dims(initial_h_mask, 2)
+            initial_h_mask_seqs = unbind(initial_h_mask)
+
+    output = _op.zeros(infer_shape(hidden_state), input_dtype)
+    for i in range(seq_len) if not backwards else reversed(range(seq_len)):
+        x_t = input_seqs[i]
         xwt = _op.nn.dense(x_t, w_inp)
         if linear_before_reset:
             hwt = _op.nn.dense(hidden_state, w_hid)
@@ -806,9 +845,21 @@ def gru_cell(
 
         hidden_state = (hidden_state - n_gate) * z_gate + n_gate
 
+        if sequence_lens is not None:
+            hidden_state = hidden_state * mask_seqs[i]
+
         outputs_list.append(hidden_state)  # [seq_num, (batch, hidden_size)]
 
-    return outputs_list, hidden_state
+        if sequence_lens is not None:
+            output = output * res_mask_seqs[i] + hidden_state
+        else:
+            output = hidden_state
+
+        # make sure initial_h_B correct
+        if backwards and sequence_lens is not None:
+            hidden_state = hidden_state + initial_h * initial_h_mask_seqs[i]
+
+    return outputs_list, output
 
 
 def lstm_cell(
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 3470099100d4..a8ab62602573 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -3126,8 +3126,7 @@ def _inputs_helper(cls, inputs, layout):
         Wp = inputs[1]
         Rp = inputs[2]
         Bp = inputs[3]
-        # Sequence length currently unused as it can be inferred from shapes.
-        # sequence_lens = inputs['sequence_lens']
+        sequence_lens = inputs[4]
         Hp_0 = inputs[5]
 
         num_directions = infer_shape(Wp)[0]
@@ -3158,11 +3157,11 @@ def _inputs_helper(cls, inputs, layout):
         Bs = None
         if Bp is not None:
             Bs = _op.split(Bp, num_directions)
-        return X_steps, H_ts, Ws, Rs, Bs, num_directions
+        return X_steps, H_ts, Ws, Rs, Bs, num_directions, sequence_lens
 
     @classmethod
     def _impl_common(cls, inputs, attr, layout):
-        X_steps, H_ts, Ws, Rs, Bs, num_directions = cls._inputs_helper(inputs, layout)
+        X_steps, H_ts, Ws, Rs, Bs, num_directions, _ = cls._inputs_helper(inputs, layout)
         acts = cls._get_activations(attr, 1, num_directions, "RNN")
 
         weights_dicts = []
@@ -3261,7 +3260,7 @@ def _default_activations(cls, num_directions):
 
     @classmethod
     def _impl_common(cls, inputs, attr, layout):
-        X_steps, H_ts, Ws, Rs, Bs, num_directions = cls._inputs_helper(inputs, layout)
+        X_steps, H_ts, Ws, Rs, Bs, num_directions, _ = cls._inputs_helper(inputs, layout)
         acts = cls._get_activations(attr, 3, num_directions, "LSTM")
 
         # cell state
@@ -3346,6 +3345,7 @@ def bidir_gru_cell(
         input_seqs,
         weight_dicts,
         acts,
+        sequence_lens=None,
     ):
         """
         Bidirectional GRU cell
@@ -3356,6 +3356,7 @@ def bidir_gru_cell(
             **weight_dicts[0],
             rz_act=acts[0],
             n_act=acts[1],
+            sequence_lens=sequence_lens,
         )
 
         reverse_outputs, rev_H_t = gru_cell(
@@ -3364,6 +3365,7 @@ def bidir_gru_cell(
             rz_act=acts[2],
             n_act=acts[3],
             backwards=True,
+            sequence_lens=sequence_lens,
         )
 
         final_outputs = []
@@ -3383,7 +3385,9 @@ def _default_activations(cls, num_directions):
 
     @classmethod
     def _impl_common(cls, inputs, attr, layout):
-        X_steps, H_ts, Ws, Rs, Bs, num_directions = cls._inputs_helper(inputs, layout)
+        X_steps, H_ts, Ws, Rs, Bs, num_directions, sequence_lens = cls._inputs_helper(
+            inputs, layout
+        )
         acts = cls._get_activations(attr, 2, num_directions, "GRU")
         linear_before_reset = attr.get("linear_before_reset", 0)
 
@@ -3412,6 +3416,7 @@ def _impl_common(cls, inputs, attr, layout):
                 input_seqs=X_steps,
                 weight_dicts=weights_dicts,
                 acts=acts,
+                sequence_lens=sequence_lens,
             )
         else:
             # outputs shape = [seqs_num, (batch_size, hidden_size)]
@@ -3420,6 +3425,7 @@ def _impl_common(cls, inputs, attr, layout):
                 **weights_dicts[0],
                 rz_act=acts[0],
                 n_act=acts[1],
+                sequence_lens=sequence_lens,
             )
 
             # output shape = (seqs_num, num_directions, batch_size, hidden_size)
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index dcd4f2defbe8..92a87ff6a72c 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -3897,6 +3897,7 @@ def verify_rnn(
     atol=1e-5,
     target=None,
     dev=None,
+    use_sequence_lens=False,
 ):
     """verify_rnn"""
     if rnn_type == "RNN":
@@ -3954,10 +3955,16 @@ def register(np_arr, name, shape=None):
             )
             register(b_np, "B")
 
+        if use_sequence_lens:
+            sequence_np = np.random.uniform(0, seq_length, size=(batch_size)).astype("int32")
+            register(sequence_np, "sequence_lens")
+
         if use_initial_state:
             assert use_bias is True, "Initial states must have bias specified."
-            sequence_np = np.repeat(seq_length, batch_size).astype("int32")
-            register(sequence_np, "sequence_lens")
+
+            if not use_sequence_lens:
+                sequence_np = np.repeat(seq_length, batch_size).astype("int32")
+                register(sequence_np, "sequence_lens")
 
             if layout == 1:
                 initial_h_np = np.random.uniform(size=(batch_size, directions, hidden_size)).astype(
@@ -4211,6 +4218,35 @@ def verify_rnn_helper(target, dev, rnn_type):
         #     dev=dev,
         # )
 
+        # Testing with initial state
+        if rnn_type == "GRU":
+            verify_rnn(
+                seq_length=2,
+                batch_size=1,
+                input_size=16,
+                hidden_size=32,
+                use_bias=True,
+                use_initial_state=True,
+                rnn_type=rnn_type,
+                directions=directions,
+                target=target,
+                dev=dev,
+                use_sequence_lens=True,
+            )
+            verify_rnn(
+                seq_length=8,
+                batch_size=8,
+                input_size=16,
+                hidden_size=32,
+                use_bias=True,
+                use_initial_state=True,
+                rnn_type=rnn_type,
+                directions=directions,
+                target=target,
+                dev=dev,
+                use_sequence_lens=True,
+            )
+
         # Testing with peepholes
         if rnn_type == "LSTM":
             verify_rnn(

From 662ccfb76bd1cb7b6550293417951a280a71ea8f Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 15 Dec 2022 11:10:17 +0000
Subject: [PATCH 065/286] [ETHOSN] Add support for experimental compiler option
 (#13410)

* [ETHOSN] Add support for experimental compiler option

The support library currently supports enabling the experimental
cascading compiler option via an environment variable
`FORCE_EXPERIMENTAL_COMPILER`. This commit exposes the ability to
enable this option through TVMC.
---
 src/relay/backend/contrib/ethosn/codegen.cc   | 10 +++-
 .../backend/contrib/ethosn/codegen_ethosn.h   |  4 ++
 .../contrib/test_ethosn/test_codegen.py       | 54 +++++++++++++++++++
 3 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index edf7caca820d..0fed73d2a35e 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -713,9 +713,17 @@ runtime::ethosn::OrderedCompiledNetwork EthosnCompiler::CompileEthosnFunc(const
   auto network_with_ids = ConstructNetwork(mod, gvar, func);
   // Now set the required build flags
   sl::CompilationOptions options = CreateOptions();
-  // Finally compile the network
+  // Set the experimental compiler if enabled, for now this is not part of the
+  // support library compilation options.
+  bool experimental_compiler = GetCompilerAttrs()->experimental_compiler;
+  if (experimental_compiler) {
+    setenv("FORCE_EXPERIMENTAL_COMPILER", "1", 1);
+  }
   std::vector<std::unique_ptr<sl::CompiledNetwork>> compiled_networks =
       sl::Compile(*network_with_ids.network, options);
+  if (experimental_compiler) {
+    unsetenv("FORCE_EXPERIMENTAL_COMPILER");
+  }
   ICHECK_GE(compiled_networks.size(), 1) << "Ethos-N compiler failed to compile network";
   auto compiled_network = std::move(compiled_networks[0]);
   // Determine the order that the inputs/outputs are in and how that corresponds to the
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index 7c52da713c5c..118292b45f84 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -252,6 +252,7 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode
   bool disable_winograd;
   String debug_dir;
   bool inline_non_compute_intensive_partitions;
+  bool experimental_compiler;
 
   TVM_DECLARE_ATTRS(EthosnCompilerConfigNode, "ext.attrs.EthosnCompilerConfigNode") {
     TVM_ATTR_FIELD(variant).describe("See Ethos-N documentation.").set_default("n78");
@@ -285,6 +286,9 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode
             "Ethos(TM)-N that are deemed 'non-compute-intensive'. The inlined functions will "
             "continue through TVM's standard compilation flow.")
         .set_default(true);
+    TVM_ATTR_FIELD(experimental_compiler)
+        .describe("An exprimental cascading compiler for Arm(R) Ethos(TM)-N.")
+        .set_default(false);
   }
 };
 
diff --git a/tests/python/contrib/test_ethosn/test_codegen.py b/tests/python/contrib/test_ethosn/test_codegen.py
index c50dfb7963d3..4a40d062af5c 100644
--- a/tests/python/contrib/test_ethosn/test_codegen.py
+++ b/tests/python/contrib/test_ethosn/test_codegen.py
@@ -50,3 +50,57 @@ def test_compile_with_unsupported_variant():
 
     with pytest.raises(tvm.TVMError, match=r"Unknown NPU type"):
         tei.build_and_run(mod, inputs, 1, {}, True, additional_config_args=additional_config_args)
+
+
+@requires_ethosn
+def test_experimental_compiler(capfd):
+    """Test compilation with the experimental compiler."""
+    dtype = "int8"
+    input_shape = (1, 2, 2, 2)
+
+    x = relay.var("x", shape=input_shape, dtype=dtype)
+    y = relay.reshape(x, newshape=(1, 1, 1, 8))
+    mod = tei.make_ethosn_partition(y)
+
+    additional_config_args = {
+        "variant": "n78",
+        "experimental_compiler": True,
+        "inline_non_compute_intensive_partitions": False,
+    }
+
+    tei.build(mod, {}, True, additional_config_args=additional_config_args)
+
+    # Check for hints that the experimental compiler was activated.
+    # The support library logs a warning to say the the experimental
+    # compiler is in use. Check that this warning was logged.
+    captured = capfd.readouterr()
+    assert (
+        "WARNING: Experimental Compiler in use." in captured.err
+    ), "Experimental compiler was not activated."
+
+
+@requires_ethosn
+def test_without_experimental_compiler(capfd):
+    """Test compilation when the experimental compiler is not enabled."""
+    dtype = "int8"
+    input_shape = (1, 2, 2, 2)
+
+    x = relay.var("x", shape=input_shape, dtype=dtype)
+    y = relay.reshape(x, newshape=(1, 1, 1, 8))
+    mod = tei.make_ethosn_partition(y)
+
+    additional_config_args = {
+        "variant": "n78",
+        "experimental_compiler": False,
+        "inline_non_compute_intensive_partitions": False,
+    }
+
+    tei.build(mod, {}, True, additional_config_args=additional_config_args)
+
+    # Check for hints that the experimental compiler was activated.
+    # The support library logs a warning to say the the experimental
+    # compiler is in use. Check that this warning was logged.
+    captured = capfd.readouterr()
+    assert (
+        "WARNING: Experimental Compiler in use." not in captured.err
+    ), "Experimental compiler was enabled when it is not expected to be."

From af59d45ed5a409551a95f3c4ec422ad187a2f789 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Fri, 16 Dec 2022 02:57:35 +0800
Subject: [PATCH 066/286] [TVMScript] Fix print round-tripable multi thread env
 binding (#13622)

* Fix print round-tripable multi thread env binding

* add unittest
---
 src/printer/tvmscript_printer.cc                  |  1 -
 tests/python/unittest/test_tvmscript_roundtrip.py | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 7fb1129d274e..274b9542cc92 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -1045,7 +1045,6 @@ Doc TVMScriptPrinter::VisitStmt_(const AttrStmtNode* op) {
             << ")";
         doc << Doc::NewLine() << PrintBody(op->body);
       }
-      TryDeallocVar(iter_var->var);
       return doc;
     }
   }
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 0ead66bd609f..c0174a0671c0 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3537,6 +3537,19 @@ def func(A: T.Buffer[1, "bool"], i: T.bool, j: T.bool, k: T.bool):
         yield generator
 
 
+def multi_env_threads():
+    @T.prim_func
+    def func(A: T.Buffer[128, "float32"], C: T.Buffer[128, "float32"]):
+        B = T.alloc_buffer([128], dtype="float32")
+        for i in T.thread_binding(128, thread="threadIdx.x"):
+            B[i] = A[i] + 1.0
+        for i in T.thread_binding(128, thread="threadIdx.x"):
+            C[i] = B[i] + 2.0
+
+    mod = tvm.tir.transform.LowerOpaqueBlock()(tvm.IRModule.from_expr(func))
+    return mod["main"]
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3593,6 +3606,7 @@ def func(A: T.Buffer[1, "bool"], i: T.bool, j: T.bool, k: T.bool):
     elif_chain_without_else,
     elif_chain_with_else,
     *nested_boolean_expressions(),
+    multi_env_threads,
 )
 
 
From 6f8f450f30982481fa60977dbddcfc829f6d1d03 Mon Sep 17 00:00:00 2001
From: Tasmia Rahman <89925728+trahman-quic@users.noreply.github.com>
Date: Thu, 15 Dec 2022 17:30:18 -0600
Subject: [PATCH 067/286] [TOPI][Hexagon] Implement global_avg_pool2d for
 hexagon (#13614)

* [TOPI][Hexagon] Implement global_avg_pool2d for hexagon

* Fix name

* Fix lint issues

* Use get_hexagon_target()
---
 python/tvm/topi/hexagon/qnn/__init__.py       |   1 +
 .../tvm/topi/hexagon/qnn/global_avg_pool2d.py |  95 ++++++++++
 python/tvm/topi/hexagon/slice_ops/__init__.py |   1 +
 .../hexagon/slice_ops/global_avg_pool2d.py    |  52 ++++++
 python/tvm/topi/hexagon/utils.py              |  12 ++
 .../contrib/test_hexagon/infrastructure.py    |  13 ++
 .../topi/slice_op/test_global_avg_pool2d.py   | 167 ++++++++++++++++++
 7 files changed, 341 insertions(+)
 create mode 100755 python/tvm/topi/hexagon/qnn/global_avg_pool2d.py
 create mode 100755 python/tvm/topi/hexagon/slice_ops/global_avg_pool2d.py
 create mode 100755 tests/python/contrib/test_hexagon/topi/slice_op/test_global_avg_pool2d.py

diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py
index d63b69b2e259..d41d8854d7d1 100644
--- a/python/tvm/topi/hexagon/qnn/__init__.py
+++ b/python/tvm/topi/hexagon/qnn/__init__.py
@@ -28,3 +28,4 @@
 from .nn import *
 from .qdepthwise_conv2d_slice import qdepthwise_conv2d_compute, qdepthwise_conv2d_schedule
 from .adaptive_avg_pool1d import *
+from .global_avg_pool2d import *
diff --git a/python/tvm/topi/hexagon/qnn/global_avg_pool2d.py b/python/tvm/topi/hexagon/qnn/global_avg_pool2d.py
new file mode 100755
index 000000000000..1c171be8976e
--- /dev/null
+++ b/python/tvm/topi/hexagon/qnn/global_avg_pool2d.py
@@ -0,0 +1,95 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Assumptions:
+1) The input is in NCHW layout. Squeezenet is the only model that calls
+   nn.global_avg_pool2d and the only layout it uses is 'NCHW'.
+2) Both input and output dtype is uint8 and
+   quantization parameter is provided to the op.
+3) Input is assumed to always be multiple of fixed chunk 32c8h8w.
+"""
+
+from tvm import te
+from tvm import tir
+from ..utils import get_layout_transform_fn, get_fixed_point_value, saturate
+
+
+def global_avg_pool2d_u8(
+    data: te.Tensor,
+    odtype: str,
+    input_zero_point: int,
+    input_scale: float,
+    output_zero_point: int,
+    output_scale: float,
+):
+    """global_avg_pool2d"""
+    input_b, input_c, input_h, input_w = data.shape
+    oshape = (input_b, input_c) + (1, 1)
+
+    if input_h * input_w < 256:
+        bits = "16"
+    else:
+        bits = "32"
+
+    if odtype == "uint8":
+        temp_dtype = "uint" + bits
+    elif odtype == "int8":
+        temp_dtype = "int" + bits
+    else:
+        raise RuntimeError(f"Unsupported output dtype, {odtype}'")
+
+    pool_area = input_h * input_w
+    rh_r = te.reduce_axis((0, input_h), name="rh_r")
+    rw_r = te.reduce_axis((0, input_w), name="rw_r")
+
+    scale_with_area = input_scale / (output_scale * int(pool_area))
+    scale_fixed_point, rsh = get_fixed_point_value(scale_with_area, "int16")
+    corr = (output_zero_point << rsh) - input_zero_point * pool_area * scale_fixed_point
+
+    sum_compute = te.compute(
+        oshape,
+        lambda n, c, h, w: te.sum(
+            data[n, c, h + rh_r, w + rw_r].astype(temp_dtype), axis=[rh_r, rw_r]
+        ),
+        name="sum",
+    )
+
+    avg_compute = te.compute(
+        oshape,
+        lambda n, c, h, w: saturate(
+            ((sum_compute[n, c, h, w] * scale_fixed_point) + corr) >> rsh, odtype
+        ).astype(odtype),
+        name="global_avg_pool2d",
+    )
+
+    return avg_compute
+
+
+def stir_global_avg_pool2d_u8_schedule(outs: te.Tensor, ins: te.Tensor, input_layout: str):
+    """Schedule"""
+    func = te.create_prim_func([ins, outs])
+    s = tir.Schedule(func)
+
+    sum_block = s.get_block("sum")
+
+    # Input is multiple of fixed chunk but output is NxCx1x1
+    # Hence transform_layout is only applied on input
+    input_transformed_layout = get_layout_transform_fn(input_layout)
+    s.transform_layout(sum_block, buffer=("read", 0), index_map=input_transformed_layout)
+
+    return s
diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
index 5f86e706af50..6b17b64489a9 100644
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -36,3 +36,4 @@
 from .tanh import tanh_te_compute, tanhf16_schedule
 from .dwconv2d import *
 from .depth_to_space import d2s_compute, d2s_schedule
+from .global_avg_pool2d import *
diff --git a/python/tvm/topi/hexagon/slice_ops/global_avg_pool2d.py b/python/tvm/topi/hexagon/slice_ops/global_avg_pool2d.py
new file mode 100755
index 000000000000..30222c11bb54
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/global_avg_pool2d.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Assumptions:
+1) The input is in NCHW layout. Squeezenet is the only model that calls
+   nn.global_avg_pool2d and the only layout it uses is 'NCHW'.
+2) The op takes input data as an argument.
+3) Both input and output dtype is float32 and
+4) Input is assumed to always be multiple of fixed chunk 32c8h4w.
+"""
+
+from tvm import te
+from tvm import tir
+from tvm import topi
+from ..utils import get_layout_transform_fn
+
+
+def global_avg_pool2d(
+    data: te.Tensor,
+):
+    """global_avg_pool2d"""
+    return topi.nn.global_pool(data, "avg", "NCHW")
+
+
+def stir_global_avg_pool2d_schedule(outs: te.Tensor, ins: te.Tensor, input_layout: str):
+    """Schedule"""
+    func = te.create_prim_func([ins, outs])
+    s = tir.Schedule(func)
+
+    sum_block = s.get_block("adaptive_pool_sum")
+
+    # Input is multiple of fixed chunk but output is NxCx1x1
+    # Hence transform_layout is only applied on input
+    input_transformed_layout = get_layout_transform_fn(input_layout)
+    s.transform_layout(sum_block, buffer=("read", 0), index_map=input_transformed_layout)
+
+    return s
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index 5aeed9aa4fde..78ed21e8a13b 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -136,6 +136,14 @@ def ncw_32c64w_2d(n, c, w):
     return [n, c // 32, w // 64, te.AXIS_SEPARATOR, c % 32, w % 64]
 
 
+def nchw_32c8h8w_2d(n, c, h, w):
+    return [n, c // 32, h // 8, w // 8, te.AXIS_SEPARATOR, c % 32, h % 8, w % 8]
+
+
+def nchw_32c8h4w_2d(n, c, h, w):
+    return [n, c // 32, h // 8, w // 4, te.AXIS_SEPARATOR, c % 32, h % 8, w % 4]
+
+
 def get_layout_transform_fn(layout):
     """Return index map function as per the layout string"""
     if layout == "nhwc-8h2w32c2w-2d":
@@ -180,6 +188,10 @@ def get_layout_transform_fn(layout):
         return ohwi32o_1d
     if layout == "ncw-32c64w-2d":
         return ncw_32c64w_2d
+    if layout == "nchw-32c8h8w-2d":
+        return nchw_32c8h8w_2d
+    if layout == "nchw-32c8h4w-2d":
+        return nchw_32c8h4w_2d
     raise RuntimeError(f"Unexpected layout '{layout}'")
 
 
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index fcb811fce742..e81c24694ef9 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -277,6 +277,19 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str):
 
         raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
 
+    if current_layout == "nchw":
+        if new_layout in ["nchw-32c8h8w-2d", "nchw-32c8h8w-1d"]:
+            n, c, h, w = arr_np.shape
+            return arr_np.reshape([n, c // 32, 32, h // 8, 8, w // 8, 8]).transpose(
+                0, 1, 3, 5, 2, 4, 6
+            )
+        if new_layout in ["nchw-32c8h4w-2d", "nchw-32c8h4w-1d"]:
+            n, c, h, w = arr_np.shape
+            return arr_np.reshape([n, c // 32, 32, h // 8, 8, w // 4, 4]).transpose(
+                0, 1, 3, 5, 2, 4, 6
+            )
+        raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
+
     raise RuntimeError(f"Unexpected current_layout '{current_layout}'")
 
 
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_global_avg_pool2d.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_global_avg_pool2d.py
new file mode 100755
index 000000000000..3f7e999c7bca
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_global_avg_pool2d.py
@@ -0,0 +1,167 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Test code for float16 and uint8 global_avg_pool2d."""
+
+import numpy as np
+
+import tvm
+from tvm import te
+from tvm.topi.testing import adaptive_pool
+import tvm.topi.hexagon.qnn as qn
+import tvm.topi.hexagon.slice_ops as sl
+from tvm.contrib.hexagon import allocate_hexagon_array
+from ...infrastructure import transform_numpy, quantize_np, get_hexagon_target
+
+
+SCALE_M_VAL = None
+ZERO_POINT_M_VAL = None
+SCALE_VAL = None
+ZERO_POINT_VAL = None
+
+
+class TestGlobalPool2D:
+    (input_shape,) = tvm.testing.parameters(
+        ([1, 32, 8, 8],),
+        ([1, 1056, 16, 16],),
+    )
+
+    # Fixed chunk layout is set as nchw-32c8h8w-2d for uint8 and nchw-32c8h4w-2d for float16.
+    # For optimization, it might get changed later.
+    # Since output shape will be NxCx1x1 which is not a
+    # multiple of fixed-chunk, output_layout is NCHW.
+    input_layout, output_layout, pool_type, layout, dtype = tvm.testing.parameters(
+        ("nchw-32c8h8w-2d", "nchw", "avg", "NCHW", "uint8"),
+        ("nchw-32c8h4w-2d", "nchw", "avg", "NCHW", "float16"),
+    )
+
+    @tvm.testing.fixture
+    def expected_output_np(
+        self,
+        input_np,
+        pool_type,
+        layout,
+    ):
+        """Generate expected output."""
+        ref_np = tvm.topi.testing.adaptive_pool(
+            input_np,
+            (1, 1),
+            pool_type,
+            layout,
+        )
+        return ref_np
+
+    @tvm.testing.fixture
+    def input_np(self, input_shape, dtype):
+        if dtype in ("uint8", "int8"):
+            dtype = "float32"
+        return np.random.random(input_shape).astype(dtype)
+
+    @tvm.testing.fixture
+    def quantize_input_np(self, input_np, dtype):
+        if dtype in ("uint8", "int8"):
+            global ZERO_POINT_VAL, SCALE_VAL
+            input_np_quantized, SCALE_VAL, ZERO_POINT_VAL = quantize_np(input_np, dtype)
+            return input_np_quantized
+
+    @tvm.testing.fixture
+    def transformed_input_np(self, input_np, quantize_input_np, input_layout, layout, dtype):
+        if dtype == "float16":
+            return transform_numpy(input_np, layout.lower(), input_layout)
+        if dtype in ("uint8", "int8"):
+            return transform_numpy(quantize_input_np, layout.lower(), input_layout)
+
+        raise RuntimeError(f"Unsupported data type '{dtype}'")
+
+    @tvm.testing.fixture
+    def quantize_expected_output_np(self, expected_output_np, dtype):
+        if dtype in ("uint8", "int8"):
+            global ZERO_POINT_M_VAL, SCALE_M_VAL
+            out_ref_quantized, SCALE_M_VAL, ZERO_POINT_M_VAL = quantize_np(
+                expected_output_np, dtype
+            )
+
+            # Since output_layout is nchw, no transformation is needed.
+            return out_ref_quantized
+
+    @tvm.testing.requires_hexagon
+    def test_global_pool2d(
+        self,
+        dtype,
+        input_shape,
+        input_layout,
+        transformed_input_np,
+        expected_output_np,
+        quantize_expected_output_np,
+        hexagon_session,
+    ):
+        a_tensor = te.placeholder(input_shape, name="a_tensor", dtype=dtype)
+
+        if dtype == "float16":
+            m_tensor = sl.global_avg_pool2d(a_tensor)
+            tir_schedule = sl.stir_global_avg_pool2d_schedule(m_tensor, a_tensor, input_layout)
+        elif dtype in ["uint8", "int8"]:
+            m_tensor = qn.global_avg_pool2d_u8(
+                a_tensor,
+                dtype,
+                ZERO_POINT_VAL,
+                SCALE_VAL,
+                ZERO_POINT_M_VAL,
+                SCALE_M_VAL,
+            )
+            tir_schedule = qn.stir_global_avg_pool2d_u8_schedule(m_tensor, a_tensor, input_layout)
+
+        sch = tir_schedule.mod
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(
+                sch,
+                [a_tensor, m_tensor],
+                get_hexagon_target("v69"),
+                name="global_pool2d",
+            )
+
+        input_axis_separator = [4]
+
+        a_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            dtype=dtype,
+            axis_separators=input_axis_separator,
+            mem_scope="global.vtcm",
+        )
+
+        m_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            expected_output_np.shape,
+            dtype=dtype,
+        )
+
+        mod = hexagon_session.load_module(func)
+        mod(a_data_nd, m_data_nd)
+
+        # Convert nd to np
+        m_data_np = m_data_nd.numpy()
+
+        if dtype == "float16":
+            np.testing.assert_allclose(expected_output_np, m_data_np, rtol=1e-3, atol=1e-3)
+        elif dtype in ["int8", "uint8"]:
+            np.testing.assert_allclose(quantize_expected_output_np, m_data_np, atol=1)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 9e162f4ce4940a2df2c4b434696c3aa079784fc5 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Thu, 15 Dec 2022 17:24:55 -0800
Subject: [PATCH 068/286] =?UTF-8?q?Add=20check=20for=20non-contiguous=20me?=
 =?UTF-8?q?mory=20access=20when=20lowering=20to=20async=20dma=E2=80=A6=20(?=
 =?UTF-8?q?#13613)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add check for non-contiguous memory access when lowering to async dma copies.

* lint

* lint and nits

* lint
---
 src/tir/transforms/lower_async_dma.cc         |  28 +++
 .../test_hexagon/test_async_dma_pipeline.py   | 206 ++++++++++++++++++
 2 files changed, 234 insertions(+)

diff --git a/src/tir/transforms/lower_async_dma.cc b/src/tir/transforms/lower_async_dma.cc
index 9a950c10c776..94769dae0899 100644
--- a/src/tir/transforms/lower_async_dma.cc
+++ b/src/tir/transforms/lower_async_dma.cc
@@ -22,6 +22,7 @@
  */
 
 #include <tvm/arith/analyzer.h>
+#include <tvm/arith/iter_affine_map.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
@@ -34,6 +35,12 @@ class AsyncDMALowerer : public StmtExprMutator {
  public:
   explicit AsyncDMALowerer(bool dma_bypass_cache) : dma_bypass_cache_(dma_bypass_cache) {}
 
+  // Create member statement to track a mapping from iter var to iter range
+  Stmt VisitStmt_(const ForNode* op) final {
+    input_iters.Set(op->loop_var, Range(op->min, op->extent));
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     // Convert this, for example:
     // attr [0] "async_wait_queue_scope" = 0;
@@ -146,6 +153,17 @@ class AsyncDMALowerer : public StmtExprMutator {
 
       // map loop variable to zero for the store index & simplify
       Array<PrimExpr> store_index = bufferstorenode->indices;
+
+      // Use DetectIterMap to detect whether store index is non-contiguous.
+      arith::Analyzer analyzer;
+      auto store_iter_map = DetectIterMap(store_index, input_iters, 1, arith::IterMapLevel::NoCheck,
+                                          &analyzer, false);
+      if (!store_iter_map->errors.empty()) {
+        LOG(FATAL)
+            << "Unable to lower async dma for non contiguous memory access with store index: "
+            << store_index;
+      }
+
       store_index.MutateByApply([&](PrimExpr expr) {
         arith::Analyzer analyzer;
         return analyzer.Simplify(Substitute(std::move(expr), loop_var_remap));
@@ -153,6 +171,15 @@ class AsyncDMALowerer : public StmtExprMutator {
 
       // map loop variable to zero for the load index & simplify
       Array<PrimExpr> load_index = bufferloadnode->indices;
+
+      // Use DetectIterMap to detect whether load index is non-contiguous.
+      auto load_iter_map =
+          DetectIterMap(load_index, input_iters, 1, arith::IterMapLevel::NoCheck, &analyzer, false);
+      if (!load_iter_map->errors.empty()) {
+        LOG(FATAL) << "Unable to lower async dma for non contiguous memory access with load index: "
+                   << load_index;
+      }
+
       load_index.MutateByApply([&](PrimExpr expr) {
         arith::Analyzer analyzer;
         return analyzer.Simplify(Substitute(std::move(expr), loop_var_remap));
@@ -176,6 +203,7 @@ class AsyncDMALowerer : public StmtExprMutator {
  private:
   std::set<int> queue_ids_;
   bool dma_bypass_cache_;
+  Map<Var, Range> input_iters = Map<Var, Range>();
 };
 
 namespace transform {
diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index 51427f18f6f4..914a26c51180 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -25,6 +25,193 @@
 VRMPY_SIZE_B = 128
 VRMPY_SIZE_INT32 = 32
 
+# pylint: disable=invalid-name
+@T.prim_func
+def conv2d_async_non_contig(
+    p0: T.Buffer[(T.int64(1), T.int64(1), T.int64(56), T.int64(56), T.int64(4)), "uint8"],
+    fused_constant_1: T.Buffer[
+        (T.int64(1), T.int64(1), T.int64(3), T.int64(3), T.int64(1), T.int64(32), T.int64(4)),
+        "uint8",
+    ],
+    conv2d_NCHWc_int8: T.Buffer[
+        (T.int64(1), T.int64(1), T.int64(54), T.int64(54), T.int64(32)), "int32"
+    ],
+):
+    """Non contiguous memory access is used in this conv2d taken from MS."""
+    # pylint: disable=no-self-argument
+    # function attr dict
+    T.func_attr({"tir.noalias": True, "global_symbol": "main"})
+    # body
+    # with T.block("root")
+    p0_global_vtcm = T.alloc_buffer(
+        [T.int64(1), T.int64(1), T.int64(56), T.int64(56), T.int64(4)],
+        dtype="uint8",
+        scope="global.vtcm",
+    )
+    fused_constant_global_vtcm = T.alloc_buffer(
+        [T.int64(1), T.int64(1), T.int64(3), T.int64(3), T.int64(1), T.int64(32), T.int64(4)],
+        dtype="uint8",
+        scope="global.vtcm",
+    )
+    for oh_0 in T.serial(T.int64(3)):
+        for ow_0 in T.serial(
+            T.int64(3),
+            annotations={
+                "software_pipeline_async_stages": [0],
+                "software_pipeline_order": [0, 1, 2],
+                "software_pipeline_stage": [0, 0, 1],
+            },
+        ):
+            for ax0_ax1_ax2_ax3_ax4_fused in T.serial(T.int64(1600)):
+                with T.block("p0_global.vtcm"):
+                    v0 = T.axis.spatial(T.int64(1), T.int64(0))
+                    v1 = T.axis.spatial(T.int64(1), T.int64(0))
+                    v2 = T.axis.spatial(
+                        T.int64(56), oh_0 * T.int64(18) + ax0_ax1_ax2_ax3_ax4_fused // T.int64(80)
+                    )
+                    v3 = T.axis.spatial(
+                        T.int64(56),
+                        ow_0 * T.int64(18) + ax0_ax1_ax2_ax3_ax4_fused % T.int64(80) // T.int64(4),
+                    )
+                    v4 = T.axis.spatial(T.int64(4), ax0_ax1_ax2_ax3_ax4_fused % T.int64(4))
+                    T.reads(p0[v0, v1, v2, v3, v4])
+                    T.writes(p0_global_vtcm[v0, v1, v2, v3, v4])
+                    p0_global_vtcm[v0, v1, v2, v3, v4] = p0[v0, v1, v2, v3, v4]
+            for ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused in T.serial(T.int64(1152)):
+                with T.block("fused_constant_global.vtcm"):
+                    v0 = T.axis.spatial(T.int64(1), T.int64(0))
+                    v1 = T.axis.spatial(T.int64(1), T.int64(0))
+                    v2 = T.axis.spatial(
+                        T.int64(3), ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused // T.int64(384)
+                    )
+                    v3 = T.axis.spatial(
+                        T.int64(3), ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused % T.int64(384) // T.int64(128)
+                    )
+                    v4 = T.axis.spatial(T.int64(1), T.int64(0))
+                    v5 = T.axis.spatial(
+                        T.int64(32), ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused % T.int64(128) // T.int64(4)
+                    )
+                    v6 = T.axis.spatial(T.int64(4), ax0_ax1_ax2_ax3_ax4_ax5_ax6_fused % T.int64(4))
+                    T.reads(fused_constant_1[v0, v1, v2, v3, v4, v5, v6])
+                    T.writes(fused_constant_global_vtcm[v0, v1, v2, v3, v4, v5, v6])
+                    fused_constant_global_vtcm[v0, v1, v2, v3, v4, v5, v6] = fused_constant_1[
+                        v0, v1, v2, v3, v4, v5, v6
+                    ]
+            for oh_1, ow_1 in T.grid(T.int64(3), T.int64(6)):
+                for oh_2_init, ow_2_init in T.grid(T.int64(6), T.int64(3)):
+                    with T.block("conv2d_NCHWc_int8_o_init"):
+                        v_n = T.axis.spatial(T.int64(1), T.int64(0))
+                        v_oc_chunk = T.axis.spatial(T.int64(1), T.int64(0))
+                        v_oh = T.axis.spatial(
+                            T.int64(54), oh_0 * T.int64(18) + oh_1 * T.int64(6) + oh_2_init
+                        )
+                        v_ow = T.axis.spatial(
+                            T.int64(54), ow_0 * T.int64(18) + ow_1 * T.int64(3) + ow_2_init
+                        )
+                        T.reads()
+                        T.writes(
+                            conv2d_NCHWc_int8[v_n, v_oc_chunk, v_oh, v_ow, T.int64(0) : T.int64(32)]
+                        )
+                        for oc_block_1 in T.vectorized(T.int64(32)):
+                            with T.block("conv2d_NCHWc_int8_init"):
+                                v_oc_block_i_init = T.axis.spatial(T.int64(32), oc_block_1)
+                                T.reads()
+                                T.writes(
+                                    conv2d_NCHWc_int8[
+                                        v_n, v_oc_chunk, v_oh, v_ow, v_oc_block_i_init
+                                    ]
+                                )
+                                conv2d_NCHWc_int8[
+                                    v_n, v_oc_chunk, v_oh, v_ow, v_oc_block_i_init
+                                ] = 0
+                for kh_1, kw_1, oh_2, ow_2 in T.grid(
+                    T.int64(3), T.int64(3), T.int64(6), T.int64(3)
+                ):
+                    with T.block("conv2d_NCHWc_int8_o_update"):
+                        v_n = T.axis.spatial(T.int64(1), T.int64(0))
+                        v_oc_chunk = T.axis.spatial(T.int64(1), T.int64(0))
+                        v_oh = T.axis.spatial(
+                            T.int64(54), oh_0 * T.int64(18) + oh_1 * T.int64(6) + oh_2
+                        )
+                        v_ow = T.axis.spatial(
+                            T.int64(54), ow_0 * T.int64(18) + ow_1 * T.int64(3) + ow_2
+                        )
+                        v_kh, v_kw = T.axis.remap("RR", [kh_1, kw_1])
+                        v_ic_outer = T.axis.reduce(T.int64(1), T.int64(0))
+                        v_ic_f_inner = T.axis.reduce(T.int64(1), T.int64(0))
+                        T.reads(
+                            conv2d_NCHWc_int8[
+                                v_n, v_oc_chunk, v_oh, v_ow, T.int64(0) : T.int64(32)
+                            ],
+                            p0_global_vtcm[
+                                v_n,
+                                v_ic_outer,
+                                v_oh + v_kh,
+                                v_ow + v_kw,
+                                v_ic_f_inner * T.int64(4) : v_ic_f_inner * T.int64(4) + T.int64(4),
+                            ],
+                            fused_constant_global_vtcm[
+                                v_oc_chunk,
+                                v_ic_outer,
+                                v_kh,
+                                v_kw,
+                                v_ic_f_inner,
+                                T.int64(0) : T.int64(32),
+                                T.int64(0) : T.int64(4),
+                            ],
+                        )
+                        T.writes(
+                            conv2d_NCHWc_int8[v_n, v_oc_chunk, v_oh, v_ow, T.int64(0) : T.int64(32)]
+                        )
+                        A = T.match_buffer(
+                            p0_global_vtcm[
+                                v_n,
+                                v_ic_outer,
+                                v_oh + v_kh,
+                                v_ow + v_kw,
+                                v_ic_f_inner * T.int64(4) : v_ic_f_inner * T.int64(4) + T.int64(4),
+                            ],
+                            [T.int64(4)],
+                            dtype="uint8",
+                            scope="global.vtcm",
+                            offset_factor=1,
+                        )
+                        B = T.match_buffer(
+                            fused_constant_global_vtcm[
+                                v_oc_chunk,
+                                v_ic_outer,
+                                v_kh,
+                                v_kw,
+                                v_ic_f_inner,
+                                T.int64(0) : T.int64(32),
+                                T.int64(0) : T.int64(4),
+                            ],
+                            [T.int64(32), T.int64(4)],
+                            dtype="uint8",
+                            scope="global.vtcm",
+                            offset_factor=1,
+                        )
+                        C = T.match_buffer(
+                            conv2d_NCHWc_int8[
+                                v_n, v_oc_chunk, v_oh, v_ow, T.int64(0) : T.int64(32)
+                            ],
+                            [T.int64(32)],
+                            dtype="int32",
+                            offset_factor=1,
+                        )
+                        A_u8x4: T.uint8x4 = A[T.int64(0) : T.int64(4)]
+                        A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                        B_i8x128 = B[T.int64(0), T.int64(0) : T.int64(128)]
+                        B_i32x32: T.int32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+                        C[0:32] = T.call_llvm_pure_intrin(
+                            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.acc.128B"),
+                            T.uint32(3),
+                            C[0:32],
+                            B_i32x32,
+                            A_i32,
+                            dtype="int32x32",
+                        )
+
 
 def conv_approximation(size_a, size_w):
     """Conv approximation."""
@@ -695,5 +882,24 @@ def test_meta(hexagon_session):
     )
 
 
+def test_non_contiguous():
+    """Test Non Contiguous memory lowering."""
+    sch = tvm.tir.Schedule(conv2d_async_non_contig)
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    err_rgx = r"Unable to lower async dma for non contiguous memory access with load index: "
+    # Currently we do not support non contiguous memory access being lowered to
+    # async dma so we throw an error.
+    with pytest.raises(tvm.TVMError, match=err_rgx):
+        with tvm.transform.PassContext(
+            config={
+                "tir.use_async_copy": 1,
+                "tir.merge_async_commit_queue_scope": 0,
+            }
+        ):
+            tvm.build(
+                sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
+            )
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From ac92ad98f53f71aa156893903965594a2571d848 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 16 Dec 2022 11:08:43 +0900
Subject: [PATCH 069/286] [MetaSchedule] Fix tensorcore winograd task
 extraction (#13625)

* [MetaSchedule] Fix tensorcore winograd task extraction

* add test

* fixed target
---
 python/tvm/relay/op/strategy/cuda.py           |  2 ++
 .../test_meta_schedule_relay_integration.py    | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 312ec0fe2f97..cc438092666a 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -261,6 +261,8 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                     )
             if (
                 target.kind.name == "cuda"
+                and not is_auto_scheduler_enabled()
+                and not is_meta_schedule_enabled()
                 and nvcc.have_tensorcore(target=target)
                 and (
                     (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index 062da0b00ca3..604f337099b0 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -108,6 +108,24 @@ def test_meta_schedule_integration_extract_from_resnet():
         assert t.task_name in expected_task_names, t.task_name
 
 
+@requires_torch
+def test_task_extraction_winograd_tensorcore():
+    mod, params, _ = get_network(name="resnet_50", input_shape=[16, 3, 224, 224])
+    seq = tvm.transform.Sequential(
+        [
+            relay.transform.ToMixedPrecision("float16"),
+            relay.transform.ConvertLayout({"nn.conv2d": ["NHWC", "HWIO"]}),
+        ]
+    )
+    with tvm.transform.PassContext(opt_level=3):
+        mod = seq(mod)
+
+    target = tvm.target.Target("nvidia/geforce-rtx-3070")
+    extracted_tasks = ms.relay_integration.extract_tasks(mod, target=target, params=params)
+
+    assert len([t for t in extracted_tasks if "winograd" in t.task_name]) == 4
+
+
 @requires_torch
 def test_task_extraction_anchor_block():
     mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])

From 80c02336a2261d1d046165921fc37e489a431482 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 15 Dec 2022 23:29:17 -0800
Subject: [PATCH 070/286] [COMMUNITY] Min Chen -> Reviewer (#13628)

Please join us to welcome @multiverstack-intellif as a new reviewer to
TVM. Min contributed key features in TIR scheduling, specifically,
cache_read/write that are aware of cache location.
These are huge improvements that are technically profound and helpful
to the overall TVM stack. Therefore, it would be good to get more
opportunities for him to participate more deeply in the community.

- [Commits History](https://github.com/apache/tvm/commits?author=multiverstack-intellif)
- [Code Review](https://github.com/apache/tvm/pulls?q=reviewed-by:multiverstack-intellif)
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 9fc6423d1e76..84615e9fc60b 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -202,6 +202,7 @@ We do encourage everyone to work anything they are interested in.
 - [Hao Yu](https://github.com/comaniac): @comaniac
 - [Joshua Z. Zhang](https://github.com/zhreshold): @zhreshold
 - [Lianmin Zheng](https://github.com/merrymercy): @merrymercy
+- [Min Chen](https://github.com/multiverstack-intellif): @multiverstack-intellif
 - [Xiyou Zhou](https://github.com/zxybazh): @zxybazh
 
 ## List of Contributors

From 54358e097775a9fe0b967e11143b5ad75751d073 Mon Sep 17 00:00:00 2001
From: Balint Cristian <cristian.balint@gmail.com>
Date: Fri, 16 Dec 2022 15:23:26 +0200
Subject: [PATCH 071/286] [BugFix][UMA] Protect target registration (#13624)

This PR address fixes for UMA target registration.
* Fix the doc issue #13304
* Continues stalled PR #12731

Changes:
* Incorporates all proposed fixes from mentioned [PR #12731](https://github.com/apache/tvm/pull/12731)
* Address test case concerns and discussions from [PR #12731](https://github.com/apache/tvm/pull/12731)
* **NEW:** Already exiting target cannot be created, explicit error on this.
* **NEW:** Attributes having special/reserved scope cannot be created explicitly.

It also address proper test cases for all the above.
---
 gallery/tutorial/uma.py                       |  2 +-
 .../tvm/relay/backend/contrib/uma/backend.py  |  7 +++---
 src/relay/backend/contrib/uma/targets.cc      | 24 +++++++++++-------
 tests/python/contrib/test_uma/test_target.py  | 25 ++++++++++++++++---
 4 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/gallery/tutorial/uma.py b/gallery/tutorial/uma.py
index ed4fc4cf805c..ea38813a7ace 100644
--- a/gallery/tutorial/uma.py
+++ b/gallery/tutorial/uma.py
@@ -57,7 +57,7 @@
 #
 
 ######################################################################
-# .. image:: https://raw.githubusercontent.com/apache/tvm-site/main/images/tutorial/uma_vanilla_block_diagram.png
+# .. image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/tutorial/uma_vanilla_block_diagram.png
 #   :width: 100%
 #   :alt: A block diagram of Vanilla
 #
diff --git a/python/tvm/relay/backend/contrib/uma/backend.py b/python/tvm/relay/backend/contrib/uma/backend.py
index 40ec06e45367..550109f1700d 100644
--- a/python/tvm/relay/backend/contrib/uma/backend.py
+++ b/python/tvm/relay/backend/contrib/uma/backend.py
@@ -278,11 +278,12 @@ def register(self) -> None:
         """
         registration_func = tvm.get_global_func("relay.backend.contrib.uma.RegisterTarget")
 
-        for name, attr in self._target_attrs:
+        for name, attr in self._target_attrs.items():
             if attr is None:
                 raise ValueError("Target attribute None is not supported.")
-
-        if registration_func(self.target_name, self._target_attrs):
+        # skip if target is already registered
+        if self.target_name not in tvm.target.Target.list_kinds():
+            registration_func(self.target_name, self._target_attrs)
             self._relay_to_relay.register()
             self._relay_to_tir.register()
             self._tir_to_runtime.register()
diff --git a/src/relay/backend/contrib/uma/targets.cc b/src/relay/backend/contrib/uma/targets.cc
index ed2cc047cf2f..e2fe644cb9bf 100644
--- a/src/relay/backend/contrib/uma/targets.cc
+++ b/src/relay/backend/contrib/uma/targets.cc
@@ -31,7 +31,7 @@ namespace tvm {
 namespace relay {
 namespace contrib {
 namespace uma {
-tvm::transform::Pass RelayToTIR(String target_name);
+transform::Pass RelayToTIR(String target_name);
 runtime::Module TIRToRuntime(IRModule mod, Target target);
 }  // namespace uma
 }  // namespace contrib
@@ -39,16 +39,15 @@ runtime::Module TIRToRuntime(IRModule mod, Target target);
 
 TVM_REGISTER_GLOBAL("relay.backend.contrib.uma.RegisterTarget")
     .set_body_typed([](String target_name, Map<String, ObjectRef> attr_options) -> bool {
-      // @todo(cgerum): We probably should get rid of target.register rather sooner than later
-      //               And use a proper registry for uma backends
-      for (const String registered_target_name : ::tvm::TargetKindRegEntry::ListTargetKinds()) {
+      // create only new target and init only once
+      for (const String registered_target_name : TargetKindRegEntry::ListTargetKinds()) {
         if (registered_target_name == target_name) {
-          return false;
+          LOG(FATAL) << "TVM UMA Error: Target is already registered: " << target_name;
         }
       }
 
       auto target_kind =
-          ::tvm::TargetKindRegEntry::RegisterOrGet(target_name)
+          TargetKindRegEntry::RegisterOrGet(target_name)
               .set_name()
               .set_default_device_type(kDLCPU)
               .add_attr_option<Array<String>>("keys")
@@ -58,20 +57,27 @@ TVM_REGISTER_GLOBAL("relay.backend.contrib.uma.RegisterTarget")
               .add_attr_option<Array<String>>("libs")
               .add_attr_option<Target>("host")
               .add_attr_option<Integer>("from_device")
-              .set_attr<FTVMRelayToTIR>(tvm::attr::kRelayToTIR,
+              .set_attr<FTVMRelayToTIR>(attr::kRelayToTIR,
                                         relay::contrib::uma::RelayToTIR(target_name))
               .set_attr<FTVMTIRToRuntime>("TIRToRuntime", relay::contrib::uma::TIRToRuntime);
 
+      // target kind attrs inventory
+      auto kind = TargetKind::Get(target_name).value();
+      auto list_attrs = TargetKindRegEntry::ListTargetKindOptions(kind);
+
       for (auto& attr_option : attr_options) {
         auto option_name = attr_option.first;
         auto default_value = attr_option.second;
+        if (list_attrs.find(option_name) != list_attrs.end()) {
+          LOG(FATAL) << "TVM UMA Error: Attribute is already registered: " << option_name;
+        }
         if (default_value->IsInstance<StringObj>()) {
           target_kind.add_attr_option<String>(option_name, Downcast<String>(default_value));
         } else if (default_value->IsInstance<IntImmNode>()) {
           target_kind.add_attr_option<Integer>(option_name, Downcast<Integer>(default_value));
         } else {
-          LOG(FATAL) << "Only String, Integer, or Bool are supported. Given attribute option type: "
-                     << attr_option.second->GetTypeKey();
+          LOG(FATAL) << "TypeError: Only String, Integer, or Bool are supported. "
+                     << "Given attribute option type: " << attr_option.second->GetTypeKey();
         }
       }
       return true;
diff --git a/tests/python/contrib/test_uma/test_target.py b/tests/python/contrib/test_uma/test_target.py
index 558c4e518230..1662becf088d 100644
--- a/tests/python/contrib/test_uma/test_target.py
+++ b/tests/python/contrib/test_uma/test_target.py
@@ -63,23 +63,42 @@ def test_uma_target(target_name, target_attrs, target_args):
     [
         ("float_attr", 3.14),
         ("none_attr", None),
+        ("model", "my_model"),
     ],
 )
 def test_invalid_attr_option(attr_name: str, target_attr: Union[str, int, bool, float, None]):
+    registration_func = tvm.get_global_func("relay.backend.contrib.uma.RegisterTarget")
     if target_attr is None:
         # None cannot be caught as TVMError, as it causes a SIGKILL, therefore it must be prevented to be
         # entered into relay.backend.contrib.uma.RegisterTarget at Python level.
-        with pytest.raises(ValueError):
+        with pytest.raises(ValueError, match=r"Target attribute None is not supported."):
             uma_backend = VanillaAcceleratorBackend()
             uma_backend._target_attrs = {attr_name: target_attr}
             uma_backend.register()
+    elif "model" in attr_name:
+        target_name = f"{attr_name}_{target_attr}"
+        target_attr = {attr_name: target_attr}
+        with pytest.raises(tvm.TVMError, match=r"Attribute is already registered: .*"):
+            registration_func(target_name, target_attr)
     else:
-        registration_func = tvm.get_global_func("relay.backend.contrib.uma.RegisterTarget")
         target_name = f"{attr_name}_{target_attr}"
         target_attr = {attr_name: target_attr}
-        with pytest.raises(tvm.TVMError, match=r"Only String, Integer, or Bool are supported. .*"):
+        with pytest.raises(TypeError, match=r"Only String, Integer, or Bool are supported. .*"):
             registration_func(target_name, target_attr)
 
 
+@pytest.mark.parametrize(
+    "target_name",
+    [
+        "llvm",
+        "c",
+    ],
+)
+def test_target_duplication(target_name: str):
+    with pytest.raises(tvm.TVMError, match=r"TVM UMA Error: Target is already registered: .*"):
+        registration_func = tvm.get_global_func("relay.backend.contrib.uma.RegisterTarget")
+        registration_func(target_name, {})
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From cc8ad6956fe45690b568613471befba167b0f3c8 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sat, 17 Dec 2022 03:25:47 +0900
Subject: [PATCH 072/286] [Arith] Allow const folding on fp16 involving one and
 zero (#13631)

---
 src/arith/const_fold.h                        |  8 -----
 .../unittest/test_arith_canonical_simplify.py | 29 ++++++++++++-------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/src/arith/const_fold.h b/src/arith/const_fold.h
index 606bc28ddd22..22a91b91b946 100644
--- a/src/arith/const_fold.h
+++ b/src/arith/const_fold.h
@@ -142,8 +142,6 @@ inline Optional<PrimExpr> TryConstFold<tir::Add>(PrimExpr a, PrimExpr b) {
                                                        static_cast<float>(fb->value)));
       } else if (rtype.bits() == 64) {
         return FloatImm(rtype, fa->value + fb->value);
-      } else {
-        return NullOpt;
       }
     }
     if (fa && fa->value == 0) return b;
@@ -171,8 +169,6 @@ inline Optional<PrimExpr> TryConstFold<tir::Sub>(PrimExpr a, PrimExpr b) {
                                                        static_cast<float>(fb->value)));
       } else if (rtype.bits() == 64) {
         return FloatImm(rtype, fa->value - fb->value);
-      } else {
-        return NullOpt;
       }
     }
     if (fb && fb->value == 0) return a;
@@ -202,8 +198,6 @@ inline Optional<PrimExpr> TryConstFold<tir::Mul>(PrimExpr a, PrimExpr b) {
                                                        static_cast<float>(fb->value)));
       } else if (rtype.bits() == 64) {
         return FloatImm(rtype, fa->value * fb->value);
-      } else {
-        return NullOpt;
       }
     }
     if (fa) {
@@ -243,8 +237,6 @@ inline Optional<PrimExpr> TryConstFold<tir::Div>(PrimExpr a, PrimExpr b) {
                                                        static_cast<float>(fb->value)));
       } else if (rtype.bits() == 64) {
         return FloatImm(rtype, fa->value / fb->value);
-      } else {
-        return NullOpt;
       }
     }
     if (fa && fa->value == 0) return a;
diff --git a/tests/python/unittest/test_arith_canonical_simplify.py b/tests/python/unittest/test_arith_canonical_simplify.py
index 9f187685991e..9db3035fd944 100644
--- a/tests/python/unittest/test_arith_canonical_simplify.py
+++ b/tests/python/unittest/test_arith_canonical_simplify.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+import tvm.testing
 from tvm import te
 
 
@@ -124,6 +125,22 @@ def test_div_simplify():
     ck.verify(fld(17 + 47 * x, 16), fld(x * 47 + 17, 16))
 
 
+def test_fp16_const_fold():
+    ck = CanonicalChecker()
+    zero = tvm.tir.const(0, "float16")
+    one = tvm.tir.const(1, "float16")
+    half = tvm.tir.const(0.5, "float16")
+
+    ck.verify(zero + half, half)
+    ck.verify(half - zero, half)
+
+    ck.verify(zero * half, zero)
+    ck.verify(half * one, half)
+
+    ck.verify(half / one, half)
+    ck.verify(zero / half, zero)
+
+
 def test_floormod_simplify():
     ck = CanonicalChecker()
     flm = tvm.te.floormod
@@ -356,14 +373,4 @@ def test_simplify_cast():
 
 
 if __name__ == "__main__":
-    test_floormod_simplify()
-    test_mul_sum_simplify()
-    test_simplify_if_then_else()
-    test_div_simplify()
-    test_reduce_simplify()
-    test_reduce_combiner_simplify()
-
-    test_split_index_simplify()
-    test_canonical_mixed()
-    test_complex_cases()
-    test_simplify_cast()
+    tvm.testing.main()

From 23c509af02ec2f806c98c354886c05e7f4348c08 Mon Sep 17 00:00:00 2001
From: Janet Schneider <21978033+janetsc@users.noreply.github.com>
Date: Fri, 16 Dec 2022 16:40:14 -0800
Subject: [PATCH 073/286] [Hexagon][runtime] Make
 HexagonThreadManager::CheckSemaphore thread safe (#13609)

Protect CheckSemaphore with mutex. Ensure that only one thread can add a semaphore if it doesn't already exist.
---
 src/runtime/hexagon/hexagon_thread_manager.cc | 10 ++++++++--
 src/runtime/hexagon/hexagon_thread_manager.h  |  3 +++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_thread_manager.cc b/src/runtime/hexagon/hexagon_thread_manager.cc
index 3658611cf00d..4f8ddd156b9f 100644
--- a/src/runtime/hexagon/hexagon_thread_manager.cc
+++ b/src/runtime/hexagon/hexagon_thread_manager.cc
@@ -265,9 +265,15 @@ void HexagonThreadManager::WaitOnThreads() {
 }
 
 void HexagonThreadManager::CheckSemaphore(unsigned syncID) {
+  // We want the success case to be fast, so do not lock the mutex
   if (semaphores_.find(syncID) == semaphores_.end()) {
-    semaphores_[syncID] = reinterpret_cast<qurt_sem_t*>(malloc(sizeof(qurt_sem_t)));
-    qurt_sem_init_val(semaphores_[syncID], 0);
+    // If we don't find it, lock the mutex, make sure it hasn't
+    // been added by another thread before creating it.
+    std::lock_guard<std::mutex> lock(semaphores_mutex_);
+    if (semaphores_.find(syncID) == semaphores_.end()) {
+      semaphores_[syncID] = reinterpret_cast<qurt_sem_t*>(malloc(sizeof(qurt_sem_t)));
+      qurt_sem_init_val(semaphores_[syncID], 0);
+    }
   }
 }
 
diff --git a/src/runtime/hexagon/hexagon_thread_manager.h b/src/runtime/hexagon/hexagon_thread_manager.h
index c911d1326a39..9bf6bb6efe64 100644
--- a/src/runtime/hexagon/hexagon_thread_manager.h
+++ b/src/runtime/hexagon/hexagon_thread_manager.h
@@ -213,6 +213,9 @@ class HexagonThreadManager {
   //! \brief Semaphores used by `Signal` and `Wait` mapped by ID.
   std::unordered_map<unsigned, qurt_sem_t*> semaphores_;
 
+  //! \brief Protects updates to semaphores_
+  std::mutex semaphores_mutex_;
+
   //! \brief Start semaphore created at time of construction; signled by `Start`.
   qurt_sem_t start_semaphore_;
 

From 005a46313fcc85abc8e6d636a890cdb6e30e2a4d Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@octoml.ai>
Date: Fri, 16 Dec 2022 21:03:55 -0500
Subject: [PATCH 074/286] [Relay][Testing][Bugfix] `py_converter` should use
 correct AST for versions above 3.8 too (#13635)

Currently, `relay.testing.py_converter` is checking for using _exactly_ Python 3.8 in order to use certain updated signatures in the `ast` library. However, those signatures are also correct for versions _above_ 3.8. This PR changes the bounds checks so that the converter will work above 3.8.
---
 python/tvm/relay/testing/py_converter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/testing/py_converter.py b/python/tvm/relay/testing/py_converter.py
index 50f473aea1f2..1ec85faea619 100644
--- a/python/tvm/relay/testing/py_converter.py
+++ b/python/tvm/relay/testing/py_converter.py
@@ -88,7 +88,7 @@ def convert(self, prog: Expr):
         body.append(Assign([Name(OUTPUT_VAR_NAME, Store())], prog_body))
         global __MAJOR__, __MINOR__
 
-        if __MAJOR__ == 3 and __MINOR__ == 8:
+        if __MAJOR__ == 3 and __MINOR__ >= 8:
             return ast.fix_missing_locations(ast.Module(body=body, type_ignores=[]))
         else:
             return ast.fix_missing_locations(ast.Module(body=body))
@@ -224,7 +224,7 @@ def create_def(self, func_name: str, arguments: [str], body):
         inner_args = [ast.arg(argument, None) for argument in arguments]
 
         global __MAJOR__, __MINOR__
-        if __MAJOR__ == 3 and __MINOR__ == 8:
+        if __MAJOR__ == 3 and __MINOR__ >= 8:
             arguments = ast.arguments([], inner_args, None, [], [], None, [])
         else:
             arguments = ast.arguments(inner_args, None, [], [], None, [])

From 2799a3e95ea810a35bbd29a487ecefed406ef0fb Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Fri, 16 Dec 2022 21:26:01 -0800
Subject: [PATCH 075/286] [Relay][Runtime] Add `set_input/output_zero_copy` in
 python (#13623)

* add set_output and test for set_output_zero_copy in python

* clean up

* clean up test

* test finished

* remove set output

* remove setoutput from header

* use zero copy for params

* fix typo

* address comments

* address comments

* add second test for set_input params

* add requires_torch

* add requires torch

* remove pytest

* add error handling for c graph executor

* better handling
---
 python/tvm/contrib/graph_executor.py          | 58 ++++++++++++++++++-
 .../test_meta_schedule_relay_integration.py   | 20 ++-----
 .../test_runtime_module_based_interface.py    | 39 +++++++++++++
 3 files changed, 101 insertions(+), 16 deletions(-)

diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py
index 08dae307a89e..161ca5ffd08c 100644
--- a/python/tvm/contrib/graph_executor.py
+++ b/python/tvm/contrib/graph_executor.py
@@ -153,6 +153,21 @@ class GraphModule(object):
     def __init__(self, module):
         self.module = module
         self._set_input = module["set_input"]
+
+        # TODO(shingjan): The graph_executor in C doesn't have
+        # set_input/output_zero_copy implemented.
+        try:
+            self._set_input_zero_copy = module["set_input_zero_copy"]
+        except AttributeError:
+            self._set_input_zero_copy = lambda *_: (_ for _ in ()).throw(
+                Exception("set_input_zero_copy is not implemented for C graph executor")
+            )
+        try:
+            self._set_output_zero_copy = module["set_output_zero_copy"]
+        except AttributeError:
+            self._set_output_zero_copy = lambda *_: (_ for _ in ()).throw(
+                Exception("set_output_zero_copy is not implemented for C graph executor")
+            )
         self._run = module["run"]
         self._get_output = module["get_output"]
         self._get_input = module["get_input"]
@@ -172,7 +187,7 @@ def set_input(self, key=None, value=None, **params):
            The input key
 
         value : the input value.
-           The input key
+           The input value
 
         params : dict of str to NDArray
            Additional arguments
@@ -195,6 +210,47 @@ def set_input(self, key=None, value=None, **params):
                 if val:
                     self._get_input(k).copyfrom(params[k])
 
+    def set_input_zero_copy(self, key=None, value=None, **params):
+        """Set inputs to the module via kwargs with zero memory copy
+
+        Parameters
+        ----------
+        key : int or str
+           The input key
+
+        value : the input value in DLPack
+           The input value
+
+        params : dict of str to NDArray
+           Additional arguments
+        """
+        if key is not None:
+            self._set_input_zero_copy(key, value)
+
+        if params:
+            keys = list(params.keys())
+
+            for k in keys:
+                # TODO(zhiics) Skip the weights for submodule in a better way.
+                # We should use ConstLoaderModule for initialization and remove
+                # params from set_input
+                val = self._get_input(k)
+                if val:
+                    self._set_input_zero_copy(k, params[k])
+
+    def set_output_zero_copy(self, key, value):
+        """Set outputs to the module with zero memory copy
+
+        Parameters
+        ----------
+        key : int or str
+           The output key
+
+        value : the output value in DLPack
+           The output value
+        """
+        self._set_output_zero_copy(key, value)
+
     def run(self, **input_dict):
         """Run forward execution of the graph
 
diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index 604f337099b0..76d6323f309a 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -54,16 +54,6 @@ def main(a: T.handle, b: T.handle) -> None:  # type: ignore
 # pylint: enable=no-member,line-too-long,too-many-nested-blocks,unbalanced-tuple-unpacking,no-self-argument
 
 
-def _has_torch():
-    import importlib.util  # pylint: disable=unused-import,import-outside-toplevel
-
-    spec = importlib.util.find_spec("torch")
-    return spec is not None
-
-
-requires_torch = pytest.mark.skipif(not _has_torch(), reason="torch is not installed")
-
-
 def test_meta_schedule_dynamic_loop_extent():
     a = relay.var("a", shape=(1, 8, 8, 512), dtype="float32")
     b = relay.nn.adaptive_avg_pool2d(a, (7, 7), "NHWC")
@@ -72,7 +62,7 @@ def test_meta_schedule_dynamic_loop_extent():
     assert not extracted_tasks
 
 
-@requires_torch
+@tvm.testing.requires_package("torch")
 def test_meta_schedule_integration_extract_from_resnet():
     mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
     extracted_tasks = ms.relay_integration.extract_tasks(mod, target="llvm", params=params)
@@ -108,7 +98,7 @@ def test_meta_schedule_integration_extract_from_resnet():
         assert t.task_name in expected_task_names, t.task_name
 
 
-@requires_torch
+@tvm.testing.requires_package("torch")
 def test_task_extraction_winograd_tensorcore():
     mod, params, _ = get_network(name="resnet_50", input_shape=[16, 3, 224, 224])
     seq = tvm.transform.Sequential(
@@ -126,7 +116,7 @@ def test_task_extraction_winograd_tensorcore():
     assert len([t for t in extracted_tasks if "winograd" in t.task_name]) == 4
 
 
-@requires_torch
+@tvm.testing.requires_package("torch")
 def test_task_extraction_anchor_block():
     mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
     extracted_tasks = ms.relay_integration.extract_tasks(
@@ -161,7 +151,7 @@ def test_task_extraction_anchor_block():
         assert t.task_name in expected_task_names, t.task_name
 
 
-@requires_torch
+@tvm.testing.requires_package("torch")
 def test_meta_schedule_integration_extract_from_bert_base():
     pytest.importorskip(
         "transformers", reason="transformers package is required to import bert_base"
@@ -259,7 +249,7 @@ def test_meta_schedule_integration_extract_from_bert_base():
         assert expected_shape == shape, t.task_name
 
 
-@requires_torch
+@tvm.testing.requires_package("torch")
 def test_meta_schedule_integration_extract_from_resnet_with_filter_func():
     @register_func("relay.backend.tir_converter.remove_purely_spatial", override=True)
     def filter_func(args, _) -> bool:
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index c7ce5abfbd92..0ed097ddf563 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -688,6 +688,44 @@ def test_num_threads():
         assert reported == hardware_threads or reported == hardware_threads // 2
 
 
+@tvm.testing.requires_llvm
+@tvm.testing.requires_package("torch")
+def test_graph_module_zero_copy():
+    mod = tvm.IRModule()
+    params = {}
+    dev = tvm.cpu()
+    x = relay.var("x", shape=(1, 10))
+    y = relay.var("y", shape=(1, 10))
+    z = relay.add(x, y)
+    mod["main"] = relay.Function([x, y], z)
+
+    # need torch to do the from_dlpack trick
+    import torch
+
+    compiled_graph_lib = relay.build(mod, target="llvm", params=params)
+    gm = graph_executor.GraphModule(compiled_graph_lib["default"](dev))
+    x_data = torch.rand((1, 10))
+    y_data = torch.rand((1, 10))
+    z_data = torch.rand((1, 10))
+    z_torch = x_data + y_data
+
+    # zero copy run
+    assert not np.allclose(z_data.numpy(), z_torch.numpy())
+    gm.set_input_zero_copy("x", tvm.nd.from_dlpack(x_data))
+    gm.set_input_zero_copy("y", tvm.nd.from_dlpack(y_data))
+    gm.set_output_zero_copy(0, tvm.nd.from_dlpack(z_data))
+    gm.run()
+
+    tvm.testing.assert_allclose(z_data.numpy(), z_torch.numpy())
+
+    # zero input copy with params
+    gm = graph_executor.GraphModule(compiled_graph_lib["default"](dev))
+    gm.set_input_zero_copy(x=tvm.nd.from_dlpack(x_data), y=tvm.nd.from_dlpack(y_data))
+    gm.run()
+
+    tvm.testing.assert_allclose(gm.get_output(0).numpy(), z_torch.numpy())
+
+
 if __name__ == "__main__":
     test_legacy_compatibility()
     test_cpu()
@@ -699,3 +737,4 @@ def test_num_threads():
     test_cpu_get_graph_json()
     test_cpu_get_graph_params_run()
     test_cpu_get_graph_params_compare()
+    test_graph_module_zero_copy()

From 8427852b9aa92cae98e0207d55ab46476c5e8f27 Mon Sep 17 00:00:00 2001
From: lightzhan <1126207509@qq.com>
Date: Sun, 18 Dec 2022 09:44:49 +0800
Subject: [PATCH 076/286] [BugFix][TVMScript] Parser crash (#13630)

This PR tries to fix the crash of parser when the old value of a var is an array but the new value is not. For example:

```python
from tvm.script import tir as T
def func_wrapper(shape, dtype):
    @T.prim_func
    def test_case():
        a = T.alloc_buffer(shape, dtype=dtype)

    return test_case


if __name__ == "__main__":
    a = np.zeros((10, 10), dtype="int8")
    print(func_wrapper((256, 256), dtype="int8").script())
```

In the above code, there are two assignment to var 'a'. In the global scope, its value is a numpy array. But it is a Buffer in the prim function. There is a table named 'name2value' to track the value of vars like 'a' here.
When the parser wants to update its value, it will compare the value between the new and the old assignment. Here the problem comes. When we use '==' to compare an array with a value, the result is an array too, which can not be used as a condition of a if stmt directly. So, the code above will emit an error:

```shell
error: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
 --> /workspace/code_newest/tvm/private_test/test_meta_programming.py:16:9
    |
 16 |          a = T.alloc_buffer(shape, dtype=dtype)
    |          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
```

This PR fixes this by change "==" to "is".

Co-authored-by: lightzhan-intellif <zhan.liang@intellif.com>
---
 python/tvm/script/parser/core/parser.py           |  8 ++++++--
 .../python/unittest/test_tvmscript_regression.py  | 15 +++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/python/tvm/script/parser/core/parser.py b/python/tvm/script/parser/core/parser.py
index c6d43f11cbf5..7c699c42aecb 100644
--- a/python/tvm/script/parser/core/parser.py
+++ b/python/tvm/script/parser/core/parser.py
@@ -19,6 +19,7 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from typing import Any, Callable, Dict, List, Optional, Set, Union
+import numpy as np
 from tvm._ffi.base import TVMError
 
 from tvm.error import DiagnosticError
@@ -150,8 +151,11 @@ def add(self, var: str, value: Any, allow_shadowing: bool = False):
             The options of whether variable shadowing allwed for this variable.
         """
         # Skip if the key and value are equal to those in the var_table
-        if self.name2value[var] and self.name2value[var][-1] == value:
-            return
+        if self.name2value[var] and isinstance(self.name2value[var][-1], type(value)):
+            if isinstance(value, np.ndarray) and (self.name2value[var][-1] == value).all():
+                return
+            elif self.name2value[var][-1] == value:
+                return
         if allow_shadowing and var in self.frames[-1].vars:
             # Shadowing
             self.name2value[var][-1] = value
diff --git a/tests/python/unittest/test_tvmscript_regression.py b/tests/python/unittest/test_tvmscript_regression.py
index 3ad8090893eb..05c1665ea2a1 100644
--- a/tests/python/unittest/test_tvmscript_regression.py
+++ b/tests/python/unittest/test_tvmscript_regression.py
@@ -45,5 +45,20 @@ def test_multi_element_array_in_outmost_namespace():
     tvm.ir.assert_structural_equal(func, rt_func)
 
 
+def test_different_dtype_assignment_to_var():
+    @T.prim_func
+    def test_case():
+        a = T.alloc_buffer((10, 10), dtype="int8")
+
+    @T.prim_func
+    def func_ref():
+        a = T.alloc_buffer([10, 10], dtype="int8")
+        T.evaluate(0)
+
+    tvm.ir.assert_structural_equal(test_case, func_ref)
+
+
 if __name__ == "__main__":
+    a = numpy.zeros((10, 10), dtype="int8")
     test_multi_element_array_in_outmost_namespace()
+    test_different_dtype_assignment_to_var()

From c798ed16950322b9790b70cd250db591fd525980 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Mon, 19 Dec 2022 10:57:43 +0530
Subject: [PATCH 077/286] [TRANSFORM] Fix virtual device annotation issue with
 BYOC subgraphs (#13325)

* [TRANSFORM] Fix virtual device anaotation issue with BYOC subgraphs

Heterogeneous module partitioned by BYOC has functions nodes without any VirtualDevice
definition (having FullyUnconstrained device). Ignoring the device here causes
expr_virtual_devices_ being empty when PopVirtualDevice is called assuming above PushVirtualDevice
is succeeded. PushVirtualDevice and PopVirtualDevice occurs as pairs across function body,
hence it's better to insert the The Virtual Device for Uncontrained and Pop it subsequently.

* * Test case

Co-authored-by: Siva Rama Krishna Reddy B <sivb@blr-ubuntu-ripper.qualcomm.com>
---
 src/relay/transforms/device_aware_visitors.cc |  3 --
 .../python/contrib/test_clml/test_compiler.py | 42 +++++++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 tests/python/contrib/test_clml/test_compiler.py

diff --git a/src/relay/transforms/device_aware_visitors.cc b/src/relay/transforms/device_aware_visitors.cc
index e433e9a3cc88..f3ca1bfa3a9e 100644
--- a/src/relay/transforms/device_aware_visitors.cc
+++ b/src/relay/transforms/device_aware_visitors.cc
@@ -94,9 +94,6 @@ void LexicalOnDeviceMixin::ExitFunctionBody() {
 }
 
 void LexicalOnDeviceMixin::PushVirtualDevice(const VirtualDevice& virtual_device) {
-  if (virtual_device->IsFullyUnconstrained()) {
-    return;
-  }
   expr_virtual_devices_.emplace_back(virtual_device);
 }
 
diff --git a/tests/python/contrib/test_clml/test_compiler.py b/tests/python/contrib/test_clml/test_compiler.py
new file mode 100644
index 000000000000..973fbbd345f0
--- /dev/null
+++ b/tests/python/contrib/test_clml/test_compiler.py
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""CLML compiler tests."""
+
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay import testing
+from tvm.relay.op.contrib import clml
+import pytest
+
+
+@tvm.testing.requires_openclml
+def test_device_annotation():
+    mod, params = relay.testing.mobilenet.get_workload(batch_size=1)
+    mod = clml.partition_for_clml(mod, params)
+    with tvm.transform.PassContext(opt_level=3):
+        relay.backend.te_compiler.get().clear()
+        lib = relay.build(
+            mod,
+            target="opencl -device=adreno",
+            target_host="llvm -mtriple=aarch64-linux-gnu",
+            params=params,
+        )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 8798c9383895ec91ac95d6f75c0f2fd782acdcf4 Mon Sep 17 00:00:00 2001
From: lightzhan <1126207509@qq.com>
Date: Mon, 19 Dec 2022 13:31:04 +0800
Subject: [PATCH 078/286] [BugFix][TVMScript]fix var capturing order error
 (#13640)

This PR try to fix the following bug:

```python
def test_var_capturing_order():
    b = 2

    @T.prim_func
    def test_case():
        k: T.int32 = b


if __name__ == "__main__":
    b = 1
```

In the prim func `test_case`, the vaule of b should be 2, rather than 1. The parser wrongly uses global vars to shadow the value of nonlocal vars, which should be reversed.

Co-authored-by: lightzhan-intellif <zhan.liang@intellif.com>
---
 python/tvm/script/parser/core/utils.py          |  2 +-
 .../unittest/test_tvmscript_regression.py       | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/python/tvm/script/parser/core/utils.py b/python/tvm/script/parser/core/utils.py
index a304afddbe55..453ac18b382b 100644
--- a/python/tvm/script/parser/core/utils.py
+++ b/python/tvm/script/parser/core/utils.py
@@ -37,8 +37,8 @@ def inspect_function_capture(func: Callable) -> Dict[str, Any]:
         The function variables map with non-local or global variables.
     """
     captured = {
-        **inspect.getclosurevars(func).nonlocals,
         **func.__globals__,  # type: ignore
+        **inspect.getclosurevars(func).nonlocals,
     }
     return captured
 
diff --git a/tests/python/unittest/test_tvmscript_regression.py b/tests/python/unittest/test_tvmscript_regression.py
index 05c1665ea2a1..d063c0fcab7f 100644
--- a/tests/python/unittest/test_tvmscript_regression.py
+++ b/tests/python/unittest/test_tvmscript_regression.py
@@ -58,7 +58,24 @@ def func_ref():
     tvm.ir.assert_structural_equal(test_case, func_ref)
 
 
+def test_var_capturing_order():
+    b = 2
+
+    @T.prim_func
+    def test_case():
+        k: T.int32 = b
+
+    @T.prim_func
+    def func_ref():
+        k: T.int32 = 2
+        T.evaluate(0)
+
+    tvm.ir.assert_structural_equal(test_case, func_ref)
+
+
 if __name__ == "__main__":
     a = numpy.zeros((10, 10), dtype="int8")
     test_multi_element_array_in_outmost_namespace()
     test_different_dtype_assignment_to_var()
+    b = 1
+    test_var_capturing_order()

From 16677d4d4dbb3235a6c83b1a15aa590753b9d043 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 19 Dec 2022 06:09:49 -0800
Subject: [PATCH 079/286] [TVMScript] Remove obsolete modules (#13638)

Removing some minor code path that is not used any longer.
---
 apps/microtvm/cmsisnn/requirements.txt        |    3 -
 apps/microtvm/ethosu/requirements.txt         |    3 -
 .../install/ubuntu_install_python_package.sh  |    1 -
 docs/README.md                                |    2 +-
 docs/contribute/pull_request.rst              |    2 +-
 .../how_to/work_with_microtvm/micro_ethosu.py |    1 -
 python/gen_requirements.py                    |    2 -
 python/tvm/script/parser_v1/__init__.py       |   21 -
 python/tvm/script/parser_v1/_ffi_api.py       |   20 -
 .../script/parser_v1/context_maintainer.py    |  248 ---
 python/tvm/script/parser_v1/diagnostics.py    |   55 -
 python/tvm/script/parser_v1/meta_unparser.py  |   45 -
 python/tvm/script/parser_v1/parser.py         | 1391 -----------------
 python/tvm/script/parser_v1/registry.py       |   62 -
 python/tvm/script/parser_v1/tir/__init__.py   |   33 -
 python/tvm/script/parser_v1/tir/__init__.pyi  |  475 ------
 python/tvm/script/parser_v1/tir/intrin.py     |  307 ----
 python/tvm/script/parser_v1/tir/node.py       |  218 ---
 python/tvm/script/parser_v1/tir/prim_func.py  |   45 -
 .../tvm/script/parser_v1/tir/scope_handler.py |  793 ----------
 .../tvm/script/parser_v1/tir/special_stmt.py  |  927 -----------
 python/tvm/script/parser_v1/tir/ty.py         |  226 ---
 python/tvm/script/parser_v1/utils.py          |  105 --
 src/tir/schedule/error.h                      |    2 +-
 tests/python/unittest/test_tvmscript_spans.py |   73 -
 tests/scripts/ci.py                           |    1 -
 26 files changed, 3 insertions(+), 5058 deletions(-)
 delete mode 100644 python/tvm/script/parser_v1/__init__.py
 delete mode 100644 python/tvm/script/parser_v1/_ffi_api.py
 delete mode 100644 python/tvm/script/parser_v1/context_maintainer.py
 delete mode 100644 python/tvm/script/parser_v1/diagnostics.py
 delete mode 100644 python/tvm/script/parser_v1/meta_unparser.py
 delete mode 100644 python/tvm/script/parser_v1/parser.py
 delete mode 100644 python/tvm/script/parser_v1/registry.py
 delete mode 100644 python/tvm/script/parser_v1/tir/__init__.py
 delete mode 100644 python/tvm/script/parser_v1/tir/__init__.pyi
 delete mode 100644 python/tvm/script/parser_v1/tir/intrin.py
 delete mode 100644 python/tvm/script/parser_v1/tir/node.py
 delete mode 100644 python/tvm/script/parser_v1/tir/prim_func.py
 delete mode 100644 python/tvm/script/parser_v1/tir/scope_handler.py
 delete mode 100644 python/tvm/script/parser_v1/tir/special_stmt.py
 delete mode 100644 python/tvm/script/parser_v1/tir/ty.py
 delete mode 100644 python/tvm/script/parser_v1/utils.py
 delete mode 100644 tests/python/unittest/test_tvmscript_spans.py

diff --git a/apps/microtvm/cmsisnn/requirements.txt b/apps/microtvm/cmsisnn/requirements.txt
index 72ae166963ee..1c99bd49a92e 100644
--- a/apps/microtvm/cmsisnn/requirements.txt
+++ b/apps/microtvm/cmsisnn/requirements.txt
@@ -216,9 +216,6 @@ scipy==1.5.4 \
     --hash=sha256:ed572470af2438b526ea574ff8f05e7f39b44ac37f712105e57fc4d53a6fb660 \
     --hash=sha256:f87b39f4d69cf7d7529d7b1098cb712033b17ea7714aed831b95628f483fd012 \
     --hash=sha256:fa789583fc94a7689b45834453fec095245c7e69c58561dc159b5d5277057e4c
-synr==0.6.0 \
-    --hash=sha256:0b4e16b10c3988e1981e3372153a31956f74d86752eaaa55e8c4e7b7fe591e4e \
-    --hash=sha256:9399b27d9f21c5d439eae92e0159d6f521cc396d27149ac45473012a205a3c30
 tflite==2.10.0 \
     --hash=sha256:6818a5d7776958b803944ba0a1f4c4395559606d9e795d67ac467a8a3904757d \
     --hash=sha256:89cb9f57df0f5345f8fad1381e0fae6180ded687113eb552cfbb60a05edc002c
diff --git a/apps/microtvm/ethosu/requirements.txt b/apps/microtvm/ethosu/requirements.txt
index d9593a8184e9..d8a7fa7bd901 100644
--- a/apps/microtvm/ethosu/requirements.txt
+++ b/apps/microtvm/ethosu/requirements.txt
@@ -216,9 +216,6 @@ scipy==1.5.4 \
     --hash=sha256:ed572470af2438b526ea574ff8f05e7f39b44ac37f712105e57fc4d53a6fb660 \
     --hash=sha256:f87b39f4d69cf7d7529d7b1098cb712033b17ea7714aed831b95628f483fd012 \
     --hash=sha256:fa789583fc94a7689b45834453fec095245c7e69c58561dc159b5d5277057e4c
-synr==0.6.0 \
-    --hash=sha256:0b4e16b10c3988e1981e3372153a31956f74d86752eaaa55e8c4e7b7fe591e4e \
-    --hash=sha256:9399b27d9f21c5d439eae92e0159d6f521cc396d27149ac45473012a205a3c30
 tflite==2.4.0 \
     --hash=sha256:0510db1b48a3eec86bf9bb8d2749cd9d6d26d6a4fb329fd141bde5b4404932d1 \
     --hash=sha256:0796f6ce6eb2aef4a318f5509e5fb0ce808e29cd3094801b4abbb1d8575a28cd
diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 757ad0228c5d..93abac52beaa 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -41,7 +41,6 @@ pip3 install --upgrade \
     requests \
     scipy \
     Jinja2 \
-    synr==0.6.0 \
     junitparser==2.4.2 \
     six \
     tornado \
diff --git a/docs/README.md b/docs/README.md
index b6ca8e06f3f2..6c32d2d6bfed 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -48,7 +48,7 @@ This folder contains the source of TVM's documentation, hosted at https://tvm.ap
    ```bash
    # Pillow on Ubuntu may require libjpeg-dev from apt
    ./docker/bash.sh ci_gpu -c \
-       'python3 -m pip install --quiet tlcpack-sphinx-addon==0.2.1 synr==0.5.0 && python3 -m pip freeze' > frozen-requirements.txt
+       'python3 -m pip install --quiet tlcpack-sphinx-addon==0.2.1 && python3 -m pip freeze' > frozen-requirements.txt
 
    pip install -r frozen-requirements.txt
    ```
diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 7b5509be0aa9..60faff307457 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -254,7 +254,7 @@ Necessary dependencies:
 
 .. code:: bash
 
-  pip install --user pytest Cython synr
+  pip install --user pytest Cython
 
 If you want to run all tests:
 
diff --git a/gallery/how_to/work_with_microtvm/micro_ethosu.py b/gallery/how_to/work_with_microtvm/micro_ethosu.py
index 386c658ea818..e80860dc0ce6 100644
--- a/gallery/how_to/work_with_microtvm/micro_ethosu.py
+++ b/gallery/how_to/work_with_microtvm/micro_ethosu.py
@@ -95,7 +95,6 @@
 #     Pillow==8.3.2
 #     psutil==5.8.0
 #     scipy==1.5.4
-#     synr==0.6
 #     tflite==2.4.0
 #     tornado==6.1
 #
diff --git a/python/gen_requirements.py b/python/gen_requirements.py
index 9778937ae80b..b8c72a8f2744 100755
--- a/python/gen_requirements.py
+++ b/python/gen_requirements.py
@@ -70,7 +70,6 @@
                 "numpy",
                 "psutil",
                 "scipy",
-                "synr",
                 "tornado",
             ],
         ),
@@ -270,7 +269,6 @@
     ("sphinx_autodoc_annotation", None),
     ("sphinx_gallery", None),
     ("sphinx_rtd_theme", None),
-    ("synr", "==0.6.0"),
     ("tensorflow", None),
     ("tensorflow-estimator", None),
     ("tflite", None),
diff --git a/python/tvm/script/parser_v1/__init__.py b/python/tvm/script/parser_v1/__init__.py
deleted file mode 100644
index 555659d0c55e..000000000000
--- a/python/tvm/script/parser_v1/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TVM Script APIs of TVM Python Package, aimed to support TIR"""
-
-from . import tir
-
-from .parser import ir_module, from_source
diff --git a/python/tvm/script/parser_v1/_ffi_api.py b/python/tvm/script/parser_v1/_ffi_api.py
deleted file mode 100644
index 926d17b1667e..000000000000
--- a/python/tvm/script/parser_v1/_ffi_api.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""FFI APIs for tvm.script"""
-import tvm._ffi
-
-tvm._ffi._init_api("script", __name__)
diff --git a/python/tvm/script/parser_v1/context_maintainer.py b/python/tvm/script/parser_v1/context_maintainer.py
deleted file mode 100644
index b84b7d398084..000000000000
--- a/python/tvm/script/parser_v1/context_maintainer.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TVM Script Context Maintainer for TIR"""
-
-from typing import List, Mapping, Union, Optional, Dict, Callable
-import synr
-
-
-import tvm
-from tvm.ir import Span
-from tvm.ir.expr import Range
-from tvm.tir import Var, Buffer, PrimExpr, Stmt, MatchBufferRegion
-from tvm.runtime import Object
-from tvm.tir.expr import IterVar
-from .tir.node import BufferSlice
-
-
-class BlockInfo:
-    """Information for block and block_realize signature
-
-    Examples
-    ----------
-    .. code-block:: python
-
-        @T.prim_func
-        def example_func(a: T.handle, b: T.handle, c: T.handle) -> None:
-            A = T.match_buffer(a, (16, 16), "float32")
-            B = T.match_buffer(b, (16, 16), "float32")
-            C = T.match_buffer(a, (16, 16), "float32")
-
-            for i, j, k in T.grid(16, 16, 16):
-                with T.block("matmul"):
-                    vi = T.axis.S(16, i)
-                    vj = T.axis.S(16, j)
-                    vk = T.axis.R(16, k)         # iter_bindings = {vj: i, vj: j, vk: k}
-
-                    T.where(True)         # predicate of the block_realize
-
-                    T.reads(A[0:16, 0:16], B[0: 16, 0: 16])      # reads region of the block
-                    T.writes(C[0: 16, 0: 16])                    # writes region of the block
-                    T.block_attr({"attr_key": "attr_value"})     # block annotations
-
-                    # alloc_buffers inside the block
-                    CC = T.alloc_buffer((1, 1), dtype="float32")
-
-                    # match_buffers of the block,
-                    # which bind a sub-region of source buffer into a new buffer
-                    D = T.match_buffer(C[vi, vj], ())
-
-                    # init part of the block, executed when all reduce axes are the beginning value
-                    with T.init():
-                        C[vi, vj] = T.float32(0)
-
-                    # block body
-                    CC[0, 0] = A[vi, vk] * B[vj, vk]
-                    D[()] += CC[0, 0]         # The same as C[vi, vj] += CC[0, 0]
-    """
-
-    alloc_buffers: List[Buffer] = []
-    """List[Buffer]: list of T.alloc_buffer statements in the block signature"""
-    match_buffers: List[MatchBufferRegion] = []
-    """List[MatchBufferRegion]: list of T.match_buffer statements in the block signature"""
-    iter_values: List[PrimExpr] = []
-    """List[PrimExpr]: list of binding values for iter vars"""
-    iter_vars: List[IterVar] = []
-    """List[PrimExpr]: list of iter vars in the block"""
-    reads: Optional[List[BufferSlice]] = None
-    """Optional[List[BufferSlice]]:
-    list of T.reads statements in the block signature, None for not-visited"""
-    writes: Optional[List[BufferSlice]] = None
-    """Optional[List[BufferSlice]]:
-    list of T.writes statements in the block signature, None for not-visited"""
-    annotations: Optional[Mapping[str, Object]] = None
-    """Optional[Mapping[str, Object]]:
-    list of T.block_attr statements in the block signature, None for not-visited"""
-    predicate: Optional[PrimExpr] = None
-    """Optional[PrimExpr]: block realize predicate, None for not-visited"""
-    init: Optional[Stmt] = None
-    """Optional[Stmt]: init part of the block, None for not-visited"""
-
-    def __init__(self):
-        self.alloc_buffers = []
-        self.match_buffers = []
-        self.iter_values = []
-        self.iter_vars = []
-        self.reads = None
-        self.writes = None
-        self.annotations = None
-        self.predicate = None
-        self.init = None
-
-
-class ContextMaintainer:
-    """Maintain all the necessary context info
-    Parameters
-    ----------
-    _report_error : Callable[[str, Union[Span, synr.ast.Span]], None]
-        The report error function handle
-    """
-
-    # scope context
-    node_stack: List[List[synr.ast.Node]] = []
-    """List[List[synr.ast.Node]]: The ast nodes insides the current scope"""
-    block_info_stack: List[BlockInfo] = []
-    """List[BlockInfo]: The block info for the current block scope"""
-    loop_stack: Dict[Var, Range] = {}
-    """Dict[Var, Range]: The dict from loop var to its domain outside the block"""
-    symbols: List[Dict[str, Union[Var, Buffer]]] = []
-    """List[Dict[str, Union[Var, Buffer]]]: Symbol map from name to object for the current scope"""
-    closure_vars: Dict[str, Object] = {}
-    """ClosureVars: The closure vars defined in Python interpreter"""
-
-    # function context
-    func_params: List[Var] = []
-    """List[Var]: The function parameters"""
-    func_buffer_map: Mapping[Var, Buffer] = {}
-    """Mapping[Var, Buffer]: The function buffer map"""
-    func_dict_attr: Mapping[str, Object] = {}
-    """Mapping[str, Object]: The function attrs"""
-    func_var_env_dict: Mapping[Var, str] = {}
-    """Mapping[Var, str]: The map from var to env thread"""
-
-    # parser and analyzer
-    analyzer: tvm.arith.Analyzer = tvm.arith.Analyzer()
-    """tvm.arith.Analyzer: The analyzer for simplifying"""
-    _report_error: Callable[[str, Union[Span, synr.ast.Span]], None]
-    """Callable[[str, Union[Span, synr.ast.Span]], None]: The report error function handle"""
-
-    # root alloc_buffer
-    root_alloc_buffers: List[Buffer] = []
-    """List[Buffer]: The buffers allocated under root block"""
-
-    def __init__(
-        self,
-        _report_error: Callable[[str, Union[Span, synr.ast.Span]], None],
-        closure_vars: Dict[str, Object],
-    ):
-        # scope context
-        self.node_stack = []
-        self.block_info_stack = []
-        self.loop_stack = {}
-        self.symbols = []
-        self.closure_vars = closure_vars
-        # function context
-        self.func_params = []
-        self.func_buffer_map = {}
-        self.func_dict_attr = {}
-        self.func_var_env_dict = {}
-        # parser and analyzer
-        self._report_error = _report_error
-        self.analyzer = tvm.arith.Analyzer()
-        # root alloc_buffer
-        self.root_alloc_buffers = []
-
-    def enter_scope(self, nodes: Optional[List[synr.ast.Node]] = None):
-        """Creates a new scope
-
-        Note
-        ----
-        This function is used for normal scopes that do not involve
-        a `with block` scope. Use `enter_block_scope`
-        for block scope cases.
-
-        Parameters
-        ----------
-        nodes : Optional[List[synr.ast.Node]]
-            The synr AST nodes in new scope
-        """
-        if nodes is None:
-            nodes = []
-        self.node_stack.append(list(reversed(nodes)))
-        self.symbols.append(dict())
-
-    def enter_block_scope(self, nodes: Optional[List[synr.ast.Node]] = None):
-        """Creates a new block scope, the function will call `enter_scope` implicitly
-        Besides the behaviors of `enter_scope`, it will update loop_stack and block_info_stack
-        to maintain block info.
-
-        Note
-        ----
-        This function should be used to handle a block scope,
-        aka the blocks that involve a `with block` scope.
-
-        Parameters
-        ----------
-        nodes : Optional[List[synr.ast.Node]]
-            The synr AST nodes in new scope
-        """
-        self.enter_scope(nodes)
-        # Create a new BlockInfo for the new block
-        self.block_info_stack.append(BlockInfo())
-
-    def exit_scope(self):
-        """Pop the inner most scope"""
-        self.symbols.pop()
-        self.node_stack.pop()
-
-    def exit_block_scope(self):
-        """Pop the inner most block scope, the function will call `exit_scope` implicitly"""
-        self.exit_scope()
-        # Pop block_info
-        self.block_info_stack.pop()
-
-    def update_symbol(self, name: str, symbol: Union[Buffer, Var], node: synr.ast.Node):
-        """Append a symbol into current scope"""
-        if isinstance(symbol, Buffer):
-            if name in self.symbols[0]:
-                self.report_error("Duplicate Buffer name: " + symbol.name, node.span)
-            self.symbols[0][name] = symbol
-        else:
-            self.symbols[-1][name] = symbol
-
-    def remove_symbol(self, name: str):
-        """Remove a symbol"""
-        for symbols in reversed(self.symbols):
-            if name in symbols:
-                symbols.pop(name)
-                return
-        raise RuntimeError("Internal error of tvm script parser: no symbol named " + name)
-
-    def lookup_symbol(self, name: str) -> Optional[Union[Buffer, Var]]:
-        """Look up symbol by name"""
-        for symbols in reversed(self.symbols):
-            if name in symbols:
-                return symbols[name]
-        return self.closure_vars.get(name)
-
-    def report_error(self, message: str, span: Union[Span, synr.ast.Span]):
-        self._report_error(message, span)
-
-    def current_block_scope(self) -> BlockInfo:
-        if self.block_info_stack:
-            return self.block_info_stack[-1]
-        return None
diff --git a/python/tvm/script/parser_v1/diagnostics.py b/python/tvm/script/parser_v1/diagnostics.py
deleted file mode 100644
index e676461ab39e..000000000000
--- a/python/tvm/script/parser_v1/diagnostics.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Bridge from synr's (the library used for parsing the python AST)
-   DiagnosticContext to TVM's diagnostics
-"""
-from synr import DiagnosticContext, ast
-
-import tvm
-from tvm.ir.diagnostics import DiagnosticContext as TVMCtx
-from tvm.ir.diagnostics import get_renderer, DiagnosticLevel, Diagnostic
-
-
-class TVMDiagnosticCtx(DiagnosticContext):
-    """TVM diagnostics for synr"""
-
-    diag_ctx: TVMCtx
-
-    def __init__(self) -> None:
-        self.diag_ctx = TVMCtx(tvm.IRModule(), get_renderer())
-        self.source_name = None
-
-    def to_tvm_span(self, src_name, ast_span: ast.Span) -> tvm.ir.Span:
-        return tvm.ir.Span(
-            src_name,
-            ast_span.start_line,
-            ast_span.end_line,
-            ast_span.start_column,
-            ast_span.end_column,
-        )
-
-    def add_source(self, name: str, source: str) -> None:
-        src_name = self.diag_ctx.module.source_map.add(name, source)
-        self.source_name = src_name
-
-    def emit(self, _level, message, span):
-        span = self.to_tvm_span(self.source_name, span)
-        self.diag_ctx.emit(Diagnostic(DiagnosticLevel.ERROR, span, message))
-        self.diag_ctx.render()  # Raise exception on the first error we hit. TODO remove
-
-    def render(self):
-        self.diag_ctx.render()
diff --git a/python/tvm/script/parser_v1/meta_unparser.py b/python/tvm/script/parser_v1/meta_unparser.py
deleted file mode 100644
index b1472ccdc758..000000000000
--- a/python/tvm/script/parser_v1/meta_unparser.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unparse meta AST node into a dict"""
-# pylint: disable=invalid-name
-
-from synr import Transformer
-
-
-class MetaUnparser(Transformer):
-    """Python AST Visitor to unparse meta AST node into a dict"""
-
-    def transform(self, node):
-        method = "transform_" + node.__class__.__name__
-        visitor = getattr(self, method, None)
-        if visitor is None:
-            self.error(f"Unexpected node type {type(node)} when parsing __tvm_meta__", node.span)
-        return visitor(node)
-
-    def transform_DictLiteral(self, node):
-        keys = [self.visit(key) for key in node.keys]
-        values = [self.visit(value) for value in node.values]
-        return dict(zip(keys, values))
-
-    def transform_Tuple(self, node):
-        return tuple(self.visit(element) for element in node.elts)
-
-    def transform_ArrayLiteral(self, node):
-        return [self.visit(element) for element in node.elts]
-
-    def transform_Constant(self, node):
-        return node.value
diff --git a/python/tvm/script/parser_v1/parser.py b/python/tvm/script/parser_v1/parser.py
deleted file mode 100644
index ce8c1fe161a3..000000000000
--- a/python/tvm/script/parser_v1/parser.py
+++ /dev/null
@@ -1,1391 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TVM Script Parser For TIR
-
-We use [synr](https://synr.readthedocs.io) to get an AST that is stable over
-different python versions. Synr also provides an error handling context that we
-use for error reporting.
-"""
-# pylint: disable=invalid-name, inconsistent-return-statements, no-else-return, broad-except
-import types
-import json
-import operator
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-from synr import ast, Transformer, to_ast
-
-import tvm
-from tvm import IRModule
-from tvm._ffi.base import TVMError
-from tvm.ir import GlobalVar
-from tvm.ir.function import BaseFunc
-from tvm.tir import buffer
-from tvm.tir.function import PrimFunc
-from . import _ffi_api
-from . import tir
-
-from .context_maintainer import ContextMaintainer
-from .meta_unparser import MetaUnparser
-from .registry import Registry
-from .diagnostics import TVMDiagnosticCtx
-from .utils import tvm_span_from_synr, synr_span_from_tvm, call_with_error_reporting
-
-from .tir.intrin import Intrin
-from .tir.node import Slice, BufferSlice
-from .tir.scope_handler import ScopeHandler, WithScopeHandler, ForScopeHandler
-from .tir.special_stmt import SpecialStmt
-from .tir import ty
-
-
-class CallArgumentReader(object):
-    """Helper class to read required arguments from passed arguments.
-
-    When parsing a function call, we need to match the arguments provided in
-    the AST to the required arguments of the function. This class makes sure
-    all the positional arguments are filled and also fill keyword arguments
-    with thier default value if a different value was not provided.
-    """
-
-    def __init__(self, func_name, args, kwargs, parser, node):
-        self.func_name = func_name
-        self.args = args
-        self.kwargs = kwargs
-        self.parser = parser
-        self.node = node
-
-    def get_pos_only_arg(self, pos, name):
-        """Get corresponding position only function argument from argument list"""
-        if len(self.args) >= pos:
-            arg = self.args[pos - 1]
-        elif name not in self.kwargs:
-            # If no positional argument was found in the AST, we see if it was
-            # defined by name instead.
-            # TODO(tkonolige): this error message is not quite correct. The
-            # number of required arguments is >= pos
-            self.parser.report_error(
-                f"{self.func_name} requires {pos} arguments, but only {len(self.args)} were given.",
-                self.node.span,
-            )
-        else:
-            arg = self.kwargs[name]
-
-        return arg
-
-    def get_kwarg(self, pos, name, default):
-        """Get corresponding keyword function argument from argument list.
-
-        If the user hasn't provided the argument, set it to the default value.
-        """
-        if len(self.args) >= pos:
-            arg = self.args[pos - 1]
-        elif name in self.kwargs:
-            arg = self.kwargs[name]
-        else:
-            return default
-
-        return arg
-
-    def get_varargs(self, pos):
-        """Get corresponding variable argument from argument list"""
-        if len(self.args) >= pos and len(self.kwargs) == 0:
-            return self.args[pos - 1 :]
-        return []
-
-
-class TVMScriptParser(Transformer):
-    """Synr AST visitor pass which finally lowers to TIR.
-
-    Notes for Extension
-    -------------------
-    1. To support a new type of AST node, add a function transform_xxx().
-    2. To support new functions, add the function to the appropriate registry:
-        We divide allowed function calls in TVM script into 3 categories,
-        intrin, scope_handler and special_stmt.
-        1. intrin functions are low level functions like mod, load, and
-           constants. They correspond to a tir `IRNode`. They must have a
-           return value. The user can register intrin functions for the parser to
-           use.
-        2. scope_handler functions have no return value. They take two
-           arguments: the parser and the AST node. scope_handler functions are
-           used in with and for statements.
-        3. special_stmt functions handle cases that do not have a corresponding
-           tir `IRNode`. These functions take the parser and the AST node as
-           arguments and may return a value.
-        When visiting a Call node, we check the special_stmt registry first. If
-        no registered function is found, we then check the intrin registry.
-        When visiting With node, we check the with_scope registry.
-        When visiting For node, we check the for_scope registry.
-    """
-
-    _binop_maker = {
-        ast.BuiltinOp.Add: tvm.tir.Add,
-        ast.BuiltinOp.Sub: tvm.tir.Sub,
-        ast.BuiltinOp.Mul: tvm.tir.Mul,
-        ast.BuiltinOp.Div: tvm.tir.Div,
-        ast.BuiltinOp.FloorDiv: tvm.tir.FloorDiv,
-        ast.BuiltinOp.Mod: tvm.tir.FloorMod,
-        ast.BuiltinOp.BitOr: lambda lhs, rhs, span: operator.or_(lhs, rhs),
-        ast.BuiltinOp.BitAnd: lambda lhs, rhs, span: operator.and_(lhs, rhs),
-        ast.BuiltinOp.BitXor: lambda lhs, rhs, span: operator.xor(lhs, rhs),
-        ast.BuiltinOp.GT: tvm.tir.GT,
-        ast.BuiltinOp.GE: tvm.tir.GE,
-        ast.BuiltinOp.LT: tvm.tir.LT,
-        ast.BuiltinOp.LE: tvm.tir.LE,
-        ast.BuiltinOp.Eq: tvm.tir.EQ,
-        ast.BuiltinOp.NotEq: tvm.tir.NE,
-        ast.BuiltinOp.And: tvm.tir.And,
-        ast.BuiltinOp.Or: tvm.tir.Or,
-    }
-
-    _unaryop_maker = {
-        ast.BuiltinOp.USub: lambda rhs, span: operator.neg(rhs),
-        ast.BuiltinOp.Invert: lambda rhs, span: operator.invert(rhs),
-        ast.BuiltinOp.Not: tvm.tir.Not,
-    }
-
-    # pylint gets confused here with synr.Transformer which doesn't have a
-    # custom init, so just disable it
-    def __init__(
-        self, base_lineno, tir_namespace, closure_vars
-    ):  # pylint: disable=super-init-not-called
-        self.context = None
-
-        self.base_lineno = base_lineno
-        self.current_lineno = 0
-        self.current_col_offset = 0
-        self.tir_namespace = tir_namespace
-        self.closure_vars = closure_vars
-        self.meta = None
-        self._inside_buffer_sugar = False
-
-    def init_function_parsing_env(self):
-        """Initialize function parsing environment"""
-        self.context = ContextMaintainer(self.report_error, self.closure_vars)  # scope emitter
-
-    def init_meta(self, meta_dict):
-        if meta_dict is not None:
-            self.meta = tvm.ir.load_json(json.dumps(meta_dict))
-
-    def transform(self, node):
-        """Generic transformation for visiting the AST. Dispatches to
-        `transform_ClassName` for the appropriate ClassName."""
-        old_lineno, old_col_offset = self.current_lineno, self.current_col_offset
-
-        if hasattr(node, "lineno"):
-            self.current_lineno = self.base_lineno + node.lineno - 1
-        if hasattr(node, "col_offset"):
-            self.current_col_offset = node.col_offset
-
-        method = "transform_" + node.__class__.__name__
-        visitor = getattr(self, method, self.generic_visit)
-        transform_res = visitor(node)
-
-        self.current_lineno, self.current_col_offset = old_lineno, old_col_offset
-
-        return transform_res
-
-    def match_tir_namespace(self, identifier: str) -> bool:
-        """Check if the namespace is equal to tvm.script.tir"""
-        return identifier in self.tir_namespace
-
-    def report_error(self, message: str, span: Union[ast.Span, tvm.ir.Span]):
-        """Report an error occuring at a location.
-
-        This just dispatches to synr's DiagnosticContext.
-
-        Parameters
-        ----------
-        message : str
-            Error message
-        span : Union[synr.ast.Span, tvm.ir.Span]
-            Location of the error
-        """
-        if isinstance(span, tvm.ir.Span):
-            span = synr_span_from_tvm(span)
-        self.error(message, span)
-
-    def parse_body(self, parent):
-        """Parse remaining statements in this scope.
-
-        Parameters
-        ----------
-        parent : synr.ast.Node
-            Parent node of this scope. Errors will be reported here.
-        """
-        body = []
-        spans = []
-        stmt = parent
-        while len(self.context.node_stack[-1]) > 0:
-            stmt = self.context.node_stack[-1].pop()
-            spans.append(stmt.span)
-            res = self.transform(stmt)
-            if res is not None:
-                body.append(res)
-        if len(body) == 0:
-            self.report_error(
-                "Expected another statement at the end of this block. Perhaps you "
-                "used a concise statement and forgot to include a body afterwards.",
-                stmt.span,
-            )
-        else:
-            return (
-                tvm.tir.SeqStmt(body, tvm_span_from_synr(ast.Span.union(spans)))
-                if len(body) > 1
-                else body[0]
-            )
-
-    def parse_arg_list(self, func, node_call):
-        """Match the arguments of a function call in the AST to the required
-        arguments of the function. This handles positional arguments,
-        positional arguments specified by name, keyword arguments, and varargs.
-
-        Parameters
-        ----------
-        func : Function
-            The function that provides the signature
-
-        node_call: Union[ast.Call, ast.TypeApply, ast.TypeCall]
-            The AST call node that calls into the function.
-
-        Returns
-        -------
-        arg_list : list
-            The parsed positional argument.
-        """
-        assert isinstance(node_call, (ast.Call, ast.TypeApply, ast.TypeCall))
-        # collect arguments
-        args = [self.transform(arg) for arg in node_call.params]
-        if isinstance(node_call, ast.TypeApply):
-            kw_args = {}  # TypeApply (e.g. foo[bar]) doesn't have kwargs defined in synr
-        else:
-            kw_args = {
-                self.transform(k): self.transform(v) for k, v in node_call.keyword_params.items()
-            }
-        # get the name and parameter list of func
-        if isinstance(func, (Intrin, ScopeHandler, SpecialStmt)):
-            func_name, param_list = func.signature()
-        else:
-            self.report_error(
-                "Internal Error: function must be of type Intrin, ScopeHandler or SpecialStmt, "
-                f"but it is {type(func).__name__}",
-                node_call.span,
-            )
-        # check arguments and parameter list and get a list of arguments
-        reader = CallArgumentReader(func_name, args, kw_args, self, node_call)
-        pos_only, kwargs, varargs = param_list
-        internal_args = list()
-
-        for i, arg_name in enumerate(pos_only):
-            internal_args.append(reader.get_pos_only_arg(i + 1, arg_name))
-        for i, arg_info in enumerate(kwargs):
-            arg_name, default = arg_info
-            internal_args.append(reader.get_kwarg(i + 1 + len(pos_only), arg_name, default=default))
-        if varargs is not None:
-            internal_args.extend(reader.get_varargs(len(pos_only) + len(kwargs) + 1))
-        elif len(args) + len(kw_args) > len(pos_only) + len(kwargs):
-            self.report_error(
-                "Arguments mismatched. "
-                + f"Expected {len(pos_only) + len(kwargs)} args but got "
-                + f"{len(args) + len(kw_args)}",
-                node_call.span,
-            )
-        return internal_args
-
-    def parse_type(self, type_node, parent):
-        """Parse a type annotation.
-
-        We require the parent object to the type so that we have a place to
-        report the error message if the type does not exist.
-        """
-        if type_node is None:
-            self.report_error("A type annotation is required", parent.span)
-        res_type = self.transform(type_node)
-        return tvm.ir.TupleType([]) if res_type is None else res_type.evaluate()
-
-    def generic_visit(self, node):
-        """Fallback visitor if node type is not handled. Reports an error."""
-
-        self.report_error(type(node).__name__ + " AST node is not supported", node.span)
-
-    def transform_Module(self, node):
-        """Module visitor
-
-        Right now, we only support two formats for TVM Script.
-
-        Example
-        -------
-        1. Generate a PrimFunc (If the code is printed, then it may also contain metadata)
-        .. code-block:: python
-
-            import tvm
-
-            @tvm.script
-            def A(...):
-                ...
-
-            # returns a PrimFunc
-            func = A
-
-        2. Generate an IRModule
-        .. code-block:: python
-
-            import tvm
-
-            @tvm.script.ir_module
-            class MyMod():
-                @T.prim_func
-                def A(...):
-                    ...
-                @T.prim_func
-                def B(...):
-                    ...
-
-                __tvm_meta__ = ...
-
-            # returns an IRModule
-            mod = MyMod
-        """
-        if len(node.funcs) == 1:
-            return self.transform(next(iter(node.funcs.values())))
-        elif len(node.funcs) == 0:
-            self.report_error(
-                "You must supply at least one class or function definition", node.span
-            )
-        else:
-            self.report_error(
-                "Only one-function, one-class or function-with-meta source code is allowed",
-                ast.Span.union([x.span for x in list(node.funcs.values())[1:]]),
-            )
-
-    def transform_Class(self, node):
-        """Class definition visitor.
-
-        A class can have multiple function definitions and a single
-        :code:`__tvm_meta__` statement. Each class corresponds to a single
-        :code:`IRModule`.
-
-        Example
-        -------
-        .. code-block:: python
-
-            @tvm.script.ir_module
-            class MyClass:
-                __tvm_meta__ = {}
-                def A():
-                    T.evaluate(0)
-        """
-        if len(node.assignments) == 1:
-            if not (
-                len(node.assignments[0].lhs) == 1
-                and isinstance(node.assignments[0].lhs[0], ast.Var)
-                and node.assignments[0].lhs[0].id.name == "__tvm_meta__"
-            ):
-                self.report_error(
-                    "The only top level assignments allowed are `__tvm_meta__ = ...`",
-                    node.assignments[0].span,
-                )
-            self.init_meta(
-                MetaUnparser().do_transform(node.assignments[0].rhs, self._diagnostic_context)
-            )
-        elif len(node.assignments) > 1:
-            self.report_error(
-                "Only a single top level `__tvm_meta__` is allowed",
-                ast.Span.union([x.span for x in node.assignments[1:]]),
-            )
-
-        return IRModule(
-            {GlobalVar(name): self.transform(func) for name, func in node.funcs.items()}
-        )
-
-    def transform_Function(self, node):
-        """Function definition visitor.
-
-        Each function definition is translated to a single :code:`PrimFunc`.
-
-        There are a couple restrictions on TVM Script functions:
-        1. Function arguments must have their types specified.
-        2. The body of the function can contain :code:`func_attr` to specify
-           attributes of the function (like it's name).
-        3. The body of the function can also contain multiple :code:`buffer_bind`s,
-           which give shape and dtype information to arguments.
-        4. Return statements are implicit.
-
-        Example
-        -------
-        .. code-block:: python
-
-            @T.prim_func
-            def my_function(x: T.handle):  # 1. Argument types
-                T.func_attr({"global_symbol": "mmult"})  # 2. Function attributes
-                X_1 = tir.buffer_bind(x, [1024, 1024])  # 3. Buffer binding
-                T.evaluate(0)  # 4. This function returns 0
-        """
-
-        def check_as_torch_decorator(decorator: Union[ast.Call, ast.Var]):
-            if isinstance(decorator, ast.Call):
-                if len(decorator.params) != 1:
-                    return False
-                func_name = decorator.func_name
-            else:
-                func_name = decorator
-            if isinstance(func_name, ast.Var):
-                return func_name.id.name == "as_torch"
-
-        def check_decorator(decorators: List[ast.Expr]) -> bool:
-            """Check the decorator is `T.prim_func"""
-            if len(decorators) > 2 or len(decorators) == 0:
-                return False
-            if len(decorators) == 2 and not check_as_torch_decorator(decorators[0]):
-                return False
-            d: ast.Expr = decorators[-1]
-            return (
-                isinstance(d, ast.Attr)
-                and isinstance(d.object, ast.Var)
-                and self.match_tir_namespace(d.object.id.name)
-                and d.field.name == "prim_func"
-            )
-
-        self.init_function_parsing_env()
-        self.context.enter_scope(nodes=node.body.stmts)
-
-        # add parameters of function
-        for arg in node.params:
-            # Note that this case is for T.match_buffer syntax sugar
-            if isinstance(arg.ty, (ast.TypeCall, ast.TypeApply)) and isinstance(
-                self.transform(arg.ty.func_name), ty.GenericBufferType
-            ):
-                result = self.handle_match_buffer_type(arg.ty, arg.name)
-                if not isinstance(result, buffer.Buffer):
-                    self.report_error(
-                        "The result type of evaluating TypeCall and TypeApply stmt"
-                        f" is wrong: {type(result)}. It should be a Buffer",
-                        node.span,
-                    )
-                arg_name_with_handle = arg.name + "_handle"
-                arg_var = tvm.te.var(arg_name_with_handle, tvm.ir.PrimType("handle"))
-                self.context.func_buffer_map[arg_var] = result
-                self.context.update_symbol(arg.name, result, node)
-            else:
-                arg_var = tvm.te.var(arg.name, self.parse_type(arg.ty, arg))
-                self.context.update_symbol(arg.name, arg_var, node)
-            self.context.func_params.append(arg_var)
-
-        if not check_decorator(node.decorators):
-            self.report_error(
-                "All functions should be decorated by `T.prim_func`",
-                node.span,
-            )
-
-        # fetch the body of root block
-        body = self.parse_body(node.body)
-
-        # return a tir.PrimFunc
-        dict_attr = self.context.func_dict_attr
-        ret_type = self.parse_type(node.ret_type, node) if node.ret_type is not None else None
-        func = tvm.tir.PrimFunc(
-            self.context.func_params,
-            body,
-            ret_type,
-            buffer_map=self.context.func_buffer_map,
-            attrs=tvm.ir.make_node("DictAttrs", **dict_attr) if dict_attr else None,
-            span=tvm_span_from_synr(node.span),
-        )
-
-        # New Scope : Implicit root block
-        # Each function contains an implicit root block in TensorIR,
-        # so here we need a block scope for it.
-        # If the PrimFunc is not a TensorIR func (e.g. TE scheduled func or low-level func),
-        # the root block will not be added. The logic to add root block is in `_ffi_api.Complete`
-
-        # Fix the PrimFunc
-        # 1. generate root block if necessary
-        # 2. generate surrounding loops for blocks if necessary
-
-        func = call_with_error_reporting(
-            self.report_error,
-            node.span,
-            _ffi_api.Complete,
-            func,
-            self.context.root_alloc_buffers,
-        )
-
-        self.context.exit_scope()
-        return func
-
-    def transform_Lambda(self, node):
-        """Lambda visitor
-
-        Return an array of input parameters and the transformed lambda body.
-        """
-
-        self.context.enter_scope(nodes=[node.body])
-
-        # add parameters of the lambda
-        arg_vars = []
-        for arg in node.params:
-            # Use "void" for dtype here. The actual type is not yet known and will be
-            # determined later. Using void type will allow IRSubstitute to do the
-            # replacement without flagging a type-mismatch error.
-            arg_var = tvm.te.var(arg.name, dtype="")
-            arg_vars.append(arg_var)
-            self.context.update_symbol(arg.name, arg_var, node)
-
-        # the body of a lambda must be an expr
-        if not isinstance(node.body, ast.Expr):
-            self.report_error("The body of a lambda must be an expression", node.span)
-
-        # transform the body of the lambda
-        body = self.transform(node.body)
-
-        self.context.exit_scope()
-        return arg_vars, body
-
-    def transform_Assign(self, node):
-        """Assign visitor
-        AST abstract grammar:
-            Assign(expr* targets, expr value, string? type_comment)
-
-        By now 5 patterns of Assign is supported:
-            1. special stmts with return value
-                1.1 Buffer = T.match_buffer()/T.buffer_decl()
-                1.2 Var = T.var()
-                1.3 Var = T.env_thread()
-            2. (BufferStore) Buffer[PrimExpr, PrimExpr, ..., PrimExpr] = PrimExpr
-            3. (Store)       Var[PrimExpr] = PrimExpr
-            4. with scope handlers with concise scoping and var def
-                4.1 var = T.allocate()
-            5. A call to a pure python function, consuming and producing TVMScript values.
-               The outputs are inlined into the following body (no variable is created).
-               x, y = f(...)
-        """
-
-        if isinstance(node.rhs, ast.Call):
-            # Pattern 1 & Pattern 4
-            if isinstance(node.rhs.func_name, ast.Op):
-                func = None
-            else:
-                func = self.transform(node.rhs.func_name)
-
-            if isinstance(func, WithScopeHandler):
-                if not func.concise_scope or not func.def_symbol:
-                    self.report_error(
-                        "with scope handler " + func.signature()[0] + " is not suitable here",
-                        node.rhs.span,
-                    )
-                # Pattern 4
-                arg_list = self.parse_arg_list(func, node.rhs)
-                func.enter_scope(node, self.context, arg_list, node.rhs.func_name.span)
-                func.body = self.parse_body(node)
-                return func.exit_scope(node, self.context, arg_list, node.rhs.func_name.span)
-            elif isinstance(func, SpecialStmt):
-                # Pattern 1
-                arg_list = self.parse_arg_list(func, node.rhs)
-                func.handle(node, self.context, arg_list, node.rhs.func_name.span)
-                return self.parse_body(node)
-            elif isinstance(func, types.FunctionType):
-                # Pattern 5
-                args = [self.transform(arg) for arg in node.rhs.params]
-                try:
-                    out = func(*args)
-                except Exception as e:
-                    self.report_error(
-                        "Error occurred when invoking the function "
-                        + func.__name__
-                        + ": \n"
-                        + str(e),
-                        node.rhs.span,
-                    )
-
-                if len(node.lhs) == 1 and not isinstance(out, list):
-                    out = [out]
-
-                assert len(out) == len(node.lhs)
-
-                for var, value in zip(node.lhs, out):
-                    self.context.update_symbol(var.id.name, value, node)
-
-                body = self.parse_body(node)
-
-                for var, value in zip(node.lhs, out):
-                    self.context.remove_symbol(var.id.name)
-
-                return body
-
-        if isinstance(node.rhs, (ast.Call, ast.Constant)):
-            # Pattern 4 of let binding
-            value = self.transform(node.rhs)
-            if len(node.lhs) == 1 and not isinstance(node.lhs[0], ast.Var):
-                # This is a little confusing because it only is true when
-                # we have taken this branch. We might need to clarify what
-                # exectly is allowed in Assignments in tvmscript.
-                self.report_error(
-                    "Left hand side of assignment must be an unqualified variable",
-                    node.span,
-                )
-            ast_var = node.lhs[0]
-
-            if node.ty is None and hasattr(value, "dtype"):
-                var_ty = value.dtype
-            else:
-                var_ty = self.parse_type(node.ty, ast_var)
-
-            var = tvm.te.var(
-                ast_var.id.name,
-                var_ty,
-                span=tvm_span_from_synr(ast_var.span),
-            )
-            self.context.update_symbol(var.name, var, node)
-            body = self.parse_body(node)
-            self.context.remove_symbol(var.name)
-            return tvm.tir.LetStmt(var, value, body, span=tvm_span_from_synr(node.span))
-
-        self.report_error(
-            """Assignments should be one of:
-            1. A "special statement" with return value
-                1.1 Buffer = T.match_buffer()/T.buffer_decl()
-                1.2 Var = T.var()
-                1.3 Var = T.env_thread()
-            2. A store into a buffer: Buffer[PrimExpr, PrimExpr, ..., PrimExpr] = PrimExpr
-            3. A store into a variable: Var[PrimExpr] = PrimExpr
-            4. A with scope handler with concise scoping and var def
-                4.1 var = T.allocate()
-            5. The right-hand side being a call to a pure python function, consuming and
-               producing TVMScript values.
-               x, y = f(...)""",
-            node.span,
-        )
-
-    def transform_SubscriptAssign(self, node):
-        """Visitor for statements of the form :code:`x[1] = 2`."""
-        symbol = self.transform(node.params[0])
-        indexes = self.transform(node.params[1])
-        rhs = self.transform(node.params[2])
-        rhs_span = tvm_span_from_synr(node.params[2].span)
-        if isinstance(symbol, tvm.tir.Buffer):
-            if len(indexes) != len(symbol.shape):
-                self.report_error(
-                    f"Buffer {symbol.name} is {len(symbol.shape)}-dimensional, "
-                    f"cannot be indexed by {len(indexes)}-dimensional indices.",
-                    node.params[1].span,
-                )
-
-            def __convert_index(x):
-                if isinstance(x, Slice):
-                    return x.as_index_expr(self.report_error)
-                return x
-
-            # BufferStore
-            indexes = [__convert_index(x) for x in indexes]
-            return tvm.tir.BufferStore(
-                symbol,
-                tvm.runtime.convert(rhs, span=rhs_span),
-                indexes,
-                span=tvm_span_from_synr(node.span),
-            )
-        else:
-            if symbol.dtype == "handle" and len(indexes) != 1:
-                self.report_error(
-                    "Handles only support one-dimensional indexing. Use `T.match_buffer` to "
-                    "construct a multidimensional buffer from a handle.",
-                    node.params[0].span,
-                )
-            if len(indexes) != 1:
-                self.report_error(
-                    f"Store is only allowed with one index, but {len(indexes)} were provided.",
-                    node.params[1].span,
-                )
-            self.report_error(
-                "Use of tir.Store has been deprecated in favor of tir.BufferStore.", node.span
-            )
-
-    def transform_AttrAssign(self, node):
-        """Visitor for statements of the form :code:`x.y = 2`."""
-        obj = self.transform(node.params[0])
-        field = node.params[1]
-        value = self.transform(node.params[2])
-
-        if not hasattr(obj, field.name):
-            self.error(f"Field {field.name} does not exist", field.span)
-
-        var = getattr(obj, field.name)
-
-        if not isinstance(var, tvm.tir.Var):
-            self.error(
-                f"Can only assign to tir.Var attributes, not {type(var).__name__}", node.span
-            )
-
-        body = self.parse_body(node)
-        return tvm.tir.LetStmt(var, value, body, span=tvm_span_from_synr(node.span))
-
-    def transform_Assert(self, node):
-        """Assert visitor
-
-        Pattern corresponds to concise mode of :code:`with T.Assert()`.
-        """
-
-        condition = self.transform(node.condition)
-        if node.msg is None:
-            self.report_error("Assert statements must have an error message.", node.span)
-        message = self.transform(node.msg)
-        body = self.parse_body(node)
-        return tvm.tir.AssertStmt(
-            condition, tvm.runtime.convert(message), body, span=tvm_span_from_synr(node.span)
-        )
-
-    def transform_For(self, node):
-        """For visitor
-        AST abstract grammar:
-            For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment)
-        By now 1 pattern of For is supported:
-            1. for scope handler
-                for name in T.serial()/T.parallel()/T.vectorized()/T.unroll()/range()/
-                            T.grid()/T.thread_binding()
-        """
-
-        if not isinstance(node.rhs, ast.Call):
-            self.report_error("The loop iterator should be a function call.", node.rhs.span)
-        func = self.transform(node.rhs.func_name)
-        if not isinstance(func, ForScopeHandler):
-            self.report_error(
-                "Only For scope handlers can be used in a for statement.", node.rhs.func_name.span
-            )
-        # prepare for new for scope
-        old_lineno, old_col_offset = self.current_lineno, self.current_col_offset
-        self.current_lineno = node.span.start_line
-        self.current_col_offset = node.span.start_column
-        self.context.enter_scope(nodes=node.body.stmts)
-        # for scope handler process the scope
-        arg_list = [
-            tvm.runtime.convert(arg, span=tvm_span_from_synr(node.rhs.span))
-            for arg in self.parse_arg_list(func, node.rhs)
-        ]
-        func.enter_scope(node, self.context, arg_list, node.rhs.func_name.span)
-        func.body = self.parse_body(node)
-        res = func.exit_scope(node, self.context, arg_list, node.rhs.func_name.span)
-        # exit the scope
-        self.context.exit_scope()
-        self.current_lineno, self.current_col_offset = old_lineno, old_col_offset
-        return res
-
-    def transform_While(self, node):
-        """While visitor
-        AST abstract grammar:
-            While(expr condition, stmt* body)
-        """
-        condition = self.transform(node.condition)
-        # body
-        self.context.enter_scope(nodes=node.body.stmts)
-        body = self.parse_body(node)
-        self.context.exit_scope()
-
-        return tvm.tir.While(condition, body, span=tvm_span_from_synr(node.span))
-
-    def transform_With(self, node):
-        """With visitor
-        AST abstract grammar:
-            With(withitem* items, stmt* body, string? type_comment)
-            withitem = (expr context_expr, expr? optional_vars)
-        By now 2 patterns of With is supported:
-            1. with scope handler with symbol def
-                with T.allocate() as targets:
-            2. with scope handler without symbol def
-                with T.block(*axes)/T.let()/T.Assert()/T.attr()/T.realize()
-        """
-
-        if not isinstance(node.rhs, ast.Call):
-            self.report_error(
-                "The context expression of a `with` statement should be a function call.",
-                node.rhs.span,
-            )
-
-        func = self.transform(node.rhs.func_name)
-
-        if not isinstance(func, WithScopeHandler):
-            self.report_error(
-                f"Function {func} cannot be used in a `with` statement.", node.rhs.func_name.span
-            )
-        # prepare for new block scope
-        old_lineno, old_col_offset = self.current_lineno, self.current_col_offset
-        self.current_lineno = node.body.span.start_line
-        self.current_col_offset = node.body.span.start_column
-        self.context.enter_block_scope(nodes=node.body.stmts)
-        # with scope handler process the scope
-        arg_list = self.parse_arg_list(func, node.rhs)
-        func.enter_scope(node, self.context, arg_list, node.rhs.func_name.span)
-        func.body = self.parse_body(node)
-        res = func.exit_scope(node, self.context, arg_list, node.rhs.func_name.span)
-        # exit the scope
-        self.context.exit_block_scope()
-        self.current_lineno, self.current_col_offset = old_lineno, old_col_offset
-        return res
-
-    def transform_If(self, node):
-        """If visitor
-        AST abstract grammar:
-            If(expr test, stmt* body, stmt* orelse)
-        """
-
-        condition = self.transform(node.condition)
-        # then body
-        self.context.enter_scope(nodes=node.true.stmts)
-        then_body = self.parse_body(node)
-        self.context.exit_scope()
-
-        # else body
-        if len(node.false.stmts) > 0:
-            self.context.enter_scope(nodes=node.false.stmts)
-            else_body = self.parse_body(node)
-            self.context.exit_scope()
-        else:
-            else_body = None
-
-        return tvm.tir.IfThenElse(
-            condition, then_body, else_body, span=tvm_span_from_synr(node.span)
-        )
-
-    def transform_Call(self, node):
-        """Call visitor
-
-        3 different Call patterns are allowed:
-            1. Intrin representing a PrimExpr/IterVar
-                1.1 tir.int/uint/float8/16/32/64/floormod/floordiv/load/cast/ramp/broadcast/max
-                1.2 tir.range/reduce_axis/scan_axis/opaque_axis
-            2. tir.Op(dtype, ...)
-            3. other callable functions
-        """
-
-        if isinstance(node.func_name, ast.Op):
-            if node.func_name.name == ast.BuiltinOp.Subscript:
-                return self.transform_Subscript(node)
-            if node.func_name.name in self._binop_maker:
-                lhs = self.transform(node.params[0])
-                # There is no supertype for everything that can appear in
-                # an expression, so we manually add what we might get here.
-                if not isinstance(lhs, (tvm.tir.PrimExpr, BufferSlice)):
-                    # We would really like to report a more specific
-                    # error here, but this parser contains no distinction
-                    # between parsing statements and parsing expressions. All
-                    # rules just call `transform`.
-                    self.report_error(
-                        f"Left hand side of binary op must be a PrimExpr, "
-                        "but it is a {type(lhs).__name__}",
-                        node.params[0].span,
-                    )
-                rhs = self.transform(node.params[1])
-                if not isinstance(rhs, (tvm.tir.PrimExpr, BufferSlice)):
-                    self.report_error(
-                        f"Right hand side of binary op must be a PrimExpr, "
-                        "but it is a {type(rhs).__name__}",
-                        node.params[1].span,
-                    )
-                return call_with_error_reporting(
-                    self.report_error,
-                    node.span,
-                    lambda node, lhs, rhs, span: self._binop_maker[node.func_name.name](
-                        lhs, rhs, span=span
-                    ),
-                    node,
-                    lhs,
-                    rhs,
-                    tvm_span_from_synr(node.span),
-                )
-            if node.func_name.name in self._unaryop_maker:
-                rhs = self.transform(node.params[0])
-                if node.func_name.name == ast.BuiltinOp.USub and isinstance(
-                    node.params[0], ast.Constant
-                ):
-                    # '-literal' should be parsed together for proper literal type inference
-                    if not isinstance(rhs, (tvm.tir.IntImm, tvm.tir.FloatImm)):
-                        self.report_error("The literal is illegal after -", node.params[0].span)
-                    return tvm.tir.const(-rhs.value)
-                return self._unaryop_maker[node.func_name.name](
-                    rhs, span=tvm_span_from_synr(node.span)
-                )
-            self.report_error(f"Unsupported operator {node.func_name.name}.", node.func_name.span)
-        else:
-            func = self.transform(node.func_name)
-            if isinstance(func, Intrin) and not func.stmt:
-                # pattern 1
-                arg_list = self.parse_arg_list(func, node)
-                return call_with_error_reporting(
-                    self.report_error,
-                    node.func_name.span,
-                    func.handle,
-                    arg_list,
-                    node.func_name.span,
-                )
-            else:
-                args = [self.transform(arg) for arg in node.params]
-                kw_args = {
-                    self.transform(k): self.transform(v) for k, v in node.keyword_params.items()
-                }
-                if isinstance(func, tvm.tir.op.Op):
-                    if not "dtype" in kw_args.keys():
-                        self.report_error(f"{func} requires a dtype keyword argument.", node.span)
-                    # pattern 2
-                    return tvm.tir.Call(
-                        kw_args["dtype"], func, args, span=tvm_span_from_synr(node.span)
-                    )
-                elif callable(func):
-                    # pattern 3
-                    return func(*args, **kw_args)
-                else:
-                    self.report_error(
-                        f"Function is neither callable nor a tvm.tir.op.Op (it is a {type(func)}).",
-                        node.func_name.span,
-                    )
-
-    def transform_UnassignedCall(self, node):
-        """Visitor for statements that are function calls.
-
-        This handles function calls that appear on thier own line like `tir.realize`.
-
-        Examples
-        --------
-        .. code-block:: python
-
-            @T.prim_func
-            def f():
-                A = T.buffer_decl([10, 10])
-                T.realize(A[1:2, 1:2], "")  # This is an UnassignedCall
-                A[1, 1] = 2  # This is also an UnassignedCall
-        """
-        # Only allowed builtin operator that can be a statement is x[1] = 3 i.e. subscript assign.
-        if isinstance(node.call.func_name, ast.Op):
-            if node.call.func_name.name == ast.BuiltinOp.SubscriptAssign:
-                return self.transform_SubscriptAssign(node.call)
-
-            if node.call.func_name.name == ast.BuiltinOp.AttrAssign:
-                return self.transform_AttrAssign(node.call)
-
-            self.report_error(
-                "Binary and unary operators are not allowed as a statement", node.span
-            )
-
-        # handle a regular function call
-        func = self.transform(node.call.func_name)
-        arg_list = self.parse_arg_list(func, node.call)
-
-        if isinstance(func, tir.scope_handler.AssertHandler):
-            self.report_error(
-                "A standalone `T.Assert` is not allowed. Use `assert condition, message` "
-                "instead.",
-                node.call.func_name.span,
-            )
-
-        if isinstance(func, Intrin):
-            if func.stmt:
-                return call_with_error_reporting(
-                    self.report_error,
-                    node.call.func_name.span,
-                    func.handle,
-                    arg_list,
-                    node.call.func_name.span,
-                )
-            else:
-                self.report_error(f"This intrinsic cannot be used as a statement.", node.call.span)
-        elif isinstance(func, WithScopeHandler) and func.concise_scope and not func.def_symbol:
-            func.enter_scope(node, self.context, arg_list, node.call.func_name.span)
-            func.body = self.parse_body(node)
-            return func.exit_scope(node, self.context, arg_list, node.call.func_name.span)
-        elif isinstance(func, SpecialStmt) and not func.def_symbol:
-            func.handle(node, self.context, arg_list, node.call.func_name.span)
-            return
-
-        self.report_error(
-            "Unexpected statement. Expected an assert, an intrinsic, a with statement, or a "
-            f"special statement, but got {type(func).__name__}.",
-            node.call.func_name.span,
-        )
-
-    def transform_Slice(self, node):
-        """Index slice visitor."""
-        start = self.transform(node.start)
-        end = self.transform(node.end)
-        if not (
-            isinstance(node.step, ast.Constant)
-            and isinstance(node.step.value, int)
-            and node.step.value > 0
-        ):
-            self.report_error(
-                "Only positive integer step size is supported for slices.", node.step.span
-            )
-        return Slice(start, end, node.step.value, tvm_span_from_synr(node.span))
-
-    def transform_Subscript(self, node):
-        """Array access visitor.
-
-        By now only 3 types of Subscript are supported:
-            1. Buffer[index, index, ...], Buffer element access(BufferLoad & BufferStore)
-               Var[index] Buffer element access()
-            2. Buffer[start: stop, start: stop, ...], BufferRealize(realize(buffer[...]))
-            3. Array[index], Buffer element access
-        """
-
-        symbol = self.transform(node.params[0])
-        if symbol is None:
-            self.report_error(
-                f"Variable {node.params[0].id.name} is not defined.", node.params[0].span
-            )
-
-        indexes = [self.transform(x) for x in node.params[1].values]
-        if isinstance(symbol, tvm.tir.expr.Var):
-            if symbol.dtype == "handle":
-                self.report_error(
-                    "Cannot read directly from a handle, use `T.match_buffer` "
-                    "to create a buffer to read from.",
-                    node.params[0].span,
-                )
-            if len(indexes) > 1:
-                self.report_error(
-                    "Only a single index can be provided when indexing into a `var`.",
-                    node.params[1].span,
-                )
-            index = indexes[0]
-            if not isinstance(index, (tvm.tir.PrimExpr, int)):
-                self.report_error(
-                    "Var load index should be an int or PrimExpr, but it is a" + type(index),
-                    node.span,
-                )
-
-            self.report_error(
-                "Use of tir.Load has been deprecated in favor of tir.BufferLoad", node.span
-            )
-        elif isinstance(symbol, tvm.tir.Buffer):
-            return BufferSlice(
-                symbol, indexes, self.report_error, span=tvm_span_from_synr(node.span)
-            )
-        elif isinstance(symbol, tvm.container.Array):
-            if len(indexes) > 1:
-                self.report_error(
-                    "Array access should be one-dimension access, but the indices are "
-                    + str(indexes),
-                    node.span,
-                )
-            index = indexes[0]
-            if not isinstance(index, (int, tvm.tir.expr.IntImm)):
-                self.report_error(
-                    "Array access index expected int or IntImm, but got " + type(index),
-                    node.span,
-                )
-            if int(index) >= len(symbol):
-                self.report_error(
-                    f"Array access out of bound, size: {len(symbol)}, got index {index}.",
-                    node.span,
-                )
-            return symbol[int(index)]
-        else:
-            self.report_error(
-                f"Cannot subscript from a {type(symbol).__name__}. Only variables and "
-                "buffers are supported.",
-                node.params[0].span,
-            )
-
-    def transform_Attr(self, node):
-        """Visitor for field access of the form `x.y`.
-
-        This visitor is used to lookup function and symbol names. We have two
-        cases to handle here:
-        1. If we have a statement of the form `tir.something`, then we lookup
-           `tir.something` in the `Registry`. If the function is not in the
-           registry, then we try to find a `tvm.ir.op.Op` with the same name.
-        2. All other names `tvm.something` are lookup up in this current python
-           namespace.
-        """
-
-        def get_full_attr_name(node: ast.Attr) -> str:
-            reverse_field_names = [node.field.name]
-            while isinstance(node.object, ast.Attr):
-                node = node.object
-                reverse_field_names.append(node.field.name)
-            if isinstance(node.object, ast.Var):
-                reverse_field_names.append(node.object.id.name)
-            return ".".join(reversed(reverse_field_names))
-
-        if isinstance(node.object, (ast.Var, ast.Attr)):
-            full_attr_name = get_full_attr_name(node)
-            attr_object, fields = full_attr_name.split(".", maxsplit=1)
-            if self.match_tir_namespace(attr_object):
-                func_name = "tir." + fields
-                res = Registry.lookup(func_name)
-                if res is not None:
-                    return res
-                try:
-                    return tvm.ir.op.Op.get(func_name)
-                except TVMError as e:
-                    # Check if we got an attribute error
-                    if e.args[0].find("AttributeError"):
-                        self.report_error(f"Unregistered function `tir.{fields}`.", node.span)
-                    else:
-                        raise e
-
-        symbol = self.transform(node.object)
-        if symbol is None:
-            self.report_error("Unsupported Attribute expression.", node.object.span)
-        if not hasattr(symbol, node.field.name):
-            self.report_error(
-                f"Type {type(symbol)} does not have a field called `{node.field.name}`.", node.span
-            )
-        res = getattr(symbol, node.field.name)
-        return res
-
-    def transform_TypeAttr(self, node):
-        """Visitor for field access of the form `x.y` for types.
-
-        We have two cases here:
-        1. If the type is of the form `T.something`, we look up the type in
-           the `tir` namespace in this module.
-        2. If the type is of the form `tvm.x.something` then we look up
-           `tvm.x.something` in this modules namespace.
-        """
-        if isinstance(node.object, ast.TypeVar):
-            if self.match_tir_namespace(node.object.id.name):
-                if not hasattr(tir, node.field.name):
-                    self.report_error(
-                        f"Invalid type annotation `tir.{node.field.name}`.", node.span
-                    )
-                return getattr(tir, node.field.name)
-
-        symbol = self.transform(node.object)
-        if symbol is None:
-            self.report_error("Unsupported Attribute expression", node.object.span)
-        if not hasattr(symbol, node.field):
-            self.report_error(
-                f"Type {type(symbol)} does not have a field called `{node.field}`.", node.span
-            )
-        res = getattr(symbol, node.field)
-        return res
-
-    def transform_DictLiteral(self, node):
-        """Dictionary literal visitor.
-
-        Handles dictionary literals of the form `{x:y, z:2}`.
-        """
-
-        keys = [self.transform(key) for key in node.keys]
-        values = [self.transform(value) for value in node.values]
-
-        return dict(zip(keys, values))
-
-    def transform_Tuple(self, node):
-        """Tuple visitor.
-
-        Handles tuples of the form `(x, y, 2)`.
-        """
-
-        return tuple(self.transform(element) for element in node.values)
-
-    def transform_ArrayLiteral(self, node):
-        """List literal visitor.
-
-        Handles lists of the form `[x, 2, 3]`.
-        """
-
-        return [self.transform(element) for element in node.values]
-
-    def transform_Var(self, node):
-        """Variable visitor
-
-        Handles variables like `x` in `x = 2`.
-        """
-
-        name = node.id.name
-        if name == "meta":
-            return self.meta
-        symbol = Registry.lookup(name)
-        if symbol is not None:
-            return symbol
-        symbol = self.context.lookup_symbol(name)
-        if symbol is not None:
-            return symbol
-        self.report_error(f"Unknown identifier {name}.", node.span)
-
-    def transform_TypeVar(self, node):
-        """Type variable visitor.
-
-        Equivalent to `transform_Var` but for types.
-        """
-        name = node.id.name
-        symbol = Registry.lookup(name) or self.context.lookup_symbol(name)
-        if symbol is not None:
-            return symbol
-        self.report_error(f"Unknown identifier {name}.", node.span)
-
-    def transform_Constant(self, node):
-        """Constant value visitor.
-
-        Constant values include `None`, `"strings"`, `2` (integers), `4.2`
-        (floats), and `true` (booleans).
-        """
-        return tvm.runtime.convert(node.value, span=tvm_span_from_synr(node.span))
-
-    def transform_TypeConstant(self, node):
-        """Constant value visitor for types.
-
-        See `transform_Constant`.
-        """
-        if self._inside_buffer_sugar:
-            return self.transform_Constant(node)
-
-        return node.value
-
-    def transform_TypeTuple(self, node):
-        """Tuple value visitor for types.
-
-        Mostly used in `transform_TypeCall` and `transform_TypeApply`.
-        """
-        return [self.transform(value) for value in node.values]
-
-    def transform_TypeCall(self, node):
-        """TypeCall visitor
-
-        This occurs when an expression is used inside a T.Buffer
-        parameter annotation.
-        """
-
-        # ast.Call has the BuiltinOp as node.func_name.name, where
-        # ast.TypeCall has the BuiltinOp as node.func_name.  So we can
-        # delegate to self.transform_Call, but the error messages for
-        # unsupported operations will highlight the entire expression
-        # and not just the function itself.
-        op = ast.Op(node.span, node.func_name)
-        call = ast.Call(node.span, op, node.params, node.keyword_params)
-        return self.transform_Call(call)
-
-    def transform_TypeApply(self, node):
-        """Visitor for Type[Type] expressions.
-
-        Mostly used for ``T.Ptr`` expressions.
-        """
-        func = self.transform(node.func_name)
-
-        if not isinstance(func, ty.TypeGeneric) or not hasattr(func, "__getitem__"):
-            self.report_error(
-                f"Use of type arguments requires a type that accepts type arguments (e.g. T.Ptr), "
-                f"but found {type(func).__name__} instead.",
-                node.span,
-            )
-
-        param_types = []
-        for idx, param in enumerate(node.params):
-            param_type = self.transform(param)
-            if not isinstance(param_type, ty.TypeGeneric) and func.require_type_generic_at(idx):
-                self.report_error(
-                    f"Expected a type but found {type(param).__name__} "
-                    f"at {idx}th type argument",
-                    param.span,
-                )
-
-            param_types.append(param_type)
-
-        if len(param_types) == 1:
-            return func[param_types[0]]
-        else:
-            return func[param_types]
-
-    def handle_match_buffer_type(self, node, buffer_name):
-        """special function to handle syntax sugar for match buffer.
-
-        This method is for buffer declarations in the function parameters.
-        """
-        func = self.transform(node.func_name)
-        assert isinstance(func, SpecialStmt)
-
-        # parse args and kwargs for TypeCall and TypeApply
-        self._inside_buffer_sugar = True
-        try:
-            arg_list = self.parse_arg_list(func, node)
-        finally:
-            self._inside_buffer_sugar = False
-
-        # Note that the third element in arg_list would always be the 'name'
-        # TODO: This index is hardcoded as a workaround. Better to make it programmatic
-        if arg_list[2] is None:
-            arg_list[2] = buffer_name
-        buf = func.handle(node, self.context, arg_list, node.func_name.span)
-        return buf
-
-    def transform_Return(self, node):
-        self.report_error(
-            "TVM script does not support return statements. Instead the last statement in any "
-            "block is implicitly returned.",
-            node.span,
-        )
-
-
-def get_tir_namespace(script: Union[Callable, type]) -> List[str]:
-    assert inspect.isfunction(script) or inspect.isclass(script)
-    env: Dict[str, Any] = script.__globals__
-    return [key for key in env.keys() if env[key] == tir]
-
-
-def from_source(
-    input_func: Union[str, Callable], tir_prefix: Optional[List[str]] = None
-) -> Union[PrimFunc, IRModule]:
-    """Parse function or string into PrimFunc or IRModule.
-
-    If possible, pass the TVM script in as a function so that line numbers and
-    filename will be accurate.
-
-    Parameters
-    ----------
-    input_module : Union[str, Callable]
-        The python function to be parsed.
-
-    tir_prefix : Optional[List[str]]
-        The tir prefix list. Only works for str input, default by "tir" and "T".
-
-    Returns
-    -------
-    output : Union[Function, Module]
-        The Function or Module in IR.
-    """
-    if isinstance(input_func, str):
-        tir_prefix = ["T", "tir"] if tir_prefix is None else tir_prefix
-        return to_ast(input_func, TVMDiagnosticCtx(), TVMScriptParser(0, tir_prefix, {}))
-    elif inspect.isfunction(input_func):
-        _, start_line = inspect.getsourcelines(input_func)
-        env: Dict[str, Any] = input_func.__globals__
-        namespace = [key for key in env.keys() if env[key] is tir]
-        _closure_vars = inspect.getclosurevars(input_func)
-        closure_vars = {**_closure_vars.nonlocals, **_closure_vars.globals}
-        parser = TVMScriptParser(start_line, namespace, closure_vars)
-        result = to_ast(input_func, TVMDiagnosticCtx(), parser)
-        return result
-    else:
-        raise TypeError("Only function definitions are supported.")
-
-
-def ir_module(input_module: type) -> IRModule:
-    """Decorate a python class as tvm IRModule.
-
-    Parameters
-    ----------
-    input_module : type
-        The python class to be parsed.
-
-    Returns
-    -------
-    output : IRModule
-        The result IRModule.
-    """
-    if inspect.isclass(input_module):
-        func_dict = {
-            name: f for name, f in input_module.__dict__.items() if isinstance(f, BaseFunc)
-        }
-        return IRModule(func_dict)
-    raise TypeError("Only class definitions are supported.")
diff --git a/python/tvm/script/parser_v1/registry.py b/python/tvm/script/parser_v1/registry.py
deleted file mode 100644
index e7d90dd51517..000000000000
--- a/python/tvm/script/parser_v1/registry.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TVM Script Parser Function Registry """
-# pylint: disable=inconsistent-return-statements, relative-beyond-top-level, import-outside-toplevel
-import types
-from typing import Union, Callable, Dict, Optional, Any
-
-
-class Registry(object):
-    """Registration map
-    All these maps are static
-    """
-
-    registrations: Dict[str, type] = dict()
-
-    @staticmethod
-    def lookup(name: str) -> Optional[Any]:
-        if name in Registry.registrations:
-            # every time we create a new handler
-            # since we may want to keep some local info inside it
-            return Registry.registrations[name]()
-        return None
-
-
-def register(inputs: Union[Callable, type]) -> type:
-    """Register Intrin/ScopeHandler/SpecialStmt"""
-    registration: type
-    if isinstance(inputs, types.FunctionType):
-        # is function
-        from .tir.intrin import Intrin
-
-        def create_new_intrin(func) -> type:
-            class NewIntrin(Intrin):
-                def __init__(self):
-                    super().__init__(func)
-
-            return NewIntrin
-
-        registration = create_new_intrin(inputs)
-    elif isinstance(inputs, type):
-        # is class
-        registration = inputs
-    else:
-        raise ValueError()
-
-    key: str = registration().signature()[0]
-    Registry.registrations[key] = registration
-    return registration
diff --git a/python/tvm/script/parser_v1/tir/__init__.py b/python/tvm/script/parser_v1/tir/__init__.py
deleted file mode 100644
index 662dd10ec068..000000000000
--- a/python/tvm/script/parser_v1/tir/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TVMScript for TIR"""
-
-# Type system
-from .ty import void, boolean, handle, Ptr, Tuple, Buffer
-from .ty import bool  # pylint: disable=redefined-builtin
-
-from .prim_func import prim_func
-
-# add all floating point and integer datatypes to the module
-for _dtype in ["float", "uint", "int"]:
-    for _size in ["8", "16", "32", "64"]:
-        for _lanes in ["", "x4", "x8", "x16", "x32", "x64"]:
-            from . import ty
-
-            _name = _dtype + _size + _lanes
-            if hasattr(ty, _name):
-                globals()[_name] = getattr(ty, _name)
diff --git a/python/tvm/script/parser_v1/tir/__init__.pyi b/python/tvm/script/parser_v1/tir/__init__.pyi
deleted file mode 100644
index beefaf4c75d7..000000000000
--- a/python/tvm/script/parser_v1/tir/__init__.pyi
+++ /dev/null
@@ -1,475 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=redefined-builtin
-from typing import (
-    Any,
-    Callable,
-    ContextManager,
-    Dict,
-    Iterable,
-    Optional,
-    Tuple,
-    Union,
-    Sequence,
-    List,
-    Mapping,
-    overload,
-)
-from numbers import Number
-import builtins
-
-from tvm.tir.function import PrimFunc
-from tvm.tir import Range
-from tvm.runtime import Object
-from tvm.target import Target
-from .node import BufferSlice
-
-"""
-redefine types
-"""
-
-class PrimExpr:
-    def __init__(self: PrimExpr) -> None: ...
-    @overload
-    def __add__(self: PrimExpr, other: PrimExpr) -> PrimExpr: ...
-    @overload
-    def __add__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
-    @overload
-    def __sub__(self: PrimExpr, other: PrimExpr) -> PrimExpr: ...
-    @overload
-    def __sub__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
-    @overload
-    def __mul__(self: PrimExpr, other: PrimExpr) -> PrimExpr: ...
-    @overload
-    def __mul__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
-    @overload
-    def __div__(self: PrimExpr, other: PrimExpr) -> PrimExpr: ...
-    @overload
-    def __div__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
-    def __mod__(self: PrimExpr, other: Union[int, float, PrimExpr]) -> PrimExpr: ...
-    def __radd__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
-    def __rsub__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
-    def __rmul__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
-    def __rdiv__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
-    def __floordiv__(self: PrimExpr, other: Union[int, float, PrimExpr]) -> PrimExpr: ...
-    def __index__(self: PrimExpr) -> int: ...  # so range doesn't complain
-
-class Var(PrimExpr): ...
-class IterVar(Var): ...
-
-class Buffer:
-    @overload
-    def __getitem__(self: Buffer, pos: Sequence[Union[PrimExpr, int, slice]]) -> PrimExpr: ...
-    @overload
-    def __getitem__(self: Buffer, pos: Union[PrimExpr, int, slice]) -> PrimExpr: ...
-    @overload
-    def __setitem__(
-        self: Buffer, pos: Sequence[Union[PrimExpr, int, slice]], value: PrimExpr
-    ) -> None: ...
-    @overload
-    def __setitem__(self: Buffer, pos: Union[PrimExpr, int, slice], value: PrimExpr) -> None: ...
-    @property
-    def data(self: Buffer) -> Ptr: ...
-
-"""
-Intrinsic
-"""
-
-def min_value(dtype: str) -> PrimExpr: ...
-def max_value(dtype: str) -> PrimExpr: ...
-def floordiv(x: PrimExpr, y: PrimExpr) -> PrimExpr: ...
-def floormod(x: PrimExpr, y: PrimExpr) -> PrimExpr: ...
-def ceildiv(x: PrimExpr, y: PrimExpr) -> PrimExpr: ...
-def truncmod(x: PrimExpr, y: PrimExpr) -> PrimExpr: ...
-def truncdiv(x: PrimExpr, y: PrimExpr) -> PrimExpr: ...
-def abs(x: PrimExpr) -> PrimExpr: ...
-def load(
-    dtype: str, var: Var, index: PrimExpr, predicate: Union[PrimExpr, builtins.bool] = None
-) -> PrimExpr: ...
-def cast(value: PrimExpr, dtype: str) -> PrimExpr: ...
-def ramp(base: PrimExpr, stride: Any, lanes: int) -> PrimExpr: ...
-def broadcast(value: PrimExpr, lanes: int) -> PrimExpr: ...
-def iter_var(var: Union[Var, str], dom: Range, iter_type: int, thread_tag: str) -> IterVar: ...
-def max(a: PrimExpr, b: PrimExpr) -> PrimExpr: ...
-def min(a: PrimExpr, b: PrimExpr) -> PrimExpr: ...
-def Select(cond: PrimExpr, if_body: PrimExpr, else_body: PrimExpr) -> PrimExpr: ...
-def if_then_else(cond: PrimExpr, t: PrimExpr, f: PrimExpr, dtype: str) -> PrimExpr: ...
-def evaluate(value: PrimExpr) -> None: ...
-def reinterpret(value: PrimExpr, dtype: str) -> PrimExpr: ...
-def vectorlow(value: PrimExpr, dtype: str) -> PrimExpr: ...
-def vectorhigh(value: PrimExpr, dtype: str) -> PrimExpr: ...
-def store(
-    var: Var, index: PrimExpr, value: PrimExpr, predicate: Union[PrimExpr, builtins.bool] = True
-) -> None: ...
-def comm_reducer(lambda_io: Callable[[Any, Any], Any], identities: List[PrimExpr]) -> PrimExpr: ...
-def llvm_lookup_intrinsic_id(name: str) -> PrimExpr: ...
-
-"""
-Intrinsics - tvm builtin
-"""
-
-def tvm_thread_allreduce(
-    *freduceargs: Union[PrimExpr, builtins.bool, Ptr], dtype: str
-) -> PrimExpr: ...
-
-"""
-Unary operator
-Note that any intrinsics not registered in script.tir.intrin
-should add "dtype" as an argument. This is different from their
-definition but intentional.
-"""
-
-def exp(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def exp2(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def exp10(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def erf(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def tanh(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def sigmoid(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def log(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def log2(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def log10(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def log1p(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def tan(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def cos(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def cosh(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def acos(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def acosh(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def sin(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def sinh(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def asin(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def asinh(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def atan(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def atanh(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def atan2(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def sqrt(x: PrimExpr, dtype: str) -> PrimExpr: ...
-def rsqrt(x: PrimExpr, dtype: str) -> PrimExpr: ...
-
-"""
-special_stmt - Buffers
-"""
-
-def match_buffer(
-    param: Union[Var, BufferSlice],
-    shape: Sequence[Union[PrimExpr, int]],
-    dtype: str = "float32",
-    data: Var = None,
-    strides: Optional[Sequence[int]] = None,
-    elem_offset: Optional[int] = None,
-    scope: str = "global",
-    align: int = -1,
-    offset_factor: int = 0,
-    buffer_type: str = "default",
-    axis_separators: Optional[List[int]] = None,
-) -> Buffer: ...
-def decl_buffer(
-    shape: Sequence[Union[PrimExpr, int]],
-    dtype: str = "float32",
-    data: Var = None,
-    strides: Optional[Sequence[int]] = None,
-    elem_offset: Optional[int] = None,
-    scope: str = "global",
-    align: int = -1,
-    offset_factor: int = 0,
-    buffer_type: str = "default",
-    axis_separators: Optional[List[int]] = None,
-) -> Buffer: ...
-def buffer_decl(
-    shape: Sequence[Union[PrimExpr, int]],
-    dtype: str = "float32",
-    data: Var = None,
-    strides: Optional[Sequence[int]] = None,
-    elem_offset: Optional[int] = None,
-    scope: str = "global",
-    align: int = -1,
-    offset_factor: int = 0,
-    buffer_type: str = "default",
-    axis_separators: Optional[List[int]] = None,
-) -> Buffer: ...
-def alloc_buffer(
-    shape: Sequence[Union[PrimExpr, int]],
-    dtype: str = "float32",
-    data: Var = None,
-    strides: Optional[Sequence[int]] = None,
-    elem_offset: Optional[int] = None,
-    scope: str = "global",
-    align: int = -1,
-    offset_factor: int = 0,
-    buffer_type: str = "default",
-    axis_separators: Optional[List[int]] = None,
-) -> Buffer: ...
-
-"""
-special_stmt - Reads/Writes
-"""
-
-@overload
-def reads(read_regions: List[BufferSlice]) -> None: ...
-@overload
-def reads(*read_regions: BufferSlice) -> None: ...
-@overload
-def writes(write_region: List[BufferSlice]) -> None: ...
-@overload
-def writes(*write_region: BufferSlice) -> None: ...
-def block_attr(attrs: Mapping[str, Object]) -> None: ...
-
-"""
-special_stmt - Axis
-"""
-
-class axis:
-    @overload
-    @staticmethod
-    def spatial(dom: Union[PrimExpr, int], value: PrimExpr) -> IterVar: ...
-    @overload
-    @staticmethod
-    def spatial(
-        dom: Tuple[Union[PrimExpr, int], Union[PrimExpr, int]], value: PrimExpr
-    ) -> IterVar: ...
-    @overload
-    @staticmethod
-    def S(dom: Union[PrimExpr, int], value: PrimExpr) -> IterVar: ...
-    @overload
-    @staticmethod
-    def S(dom: Tuple[Union[PrimExpr, int], Union[PrimExpr, int]], value: PrimExpr) -> IterVar: ...
-    @overload
-    @staticmethod
-    def reduce(dom: Union[PrimExpr, int], value: PrimExpr) -> IterVar: ...
-    @overload
-    @staticmethod
-    def reduce(
-        dom: Tuple[Union[PrimExpr, int], Union[PrimExpr, int]], value: PrimExpr
-    ) -> IterVar: ...
-    @overload
-    @staticmethod
-    def R(dom: Union[PrimExpr, int], value: PrimExpr) -> IterVar: ...
-    @overload
-    @staticmethod
-    def R(dom: Tuple[Union[PrimExpr, int], Union[PrimExpr, int]], value: PrimExpr) -> IterVar: ...
-    @overload
-    @staticmethod
-    def scan(dom: Union[PrimExpr, int], value: PrimExpr) -> IterVar: ...
-    @overload
-    @staticmethod
-    def scan(
-        dom: Tuple[Union[PrimExpr, int], Union[PrimExpr, int]], value: PrimExpr
-    ) -> IterVar: ...
-    @overload
-    @staticmethod
-    def opaque(dom: Union[PrimExpr, int], value: PrimExpr) -> IterVar: ...
-    @overload
-    @staticmethod
-    def opaque(
-        dom: Tuple[Union[PrimExpr, int], Union[PrimExpr, int]], value: PrimExpr
-    ) -> IterVar: ...
-    @staticmethod
-    def remap(iter_types: str, loop_vars: List[Var]) -> List[IterVar]: ...
-
-def get_axis(begin: PrimExpr, end: PrimExpr, iter_type: int) -> IterVar: ...
-
-"""
-special_stmt - Annotations
-"""
-
-def buffer_var(dtype: str, storage_scope: str) -> Var: ...
-def func_attr(attrs: Mapping[str, Union[Object, str, bool, int, float]]) -> None: ...
-def prim_func(input_func: Callable) -> PrimFunc: ...
-
-"""
-special_stmt - Threads and Bindings
-"""
-
-def env_thread(env_name: str) -> IterVar: ...
-def bind(iter_var: IterVar, expr: PrimExpr) -> None: ...
-
-"""
-Scope handler
-"""
-
-class block(ContextManager):
-    def __init__(self, name_hint: str = "") -> None: ...
-    def __enter__(self) -> Sequence[IterVar]: ...
-
-class init(ContextManager):
-    def __init__(self) -> None: ...
-
-class let(ContextManager):
-    def __init__(self, var: Var, value: PrimExpr) -> None: ...
-
-def where(cond: PrimExpr) -> None: ...
-def allocate(
-    extents: List[PrimExpr],
-    dtype: str,
-    scope: str,
-    condition: Union[PrimExpr, builtins.bool] = True,
-    annotations: Optional[Mapping[str, Object]] = None,
-) -> Buffer: ...
-def launch_thread(env_var: Var, extent: Union[int, PrimExpr]) -> Var: ...
-def realize(
-    buffer_slice: BufferSlice, scope: str, condition: Union[PrimExpr, builtins.bool] = True
-) -> None: ...
-def attr(node: PrimExpr, attr_key: str, value: PrimExpr) -> None: ...
-def Assert(condition: Union[PrimExpr, builtins.bool], message: str) -> PrimExpr: ...
-
-"""
-Scope handler - Loops
-"""
-
-@overload
-def serial(
-    begin: Union[PrimExpr, int],
-    end: Union[PrimExpr, int],
-    annotations: Optional[Mapping[str, Object]] = None,
-) -> Iterable[IterVar]: ...
-@overload
-def serial(
-    end: Union[PrimExpr, int],
-    annotations: Optional[Mapping[str, Object]] = None,
-) -> Iterable[IterVar]: ...
-@overload
-def parallel(
-    begin: Union[PrimExpr, int],
-    end: Union[PrimExpr, int],
-    annotations: Optional[Mapping[str, Object]] = None,
-) -> Iterable[IterVar]: ...
-@overload
-def parallel(
-    end: Union[PrimExpr, int],
-    annotations: Optional[Mapping[str, Object]] = None,
-) -> Iterable[IterVar]: ...
-@overload
-def vectorized(
-    begin: Union[PrimExpr, int],
-    end: Union[PrimExpr, int],
-    annotations: Optional[Mapping[str, Object]] = None,
-) -> Iterable[IterVar]: ...
-@overload
-def vectorized(
-    end: Union[PrimExpr, int],
-    annotations: Optional[Mapping[str, Object]] = None,
-) -> Iterable[IterVar]: ...
-@overload
-def unroll(
-    begin: Union[PrimExpr, int],
-    end: Union[PrimExpr, int],
-    annotations: Optional[Mapping[str, Object]] = None,
-) -> Iterable[IterVar]: ...
-@overload
-def unroll(
-    end: Union[PrimExpr, int],
-    annotations: Optional[Mapping[str, Object]] = None,
-) -> Iterable[IterVar]: ...
-@overload
-def thread_binding(
-    begin: Union[PrimExpr, int],
-    end: Union[PrimExpr, int],
-    thread: str,
-    annotations: Optional[Mapping[str, Object]] = None,
-) -> Iterable[IterVar]: ...
-@overload
-def thread_binding(
-    end: Union[PrimExpr, int],
-    thread: str,
-    annotations: Optional[Mapping[str, Object]] = None,
-) -> Iterable[IterVar]: ...
-@overload
-def for_range(
-    begin: Union[PrimExpr, int],
-    end: Union[PrimExpr, int],
-    annotations: Optional[Mapping[str, Object]] = None,
-) -> Iterable[IterVar]: ...
-@overload
-def for_range(
-    end: Union[PrimExpr, int],
-    annotations: Optional[Mapping[str, Object]] = None,
-) -> Iterable[IterVar]: ...
-def grid(*extents: Union[PrimExpr, int]) -> Iterable[Sequence[IterVar]]: ...
-
-"""
-ty - redefine types
-"""
-
-class boolean: ...
-
-class handle(Var):
-    @overload
-    def __getitem__(self: handle, pos: Sequence[Union[int, PrimExpr, slice]]) -> Buffer: ...
-    @overload
-    def __getitem__(self: handle, pos: Union[int, PrimExpr, slice]) -> Buffer: ...
-    @overload
-    def __setitem__(
-        self: handle, pos: Sequence[Union[int, PrimExpr, slice]], value: Buffer
-    ) -> None: ...
-    @overload
-    def __setitem__(self: handle, pos: Union[int, PrimExpr, slice], value: Buffer) -> None: ...
-    @property
-    def data(self: handle) -> Ptr: ...
-
-class Ptr: ...
-
-def target(target_str: Union[str, Mapping[str, Object]]) -> Target: ...
-
-class var(Var):
-    def __init__(self: Var, dtype: str): ...
-
-class bool(PrimExpr):
-    def __init__(self: bool, imm: Union[PrimExpr, builtins.bool, builtins.int]): ...
-
-class int8(PrimExpr):
-    def __init__(self: int8, imm: Union[PrimExpr, int]): ...
-
-class int16(PrimExpr):
-    def __init__(self: int16, imm: Union[PrimExpr, int]): ...
-
-class int32(PrimExpr):
-    def __init__(self: int32, imm: Union[PrimExpr, int]): ...
-
-class int64(PrimExpr):
-    def __init__(self: int64, imm: Union[PrimExpr, int]): ...
-
-class uint8(PrimExpr):
-    def __init__(self: uint8, imm: Union[PrimExpr, int]): ...
-
-class uint16(PrimExpr):
-    def __init__(self: uint16, imm: Union[PrimExpr, int]): ...
-
-class uint32(PrimExpr):
-    def __init__(self: uint32, imm: Union[PrimExpr, int]): ...
-
-class uint64(PrimExpr):
-    def __init__(self: uint64, imm: Union[PrimExpr, int]): ...
-
-# use typing.Literal instead for python 3.8 or higher
-import sys
-
-if sys.version_info >= (3, 8):
-    from typing import Literal
-
-    SpecialFloatLiteral = Literal["inf", "-inf", "nan"]
-else:
-    SpecialFloatLiteral = str
-
-class float8(PrimExpr):
-    def __init__(self: float8, imm: Union[PrimExpr, int, float, SpecialFloatLiteral]): ...
-
-class float16(PrimExpr):
-    def __init__(self: float16, imm: Union[PrimExpr, int, float, SpecialFloatLiteral]): ...
-
-class float32(PrimExpr):
-    def __init__(self: float32, imm: Union[PrimExpr, int, float, SpecialFloatLiteral]): ...
-
-class float64(PrimExpr):
-    def __init__(self: float64, imm: Union[PrimExpr, int, float, SpecialFloatLiteral]): ...
diff --git a/python/tvm/script/parser_v1/tir/intrin.py b/python/tvm/script/parser_v1/tir/intrin.py
deleted file mode 100644
index 9cde8e3f6d08..000000000000
--- a/python/tvm/script/parser_v1/tir/intrin.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TVM Script Parser Intrinsic Classes"""
-# pylint: disable=redefined-builtin, relative-beyond-top-level
-import builtins
-from typing import Any, List
-
-import tvm.tir
-from tvm.tir import FloatImm
-
-from ....target import codegen
-from ..registry import register
-from ..utils import get_param_list, tvm_span_from_synr
-
-
-class Intrin:
-    def __init__(self, intrin, stmt=False):
-        self.intrin = intrin
-        self.stmt = stmt
-
-    def signature(self):
-        return "tir." + self.intrin.__name__, get_param_list(self.intrin)
-
-    def handle(self, arg_list: List[Any], span: tvm.ir.Span):
-        return self.intrin(*arg_list, span=tvm_span_from_synr(span))
-
-
-@register
-def bool(imm, span):
-    return imm.astype("bool", span)
-
-
-# register all datatypes
-for _dtype in ["float", "uint", "int"]:
-    for _size in ["8", "16", "32", "64"]:
-        for _lanes in ["", "x4", "x8", "x16", "x32"]:
-            _name = _dtype + _size + _lanes
-
-            # nest closures so we copy the name string
-            def wrap(name):
-                def f(imm, span):
-                    if name.startswith("float"):
-                        if imm in {"inf", "-inf", "nan"}:
-                            return FloatImm(dtype=name, value=float(imm), span=span)
-                    return imm.astype(name, span)
-
-                f.__name__ = name
-                return f
-
-            _intrin = wrap(_name)
-            register(_intrin)
-
-
-@register
-def min_value(dtype, span):
-    return tvm.tir.min_value(dtype, span)
-
-
-@register
-def max_value(dtype, span):
-    return tvm.tir.max_value(dtype, span)
-
-
-@register
-def floordiv(x, y, span):
-    return tvm.tir.floordiv(x, y, span)
-
-
-@register
-def floormod(x, y, span):
-    return tvm.tir.floormod(x, y, span)
-
-
-@register
-def truncmod(x, y, span):
-    return tvm.tir.truncmod(x, y, span)
-
-
-@register
-def truncdiv(x, y, span):
-    return tvm.tir.truncdiv(x, y, span)
-
-
-@register
-def ceildiv(x, y, span):
-    return tvm.tir.ceildiv(x, y, span)
-
-
-@register
-def abs(x, span):
-    return tvm.tir.abs(x, span)
-
-
-@register
-def load(dtype, var, index, predicate=None, span=None):
-    return tvm.tir.Load(dtype, var, index, predicate, span)
-
-
-@register
-def cast(value, dtype, span):
-    return tvm.tir.Cast(dtype, value, span)
-
-
-@register
-def ramp(base, stride, lanes, span):
-    return tvm.tir.Ramp(base, stride, lanes.value, span)
-
-
-@register
-def broadcast(value, lanes, span):
-    return tvm.tir.Broadcast(value, lanes.value, span)
-
-
-@register
-def iter_var(var, dom, iter_type, thread_tag, span):
-    iter_type = getattr(tvm.tir.IterVar, iter_type)
-    return tvm.tir.IterVar(dom, var, iter_type, thread_tag, span)
-
-
-@register
-def max(a, b, span):  # pylint: disable=redefined-builtin
-    return tvm.tir.Max(a, b, span)
-
-
-@register
-def min(a, b, span):  # pylint: disable=redefined-builtin
-    return tvm.tir.Min(a, b, span)
-
-
-def get_axis(begin, end, iter_type, span):
-    ana = tvm.arith.Analyzer()
-    extent = ana.simplify(end - begin)
-    block_var_dom = tvm.ir.Range.from_min_extent(begin, extent)
-
-    iter_type_dict = {"data_par": 0, "reduce": 2, "scan": 3, "opaque": 4}
-    return tvm.tir.IterVar(block_var_dom, "bv", iter_type_dict[iter_type], span=span)
-
-
-@register
-def range(begin, end, span):
-    return get_axis(begin, end, "data_par", span)
-
-
-@register
-def reduce_axis(begin, end, span):
-    return get_axis(begin, end, "reduce", span)
-
-
-@register
-def scan_axis(begin, end, span):
-    return get_axis(begin, end, "scan", span)
-
-
-@register
-def opaque_axis(begin, end, span):
-    return get_axis(begin, end, "opaque", span)
-
-
-@register
-def Select(cond, if_body, else_body, span):  # pylint: disable=invalid-name
-    return tvm.tir.Select(cond, if_body, else_body, span)
-
-
-@register
-def Let(var, value, body, span):  # pylint: disable=invalid-name
-    return tvm.tir.Let(var, value, body, span)
-
-
-@register
-class EvaluateIntrin(Intrin):
-    def __init__(self):
-        def evaluate(value, span):
-            return tvm.tir.Evaluate(value, span)
-
-        super().__init__(evaluate, stmt=True)
-
-
-@register
-class StoreIntrin(Intrin):
-    def __init__(self):
-        def store(var, index, value, predicate=True, span=None):
-            return tvm.tir.Store(var, value, index, predicate, span)
-
-        super().__init__(store, stmt=True)
-
-
-@register
-class AssumeIntrin(Intrin):
-    def __init__(self):
-        def assume(constraint, span):
-            return tvm.tir.Evaluate(
-                tvm.tir.call_intrin("bool", "tir.assume", constraint, span=span)
-            )
-
-        super().__init__(assume, stmt=True)
-
-
-@register
-def comm_reducer(lambda_io, identities, span):
-    """Create a CommReducer from lambda inputs/outputs and the identities"""
-    lambda_input = lambda_io[0]
-    lambda_output = lambda_io[1]
-
-    num_args = len(lambda_input)
-    num_arg_per_group = num_args // 2
-    x = [lambda_input[i] for i in builtins.range(0, num_arg_per_group)]
-    y = [lambda_input[i] for i in builtins.range(num_arg_per_group, num_args)]
-
-    if not isinstance(lambda_output, tuple):
-        lambda_output = (lambda_output,)
-
-    return tvm.tir.CommReducer(x, y, lambda_output, identities, span)
-
-
-@register
-def llvm_lookup_intrinsic_id(name, span):
-    # pylint: disable=unused-argument
-    return codegen.llvm_lookup_intrinsic_id(name)
-
-
-@register
-def FloorMod(x, y, span):  # pylint: disable=invalid-name
-    return tvm.tir.FloorMod(x, y, span)
-
-
-@register
-def FloorDiv(x, y, span):  # pylint: disable=invalid-name
-    return tvm.tir.FloorDiv(x, y, span)
-
-
-@register
-def Mul(x, y, span):  # pylint: disable=invalid-name
-    return tvm.tir.Mul(x, y, span)
-
-
-@register
-def Div(x, y, span):  # pylint: disable=invalid-name
-    return tvm.tir.Div(x, y, span)
-
-
-@register
-def Add(x, y, span):  # pylint: disable=invalid-name
-    return tvm.tir.Add(x, y, span)
-
-
-@register
-def Sub(x, y, span):  # pylint: disable=invalid-name
-    return tvm.tir.Sub(x, y, span)
-
-
-@register
-def LT(x, y, span):  # pylint: disable=invalid-name
-    return tvm.tir.LT(x, y, span)
-
-
-@register
-def LE(x, y, span):  # pylint: disable=invalid-name
-    return tvm.tir.LE(x, y, span)
-
-
-@register
-def GT(x, y, span):  # pylint: disable=invalid-name
-    return tvm.tir.GT(x, y, span)
-
-
-@register
-def GE(x, y, span):  # pylint: disable=invalid-name
-    return tvm.tir.GE(x, y, span)
-
-
-@register
-def EQ(x, y, span):  # pylint: disable=invalid-name
-    return tvm.tir.EQ(x, y, span)
-
-
-@register
-def NE(x, y, span):  # pylint: disable=invalid-name
-    return tvm.tir.NE(x, y, span)
-
-
-@register
-def And(x, y, span):  # pylint: disable=invalid-name
-    return tvm.tir.And(x, y, span)
-
-
-@register
-def Or(x, y, span):  # pylint: disable=invalid-name
-    return tvm.tir.Or(x, y, span)
-
-
-@register
-def Cast(dtype, value, span):  # pylint: disable=invalid-name
-    return tvm.tir.Cast(dtype, value, span)
diff --git a/python/tvm/script/parser_v1/tir/node.py b/python/tvm/script/parser_v1/tir/node.py
deleted file mode 100644
index 29e79607fbc9..000000000000
--- a/python/tvm/script/parser_v1/tir/node.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=redefined-builtin
-"""TVM Script nodes."""
-
-from typing import Optional, Union, List, Callable
-import synr
-from tvm.arith import Analyzer
-from tvm.runtime import ObjectGeneric, convert
-from tvm.tir import PrimExpr, Buffer, BufferLoad, IntImm, Ramp, BufferRegion
-from tvm.ir import Span, Range
-
-
-class Slice:
-    """A helper class to present slice information for BufferSlice
-
-    Parameters
-    ----------
-    start : Union[PrimExpr, int]
-        The start index.
-
-    stop : Optional[Union[PrimExpr, int]]
-        The stop index, None means the Slice is an element-wise index
-
-    step : int
-        The slice step
-
-    span : Optional[Span]
-        The location of the slice in the source.
-    """
-
-    start: Union[PrimExpr, int]
-    stop: Optional[Union[PrimExpr, int]]
-    step: int
-    span: Optional[Span]
-
-    def __init__(
-        self,
-        start: Union[PrimExpr, int],
-        stop: Optional[Union[PrimExpr, int]] = None,
-        step: int = 1,
-        span: Optional[Span] = None,
-    ):
-        self.start = start
-        self.stop = stop
-        self.step = step
-        self.span = span
-
-    def as_index_expr(self, report_error: Callable[[str, Union[Span, synr.ast.Span]], None]):
-        """Helper to create index PrimExpr from slice object
-        Parameters
-        ----------
-        report_error: Callable[[str, Union[Span, synr.ast.Span]], None]
-            The error report func
-        """
-        if self.stop is None:
-            # scalar index
-            return self.start
-        if self.step < 1:
-            report_error("Slice's step should be positive integer", self.span)
-        lanes = Analyzer().simplify((self.stop - self.start + self.step - 1) // self.step)
-        if not isinstance(lanes, (int, IntImm)):
-            report_error("Slice's lanes should be constant for buffer indices", self.span)
-        if lanes == 1:
-            return self.start
-        return Ramp(self.start, self.step, int(lanes), self.span)
-
-
-class BufferSlice(ObjectGeneric):
-    """A generic object for representing general buffer access. Following cases are supported:
-        - element wise access buffer[i, j], which can be converted to BufferLoad if necessary
-        - slice access buffer[i: i + 1, j : j + 2]
-        - union of element and slice buffer[i, j: j + 2]
-
-        This node is used in TVMScript to parse BufferLoad, BufferRegion and Realize
-
-    Parameters
-    ----------
-    buffer : Buffer
-        The buffer.
-
-    indices : List[Union[Slice, PrimExpr, int]]
-        The access indexes can be slice, PrimExpr or int.
-
-    report_error: Callable[[str, Union[Span, synr.ast.Span]], None]
-        The error report func
-
-    span : Optional[Span]
-        The location of the buffer access in the source.
-    """
-
-    buffer: Buffer
-    slices: List[Slice]
-    report_error: Callable[[str, Union[Span, synr.ast.Span]], None]
-    span: Optional[Span]
-
-    def __init__(
-        self,
-        buffer: Buffer,
-        indices: List[Union[Slice, PrimExpr, int]],
-        report_error: Callable[[str, Union[Span, synr.ast.Span]], None],
-        span: Optional[Span] = None,
-    ):
-        def check_index(index: Union[int, PrimExpr]):
-            """Check input index is non-negative integer or PrimExpr"""
-            if isinstance(index, int):
-                if index < 0:
-                    report_error("Negative index is not allowed during buffer access", span)
-            elif isinstance(index, PrimExpr):
-                element_dtype = index.dtype.split("x", maxsplit=1)[0]
-                if element_dtype[:3] != "int":
-                    report_error(
-                        "index expected an integer type PrimExpr but got " + str(index.dtype),
-                        index.span,
-                    )
-            else:
-                report_error(
-                    "Unsupported index type, expected int or tvm.tir.PrimExpr, but got "
-                    + str(type(index)),
-                    span,
-                )
-
-        slices: List[Union[Slice, BufferSlice]] = []
-        for index in indices:
-            if isinstance(index, Slice):
-                index.start, index.stop = [convert(_) for _ in [index.start, index.stop]]
-                check_index(index.start)
-                check_index(index.stop)
-                slices.append(index)
-            elif isinstance(index, (PrimExpr, int)):
-                check_index(index)
-                slices.append(Slice(index))
-            elif isinstance(index, BufferSlice):
-                buffer_load = index.asobject()
-                check_index(buffer_load)
-                slices.append(Slice(buffer_load))
-            else:
-                report_error(
-                    "Unsupported index type for BufferSlice, "
-                    + "expected int, tvm.tir.PrimExpr, tvm.tir.Slice, but got "
-                    + str(type(index)),
-                    span,
-                )
-
-        self.buffer = buffer
-        self.slices = slices
-        self.report_error = report_error
-        self.span = span
-
-    def __str__(self):
-        regions: List[str] = []
-        for s in self.slices:
-            if s.stop is None:
-                regions.append(str(s.start))
-            else:
-                regions.append(str(s.start) + ": " + str(s.stop))
-
-        return self.buffer.name + "[" + ", ".join(regions) + "]"
-
-    def asobject(self) -> BufferLoad:
-        """Convert object."""
-        indices = [s.as_index_expr(self.report_error) for s in self.slices]
-        return BufferLoad(self.buffer, indices, span=self.span)
-
-    def as_buffer_region(self, analyzer: Optional[Analyzer] = None) -> BufferRegion:
-        """Construct BufferRegion from BufferSlice
-
-        Parameters
-        ----------
-        analyzer : Optional[tvm.arith.Analyzer]
-            The analyzer for simplifying. If not provided, the method will construct a new one
-
-        Returns
-        -------
-        buffer_region : BufferRegion
-            The constructed BufferRegion.
-        """
-        region: List[Range] = []
-        for s in self.slices:
-            start = s.start if isinstance(s.start, PrimExpr) else IntImm("int32", s.start)
-            extent = IntImm(start.dtype, 1) if s.stop is None else s.stop - s.start
-            if not analyzer:
-                analyzer = Analyzer()
-            if isinstance(extent, PrimExpr):
-                extent = analyzer.simplify(extent)
-            if s.step != 1:
-                self.report_error("BufferRegion do not support non-trivial stride", s.span)
-            region.append(Range.from_min_extent(start, extent, span=s.span))
-        return BufferRegion(self.buffer, region)
-
-    def astype(self, dtype: str, span: Optional[Span] = None) -> PrimExpr:
-        return self.asobject().astype(dtype, span)
-
-    @property
-    def dtype(self) -> str:
-        """Return the dtype referenced by the slice.
-
-        Implemented as a property so that ``slice.dtype`` has the same
-        calling convention as ``primexpr.dtype``.  This allows a
-        BufferSlice object can be assigned to a variable without
-        requiring a type annotation on the variable, similar to other
-        expressions.
-        """
-        return self.asobject().dtype
diff --git a/python/tvm/script/parser_v1/tir/prim_func.py b/python/tvm/script/parser_v1/tir/prim_func.py
deleted file mode 100644
index 923eb97d2758..000000000000
--- a/python/tvm/script/parser_v1/tir/prim_func.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TVM Script Interface for PrimFunc"""
-
-import inspect
-from typing import Callable
-
-from tvm.tir.function import PrimFunc
-from ..parser import from_source
-
-
-def prim_func(input_func: Callable) -> PrimFunc:
-    """Decorate a python function as tvm script.
-
-    Parameters
-    ----------
-    func : input_func
-        The function to be parsed.
-
-    Returns
-    -------
-    output : PrimFunc
-        The result functions.
-    """
-    if inspect.isfunction(input_func):
-        result = from_source(input_func)
-        result.__name__ = input_func.__name__
-        result.__qualname__ = input_func.__qualname__
-        return result
-
-    raise TypeError("Only function definitions are supported.")
diff --git a/python/tvm/script/parser_v1/tir/scope_handler.py b/python/tvm/script/parser_v1/tir/scope_handler.py
deleted file mode 100644
index 69a414890655..000000000000
--- a/python/tvm/script/parser_v1/tir/scope_handler.py
+++ /dev/null
@@ -1,793 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TVM Script Parser Scope Handler Classes"""
-# pylint: disable=redefined-builtin, unused-argument, invalid-name, relative-beyond-top-level
-from typing import Tuple, Any, Callable, Optional, List, Union, Mapping
-
-import synr
-import numpy as np
-import tvm.tir
-from tvm.runtime import Object, String, convert
-from tvm.ir import Span, Range
-from tvm.tir import Stmt, PrimExpr, IterVar, Var, Buffer, BufferRegion, ForKind
-
-from .node import BufferSlice
-
-from ..context_maintainer import ContextMaintainer
-from ..registry import register
-from ..utils import (
-    get_param_list,
-    tvm_span_from_synr,
-    call_with_error_reporting,
-)
-
-
-class ScopeHandler:
-    """Base class for all scope handlers"""
-
-    def __init__(self, func: Callable):
-        self.func: Callable = func
-        self.body: Optional[Stmt] = None
-        self.node: Optional[synr.ast.Node] = None
-        self.context: Optional[ContextMaintainer] = None
-
-    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
-        return "tir." + self.func.__name__, get_param_list(self.func)
-
-    def enter_scope(
-        self,
-        node: synr.ast.Node,
-        context: ContextMaintainer,
-        arg_list: List[Any],
-        span: synr.ast.Span,
-    ):
-        pass
-
-    def exit_scope(
-        self,
-        node: synr.ast.Node,
-        context: ContextMaintainer,
-        arg_list: List[Any],
-        span: synr.ast.Span,
-    ):
-        self.node = node
-        self.context = context
-        return call_with_error_reporting(
-            context.report_error, span, self.func, *arg_list, span=tvm_span_from_synr(span)
-        )
-
-
-class WithScopeHandler(ScopeHandler):
-    """Base class for all with scope handlers"""
-
-    def __init__(self, func, concise_scope, def_symbol):
-        super().__init__(func)
-        self.concise_scope = concise_scope
-        self.def_symbol = def_symbol
-
-    @staticmethod
-    def get_optional_vars(node, context):
-        """Get a list synr.ast.With's optional_vars"""
-        assert isinstance(
-            node, synr.ast.With
-        ), f"WithScopeHandler expected synr.ast.With but got {type(node)}"
-
-        if isinstance(node.lhs, list):
-            for var in node.lhs:
-                if not isinstance(var, synr.ast.Var):
-                    context.report_error(
-                        f"Invalid optional var definition, expected Var but got {type(var)}",
-                        node.span,
-                    )
-            vars = node.lhs
-        else:
-            context.report_error(
-                f"Invalid optional var definition, expected list of Var but got {type(node.lhs)}",
-                node.span,
-            )
-        return vars
-
-
-@register
-class Allocate(WithScopeHandler):
-    """With scope handler T.allocate(extents, dtype, scope, condition, annotations)"""
-
-    def __init__(self):
-        def allocate(extents, dtype, scope, condition=True, annotations=None, span=None):
-            condition = tvm.runtime.convert(condition)
-            scope = tvm.runtime.convert(scope)
-
-            return tvm.tir.Allocate(
-                self.buffer_var,
-                dtype,
-                extents,
-                condition,
-                self.body,
-                annotations=annotations,
-                span=span,
-            )
-
-        super().__init__(allocate, concise_scope=True, def_symbol=True)
-        self.buffer_var = None
-
-    def enter_scope(
-        self,
-        node: synr.ast.Node,
-        context: ContextMaintainer,
-        arg_list: List[Any],
-        span: synr.ast.Span,
-    ):
-        # define buffer vars in symbol table
-        if isinstance(node, synr.ast.With):
-            vars = WithScopeHandler.get_optional_vars(node, context)
-            if len(vars) != 1:
-                context.report_error(f"Unexpected number of vars: 1 vs. {len(vars)}", node.span)
-            name = vars[0].id.name
-            var_span = vars[0].id.span
-        elif isinstance(node, synr.ast.Assign):
-            if len(node.lhs) != 1:
-                context.report_error(f"Unexpected number of vars: 1 vs. {len(node.lhs)}", node.span)
-            name = node.lhs[0].id.name
-            var_span = node.lhs[0].id.span
-        else:
-            raise Exception("Internal Bug")
-
-        def setup_buffer_var(
-            extents, dtype, scope, condition=True, annotations=None, span: Span = None
-        ):
-            """Setup buffer var for a given type."""
-            buffer_ptr_type = tvm.ir.PointerType(tvm.ir.PrimType(dtype), scope)
-            self.buffer_var = tvm.tir.Var(name, buffer_ptr_type, span)
-
-        setup_buffer_var(*arg_list, span=tvm_span_from_synr(var_span))
-        context.update_symbol(name, self.buffer_var, node)
-
-
-@register
-class AllocateConst(WithScopeHandler):
-    """With scope handler T.allocate_const(data, extents, dtype, condition)
-
-    TIR constant node to represent non-scalar constant
-    """
-
-    def __init__(self):
-        def allocate_const(raw_data, dtype, shape, annotations=None, span=None):
-            list_data = []
-            for i in raw_data:
-                list_data.append(i.value)
-            nd_data = tvm.nd.array(np.asarray(list_data, dtype=dtype))
-            n = tvm.tir.AllocateConst(
-                self.buffer_var,
-                dtype,
-                shape,
-                nd_data,
-                self.body,
-                annotations=annotations,
-                span=span,
-            )
-            return n
-
-        super().__init__(allocate_const, concise_scope=True, def_symbol=True)
-        self.buffer_var = None
-
-    def enter_scope(
-        self,
-        node: synr.ast.Node,
-        context: ContextMaintainer,
-        arg_list: List[Any],
-        span: synr.ast.Span,
-    ):
-        # define buffer vars in symbol table
-        if isinstance(node, synr.ast.With):
-            vars = WithScopeHandler.get_optional_vars(node, context)
-            if len(vars) != 1:
-                context.report_error(f"Unexpected number of vars: 1 vs. {len(vars)}", node.span)
-            name = vars[0].id.name
-            var_span = vars[0].id.span
-        elif isinstance(node, synr.ast.Assign):
-            if len(node.lhs) != 1:
-                context.report_error(f"Unexpected number of vars: 1 vs. {len(node.lhs)}", node.span)
-            name = node.lhs[0].id.name
-            var_span = node.lhs[0].id.span
-        else:
-            raise Exception("Internal Bug")
-
-        def setup_buffer_var(data, dtype, shape, annotations: dict = None, span: Span = None):
-            """Setup buffer var for a given type."""
-            buffer_ptr_type = tvm.ir.PointerType(tvm.ir.PrimType(dtype))
-            self.buffer_var = tvm.tir.Var(name, buffer_ptr_type, span)
-
-        setup_buffer_var(*arg_list, span=tvm_span_from_synr(var_span))
-        context.update_symbol(name, self.buffer_var, node)
-
-
-@register
-class DeclBuffer(WithScopeHandler):
-    """Special Stmt decl_buffer(shape, dtype, data, strides, elem_offset, scope, align,
-                                offset_factor, buffer_type, axis_separators)
-    Example
-    -------
-    .. code-block:: python
-        A = T.decl_buffer((128, 128), dtype="float32")
-    """
-
-    def __init__(self):
-        def decl_buffer(
-            shape,
-            dtype="float32",
-            data=None,
-            strides=None,
-            elem_offset=None,
-            scope="global",
-            align=-1,
-            offset_factor=0,
-            buffer_type="default",
-            axis_separators=None,
-            span=None,
-        ):
-            decl_buffer = tvm.tir.DeclBuffer(self.buffer, self.body, span=span)
-            if data is None:
-                # when data is not specified, the buffer is implicitly allocated
-                return tvm.tir.Allocate(
-                    self.buffer.data,
-                    dtype,
-                    shape,
-                    tvm.runtime.convert(True),
-                    decl_buffer,
-                    span=span,
-                )
-            return decl_buffer
-
-        super().__init__(decl_buffer, concise_scope=True, def_symbol=True)
-
-    def enter_scope(
-        self,
-        node: synr.ast.Node,
-        context: ContextMaintainer,
-        arg_list: List[Any],
-        span: synr.ast.Span,
-    ):
-        # define buffer vars in symbol table
-        if isinstance(node, synr.ast.With):
-            vars = WithScopeHandler.get_optional_vars(node, context)
-            if len(vars) != 1:
-                context.report_error(f"Unexpected number of vars: 1 vs. {len(vars)}", node.span)
-            name = vars[0].id.name
-            var_span = vars[0].id.span
-        elif isinstance(node, synr.ast.Assign):
-            if len(node.lhs) != 1:
-                context.report_error(f"Unexpected number of vars: 1 vs. {len(node.lhs)}", node.span)
-            name = node.lhs[0].id.name
-            var_span = node.lhs[0].id.span
-        else:
-            raise Exception("Internal Bug")
-
-        def setup_buffer(
-            shape,
-            dtype,
-            data,
-            strides,
-            elem_offset,
-            scope,
-            align,
-            offset_factor,
-            buffer_type,
-            axis_separators,
-            span: Span = None,
-        ):
-            self.buffer = tvm.tir.decl_buffer(
-                shape=shape,
-                dtype=dtype,
-                data=data,
-                strides=strides,
-                elem_offset=elem_offset,
-                scope=scope,
-                data_alignment=align,
-                offset_factor=offset_factor,
-                buffer_type=buffer_type,
-                axis_separators=axis_separators,
-                name=name,
-                span=span,
-            )
-
-        setup_buffer(*arg_list, span=tvm_span_from_synr(var_span))
-        context.update_symbol(name, self.buffer, node)
-
-
-@register
-class LaunchThread(WithScopeHandler):
-    """With scope handler T.launch_thread(env_var, extent)"""
-
-    def __init__(self):
-        def launch_thread(env_var, extent, span):
-            extent = tvm.runtime.convert(extent, span=span)
-            thread_id = self.context.func_var_env_dict[env_var]
-            attr_key = "virtual_thread" if thread_id == "vthread" else "thread_extent"
-            return tvm.tir.AttrStmt(
-                IterVar(
-                    (0, extent),
-                    env_var,
-                    getattr(IterVar, "ThreadIndex"),
-                    thread_id,
-                    span=span,
-                ),
-                attr_key,
-                extent,
-                self.body,
-                span=span,
-            )
-
-        super().__init__(launch_thread, concise_scope=True, def_symbol=False)
-
-
-@register
-class Realize(WithScopeHandler):
-    """With scope handler T.realize(buffer_bounds, scope, condition)"""
-
-    def __init__(self):
-        def realize(
-            buffer_slice: BufferSlice, scope: str, condition: bool = True, span: bool = None
-        ):
-            assert self.context, "call 'exit_scope' before 'enter_scope'"
-            buffer: Buffer = buffer_slice.buffer
-            bounds: List[Range] = []
-            for s in buffer_slice.slices:
-                min: Union[PrimExpr, int] = s.start
-                extent: Union[PrimExpr, int] = 1 if s.stop is None else s.stop - s.start
-                if isinstance(extent, PrimExpr):
-                    extent = self.context.analyzer.simplify(extent)
-                bounds.append(Range.from_min_extent(min, extent, span=s.span))
-
-            scope = tvm.runtime.convert(scope, span=span)
-            return tvm.tir.AttrStmt(
-                buffer,
-                "realize_scope",
-                scope,
-                tvm.tir.BufferRealize(buffer, bounds, condition, self.body, span=span),
-                span=span,
-            )
-
-        super().__init__(realize, concise_scope=True, def_symbol=False)
-
-
-@register
-class Attr(WithScopeHandler):
-    """With scope handler T.attr(attr_node, attr_key, value)"""
-
-    def __init__(self):
-        def attr(attr_node, attr_key, value, span):
-            attr_node = tvm.runtime.convert(attr_node, span=span)
-            value = tvm.runtime.convert(value, span=span)
-            return tvm.tir.AttrStmt(attr_node, attr_key, value, self.body, span=span)
-
-        super().__init__(attr, concise_scope=True, def_symbol=False)
-
-
-@register
-class AssertHandler(WithScopeHandler):
-    """With scope handler T.Assert(condition, message)"""
-
-    def __init__(self):
-        def Assert(condition, message, span):
-            return tvm.tir.AssertStmt(condition, tvm.runtime.convert(message), self.body, span=span)
-
-        super().__init__(Assert, concise_scope=True, def_symbol=False)
-
-
-@register
-class Let(WithScopeHandler):
-    """With scope handler T.let(var, value)"""
-
-    def __init__(self):
-        def let(var, value, span):
-            return tvm.tir.LetStmt(var, value, self.body, span=span)
-
-        super().__init__(let, concise_scope=False, def_symbol=False)
-
-    def __call__(self, var: tvm.tir.Var, value: tvm.tir.PrimExpr, body: tvm.tir.PrimExpr):
-        return tvm.tir.Let(var, value, body)
-
-
-@register
-class Block(WithScopeHandler):
-    """With scope handler T.block(name)"""
-
-    def __init__(self):
-        def block(name_hint: str = "", span: Optional[Span] = None):
-            assert (
-                self.node and self.context and self.body
-            ), "call 'exit_scope' before 'enter_scope'"
-            block_info = self.context.block_info_stack[-1]
-
-            # create block read/write regions
-            reads: List[BufferRegion] = (
-                [read.as_buffer_region() for read in block_info.reads] if block_info.reads else []
-            )
-            writes: List[BufferRegion] = (
-                [write.as_buffer_region() for write in block_info.writes]
-                if block_info.writes
-                else []
-            )
-
-            region_detect_mask: int = (block_info.reads is None) | (
-                (block_info.writes is None) << 1
-            )
-            annotations = {} if block_info.annotations is None else block_info.annotations
-            if region_detect_mask != 0:
-                annotations["tir.script_parsing_detect_access"] = region_detect_mask
-            inner = tvm.tir.Block(
-                block_info.iter_vars,
-                reads,
-                writes,
-                name_hint,
-                self.body,
-                block_info.init,
-                block_info.alloc_buffers,
-                block_info.match_buffers,
-                annotations,
-                span,
-            )
-            assert len(block_info.iter_vars) == len(block_info.iter_values)
-            predicate = (
-                tvm.tir.const(True, "bool")
-                if block_info.predicate is None
-                else block_info.predicate
-            )
-            body = tvm.tir.BlockRealize(block_info.iter_values, predicate, inner, span)
-            return body
-
-        super().__init__(func=block, concise_scope=False, def_symbol=True)
-        self.block_vars = None
-
-    def enter_scope(
-        self,
-        node: synr.ast.Node,
-        context: ContextMaintainer,
-        arg_list: List[Any],
-        span: synr.ast.Span,
-    ):
-        # define block vars
-        assert isinstance(
-            node, synr.ast.With
-        ), f"BlockScopeHandler expected to work on synr.ast.With but got {type(node)}"
-
-        optional_vars = [var.id.name for var in WithScopeHandler.get_optional_vars(node, context)]
-        if optional_vars:
-            context.report_error(
-                f"Block expected no optional_vars (e.g., `x` in `with block() as x`), "
-                f"but got {optional_vars}",
-                node.span,
-            )
-
-
-@register
-class InitBlock(WithScopeHandler):
-    """With scope handler T.init()"""
-
-    def __init__(self):
-        def init(span: Span = None):
-            assert self.context, "call 'exit_scope' before 'enter_scope'"
-            if self.context.block_info_stack[-2].init is not None:
-                self.context.report_error("Duplicate init block declaration", span)
-            self.context.block_info_stack[-2].init = self.body
-
-        super().__init__(func=init, concise_scope=False, def_symbol=True)
-
-
-class LoopInfo:
-    """Helper class for loop information"""
-
-    loop_var: Var
-    begin: PrimExpr
-    extent: PrimExpr
-    kind: ForKind
-    thread_binding: Optional[str]
-    annotations: Optional[Mapping[str, Object]]
-
-    def __init__(
-        self,
-        begin: PrimExpr,
-        extent: PrimExpr,
-        kind: ForKind,
-        thread_binding: Optional[str] = None,
-        annotations: Optional[Mapping[str, Object]] = None,
-    ) -> None:
-        self.begin = begin
-        self.extent = extent
-        self.kind = kind
-        self.thread_binding = thread_binding
-        self.annotations = annotations
-
-
-class ForScopeHandler(ScopeHandler):
-    """Base class for all for scope handlers"""
-
-    def __init__(self, func):
-        super().__init__(func)
-        self.loop_vars: List[Var] = []
-        self.loop_info: List[LoopInfo] = []
-
-    def enter_scope(
-        self,
-        node: synr.ast.Node,
-        context: ContextMaintainer,
-        arg_list: List[Any],
-        span: synr.ast.Span,
-    ):
-        assert isinstance(
-            node, synr.ast.For
-        ), f"ForScopeHandler expected synr.ast.For but got {type(node)}"
-
-        loop_var_names = list()
-        spans = list()
-        if isinstance(node.lhs, synr.ast.Var):
-            loop_var_names.append(node.lhs.id.name)
-            spans.append(tvm_span_from_synr(node.lhs.id.span))
-        elif isinstance(node.lhs, list):
-            for elt in node.lhs:
-                if not isinstance(elt, synr.ast.Var):
-                    context.report_error(
-                        f"Invalid loop var. Expected a var, but got {type(elt)}", elt.span
-                    )
-                loop_var_names.append(elt.id.name)
-                spans.append(tvm_span_from_synr(elt.id.span))
-        else:
-            context.report_error(
-                f"Invalid loop var. Expected var or list of vars as lhs, but got {type(node.lhs)}",
-                span,
-            )
-
-        self.node = node
-        self.context = context
-        # collect loop infos by calling self.func
-        call_with_error_reporting(context.report_error, span, self.func, *arg_list)
-        if len(loop_var_names) != len(self.loop_info):
-            self.context.report_error(
-                f"Inconsistent number of vars and loops, got {len(loop_var_names)} "
-                + f"vs {len(self.loop_info)}",
-                self.node.span,
-            )
-        # generate loop vars
-        self.loop_vars = []
-        for name, lv_span, li in zip(loop_var_names, spans, self.loop_info):
-            if not li.begin.dtype.startswith("int"):
-                raise NotImplementedError(f"Unsupported dtype in loop begin: {li.begin.dtype}")
-            if not li.extent.dtype.startswith("int"):
-                raise NotImplementedError(f"Unsupported dtype in loop extent: {li.extent.dtype}")
-            dtype = "int64" if "int64" in [li.begin.dtype, li.extent.dtype] else "int32"
-            self.loop_vars.append(tvm.te.var(name, dtype=dtype, span=lv_span))
-
-        for loop_var, loop_info in zip(self.loop_vars, self.loop_info):
-            context.update_symbol(loop_var.name, loop_var, node)
-            context.loop_stack[loop_var] = Range.from_min_extent(loop_info.begin, loop_info.extent)
-
-    def exit_scope(
-        self,
-        node: synr.ast.Node,
-        context: ContextMaintainer,
-        arg_list: List[Any],
-        span: synr.ast.Span,
-    ):
-        assert self.loop_vars, "call 'exit_scope' before 'enter_scope'"
-        for loop_var in self.loop_vars:
-            context.loop_stack.pop(loop_var)
-        # Use assert here since we have check it in `enter_scope`
-        assert len(self.loop_vars) == len(self.loop_info)
-
-        body = self.body
-        for var, info in zip(reversed(self.loop_vars), reversed(self.loop_info)):
-            body = tvm.tir.For(
-                var,
-                info.begin,
-                info.extent,
-                info.kind,
-                body,
-                info.thread_binding,
-                info.annotations,
-                span=tvm_span_from_synr(span),
-            )
-
-        return body
-
-    def create_loop_info(
-        self,
-        begin: Optional[PrimExpr],
-        end: PrimExpr,
-        kind: ForKind,
-        thread_binding: Optional[str] = None,
-        annotations: Optional[Mapping[str, Object]] = None,
-    ) -> None:
-        """
-        Helper function for creating For in TVM Script parser.
-
-        Parameters
-        ----------
-        begin : Optional[PrimExpr]
-            The beginning value. If None, it will be set to 0.
-
-        end : PrimExpr
-            The endding value.
-
-        kind : ForKind
-            The type of the for.
-
-        thread_binding: Optional[str]
-            The thread this loop binds to.
-
-        annotations : Optional[Mapping[str, Object]]
-            Additional annotation hints.
-
-        span : Optional[Span]
-            The location of this for in the source code.
-
-        Returns
-        -------
-        for : For
-            The constructed For.
-        """
-        end = convert(end)
-        if begin is None:
-            begin = tvm.tir.const(0, end.dtype)
-        else:
-            begin = convert(begin)
-        assert self.context and self.node, "call 'exit_scope' before 'enter_scope'"
-        extent = (
-            end
-            if self.context.analyzer.can_prove_equal(begin, 0)
-            else self.context.analyzer.simplify(end - begin)
-        )
-        self.annotations: Mapping[str, Object] = {}
-        if annotations is not None:
-            self.annotations = {
-                key: String(val) if isinstance(val, str) else val
-                for key, val in annotations.items()
-            }
-
-        self.loop_info.append(LoopInfo(begin, extent, kind, thread_binding, annotations))
-
-
-@register
-class Serial(ForScopeHandler):
-    """For scope handler T.serial(begin, end, annotations)"""
-
-    def __init__(self):
-        def serial(
-            begin: PrimExpr,
-            end: PrimExpr = None,
-            annotations: Optional[Mapping[str, Object]] = None,
-        ):
-            if end is None:
-                end, begin = begin, end
-            self.create_loop_info(begin, end, ForKind.SERIAL, annotations=annotations)
-
-        super().__init__(serial)
-
-
-@register
-class Parallel(ForScopeHandler):
-    """For scope handler T.parallel(begin, end, annotations)"""
-
-    def __init__(self):
-        def parallel(
-            begin: PrimExpr,
-            end: PrimExpr = None,
-            annotations: Optional[Mapping[str, Object]] = None,
-        ):
-            if end is None:
-                end, begin = begin, end
-            self.create_loop_info(begin, end, ForKind.PARALLEL, annotations=annotations)
-
-        super().__init__(parallel)
-
-
-@register
-class Vectorized(ForScopeHandler):
-    """For scope handler T.vectorized(begin, end, annotations)"""
-
-    def __init__(self):
-        def vectorized(
-            begin: PrimExpr,
-            end: PrimExpr = None,
-            annotations: Optional[Mapping[str, Object]] = None,
-        ):
-            if end is None:
-                end, begin = begin, end
-            self.create_loop_info(begin, end, ForKind.VECTORIZED, annotations=annotations)
-
-        super().__init__(vectorized)
-
-
-@register
-class Unroll(ForScopeHandler):
-    """For scope handler T.unroll(begin, end, annotations)"""
-
-    def __init__(self):
-        def unroll(
-            begin: PrimExpr,
-            end: PrimExpr = None,
-            annotations: Optional[Mapping[str, Object]] = None,
-        ):
-            if end is None:
-                end, begin = begin, end
-            self.create_loop_info(begin, end, ForKind.UNROLLED, annotations=annotations)
-
-        super().__init__(unroll)
-
-
-@register
-class ThreadBinding(ForScopeHandler):
-    """For scope handler T.thread_binding(begin, end, thread, annotations)"""
-
-    def __init__(self):
-        def thread_binding(
-            begin: PrimExpr,
-            end: PrimExpr = None,
-            thread: str = None,
-            annotations: Optional[Mapping[str, Object]] = None,
-        ):
-            if thread is None:
-                if isinstance(end, str):  # handle case like thread_binding(128, "threadIdx.x")
-                    thread = end
-                    end = None
-                else:
-                    raise ValueError("Thread cannot be None for thread_binding")
-            if end is None:
-                end, begin = begin, end
-            thread_iter_var = IterVar(None, None, IterVar.ThreadIndex, thread)
-            self.create_loop_info(
-                begin,
-                end,
-                ForKind.THREAD_BINDING,
-                thread_binding=thread_iter_var,
-                annotations=annotations,
-            )
-
-        super().__init__(thread_binding)
-
-
-@register
-class RangeHandler(ForScopeHandler):
-    """For scope handler range(begin, end, annotations)
-    Note that tir.range is totally the same as T.serial
-    """
-
-    def __init__(self):
-        def for_range(
-            begin: PrimExpr,
-            end: PrimExpr = None,
-            annotations: Optional[Mapping[str, Object]] = None,
-        ):
-            if end is None:
-                end, begin = begin, end
-            self.create_loop_info(begin, end, ForKind.SERIAL, annotations=annotations)
-
-        super().__init__(for_range)
-
-    def signature(self):
-        return "range", get_param_list(self.func)
-
-
-@register
-class Grid(ForScopeHandler):
-    """For scope handler T.grid(extents)"""
-
-    def __init__(self):
-        def grid(*extents: List[PrimExpr]):
-            for extent in extents:
-                self.create_loop_info(None, extent, ForKind.SERIAL)
-
-        super().__init__(grid)
diff --git a/python/tvm/script/parser_v1/tir/special_stmt.py b/python/tvm/script/parser_v1/tir/special_stmt.py
deleted file mode 100644
index f558eb6b7f73..000000000000
--- a/python/tvm/script/parser_v1/tir/special_stmt.py
+++ /dev/null
@@ -1,927 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TVM Script Parser Special Stmt Classes"""
-# pylint: disable=unused-argument, no-self-argument, inconsistent-return-statements
-# pylint: disable=relative-beyond-top-level
-from typing import Callable, List, Optional, Tuple, Any, Mapping, Union
-
-import synr
-from synr import ast
-from tvm.ir.expr import PrimExpr, Range
-
-import tvm.tir
-from tvm.runtime import Object, String
-from tvm.target import Target
-from tvm.ir import Span
-from tvm.tir import IntImm, IterVar, Var
-
-from .node import BufferSlice
-
-from ..context_maintainer import BlockInfo, ContextMaintainer
-from ..registry import register
-from ..utils import (
-    get_param_list,
-    tvm_span_from_synr,
-    call_with_error_reporting,
-)
-
-
-def convert_to_int(
-    value: Union[IntImm, int],
-    arg_name: str,
-    report_error: Callable,
-    span: Union[Span, synr.ast.Span],
-) -> int:
-    """convert a const int or TVM IntImm to Python int.
-    Reports an error when input cannot be converted to int.
-
-    Parameters
-    ----------
-    value : Union[tvm.tir.IntImm, int]
-        The input value to be converted.
-    arg_name : str
-        Function argument name for error reporting.
-    report_error: Callable
-        The report error function handle
-    span : Union[synr.ast.Span, tvm.ir.Span]
-        Location of the error
-    """
-    if isinstance(value, IntImm):
-        return value.value
-    if isinstance(value, int):
-        return value
-    report_error(
-        f"Expected int or IntImm for {arg_name}, but got {str(type(value))}",
-        span,
-    )
-
-
-class SpecialStmt:
-    """Base class for all Special Stmts"""
-
-    def __init__(self, func: Callable, def_symbol: bool):
-        self.func: Callable = func
-        self.def_symbol: bool = def_symbol
-        self.node: Optional[synr.ast.Node] = None
-        self.context: Optional[ContextMaintainer] = None
-
-    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
-        return "tir." + self.func.__name__, get_param_list(self.func)
-
-    def handle(
-        self,
-        node: ast.Node,
-        context: ContextMaintainer,
-        arg_list: List[Any],
-        span: synr.ast.Span,
-    ):
-        self.node = node
-        self.context = context
-        return call_with_error_reporting(
-            context.report_error, span, self.func, *arg_list, span=tvm_span_from_synr(span)
-        )
-
-
-@register
-class MatchBuffer(SpecialStmt):
-    """Special Stmt match_buffer(param, shape, dtype, data, strides, elem_offset, scope, align,
-                                 offset_factor, buffer_type, axis_separators)
-
-    Note
-    ----
-    This Special Stmt will perform different behavior depends on the type of param.
-    If the param is a var in function parameter, it will create a buffer from DLTensor.
-    Else if the param is a subregion of other buffers, then create a subregion match inside a block.
-
-    Example
-    -------
-    Match buffer from function parameter
-    .. code-block:: python
-        A = T.match_buffer(a, (128, 128), dtype="float32")
-
-    Match buffer from Buffer subregion
-    .. code-block:: python
-        A = T.match_buffer(B[0:128, i * 128 : i * 128 + 128], (128, 128), dtype="float32")
-    """
-
-    def __init__(self):
-        def match_buffer(
-            param,
-            shape=None,
-            dtype=None,
-            data=None,
-            strides=None,
-            elem_offset=None,
-            scope="global",
-            align=-1,
-            offset_factor=0,
-            buffer_type="default",
-            axis_separators=None,
-            span=None,
-        ):
-            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
-                self.context.report_error(
-                    "`match_buffer` must be assigned to a single buffer, "
-                    "e.g. A = match_buffer(...)",
-                    self.node.span,
-                )
-            if strides is None:
-                strides = []
-            align = convert_to_int(align, "align", self.context.report_error, self.node.span)
-            offset_factor = convert_to_int(
-                offset_factor, "offset_factor", self.context.report_error, self.node.span
-            )
-            buffer_name: str = self.node.lhs[0].id.name
-
-            if isinstance(param, tvm.tir.Var):
-                if shape is None:
-                    self.context.report_error(
-                        "Shape must be specified when binding input param",
-                        self.node.rhs.span,
-                    )
-
-                if dtype is None:
-                    dtype = "float32"
-
-                buffer = tvm.tir.decl_buffer(
-                    shape,
-                    dtype,
-                    buffer_name,
-                    data,
-                    strides,
-                    elem_offset,
-                    scope,
-                    align,
-                    offset_factor,
-                    buffer_type,
-                    axis_separators,
-                    span=span,
-                )
-                if param not in self.context.func_params:
-                    self.context.report_error(
-                        "Can not bind non-input param to buffer", self.node.rhs.params[0].span
-                    )
-                self.context.func_buffer_map[param] = buffer
-
-            elif isinstance(param, BufferSlice):
-                buffer_region = param.as_buffer_region()
-
-                if shape is None:
-                    shape = [dim.extent for dim in buffer_region.region]
-
-                if dtype is None:
-                    dtype = buffer_region.buffer.dtype
-
-                if elem_offset is None and offset_factor == 0:
-                    offset_factor = 1
-
-                buffer = tvm.tir.decl_buffer(
-                    shape,
-                    dtype,
-                    buffer_name,
-                    data,
-                    strides,
-                    elem_offset,
-                    scope,
-                    align,
-                    offset_factor,
-                    buffer_type,
-                    axis_separators,
-                    span=span,
-                )
-
-                self.context.current_block_scope().match_buffers.append(
-                    tvm.tir.MatchBufferRegion(buffer, buffer_region)
-                )
-            else:
-                self.context.report_error(
-                    "The source of match_buffer expected Var or BufferSlice, but got "
-                    + str(type(param)),
-                    self.node.rhs.params[0].span,
-                )
-            self.context.update_symbol(buffer_name, buffer, self.node)
-
-        super().__init__(match_buffer, def_symbol=True)
-
-
-@register
-class BufferDeclare(SpecialStmt):
-    """Special Stmt buffer_decl(shape, dtype, data, strides, elem_offset, scope, align,
-                                offset_factor, buffer_type, axis_separators)
-    Example
-    -------
-    .. code-block:: python
-        A = T.buffer_decl((128, 128), dtype="float32")
-    """
-
-    def __init__(self):
-        def buffer_decl(
-            shape,
-            dtype="float32",
-            data=None,
-            strides=None,
-            elem_offset=None,
-            scope="global",
-            align=-1,
-            offset_factor=0,
-            buffer_type="default",
-            axis_separators=None,
-            span=None,
-        ):
-            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
-                self.context.report_error(
-                    "`buffer_decl` must be assigned to a single buffer, e.g. A = buffer_decl(...)",
-                    self.node.span,
-                )
-
-            if strides is None:
-                strides = []
-            align = convert_to_int(align, "align", self.context.report_error, self.node.span)
-            offset_factor = convert_to_int(
-                offset_factor, "offset_factor", self.context.report_error, self.node.span
-            )
-            buffer_name: str = self.node.lhs[0].id.name
-            buffer = tvm.tir.decl_buffer(
-                shape,
-                dtype,
-                buffer_name,
-                data,
-                strides,
-                elem_offset,
-                scope,
-                align,
-                offset_factor,
-                buffer_type,
-                axis_separators,
-                span=span,
-            )
-            self.context.update_symbol(buffer_name, buffer, self.node)
-            return buffer
-
-        super().__init__(buffer_decl, def_symbol=True)
-
-
-@register
-class AllocBuffer(SpecialStmt):
-    """Special function alloc_buffer(shape, dtype, data, strides, elem_offset, scope, align,
-                                     offset_factor, buffer_type, axis_separators)
-
-    Example
-    -------
-    .. code-block:: python
-
-        A = T.alloc_buffer((128, 128), dtype="float32")
-    """
-
-    def __init__(self):
-        def alloc_buffer(
-            shape,
-            dtype="float32",
-            data=None,
-            strides=None,
-            elem_offset=None,
-            scope="global",
-            align=-1,
-            offset_factor=0,
-            buffer_type="default",
-            axis_separators=None,
-            span=None,
-        ):
-            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
-                self.context.report_error(
-                    "`alloc_buffer` must be assigned to a single buffer, "
-                    "e.g. A = alloc_buffer(...)",
-                    self.node.span,
-                )
-
-            if strides is None:
-                strides = []
-            align = convert_to_int(align, "align", self.context.report_error, self.node.span)
-            offset_factor = convert_to_int(
-                offset_factor, "offset_factor", self.context.report_error, self.node.span
-            )
-            buffer_name: str = self.node.lhs[0].id.name
-            buffer = tvm.tir.decl_buffer(
-                shape,
-                dtype,
-                buffer_name,
-                data,
-                strides,
-                elem_offset,
-                scope,
-                align,
-                offset_factor,
-                buffer_type,
-                axis_separators,
-                span=span,
-            )
-            if self.context.current_block_scope():
-                self.context.current_block_scope().alloc_buffers.append(buffer)
-            else:
-                # If it is allocated outside all blocks, allocate it under root block.
-                self.context.root_alloc_buffers.append(buffer)
-            self.context.update_symbol(buffer_name, buffer, self.node)
-
-        super().__init__(alloc_buffer, def_symbol=True)
-
-
-@register
-class BlockReads(SpecialStmt):
-    """Special function reads([read_regions], *other_regions)
-
-    Note
-    ----
-    *other_region is an unpackable list of BufferSlice to support
-    reads syntax sugar like reads(BufferRegion1, BufferRegion2, ...)
-
-    Example
-    -------
-    .. code-block:: python
-
-        T.reads([A[vi: vi + 4, vk: vk + 4], B[vk: vk + 4, vj]])
-    """
-
-    def __init__(self):
-        def reads(
-            *read_regions: Union[BufferSlice, List[BufferSlice]],
-            span: Span = None,
-        ):
-            assert self.context, "call 'exit_scope' before 'enter_scope'"
-            block_scope = self.context.current_block_scope()
-            if block_scope is None:
-                self.context.report_error(
-                    "Expected to declare read regions inside a block.",
-                    span,
-                )
-            if block_scope.reads is not None:
-                self.context.report_error(
-                    "Duplicate write region declaration, "
-                    + "previous one is "
-                    + str(", ".join(str(x) for x in block_scope.reads)),
-                    span,
-                )
-            if len(read_regions) > 1:
-                for read_region in read_regions:
-                    if not isinstance(read_region, BufferSlice):
-                        self.context.report_error(
-                            "Incorrect input type. Expected *BufferSlice or List[BufferSlice],"
-                            + f" but got {type(read_regions)}",
-                            span,
-                        )
-            elif len(read_regions) == 1:
-                if isinstance(read_regions[0], list):
-                    read_regions = read_regions[0]
-
-            block_scope.reads = read_regions
-
-        super().__init__(reads, def_symbol=False)
-
-
-@register
-class BlockWrites(SpecialStmt):
-    """Special function writes([write_regions], *other_regions)
-
-    Note
-    ----
-    *other_region is an unpackable list of BufferSlice to support
-    writes syntax sugar like writes(BufferRegion1, BufferRegion2, ...)
-
-    Example
-    -------
-    .. code-block:: python
-
-        T.writes([C[vi: vi + 4, vj])
-    """
-
-    def __init__(self):
-        def writes(
-            *write_regions: Union[BufferSlice, List[BufferSlice]],
-            span: Span = None,
-        ):
-            assert self.context, "call 'exit_scope' before 'enter_scope'"
-            block_scope = self.context.current_block_scope()
-            if block_scope is None:
-                self.context.report_error(
-                    "Expected to declare write regions inside a block.",
-                    span,
-                )
-            if block_scope.writes is not None:
-                self.context.report_error(
-                    "Duplicate write region declaration, "
-                    + "previous one is "
-                    + str(", ".join(str(x) for x in block_scope.writes)),
-                    span,
-                )
-            if len(write_regions) > 1:
-                for write_region in write_regions:
-                    if not isinstance(write_region, BufferSlice):
-                        self.context.report_error(
-                            "Incorrect input type. Expected *BufferSlice or List[BufferSlice],"
-                            + f" but got {type(write_regions)}",
-                            span,
-                        )
-            elif len(write_regions) == 1:
-                if isinstance(write_regions[0], list):
-                    write_regions = write_regions[0]
-            block_scope.writes = write_regions
-
-        super().__init__(writes, def_symbol=False)
-
-
-@register
-class BlockAttr(SpecialStmt):
-    """Special function block_attr({attr_key: attr_value})
-
-    Example
-    -------
-    .. code-block:: python
-
-        T.block_attr({"double_buffer_scope": 1})
-    """
-
-    def __init__(self):
-        def block_attr(attrs: Mapping[str, Object], span: Span = None):
-            assert self.context, "call 'exit_scope' before 'enter_scope'"
-            block_scope = self.context.current_block_scope()
-            if block_scope is None:
-                self.context.report_error(
-                    "Expected to declare block annotations inside a block.",
-                    span,
-                )
-            if block_scope.annotations is not None:
-                self.context.report_error(
-                    "Duplicate block annotations declaration, "
-                    + "previous one is "
-                    + str(block_scope.annotations),
-                    span,
-                )
-            attrs = {
-                key: String(val) if isinstance(val, str) else val for key, val in attrs.items()
-            }
-            block_scope.annotations = attrs
-
-        super().__init__(block_attr, def_symbol=False)
-
-
-class BlockAxis(SpecialStmt):
-    """Special stmt for defining a spatial block axis
-    axis.S(dom, iter_value)
-
-    Example
-    -------
-    .. code-block:: python
-
-        vi = T.axis.S(128, i * 4 + j)
-    """
-
-    def axis(
-        self,
-        var_name: str,
-        dom: Union[PrimExpr, Range],
-        value: PrimExpr,
-        iter_type: int,
-        span: Optional[Span] = None,
-    ) -> None:
-        """
-        Helper function for creating block axis
-
-        Parameters
-        ----------
-        var_name : str
-            The name_hint of var
-
-        dom : Union[PrimExpr, Range]
-            The iter domain.
-
-        value : PrimExpr
-            The binding value
-
-        iter_type : int
-            The iteration type.
-
-        span : Optional[Span]
-            The location of this for in the source code.
-        """
-        assert self.context, "call 'exit_scope' before 'enter_scope'"
-        block_scope: BlockInfo = self.context.current_block_scope()
-        if block_scope is None:
-            self.context.report_error(
-                "Expected to declare block axes inside a block.",
-                self.node.span,
-            )
-        if var_name in [iter_var.var.name for iter_var in block_scope.iter_vars]:
-            self.context.report_error("Duplicate block axis " + var_name, self.node.span)
-
-        dom = tvm.runtime.convert(dom)
-        if isinstance(dom, PrimExpr):
-            dom = tvm.ir.Range(dom)
-        elif isinstance(dom, tvm.ir.container.Array) and len(dom) == 2:
-            dom = tvm.ir.Range(dom[0], dom[1])
-        elif not isinstance(dom, tvm.ir.Range):
-            self.context.report_error(
-                f"Block axis domain expected PrimExpr or Range, but got {type(dom)}",
-                self.node.span,
-            )
-        block_var = tvm.tir.Var(var_name, dtype=dom.extent.dtype)
-        value = tvm.runtime.convert(value)
-        if not isinstance(value, PrimExpr):
-            self.context.report_error(
-                f"Block axis value expected PrimExpr, but got {type(value)}",
-                self.node.span,
-            )
-        iter_var = tvm.tir.IterVar(dom, block_var, iter_type)
-        block_scope.iter_vars.append(iter_var)
-        block_scope.iter_values.append(value)
-        self.context.update_symbol(var_name, block_var, self.node)
-
-
-@register
-class BlockAxisSpatial(BlockAxis):
-    """Special stmt for defining a spatial block axis
-    axis.spatial(dom, iter_value)
-
-    Example
-    -------
-    .. code-block:: python
-
-        vi = T.axis.spatial(128, k)
-    """
-
-    def __init__(self):
-        def axis_spatial(
-            dom: Union[PrimExpr, Tuple[PrimExpr, PrimExpr]], value: PrimExpr, span: Span = None
-        ):
-            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
-                self.context.report_error(
-                    "`axis.spatial` must be assigned to a var, e.g. vi = axis.spatial(...)",
-                    self.node.span,
-                )
-            self.axis(self.node.lhs[0].id.name, dom, value, IterVar.DataPar)
-
-        super().__init__(axis_spatial, def_symbol=True)
-
-    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
-        return "tir.axis.spatial", get_param_list(self.func)
-
-
-@register
-class BlockAxisS(BlockAxis):
-    """The sugar special stmt for defining a spatial block axis
-    axis.S(dom, iter_value)
-
-    Example
-    -------
-    .. code-block:: python
-
-        vi = T.axis.S(128, k)
-    """
-
-    def __init__(self):
-        def axis_spatial(
-            dom: Union[PrimExpr, Tuple[PrimExpr, PrimExpr]], value: PrimExpr, span: Span = None
-        ):
-            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
-                self.context.report_error(
-                    "`axis.S` must be assigned to a var, e.g. vi = axis.S(...)",
-                    self.node.span,
-                )
-            self.axis(self.node.lhs[0].id.name, dom, value, IterVar.DataPar)
-
-        super().__init__(axis_spatial, def_symbol=True)
-
-    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
-        return "tir.axis.S", get_param_list(self.func)
-
-
-@register
-class BlockAxisReduce(BlockAxis):
-    """Special stmt for defining a reduce block axis
-    axis.reduce(dom, iter_value)
-
-    Example
-    -------
-    .. code-block:: python
-
-        vi = T.axis.reduce(128, k)
-    """
-
-    def __init__(self):
-        def axis_reduce(
-            dom: Union[PrimExpr, Tuple[PrimExpr, PrimExpr]], value: PrimExpr, span: Span = None
-        ):
-            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
-                self.context.report_error(
-                    "`axis.reduce` must be assigned` to a var, e.g. vi = axis.reduce(...)",
-                    self.node.span,
-                )
-            self.axis(self.node.lhs[0].id.name, dom, value, IterVar.CommReduce)
-
-        super().__init__(axis_reduce, def_symbol=True)
-
-    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
-        return "tir.axis.reduce", get_param_list(self.func)
-
-
-@register
-class BlockAxisR(BlockAxis):
-    """The sugar special stmt for defining a reduce block axis
-    axis.R(dom, iter_value)
-
-    Example
-    -------
-    .. code-block:: python
-
-        vi = T.axis.R(128, k)
-    """
-
-    def __init__(self):
-        def axis_reduce(
-            dom: Union[PrimExpr, Tuple[PrimExpr, PrimExpr]], value: PrimExpr, span: Span = None
-        ):
-            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
-                self.context.report_error(
-                    "`axis.R` must be assigned to a var, e.g. vi = axis.R(...)",
-                    self.node.span,
-                )
-            self.axis(self.node.lhs[0].id.name, dom, value, IterVar.CommReduce)
-
-        super().__init__(axis_reduce, def_symbol=True)
-
-    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
-        return "tir.axis.R", get_param_list(self.func)
-
-
-@register
-class BlockAxisScan(BlockAxis):
-    """Special stmt for defining a ordered block axis
-    axis.scan(dom, iter_value)
-
-    Example
-    -------
-    .. code-block:: python
-
-        vi = T.axis.scan(128, k)
-    """
-
-    def __init__(self):
-        def axis_scan(
-            dom: Union[PrimExpr, Tuple[PrimExpr, PrimExpr]], value: PrimExpr, span: Span = None
-        ):
-            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
-                self.context.report_error(
-                    "`axis.scan` must be assigned to a var, e.g. vi = axis.scan(...)",
-                    self.node.span,
-                )
-            self.axis(self.node.lhs[0].id.name, dom, value, IterVar.Ordered)
-
-        super().__init__(axis_scan, def_symbol=True)
-
-    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
-        return "tir.axis.scan", get_param_list(self.func)
-
-
-@register
-class BlockAxisOpaque(BlockAxis):
-    """Special stmt for defining a opaque block axis
-    axis.opaque(dom, iter_value)
-
-    Example
-    -------
-    .. code-block:: python
-
-        vi = T.axis.opaque(128, k)
-    """
-
-    def __init__(self):
-        def axis_opaque(
-            dom: Union[PrimExpr, Tuple[PrimExpr, PrimExpr]], value: PrimExpr, span: Span = None
-        ):
-            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
-                self.context.report_error(
-                    "`axis.opaque` must be assigned to a var, e.g. vi = axis.opaque(...)",
-                    self.node.span,
-                )
-            self.axis(self.node.lhs[0].id.name, dom, value, IterVar.DimInfo)
-
-        super().__init__(axis_opaque, def_symbol=True)
-
-    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
-        return "tir.axis.opaque", get_param_list(self.func)
-
-
-@register
-class BlockAxisRemap(BlockAxis):
-    """Special stmt for remapping loops vars to block axes.
-    axis.remap(iter_type, iter_value)
-
-    Note
-    ----
-    Iter_type is a string consisting of 'S' and 'R', where 'S' means
-    for spatial and 'R' means for reduce.
-
-    Example
-    -------
-    .. code-block:: python
-
-        vi, vj = T.axis.remap("SS", [i, j])
-    """
-
-    def __init__(self):
-        def axis_remap(iter_types: str, loop_vars: List[tvm.tir.expr.Var], span: Span = None):
-            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) >= 1:
-                self.context.report_error(
-                    "`axis.remap` must be assigned to one or more vars, "
-                    "e.g. vi, vj = axis.remap(...)",
-                    self.node.span,
-                )
-            var_num: int = len(self.node.lhs)
-            if var_num != len(iter_types):
-                self.context.report_error(
-                    f"`iter_type` expected {var_num} charactor(s), "
-                    f"but got {len(iter_types)}: {iter_types}",
-                    span,
-                )
-            if var_num != len(loop_vars):
-                self.context.report_error(
-                    f"`iter_type` expected {var_num} loop var(s), "
-                    f"but got {len(loop_vars)}: {loop_vars}",
-                    span,
-                )
-            for var, iter_ty, loop_var in zip(self.node.lhs, iter_types, loop_vars):
-                iter_type: int
-                if iter_ty == "S":
-                    iter_type = IterVar.DataPar
-                elif iter_ty == "R":
-                    iter_type = IterVar.CommReduce
-                else:
-                    self.context.report_error(
-                        f'`iter_type` only expected "S" (for spatial) or "R" (for reduce), '
-                        f'but got "{iter_ty}"',
-                        span,
-                    )
-
-                if not isinstance(loop_var, tvm.tir.expr.Var):
-                    self.context.report_error(
-                        f"Values of `axis.remap` expected single loop var, but got {loop_var}",
-                        loop_var.span,
-                    )
-                loops = self.context.loop_stack
-                if loop_var not in loops:
-                    self.context.report_error(
-                        f"Cannot find loop var {loop_var} in loop nesting.",
-                        span,
-                    )
-                self.axis(var.id.name, loops[loop_var], loop_var, iter_type)
-
-        super().__init__(axis_remap, def_symbol=True)
-
-    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
-        return "tir.axis.remap", get_param_list(self.func)
-
-
-@register
-class BlockPredicate(SpecialStmt):
-    """Special function where(predicate)
-
-    Example
-    -------
-    .. code-block:: python
-
-        T.where(i < 4)
-    """
-
-    def __init__(self):
-        def where(predicate, span=None):
-            assert self.context, "call 'exit_scope' before 'enter_scope'"
-            block_scope = self.context.current_block_scope()
-            if block_scope is None:
-                self.context.report_error(
-                    "Expected to declare the predicate inside a block.",
-                    span,
-                )
-            if block_scope.predicate is not None:
-                self.context.report_error(
-                    "Duplicate block predicate declaration, "
-                    + "previous one is "
-                    + str(block_scope.predicate),
-                    span,
-                )
-
-            block_scope.predicate = predicate
-
-        super().__init__(where, def_symbol=False)
-
-
-@register
-class VarDef(SpecialStmt):
-    """Special function for defining a Var"""
-
-    def __init__(self):
-        def var(dtype, span):
-            assert isinstance(
-                self.node, ast.Assign
-            ), f"VarDef expected ast.Assign but got {type(self.node)}"
-            names = [x.id.name for x in self.node.lhs]
-            if len(names) != 1:
-                self.context.report_error(
-                    f"VarDef expected assign to only one var, but got {names}", span
-                )
-            v = Var(names[0], dtype, span=span)
-            self.context.update_symbol(v.name, v, self.node)
-
-        super().__init__(var, def_symbol=True)
-
-
-@register
-class BufferVarDef(SpecialStmt):
-    """Special function for defining a variable of pointer type"""
-
-    def __init__(self):
-        def buffer_var(dtype, storage_scope, span):
-            assert isinstance(
-                self.node, ast.Assign
-            ), f"BufferVarDef expected ast.Assign but got {type(self.node)}"
-            names = [x.id.name for x in self.node.lhs]
-            if len(names) != 1:
-                self.context.report_error(
-                    f"VarDef expected assign to only one var, but got {names}", span
-                )
-            ptr_type = tvm.ir.PointerType(tvm.ir.PrimType(dtype), storage_scope)
-            v = Var(names[0], ptr_type, span=span)
-            self.context.update_symbol(v.name, v, self.node)
-
-        super().__init__(buffer_var, def_symbol=True)
-
-
-@register
-class EnvThread(SpecialStmt):
-    """Bind a var to thread env"""
-
-    def __init__(self):
-        def env_thread(env_name, span):
-            assert isinstance(
-                self.node, ast.Assign
-            ), f"EnvThread expected ast.Assign but got {type(self.node)}"
-            names = [x.id.name for x in self.node.lhs]
-            if len(names) != 1:
-                self.context.report_error(
-                    f"VarDef expected assign to only one var, but got {names}", span
-                )
-            v = Var(names[0], dtype="int32", span=span)
-            self.context.func_var_env_dict[v] = env_name
-            self.context.update_symbol(v.name, v, self.node)
-
-        super().__init__(env_thread, def_symbol=True)
-
-
-@register
-class FuncAttr(SpecialStmt):
-    """Special Stmt for declaring the DictAttr of PrimFunc
-    Example
-    -------
-    .. code-block:: python
-         T.func_attr({"tir.noalias": True, "global_symbol"})
-    """
-
-    def __init__(self):
-        def func_attr(dict_attr, span):
-            self.context.func_dict_attr = dict_attr
-
-        super().__init__(func_attr, def_symbol=False)
-
-
-@register
-class TargetAttrValue(SpecialStmt):
-    """Special Stmt for target attr value.
-    Example
-    -------
-    .. code-block:: python
-        T.target("llvm")
-    """
-
-    def __init__(self):
-        def target(*args, span):
-            self.context.report_error(f"T.target should not appear as a stmt", span)
-
-        super().__init__(target, def_symbol=False)
-
-    def __call__(self, target_config):
-        if not isinstance(target_config, (str, dict)):
-            raise ValueError(
-                f"T.target expected a config dict or string, but got {type(target_config)}"
-            )
-        return Target(target_config)
diff --git a/python/tvm/script/parser_v1/tir/ty.py b/python/tvm/script/parser_v1/tir/ty.py
deleted file mode 100644
index b17b571e88e7..000000000000
--- a/python/tvm/script/parser_v1/tir/ty.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""TVM Script Parser Typing Class for TIR
-
-This module provides typing class for TVM script type annotation usage, it can be viewed as
-a wrapper for uniform Type system in IR
-"""
-# pylint: disable=invalid-name
-from numbers import Integral
-
-import tvm
-from .special_stmt import SpecialStmt, convert_to_int
-
-
-class TypeGeneric:  # pylint: disable=too-few-public-methods
-    """Base class for all the TVM script typing class"""
-
-    def evaluate(self):
-        """Return an actual ir.Type Object that this Generic class wraps"""
-        raise TypeError("Cannot get tvm.Type from a generic type")
-
-    def require_type_generic_at(self, idx):  # pylint: disable=unused-argument
-        """If True, the `idx`th type argument must be TypeGeneric"""
-        return True
-
-    # This function is added here to avoid a pylint error
-    # for T.int/float below not being callable
-    def __call__(self):
-        raise NotImplementedError()
-
-
-class ConcreteType(TypeGeneric):  # pylint: disable=too-few-public-methods, abstract-method
-    """TVM script typing class for uniform Type objects
-
-    Params
-    ------
-    vtype: Union[str, tvm.ir.Type]
-
-        The IR type represented by the type annotation.  If a string
-        (e.g. "float32"), this represents a `ir.PrimType` generated
-        from that string.  If a `ir.Type` is provided, this represents
-        the type provided.
-    """
-
-    def __init__(self, vtype):
-        if isinstance(vtype, tvm.ir.Type):
-            self.type = vtype
-        else:
-            self.type = tvm.ir.PrimType(vtype)
-
-    def __call__(self, *args):  # pylint: disable=arguments-differ
-        pass
-
-    def evaluate(self):
-        return self.type
-
-
-class VoidType(ConcreteType):  # pylint: disable=too-few-public-methods, abstract-method
-    """TVM script typing class for void type"""
-
-    def __init__(self):
-        super().__init__("")
-
-
-class GenericPtrType(TypeGeneric):  # pylint: disable=abstract-method
-    """TVM script typing class generator for PtrType
-
-    [] operator is overloaded, accepts a ConcreteType and an optional storage scope string,
-    returns a ConcreteType wrapping PtrType
-    """
-
-    def __getitem__(self, args):
-        if isinstance(args, TypeGeneric):
-            args = [args]
-        if len(args) == 1:
-            vtype, scope = args[0], "global"
-        elif len(args) == 2:
-            vtype, scope = args[0], args[1]
-        else:
-            raise TypeError(f"Illegal type argument num for Ptr")
-        if not isinstance(vtype, TypeGeneric):
-            raise TypeError(f"Ptr expects a type argument, but received {type(vtype).__name__}")
-        if not isinstance(scope, str):
-            raise TypeError(f"Ptr expects storage scope argument be a string")
-        return ConcreteType(tvm.ir.PointerType(vtype.evaluate(), scope))
-
-    def require_type_generic_at(self, idx):
-        return idx != 1  # the second argument is storage scope for Ptr
-
-
-class GenericTupleType(TypeGeneric):  # pylint: disable=abstract-method
-    """TVM script typing class generator for TupleType
-
-    [] operator is overloaded, accepts a list of ConcreteType and returns a ConcreteType
-    wrapping TupleType
-    """
-
-    def __getitem__(self, vtypes):
-        if isinstance(vtypes, TypeGeneric):
-            vtypes = [vtypes]
-        return ConcreteType(tvm.ir.TupleType([vtype.evaluate() for vtype in vtypes]))
-
-
-class GenericBufferType(SpecialStmt):  # pylint: disable=too-few-public-methods, abstract-method
-    """TVM script typing class for uniform Type objects"""
-
-    def __init__(self, vtype):
-        def match_buffer_syntax_sugar(
-            shape,
-            dtype: str = "float32",
-            name: str = None,
-            data=None,
-            strides=None,
-            elem_offset=None,
-            scope="global",
-            align=-1,
-            offset_factor=0,
-            buffer_type="default",
-            axis_separators=None,
-            span=None,
-        ):
-            if strides is None:
-                strides = []
-            align = convert_to_int(align, "align", self.context.report_error, self.node.span)
-            offset_factor = convert_to_int(
-                offset_factor, "offset_factor", self.context.report_error, self.node.span
-            )
-            buffer = tvm.tir.decl_buffer(
-                shape,
-                dtype,
-                name,
-                data,
-                strides,
-                elem_offset,
-                scope,
-                align,
-                offset_factor,
-                buffer_type,
-                axis_separators,
-                span=span,
-            )
-            return buffer
-
-        self.type = vtype
-        super().__init__(match_buffer_syntax_sugar, def_symbol=True)
-
-    def __call__(
-        self,
-        shape,
-        dtype="float32",
-        *,
-        name: str = None,
-        data=None,
-        strides=None,
-        elem_offset=None,
-        scope="global",
-        align=-1,
-        offset_factor=0,
-        buffer_type="default",
-        axis_separators=None,
-        span=None,
-    ):
-        """
-        This function is for Buffer(...) syntax sugar.
-        """
-        pass  # pylint: disable=unnecessary-pass
-
-    def __getitem__(self, args):
-        """
-        This function is for Buffer[...] syntax sugar
-        Note that args is the list of all arguments
-        """
-        if len(args) < 2:
-            raise ValueError("T.Buffer[...] needs at least two arguments: shape and dtype.")
-
-        shape = args[0]
-        dtype = args[1]
-
-        valid_shape = isinstance(shape, (tvm.ir.PrimExpr, Integral, tuple, list))
-        valid_dtype = isinstance(dtype, str)
-        if not (valid_shape and valid_dtype):
-            raise ValueError(
-                "The first argument of T.Buffer[...] needs to be a tuple, "
-                "followed by the second argument dtype as a string"
-            )
-
-
-# add all floating point and integer datatypes to the module
-for _dtype in ["float", "uint", "int"]:
-    for _size in ["8", "16", "32", "64"]:
-        for _lanes in ["", "x4", "x8", "x16", "x32", "x64"]:
-            _name = _dtype + _size + _lanes
-            globals()[_name] = ConcreteType(_name)
-
-
-# All other DataType annotations are represented with the same string
-# as is used by `tvm.runtime.DataType`.  This does redefine the Python
-# built-in bool, but only within the context of `tvm.script.tir.ty`
-# and `tvm.script.tir` modules.  The `T.boolean` alias is maintained
-# for backwards compatibility.
-
-bool = ConcreteType("bool")  # pylint: disable=redefined-builtin
-boolean = bool
-
-
-handle = ConcreteType("handle")
-void = VoidType()
-Ptr = GenericPtrType()
-Tuple = GenericTupleType()
-# we don't have 'buffer' type on the cpp side
-# thus 'handle' is used here for convenience's sake
-Buffer = GenericBufferType("handle")
diff --git a/python/tvm/script/parser_v1/utils.py b/python/tvm/script/parser_v1/utils.py
deleted file mode 100644
index c655a6223740..000000000000
--- a/python/tvm/script/parser_v1/utils.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Helper functions in TVM Script Parser"""
-
-from typing import Callable, List, Any, Optional, Tuple
-
-import inspect
-import synr
-
-from tvm.ir import Span, SourceName
-from tvm.error import DiagnosticError
-
-
-def get_param_list(
-    func: Callable,
-) -> Tuple[List[str], List[Tuple[str, Tuple[Any, ...]]], Optional[str]]:
-    """Get the parameter list from definition of function"""
-    full_arg_spec: inspect.FullArgSpec = inspect.getfullargspec(func)
-
-    args: List[str]
-    defaults: Optional[Tuple[Any, ...]]
-    kwonlyargs: List[str]
-    args, defaults, kwonlyargs = (
-        full_arg_spec.args,
-        full_arg_spec.defaults,
-        full_arg_spec.kwonlyargs,
-    )
-
-    if defaults is None:
-        defaults = tuple()
-
-    if full_arg_spec.varkw is not None:
-        raise RuntimeError(
-            "TVM Script register error : variable keyword argument is not supported now"
-        )
-
-    if len(kwonlyargs) == 1 and kwonlyargs[0] == "span":
-        pass
-    elif not len(kwonlyargs) == 0:
-        raise RuntimeError("TVM Script register error : keyword only argument is not supported now")
-
-    pos_only: List[str] = list()
-    for arg in args[: len(args) - len(defaults)]:
-        if arg != "span":
-            pos_only.append(arg)
-    kwargs: List[Tuple[str, Tuple[Any, ...]]] = list()
-    for default, arg in zip(defaults, args[len(args) - len(defaults) :]):
-        if arg != "span":
-            kwargs.append((arg, default))
-
-    return pos_only, kwargs, full_arg_spec.varargs
-
-
-def tvm_span_from_synr(span: synr.ast.Span) -> Span:
-    """Convert a synr span to a TVM span"""
-    return Span(
-        SourceName(span.filename),
-        span.start_line,
-        span.end_line,
-        span.start_column,
-        span.end_column,
-    )
-
-
-def synr_span_from_tvm(span: Span) -> synr.ast.Span:
-    """Convert a TVM span to a synr span"""
-    return synr.ast.Span(
-        span.source_name.name,
-        span.line,
-        span.column,
-        span.end_line,
-        span.end_column,
-    )
-
-
-def call_with_error_reporting(
-    report_error,
-    node_span,
-    func,
-    *args,
-    **kwargs,
-):
-    """Call function with exception handling and report error using node_span"""
-    try:
-        return func(*args, **kwargs)
-    except DiagnosticError:
-        raise
-    except Exception as err:  # pylint: disable=broad-except
-        # printing last non-empty row of error message.
-        error_msg = list(filter(None, str(err).split("\n")))[-1]
-        report_error(error_msg, node_span)
diff --git a/src/tir/schedule/error.h b/src/tir/schedule/error.h
index e28164c6c39b..d344f4687305 100644
--- a/src/tir/schedule/error.h
+++ b/src/tir/schedule/error.h
@@ -42,7 +42,7 @@ class ScheduleError : public tvm::runtime::Error {
    * "Some error occurred on block {0} and loop {1} blah blah"
    * And renderer will replace {0} and {1} according to the list provided LocationsOfInterest. Right
    * now it only printed out all the locations in plain text, but in the future, we may want to mark
-   * the IR with underscores and attach names to each location of interest, like what synr does.
+   * the IR with underscores and attach names to each location of interest.
    */
   virtual String DetailRenderTemplate() const = 0;
   /*!
diff --git a/tests/python/unittest/test_tvmscript_spans.py b/tests/python/unittest/test_tvmscript_spans.py
deleted file mode 100644
index 2c0522e3e3c9..000000000000
--- a/tests/python/unittest/test_tvmscript_spans.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-from tvm.script.parser_v1 import tir as T
-
-
-@T.prim_func
-def loops() -> None:
-    for i in T.parallel(0, 2):
-        for j in T.serial(0, 1):
-            for z in T.vectorized(3, 4):
-                T.evaluate(0)
-
-
-def test_loops():
-    start_line = 23
-    parsed = loops
-
-    assert parsed.span.line == start_line
-
-    assert parsed.body.span.line == start_line + 1
-    assert parsed.body.min.span.column == 25
-    assert parsed.body.extent.span.column == 28
-    assert parsed.body.extent.span.line == start_line + 1
-
-    assert parsed.body.body.span.line == start_line + 2
-    assert parsed.body.body.loop_var.span.line == start_line + 2
-    assert parsed.body.body.loop_var.span.column == 13
-
-    assert parsed.body.body.body.span.line == start_line + 3
-    assert parsed.body.body.body.span.column == 22
-
-    assert parsed.body.body.body.body.span.line == start_line + 4
-    assert parsed.body.body.body.body.span.column == 17
-
-
-@T.prim_func
-def statements() -> None:
-    T.evaluate(1)
-    T.evaluate("test")
-
-
-def test_statements():
-    start_line = 53
-    parsed = statements
-
-    assert parsed.body.span.line == start_line + 1
-
-    assert parsed.body[0].span.line == start_line + 1
-    assert parsed.body[0].span.column == 5
-
-    assert parsed.body[0].span.line == start_line + 1
-    assert parsed.body[0].span.column == 5
-
-
-if __name__ == "__main__":
-    test_loops()
-    test_statements()
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index b11ee538dc68..16389d29354c 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -274,7 +274,6 @@ def docs(
         requirements = [
             "Sphinx==4.2.0",
             "tlcpack-sphinx-addon==0.2.1",
-            "synr==0.5.0",
             "image==1.5.33",
             # Temporary git link until a release is published
             "git+https://github.com/sphinx-gallery/sphinx-gallery.git@6142f1791151849b5bec4bf3959f75697ba226cd",

From 7e2b8dd2aca49a60819d891359618c6c012c7131 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Tue, 20 Dec 2022 14:26:45 +0000
Subject: [PATCH 080/286] [CI] Fix android build by constraining numpy version
 (#13648)

Temporarily constrain the version of numpy to workaround the
deprecated value used in mxnet. See #13647.

Change-Id: Ib271c223447c76b855fe35cc8a1e77411a3fa441
---
 python/gen_requirements.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/gen_requirements.py b/python/gen_requirements.py
index b8c72a8f2744..461c1e25fb3b 100755
--- a/python/gen_requirements.py
+++ b/python/gen_requirements.py
@@ -251,7 +251,8 @@
     ("h5py", "==2.10.0"),
     ("image", None),
     ("matplotlib", None),
-    ("numpy", None),
+    # Workaround, see https://github.com/apache/tvm/issues/13647
+    ("numpy", "<=1.23.*"),
     ("onnx", None),
     ("onnxoptimizer", None),
     ("onnxruntime", None),

From 5ae63ace2dcc14ed8988018a5bf52fd8c30db804 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Tue, 20 Dec 2022 17:01:48 +0000
Subject: [PATCH 081/286] [ETHOSN] Update driver stack version to 22.11
 (#13637)

-New process memory allocator is used to create
 buffers and networks.
-Support for 22.08 stack has been kept intact
 in the sources and tests until new docker
 image is built and used.
-Tests were modified to meet limitations imposed
 on input zero point and kernel size by NPU software.
-Removed defining ETHON_API_VERSION from cmake infra.
---
 cmake/utils/FindEthosN.cmake                  | 16 +---
 .../ubuntu_install_ethosn_driver_stack.sh     |  2 +-
 python/tvm/relay/op/contrib/ethosn.py         |  2 +-
 src/runtime/contrib/ethosn/ethosn_device.cc   | 89 ++++++++++++++++++-
 src/runtime/contrib/ethosn/ethosn_device.h    |  7 +-
 src/runtime/contrib/ethosn/ethosn_runtime.cc  | 20 +++++
 src/runtime/contrib/ethosn/ethosn_runtime.h   | 11 +++
 .../contrib/test_ethosn/infrastructure.py     |  1 -
 .../python/contrib/test_ethosn/test_conv2d.py |  6 +-
 .../test_ethosn/test_conv2d_transpose.py      | 68 +++++++++++++-
 .../contrib/test_ethosn/test_leaky_relu.py    |  8 +-
 tests/python/contrib/test_ethosn/test_tanh.py |  3 +-
 12 files changed, 204 insertions(+), 29 deletions(-)

diff --git a/cmake/utils/FindEthosN.cmake b/cmake/utils/FindEthosN.cmake
index 591d49f82915..7d5f2802f6bd 100644
--- a/cmake/utils/FindEthosN.cmake
+++ b/cmake/utils/FindEthosN.cmake
@@ -58,18 +58,6 @@ macro(find_ethosn use_ethosn)
       PATHS ${__ethosn_stack}/lib)
     find_library(ETHOSN_COMPILER_LIBRARY NAMES EthosNSupport)
 
-    list(GET ETHOSN_INCLUDE_DIRS 0 filename)
-    set(filename "${filename}/ethosn_support_library/Support.hpp")
-    file(READ ${filename} ETHOSN_SUPPORT_H)
-    string(REGEX MATCH "VERSION_MAJOR ([0-9]*)" _ ${ETHOSN_SUPPORT_H})
-    set(ver_major ${CMAKE_MATCH_1})
-    string(REGEX MATCH "VERSION_MINOR ([0-9]*)" _ ${ETHOSN_SUPPORT_H})
-    set(ver_minor ${CMAKE_MATCH_1})
-    string(REGEX MATCH "VERSION_PATCH ([0-9]*)" _ ${ETHOSN_SUPPORT_H})
-    set(ver_patch ${CMAKE_MATCH_1})
-    set(ETHOSN_PACKAGE_VERSION "${ver_major}.${ver_minor}.${ver_patch}")
-    set(ETHOSN_DEFINITIONS -DETHOSN_API_VERSION=${USE_ETHOSN_API_VERSION})
-
     # Runtime hardware support. Driver library also needed for
     # test support.
     find_path(_DL_DIR NAMES Network.hpp
@@ -81,9 +69,7 @@ macro(find_ethosn use_ethosn)
       PATHS ${__ethosn_stack}/lib)
     find_library(ETHOSN_RUNTIME_LIBRARY NAMES EthosNDriver)
     if(${USE_ETHOSN_HW} MATCHES ${IS_TRUE_PATTERN})
-      set(ETHOSN_DEFINITIONS -DETHOSN_HW -DETHOSN_API_VERSION=${USE_ETHOSN_API_VERSION})
-    else()
-      set(ETHOSN_DEFINITIONS -DETHOSN_API_VERSION=${USE_ETHOSN_API_VERSION})
+      set(ETHOSN_DEFINITIONS -DETHOSN_HW)
     endif()
 
     if(ETHOSN_COMPILER_LIBRARY)
diff --git a/docker/install/ubuntu_install_ethosn_driver_stack.sh b/docker/install/ubuntu_install_ethosn_driver_stack.sh
index 4c26497c3895..7be815df32b9 100755
--- a/docker/install/ubuntu_install_ethosn_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosn_driver_stack.sh
@@ -22,7 +22,7 @@ set -o pipefail
 
 repo_url="https://github.com/Arm-software/ethos-n-driver-stack"
 repo_dir="ethosn-driver"
-repo_revision="22.08"
+repo_revision="22.11"
 install_path="/opt/arm/$repo_dir"
 
 tmpdir=$(mktemp -d)
diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index e28eea9d224f..e316c0863c6c 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -117,7 +117,7 @@ def partition_for_ethosn(mod, params=None, **opts):
     ret : annotated and partitioned module.
     """
     api_version = ethosn_api_version()
-    supported_api_versions = ["3.1.0"]
+    supported_api_versions = ["3.2.0", "3.1.0"]
     if all(api_version != LooseVersion(exp_ver) for exp_ver in supported_api_versions):
         raise ValueError(
             f"Driver stack version {api_version} is unsupported. "
diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc
index 0d79f69815fa..7e16f7e887f3 100644
--- a/src/runtime/contrib/ethosn/ethosn_device.cc
+++ b/src/runtime/contrib/ethosn/ethosn_device.cc
@@ -42,6 +42,9 @@
 
 #include "ethosn_driver_library/Inference.hpp"
 #include "ethosn_driver_library/Network.hpp"
+#ifdef _ETHOSN_API_VERSION_3_2_0
+#include "ethosn_driver_library/ProcMemAllocator.hpp"
+#endif
 
 namespace tvm {
 namespace runtime {
@@ -87,6 +90,81 @@ InferenceWaitStatus WaitForInference(dl::Inference* inference, int timeout) {
   return InferenceWaitStatus(InferenceWaitErrorCode::kSuccess);
 }
 
+#ifdef _ETHOSN_API_VERSION_3_2_0
+void CreateBuffers(dl::ProcMemAllocator* proc_mem_alloc,
+                   std::vector<std::shared_ptr<dl::Buffer>>* fm,
+                   const std::vector<DLTensor*>& tensors, const std::vector<uint32_t>& tensor_sizes,
+                   bool input) {
+  for (size_t i = 0; i < tensors.size(); i++) {
+    auto* data = static_cast<uint8_t*>(tensors[i]->data);
+    if (input) {
+      (*fm)[i] = std::make_shared<dl::Buffer>(
+          proc_mem_alloc->CreateBuffer(data, tensor_sizes[i], dl::DataFormat::NHWC));
+    } else {
+      (*fm)[i] = std::make_shared<dl::Buffer>(
+          proc_mem_alloc->CreateBuffer(tensor_sizes[i], dl::DataFormat::NHWC));
+    }
+  }
+}
+
+bool Inference(tvm::runtime::TVMArgs args, dl::ProcMemAllocator* proc_mem_alloc, dl::Network* npu,
+               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
+               const std::vector<uint32_t>& input_sizes,
+               const std::vector<uint32_t>& output_sizes) {
+  // Unpack parameters
+  size_t n_inputs = input_order.size();
+  size_t n_outputs = output_order.size();
+  std::vector<DLTensor*> inputs(n_inputs);
+  for (size_t i = 0; i < n_inputs; i++) {
+    inputs[i] = args[input_order[i]];
+  }
+  std::vector<DLTensor*> outputs(n_outputs);
+  size_t output_offset = n_inputs;
+  for (size_t i = 0; i < n_outputs; i++) {
+    outputs[i] = args[output_order[i] + output_offset];
+  }
+
+  // Set up input buffers
+  std::vector<std::shared_ptr<dl::Buffer>> ifm(n_inputs);
+  CreateBuffers(proc_mem_alloc, &ifm, inputs, input_sizes, true);
+
+  // Set up output buffers
+  std::vector<std::shared_ptr<dl::Buffer>> ofm(n_outputs);
+  CreateBuffers(proc_mem_alloc, &ofm, outputs, output_sizes, false);
+
+  // Raw pointers for the inference
+  dl::Buffer* ifm_raw[n_inputs];
+  for (size_t i = 0; i < n_inputs; i++) {
+    ifm_raw[i] = ifm[i].get();
+  }
+  dl::Buffer* ofm_raw[n_outputs];
+  for (size_t i = 0; i < n_outputs; i++) {
+    ofm_raw[i] = ofm[i].get();
+  }
+
+  // Execute the inference.
+  std::unique_ptr<dl::Inference> inference(
+      npu->ScheduleInference(ifm_raw, n_inputs, ofm_raw, n_outputs));
+  InferenceWaitStatus result = WaitForInference(inference.get(), 60);
+
+  if (result.GetErrorCode() != InferenceWaitErrorCode::kSuccess) {
+    LOG(FATAL) << "An error has occured waiting for the inference of a sub-graph on the NPU: "
+               << result.GetErrorDescription();
+  }
+
+  for (size_t i = 0; i < n_outputs; i++) {
+    DLTensor* tensor = outputs[i];
+    dl::Buffer* source_buffer = ofm_raw[i];
+    uint8_t* dest_buffer = static_cast<uint8_t*>(tensor->data);
+    size_t size = source_buffer->GetSize();
+    uint8_t* source_buffer_data = source_buffer->Map();
+    std::copy(source_buffer_data, source_buffer_data + size, dest_buffer);
+    source_buffer->Unmap();
+  }
+
+  return true;
+}
+#else
 void CreateBuffers(std::vector<std::shared_ptr<dl::Buffer>>* fm,
                    const std::vector<DLTensor*>& tensors, const std::vector<uint32_t>& tensor_sizes,
                    bool input) {
@@ -157,7 +235,7 @@ bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
 
   return true;
 }
-
+#endif
 }  // namespace ethosn
 }  // namespace runtime
 }  // namespace tvm
@@ -192,9 +270,12 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.test.infra.inference_result")
     });
 
 // Allow the ethos-n support code to be tested without a device
-bool Inference(tvm::runtime::TVMArgs args, dl::Network* /* npu */,
-               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
-               const std::vector<uint32_t>& input_sizes,
+bool Inference(tvm::runtime::TVMArgs args,
+#ifdef _ETHOSN_API_VERSION_3_2_0
+               dl::ProcMemAllocator* proc_mem_alloc,
+#endif
+               dl::Network* /* npu */, const std::vector<uint32_t>& input_order,
+               const std::vector<uint32_t>& output_order, const std::vector<uint32_t>& input_sizes,
                const std::vector<uint32_t>& output_sizes) {
   std::vector<DLTensor*> outputs;
   for (int argc = input_order.size(); argc < args.size(); argc++) {
diff --git a/src/runtime/contrib/ethosn/ethosn_device.h b/src/runtime/contrib/ethosn/ethosn_device.h
index acef104515e1..a5f3d18cf9fd 100644
--- a/src/runtime/contrib/ethosn/ethosn_device.h
+++ b/src/runtime/contrib/ethosn/ethosn_device.h
@@ -38,10 +38,15 @@ namespace dl = ::ethosn::driver_library;
 
 using tvm::runtime::TVMArgs;
 
+#ifdef _ETHOSN_API_VERSION_3_2_0
+bool Inference(tvm::runtime::TVMArgs args, dl::ProcMemAllocator* proc_mem_alloc, dl::Network* npu,
+               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
+               const std::vector<uint32_t>& input_sizes, const std::vector<uint32_t>& output_sizes);
+#else
 bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
                const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
                const std::vector<uint32_t>& input_sizes, const std::vector<uint32_t>& output_sizes);
-
+#endif
 }  // namespace ethosn
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.cc b/src/runtime/contrib/ethosn/ethosn_runtime.cc
index dc2d4da853f8..11edc8c71a2c 100644
--- a/src/runtime/contrib/ethosn/ethosn_runtime.cc
+++ b/src/runtime/contrib/ethosn/ethosn_runtime.cc
@@ -53,6 +53,11 @@ EthosnModule::EthosnModule(std::vector<OrderedCompiledNetwork>* cmms) {
     if (it.compiled_cmm != nullptr) {
       network_map_[it.name].compiled_cmm = std::move(it.compiled_cmm);
     }
+#ifdef _ETHOSN_API_VERSION_3_2_0
+    if (it.proc_mem_alloc != nullptr) {
+      network_map_[it.name].proc_mem_alloc = std::move(it.proc_mem_alloc);
+    }
+#endif
     if (it.runtime_cmm != nullptr) {
       network_map_[it.name].runtime_cmm = std::move(it.runtime_cmm);
     }
@@ -67,9 +72,16 @@ PackedFunc EthosnModule::GetFunction(const std::string& name,
                                      const ObjectPtr<Object>& sptr_to_self) {
   if (network_map_.find(name) != network_map_.end()) {
     return PackedFunc([sptr_to_self, this, name](TVMArgs args, TVMRetValue* rv) {
+#ifdef _ETHOSN_API_VERSION_3_2_0
+      *rv = Inference(args, network_map_[name].proc_mem_alloc.get(),
+                      network_map_[name].runtime_cmm.get(), network_map_[name].inputs,
+                      network_map_[name].outputs, network_map_[name].input_sizes,
+                      network_map_[name].output_sizes);
+#else
       *rv = Inference(args, network_map_[name].runtime_cmm.get(), network_map_[name].inputs,
                       network_map_[name].outputs, network_map_[name].input_sizes,
                       network_map_[name].output_sizes);
+#endif
     });
   } else {
     return PackedFunc();
@@ -102,6 +114,9 @@ Module EthosnModule::LoadFromBinary(void* strm) {
   cmms.resize(func_count);
   for (unsigned int i = 0; i < func_count; i++) {
     OrderedCompiledNetwork& compiled = cmms[i];
+#ifdef _ETHOSN_API_VERSION_3_2_0
+    compiled.proc_mem_alloc = std::make_unique<dl::ProcMemAllocator>();
+#endif
     std::string ext_symbol;
     std::string cmm;
     uint64_t input_size;
@@ -114,7 +129,12 @@ Module EthosnModule::LoadFromBinary(void* strm) {
 #if defined ETHOSN_HW
     // If hardware unavaiable use the mock inference functionality. If hardware is
     // avaiable, deserialize the compiled graph.
+#ifdef _ETHOSN_API_VERSION_3_2_0
+    compiled.runtime_cmm = std::make_unique<dl::Network>(
+        compiled.proc_mem_alloc->CreateNetwork(cmm.c_str(), cmm.size()));
+#else
     compiled.runtime_cmm = std::make_unique<dl::Network>(cmm.c_str(), cmm.size());
+#endif
 #endif
     // Read the number of inputs
     stream->Read<uint64_t>(&input_size);
diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.h b/src/runtime/contrib/ethosn/ethosn_runtime.h
index b8942fef12d9..2f8e445d97a8 100644
--- a/src/runtime/contrib/ethosn/ethosn_runtime.h
+++ b/src/runtime/contrib/ethosn/ethosn_runtime.h
@@ -36,6 +36,14 @@
 #include "ethosn_driver_library/Network.hpp"
 #include "ethosn_support_library/Support.hpp"
 
+#if ETHOSN_SUPPORT_LIBRARY_VERSION_MAJOR == 3 && ETHOSN_SUPPORT_LIBRARY_VERSION_MINOR == 2 && \
+    ETHOSN_SUPPORT_LIBRARY_VERSION_PATCH == 0
+#define _ETHOSN_API_VERSION_3_2_0
+#endif
+#ifdef _ETHOSN_API_VERSION_3_2_0
+#include "ethosn_driver_library/ProcMemAllocator.hpp"
+#endif
+
 namespace tvm {
 namespace runtime {
 namespace ethosn {
@@ -46,6 +54,9 @@ namespace dl = ::ethosn::driver_library;
 struct OrderedCompiledNetwork {
   std::unique_ptr<sl::CompiledNetwork> compiled_cmm;
   std::unique_ptr<dl::Network> runtime_cmm;
+#ifdef _ETHOSN_API_VERSION_3_2_0
+  std::unique_ptr<dl::ProcMemAllocator> proc_mem_alloc;
+#endif
   std::string name;
   std::vector<uint32_t> inputs;
   std::vector<uint32_t> outputs;
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index 8a469403872f..334cd6d3b87c 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -168,7 +168,6 @@ def build(
     if not additional_config_args:
         additional_config_args = {}
     npu_config = {**get_ethosn_device_options(), **additional_config_args}
-    print(npu_config)
     with tvm.transform.PassContext(opt_level=3, config={"relay.ext.ethos-n.options": npu_config}):
         with tvm.target.Target("llvm"):
             if npu:
diff --git a/tests/python/contrib/test_ethosn/test_conv2d.py b/tests/python/contrib/test_ethosn/test_conv2d.py
index a6ce73656bfc..851bd031b38e 100644
--- a/tests/python/contrib/test_ethosn/test_conv2d.py
+++ b/tests/python/contrib/test_ethosn/test_conv2d.py
@@ -22,6 +22,7 @@
 
 import tvm
 from tvm import relay
+from tvm.relay.op.contrib import ethosn_api_version
 from tvm.testing import requires_ethosn
 
 from . import infrastructure as tei
@@ -227,7 +228,10 @@ def test_conv2d_depthwise(
             )
         ),
     }
-    input_zp = np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max)
+    if ethosn_api_version() == "3.2.0":
+        input_zp = np.random.randint(0, np.iinfo(dtype).max)
+    else:
+        input_zp = np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max)
     input_sc = np.random.random() * 2
     if qnn_per_channel:
         kernel_sc = tvm.nd.array(
diff --git a/tests/python/contrib/test_ethosn/test_conv2d_transpose.py b/tests/python/contrib/test_ethosn/test_conv2d_transpose.py
index 84aa7e969b30..4d99a310ac44 100644
--- a/tests/python/contrib/test_ethosn/test_conv2d_transpose.py
+++ b/tests/python/contrib/test_ethosn/test_conv2d_transpose.py
@@ -22,6 +22,7 @@
 
 import tvm
 from tvm import relay
+from tvm.relay.op.contrib import ethosn_api_version
 from tvm.testing import requires_ethosn
 from . import infrastructure as tei
 
@@ -115,7 +116,7 @@ def _get_model(
     [
         ((1, 2, 2, 1), (2, 2), (1, 1), 1, False),
         ((1, 2, 2, 5), (2, 2), (3, 5), 4, False),
-        ((1, 7, 7, 4), (2, 2), (7, 9), 8, True),
+        ((1, 7, 7, 4), (2, 2), (7, 7), 8, True),
     ],
 )
 def test_conv2d_transpose(ifm_shape, strides, kernel_size, out_channels, dtype, bias):
@@ -169,6 +170,71 @@ def test_conv2d_transpose(ifm_shape, strides, kernel_size, out_channels, dtype,
     tei.verify(outputs, dtype, 1)
 
 
+@requires_ethosn
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
+@pytest.mark.parametrize(
+    "ifm_shape,strides,kernel_size,out_channels,bias",
+    [
+        ((1, 10, 20, 3), (1, 1), (8, 5), 4, False),
+        ((1, 10, 10, 2), (2, 2), (7, 9), 8, True),
+    ],
+)
+def test_conv2d_transpose_kernel_size_gt_8(
+    ifm_shape, strides, kernel_size, out_channels, dtype, bias
+):
+    """Check transpose convolution for big kernel sizes."""
+    if ethosn_api_version() in ["3.2.0", "3.1.0"]:
+        pytest.skip("Skipping because NPU driver 22.11 fails to interpret zp used in the test.")
+
+    np.random.seed(0)
+
+    kernel_layout = "IOHW"
+    dilation = (1, 1)
+    groups = 1
+
+    iinfo = np.iinfo(dtype)
+    data_min = iinfo.min
+    data_max = iinfo.max
+
+    input_zp = np.random.randint(data_min, data_max)
+    input_sc = np.random.random() * 2
+    kernel_zp = np.random.randint(data_min, data_max)
+    kernel_sc = np.random.random() * 4
+    output_zp, output_sc = tei.get_conv2d_qnn_params(
+        dtype, input_zp, input_sc, kernel_zp, kernel_sc, ifm_shape[1], ifm_shape[2], ifm_shape[3]
+    )
+
+    model, params = _get_model(
+        shape=ifm_shape,
+        kernel_h=kernel_size[0],
+        kernel_w=kernel_size[1],
+        input_zp=input_zp,
+        input_sc=input_sc,
+        kernel_zp=kernel_zp,
+        kernel_sc=kernel_sc,
+        output_zp=output_zp,
+        output_sc=output_sc,
+        stride=strides,
+        dilation=dilation,
+        groups=groups,
+        kernel_layout=kernel_layout,
+        dtype=dtype,
+        out_channels=out_channels,
+        bias=bias,
+    )
+
+    outputs = []
+    inputs = {
+        "a": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=ifm_shape, dtype=dtype))
+    }
+
+    for npu in [False, True]:
+        mod = tei.make_module(model, params)
+        outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+
+    tei.verify(outputs, dtype, 1)
+
+
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 @pytest.mark.parametrize(
diff --git a/tests/python/contrib/test_ethosn/test_leaky_relu.py b/tests/python/contrib/test_ethosn/test_leaky_relu.py
index ccf67151bf1e..ee5f2048dbbb 100644
--- a/tests/python/contrib/test_ethosn/test_leaky_relu.py
+++ b/tests/python/contrib/test_ethosn/test_leaky_relu.py
@@ -22,6 +22,7 @@
 
 import tvm
 from tvm import relay
+from tvm.relay.op.contrib import ethosn_api_version
 from tvm.testing import requires_ethosn
 
 from . import infrastructure as tei
@@ -55,9 +56,12 @@ def test_leaky_relu(dtype, shape, alpha):
     iinfo = np.iinfo(dtype)
     zp_min = iinfo.min
     zp_max = iinfo.max
-    input_zp = zp_min + 120
+    if ethosn_api_version() == "3.2.0":
+        input_zp = zp_min + 128
+    else:
+        input_zp = zp_min + 120
     input_sc = 0.0068132
-    output_zp = zp_min + 128
+    output_zp = zp_min + 126  # values offset more than 126 can cause saturation
     output_sc = 0.0078125
 
     inputs = {"x": tvm.nd.array(np.random.randint(zp_min, high=zp_max, size=shape, dtype=dtype))}
diff --git a/tests/python/contrib/test_ethosn/test_tanh.py b/tests/python/contrib/test_ethosn/test_tanh.py
index 25f46e51eda9..77ed33980ea5 100644
--- a/tests/python/contrib/test_ethosn/test_tanh.py
+++ b/tests/python/contrib/test_ethosn/test_tanh.py
@@ -47,7 +47,6 @@ def _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype):
 @pytest.mark.parametrize("shape", [(1, 52, 52, 3)])
 def test_tanh(dtype, shape):
     """Compare Tanh output with TVM."""
-
     zp_min = np.iinfo(dtype).min
     zp_max = np.iinfo(dtype).max
 
@@ -57,7 +56,7 @@ def test_tanh(dtype, shape):
     }
     outputs = []
     for npu in [False, True]:
-        model = _get_model(shape, zp_min + 120, 0.0250629, zp_min + 128, 0.0078125, dtype)
+        model = _get_model(shape, zp_min + 128, 1 / 256, zp_min + 128, 1 / 128, dtype)
         mod = tei.make_module(model, [])
         outputs.append(
             tei.build_and_run(

From 9b0de0a122a05a402a02ea8eb570ee9bce08e818 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Wed, 21 Dec 2022 10:29:02 +0000
Subject: [PATCH 082/286] [CMSIS-NN] Add Cortex-M85 support (#13644)

Added Cortex-M85 to the list of mprofiles maintained
for correct flag mappings: MVE and DSP.
---
 src/target/parsers/mprofile.cc                   |  6 +++---
 .../contrib/cmsisnn/compiler_attrs_test.cc       | 16 ++++++++++++++--
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/target/parsers/mprofile.cc b/src/target/parsers/mprofile.cc
index 9bcf2c907832..29c782ba6e08 100644
--- a/src/target/parsers/mprofile.cc
+++ b/src/target/parsers/mprofile.cc
@@ -36,9 +36,9 @@ const TargetFeatures kHasDSP = {{"has_dsp", Bool(true)}, {"has_mve", Bool(false)
 const TargetFeatures kHasMVE = {{"has_dsp", Bool(true)}, {"has_mve", Bool(true)}};
 
 static const char* baseCPUs[] = {"cortex-m0", "cortex-m3"};
-static const char* dspCPUs[] = {"cortex-m55", "cortex-m4", "cortex-m7", "cortex-m33",
-                                "cortex-m35p"};
-static const char* mveCPUs[] = {"cortex-m55"};
+static const char* dspCPUs[] = {"cortex-m55", "cortex-m4",   "cortex-m7",
+                                "cortex-m33", "cortex-m35p", "cortex-m85"};
+static const char* mveCPUs[] = {"cortex-m55", "cortex-m85"};
 
 template <typename Container>
 static inline bool MatchesCpu(Optional<String> mcpu, const Container& cpus) {
diff --git a/tests/cpp/relay/backend/contrib/cmsisnn/compiler_attrs_test.cc b/tests/cpp/relay/backend/contrib/cmsisnn/compiler_attrs_test.cc
index 7db6487f6ea9..24dd3a6f5e2d 100644
--- a/tests/cpp/relay/backend/contrib/cmsisnn/compiler_attrs_test.cc
+++ b/tests/cpp/relay/backend/contrib/cmsisnn/compiler_attrs_test.cc
@@ -53,18 +53,30 @@ TEST(CMSISNNTarget, CreateFromUndefined) {
   ASSERT_EQ(target->GetFeature<Bool>("has_dsp").value_or(Bool(false)), Bool(false));
 }
 
-TEST(CMSISNNTarget, CreateFromContext) {
+TEST(CMSISNNTarget, CreateFromContextCortexM55) {
   Target target = GetTargetWithCompilerAttrs("cortex-m55", "");
   ASSERT_EQ(target->GetFeature<Bool>("has_mve").value_or(Bool(false)), Bool(true));
   ASSERT_EQ(target->GetFeature<Bool>("has_dsp").value_or(Bool(false)), Bool(true));
 }
 
-TEST(CMSISNNTarget, CreateFromContextWithAttrs) {
+TEST(CMSISNNTarget, CreateFromContextWithAttrsCortexM55) {
   Target target = GetTargetWithCompilerAttrs("cortex-m55", "+nomve");
   ASSERT_EQ(target->GetFeature<Bool>("has_mve").value_or(Bool(false)), Bool(false));
   ASSERT_EQ(target->GetFeature<Bool>("has_dsp").value_or(Bool(false)), Bool(true));
 }
 
+TEST(CMSISNNTarget, CreateFromContextCortexM85) {
+  Target target = GetTargetWithCompilerAttrs("cortex-m85", "");
+  ASSERT_EQ(target->GetFeature<Bool>("has_mve").value_or(Bool(false)), Bool(true));
+  ASSERT_EQ(target->GetFeature<Bool>("has_dsp").value_or(Bool(false)), Bool(true));
+}
+
+TEST(CMSISNNTarget, CreateFromContextWithAttrsCortexM85) {
+  Target target = GetTargetWithCompilerAttrs("cortex-m85", "+nomve");
+  ASSERT_EQ(target->GetFeature<Bool>("has_mve").value_or(Bool(false)), Bool(false));
+  ASSERT_EQ(target->GetFeature<Bool>("has_dsp").value_or(Bool(false)), Bool(true));
+}
+
 }  // namespace cmsisnn
 }  // namespace contrib
 }  // namespace relay

From 323ec0957fd5b6ff15d9ea4764cd189f7a63dde1 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Wed, 21 Dec 2022 17:39:28 +0000
Subject: [PATCH 083/286] [ETHOSN] Fix for the mock inference after NPU driver
 update (#13650)

NPU driver 22.11 supports buffer and network creation
with only process memory allocator. This change however
should be limited to the cases when the HW is available.

NPU driver update: #13637
---
 src/runtime/contrib/ethosn/ethosn_device.cc  | 2 +-
 src/runtime/contrib/ethosn/ethosn_runtime.cc | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc
index 7e16f7e887f3..d4ebec4de311 100644
--- a/src/runtime/contrib/ethosn/ethosn_device.cc
+++ b/src/runtime/contrib/ethosn/ethosn_device.cc
@@ -272,7 +272,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.test.infra.inference_result")
 // Allow the ethos-n support code to be tested without a device
 bool Inference(tvm::runtime::TVMArgs args,
 #ifdef _ETHOSN_API_VERSION_3_2_0
-               dl::ProcMemAllocator* proc_mem_alloc,
+               dl::ProcMemAllocator* /*proc_mem_alloc*/,
 #endif
                dl::Network* /* npu */, const std::vector<uint32_t>& input_order,
                const std::vector<uint32_t>& output_order, const std::vector<uint32_t>& input_sizes,
diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.cc b/src/runtime/contrib/ethosn/ethosn_runtime.cc
index 11edc8c71a2c..0b68db1a1798 100644
--- a/src/runtime/contrib/ethosn/ethosn_runtime.cc
+++ b/src/runtime/contrib/ethosn/ethosn_runtime.cc
@@ -114,9 +114,6 @@ Module EthosnModule::LoadFromBinary(void* strm) {
   cmms.resize(func_count);
   for (unsigned int i = 0; i < func_count; i++) {
     OrderedCompiledNetwork& compiled = cmms[i];
-#ifdef _ETHOSN_API_VERSION_3_2_0
-    compiled.proc_mem_alloc = std::make_unique<dl::ProcMemAllocator>();
-#endif
     std::string ext_symbol;
     std::string cmm;
     uint64_t input_size;
@@ -130,6 +127,7 @@ Module EthosnModule::LoadFromBinary(void* strm) {
     // If hardware unavaiable use the mock inference functionality. If hardware is
     // avaiable, deserialize the compiled graph.
 #ifdef _ETHOSN_API_VERSION_3_2_0
+    compiled.proc_mem_alloc = std::make_unique<dl::ProcMemAllocator>();
     compiled.runtime_cmm = std::make_unique<dl::Network>(
         compiled.proc_mem_alloc->CreateNetwork(cmm.c_str(), cmm.size()));
 #else

From c9001a0dcf4b6c646f11698861651493f045e8b3 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 22 Dec 2022 14:59:45 -0500
Subject: [PATCH 084/286] [RPC] Add fail-guard for termination time exception
 (#13651)

This PR adds fail-guard to reduce error messages thrown during
process termination time. Such error won't trigger test error
but will bring extra message during exit time.
---
 python/tvm/rpc/proxy.py               | 5 ++++-
 python/tvm/rpc/server.py              | 5 ++++-
 python/tvm/rpc/server_ios_launcher.py | 5 ++++-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/python/tvm/rpc/proxy.py b/python/tvm/rpc/proxy.py
index 4c3144e7b5cd..d7027c88a4b5 100644
--- a/python/tvm/rpc/proxy.py
+++ b/python/tvm/rpc/proxy.py
@@ -643,7 +643,10 @@ def terminate(self):
             self.proc = None
 
     def __del__(self):
-        self.terminate()
+        try:
+            self.terminate()
+        except ImportError:
+            pass
 
 
 def websocket_proxy_server(url, key=""):
diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index 7932e98aa20c..a144356f2e6d 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -499,4 +499,7 @@ def terminate(self):
             self.proc = None
 
     def __del__(self):
-        self.terminate()
+        try:
+            self.terminate()
+        except ImportError:
+            pass
diff --git a/python/tvm/rpc/server_ios_launcher.py b/python/tvm/rpc/server_ios_launcher.py
index 2e31586f6456..06ead1c3d165 100644
--- a/python/tvm/rpc/server_ios_launcher.py
+++ b/python/tvm/rpc/server_ios_launcher.py
@@ -335,7 +335,10 @@ def terminate(self):
                 print(e)
 
     def __del__(self):
-        self.terminate()
+        try:
+            self.terminate()
+        except ImportError:
+            pass
 
     @staticmethod
     def is_compatible_environment():

From 3ec03f7828a6e7c7507e0aa5dfa88468ce3a1182 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Sun, 25 Dec 2022 16:15:35 +0000
Subject: [PATCH 085/286] [CMSIS-NN] Global function that provides range based
 on dtype (#13652)

Range for dtype was being sought differently from
inside various CMSIS-NN tests. This commit has created
a common global function inside aot.py under
tvm.testing that can provide (min, max) values based
on the dtype. In future, other AOT based targets can
make use of this function to obtain the range.
---
 python/tvm/testing/aot.py                     | 31 ++++++++++++++++-
 .../contrib/test_cmsisnn/test_binary_ops.py   | 14 ++++----
 .../contrib/test_cmsisnn/test_conv2d.py       | 21 ++++++------
 .../test_cmsisnn/test_fully_connected.py      | 10 +++---
 .../contrib/test_cmsisnn/test_fuse_pads.py    | 22 +++++++-----
 .../test_cmsisnn/test_generate_constants.py   |  7 ++--
 .../test_cmsisnn/test_invalid_graphs.py       |  5 ++-
 .../contrib/test_cmsisnn/test_networks.py     |  6 ++--
 .../contrib/test_cmsisnn/test_pooling.py      |  4 +--
 .../test_cmsisnn/test_remove_reshapes.py      |  4 +--
 .../contrib/test_cmsisnn/test_softmax.py      |  5 ++-
 tests/python/contrib/test_cmsisnn/utils.py    | 34 +++----------------
 .../aot/test_crt_forward_declarations.py      | 24 -------------
 13 files changed, 85 insertions(+), 102 deletions(-)

diff --git a/python/tvm/testing/aot.py b/python/tvm/testing/aot.py
index 563a7dff4a50..30d3c78ae43b 100644
--- a/python/tvm/testing/aot.py
+++ b/python/tvm/testing/aot.py
@@ -24,7 +24,7 @@
 import subprocess
 import tarfile
 import logging
-from typing import Any, NamedTuple, Union, Optional, List, Dict
+from typing import Any, NamedTuple, Union, Tuple, Optional, List, Dict
 import numpy as np
 
 import tvm
@@ -901,6 +901,35 @@ def compile_and_run(
     )
 
 
+def get_dtype_range(dtype: str) -> Tuple[int, int]:
+    """
+    Produces the min,max for a give data type.
+
+    Parameters
+    ----------
+    dtype : str
+        a type string (e.g., int8, float64)
+
+    Returns
+    -------
+    type_info.min : int
+        the minimum of the range
+    type_info.max : int
+        the maximum of the range
+    """
+    type_info = None
+    np_dtype = np.dtype(dtype)
+    kind = np_dtype.kind
+
+    if kind == "f":
+        type_info = np.finfo(np_dtype)
+    elif kind in ["i", "u"]:
+        type_info = np.iinfo(np_dtype)
+    else:
+        raise TypeError(f"dtype ({dtype}) must indicate some floating-point or integral data type.")
+    return type_info.min, type_info.max
+
+
 def generate_ref_data(mod, input_data, params=None, target="llvm"):
     """Generate reference data through executing the relay module"""
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
diff --git a/tests/python/contrib/test_cmsisnn/test_binary_ops.py b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
index 29335072bf06..663a1bd45d5c 100644
--- a/tests/python/contrib/test_cmsisnn/test_binary_ops.py
+++ b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
@@ -25,7 +25,7 @@
 import tvm
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
-from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
+from tvm.testing.aot import get_dtype_range, generate_ref_data, AOTTestModel, compile_and_run
 from tvm.micro.testing.aot_test_utils import (
     AOT_USMP_CORSTONE300_RUNNER,
 )
@@ -34,7 +34,6 @@
     skip_if_no_reference_system,
     make_module,
     make_qnn_relu,
-    get_range_for_dtype_str,
     assert_partitioned_function,
     assert_no_external_function,
     create_test_runner,
@@ -45,9 +44,8 @@ def generate_tensor_constant():
     rng = np.random.default_rng(12321)
     dtype = "int8"
     shape = (1, 16, 16, 3)
-    values = tvm.nd.array(
-        rng.integers(np.iinfo(dtype).min, high=np.iinfo(dtype).max, size=shape, dtype=dtype)
-    )
+    in_min, in_max = get_dtype_range(dtype)
+    values = tvm.nd.array(rng.integers(in_min, high=in_max, size=shape, dtype=dtype))
     return relay.const(values, dtype)
 
 
@@ -136,7 +134,7 @@ def test_op_int8(
     assert_partitioned_function(orig_mod, cmsisnn_mod)
 
     # validate the output
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
     inputs = {
         "input_0": np.random.randint(in_min, high=in_max, size=shape, dtype=dtype),
         "input_1": np.random.randint(in_min, high=in_max, size=shape, dtype=dtype),
@@ -196,7 +194,7 @@ def test_same_input_to_binary_op(op, relu_type):
     ), "Composite function for the binary op should have only 1 parameter."
 
     # validate the output
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
     inputs = {
         "input": np.random.randint(in_min, high=in_max, size=shape, dtype=dtype),
     }
@@ -275,7 +273,7 @@ def test_constant_input_int8(op, input_0, input_1):
     assert_partitioned_function(orig_mod, cmsisnn_mod)
 
     # validate the output
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
     inputs = {}
     if isinstance(input_0, tvm.relay.expr.Var):
         inputs.update({"input_0": np.random.randint(in_min, high=in_max, size=shape, dtype=dtype)})
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 66ff5d793880..20e7b9ed2f62 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -24,6 +24,7 @@
 from tvm.relay.op.contrib import cmsisnn
 
 from tvm.testing.aot import (
+    get_dtype_range,
     generate_ref_data,
     AOTTestModel,
     compile_models,
@@ -33,7 +34,6 @@
 from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 from .utils import (
     make_module,
-    get_range_for_dtype_str,
     get_same_padding,
     get_conv2d_qnn_params,
     get_kernel_bias_dtype,
@@ -82,10 +82,11 @@ def make_model(
         p = get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
 
     rng = np.random.default_rng(12321)
+    kmin, kmax = get_dtype_range(kernel_dtype)
     kernel = tvm.nd.array(
         rng.integers(
-            np.iinfo(kernel_dtype).min,
-            high=np.iinfo(kernel_dtype).max,
+            kmin,
+            high=kmax,
             size=kernel_shape,
             dtype=kernel_dtype,
         )
@@ -157,7 +158,7 @@ def test_conv2d_number_primfunc_args(
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
     kernel_zero_point = 0
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
     relu_type = "RELU"
 
     kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
@@ -264,7 +265,7 @@ def test_conv2d_symmetric_padding(
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
     kernel_zero_point = 0
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
 
     kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
 
@@ -358,7 +359,7 @@ def test_conv2d_asymmetric_padding(
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
     kernel_zero_point = 0
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
 
     kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
 
@@ -454,7 +455,7 @@ def test_pad_conv2d_fusion_int8(
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
     kernel_zero_point = 0
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
 
     kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
     output_scale, output_zero_point = get_conv2d_qnn_params(
@@ -567,7 +568,7 @@ def test_invalid_pad_conv2d_fusion_int8(
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
     kernel_zero_point = 0
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
 
     kernel_dtype, bias_dtype = get_kernel_bias_dtype(dtype)
 
@@ -740,7 +741,7 @@ def test_depthwise(
     kernel_w = kernel_size[1]
     kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
     kernel_zero_point = 0
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
 
     groups = ifm_shape[3]
     kernel_layout = "HWOI"
@@ -844,7 +845,7 @@ def test_relay_conv2d_cmsisnn_depthwise_int8(
     test_runner = AOT_USMP_CORSTONE300_RUNNER
 
     dtype = "int8"
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
 
     ifm_shape = (1, 24, 24, 1)
     groups = ifm_shape[3]
diff --git a/tests/python/contrib/test_cmsisnn/test_fully_connected.py b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
index 3b220eb42c9b..46b1488eb3fe 100644
--- a/tests/python/contrib/test_cmsisnn/test_fully_connected.py
+++ b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
@@ -23,10 +23,9 @@
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 
-from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
+from tvm.testing.aot import get_dtype_range, generate_ref_data, AOTTestModel, compile_and_run
 from .utils import (
     make_module,
-    get_range_for_dtype_str,
     get_conv2d_qnn_params,
     make_qnn_relu,
     assert_partitioned_function,
@@ -55,10 +54,11 @@ def make_model(
     """Return a model and any parameters it may have"""
     input_ = relay.var("input", shape=in_shape, dtype=dtype)
     rng = np.random.default_rng(12321)
+    kmin, kmax = get_dtype_range(kernel_dtype)
     weight = tvm.nd.array(
         rng.integers(
-            np.iinfo(kernel_dtype).min,
-            high=np.iinfo(kernel_dtype).max,
+            kmin,
+            high=kmax,
             size=kernel_shape,
             dtype=kernel_dtype,
         )
@@ -123,7 +123,7 @@ def test_ops(
     kernel_zero_point = 0
     kernel_shape = [out_channels, in_shape[1]]
     conv2d_kernel_shape = (1, 1, kernel_shape[0], kernel_shape[1])
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
 
     output_scale, output_zero_point = get_conv2d_qnn_params(
         conv2d_kernel_shape,
diff --git a/tests/python/contrib/test_cmsisnn/test_fuse_pads.py b/tests/python/contrib/test_cmsisnn/test_fuse_pads.py
index f57dc5cd5bab..4ea306cc4382 100644
--- a/tests/python/contrib/test_cmsisnn/test_fuse_pads.py
+++ b/tests/python/contrib/test_cmsisnn/test_fuse_pads.py
@@ -19,7 +19,7 @@
 import numpy as np
 import pytest
 import tvm
-import tvm.testing
+from tvm.testing.aot import get_dtype_range
 from tvm import relay
 from .utils import CheckForPadsWithinCompositeFunc
 
@@ -59,10 +59,11 @@ def test_invalid_padding_for_fusion(ifm_shape, pad_width, conv2d_padding, ofm_sh
         pad_mode="constant",
     )
     rng = np.random.default_rng(12321)
+    in_min, in_max = get_dtype_range(dtype)
     local_weight = tvm.nd.array(
         rng.integers(
-            np.iinfo(dtype).min,
-            high=np.iinfo(dtype).max,
+            in_min,
+            high=in_max,
             size=(ofm_channels, kernel_size[0], kernel_size[1], ifm_shape[3]),
             dtype=dtype,
         )
@@ -139,10 +140,11 @@ def test_pad_conv2d_fusion_noncmsisnn_target(ifm_shape, pad_width, conv2d_paddin
         pad_mode="constant",
     )
     rng = np.random.default_rng(12321)
+    in_min, in_max = get_dtype_range(dtype)
     local_weight = tvm.nd.array(
         rng.integers(
-            np.iinfo(dtype).min,
-            high=np.iinfo(dtype).max,
+            in_min,
+            high=in_max,
             size=(ofm_channels, kernel_size[0], kernel_size[1], ifm_shape[3]),
             dtype=dtype,
         )
@@ -217,10 +219,11 @@ def test_pad_conv2d_fusion(ifm_shape, pad_width, conv2d_padding, ofm_shape):
         pad_mode="constant",
     )
     rng = np.random.default_rng(12321)
+    kmin, kmax = get_dtype_range(dtype)
     local_weight = tvm.nd.array(
         rng.integers(
-            np.iinfo(dtype).min,
-            high=np.iinfo(dtype).max,
+            kmin,
+            high=kmax,
             size=(ofm_channels, kernel_size[0], kernel_size[1], ifm_shape[3]),
             dtype=dtype,
         )
@@ -281,10 +284,11 @@ def test_without_preceding_pad():
     ofm_shape = (1, 56, 56, 64)
     local_input = relay.var("local_input", shape=ifm_shape, dtype=dtype)
     rng = np.random.default_rng(12321)
+    kmin, kmax = get_dtype_range(dtype)
     local_weight = tvm.nd.array(
         rng.integers(
-            np.iinfo(dtype).min,
-            high=np.iinfo(dtype).max,
+            kmin,
+            high=kmax,
             size=(64, 3, 3, 64),
             dtype=dtype,
         )
diff --git a/tests/python/contrib/test_cmsisnn/test_generate_constants.py b/tests/python/contrib/test_cmsisnn/test_generate_constants.py
index 86737370bc5d..b83884128441 100644
--- a/tests/python/contrib/test_cmsisnn/test_generate_constants.py
+++ b/tests/python/contrib/test_cmsisnn/test_generate_constants.py
@@ -20,7 +20,7 @@
 import numpy as np
 import pytest
 import tvm
-import tvm.testing
+from tvm.testing.aot import get_dtype_range
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 
@@ -107,10 +107,11 @@ def make_model(
 
     weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels)
     rng = np.random.default_rng(12321)
+    kmin, kmax = get_dtype_range(kernel_dtype)
     weight = tvm.nd.array(
         rng.integers(
-            np.iinfo(kernel_dtype).min,
-            high=np.iinfo(kernel_dtype).max,
+            kmin,
+            high=kmax,
             size=weight_shape,
             dtype=kernel_dtype,
         )
diff --git a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
index c66f9d0e0726..ace1db7811da 100644
--- a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
+++ b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
@@ -19,13 +19,12 @@
 import numpy as np
 import tvm
 
-from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.testing.aot import AOTTestModel, get_dtype_range, compile_and_run, generate_ref_data
 from tvm.micro.testing.aot_test_utils import (
     AOT_USMP_CORSTONE300_RUNNER,
 )
 from .utils import (
     skip_if_no_reference_system,
-    get_range_for_dtype_str,
 )
 
 
@@ -58,7 +57,7 @@ def @main(%data : Tensor[(16, 29), int8]) -> Tensor[(16, 29), int8] {
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
     dtype = "int8"
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
     rng = np.random.default_rng(12345)
     inputs = {"data": rng.integers(in_min, high=in_max, size=(16, 29), dtype=dtype)}
     outputs = generate_ref_data(orig_mod["main"], inputs, params)
diff --git a/tests/python/contrib/test_cmsisnn/test_networks.py b/tests/python/contrib/test_cmsisnn/test_networks.py
index 6f9f3743a622..9f64be246182 100644
--- a/tests/python/contrib/test_cmsisnn/test_networks.py
+++ b/tests/python/contrib/test_cmsisnn/test_networks.py
@@ -24,12 +24,12 @@
 from tvm import relay
 from tvm.contrib.download import download_testdata
 from tvm.relay.op.contrib import cmsisnn
-from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.testing.aot import AOTTestModel, get_dtype_range, compile_and_run, generate_ref_data
 from tvm.micro.testing.aot_test_utils import (
     AOT_CORSTONE300_RUNNER,
     AOT_USMP_CORSTONE300_RUNNER,
 )
-from .utils import skip_if_no_reference_system, get_range_for_dtype_str
+from .utils import skip_if_no_reference_system
 
 # pylint: disable=import-outside-toplevel
 def _convert_to_relay(
@@ -93,7 +93,7 @@ def test_cnn_small(test_runner):
 
     input_shape = (1, 490)
     dtype = "int8"
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
     rng = np.random.default_rng(12345)
     input_data = rng.integers(in_min, high=in_max, size=input_shape, dtype=dtype)
 
diff --git a/tests/python/contrib/test_cmsisnn/test_pooling.py b/tests/python/contrib/test_cmsisnn/test_pooling.py
index 7657e0e63220..c6e5f02e712a 100644
--- a/tests/python/contrib/test_cmsisnn/test_pooling.py
+++ b/tests/python/contrib/test_cmsisnn/test_pooling.py
@@ -23,6 +23,7 @@
 from tvm.relay.op.contrib import cmsisnn
 
 from tvm.testing.aot import (
+    get_dtype_range,
     generate_ref_data,
     AOTTestModel,
     compile_and_run,
@@ -30,7 +31,6 @@
 from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 from .utils import (
     make_module,
-    get_range_for_dtype_str,
     get_same_padding,
     make_qnn_relu,
     assert_partitioned_function,
@@ -128,7 +128,7 @@ def test_ops(
     assert_partitioned_function(orig_mod, cmsisnn_mod)
 
     # validate the output
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
     np.random.seed(0)
     inputs = {
         "input": np.random.randint(in_min, high=in_max, size=in_shape, dtype=dtype),
diff --git a/tests/python/contrib/test_cmsisnn/test_remove_reshapes.py b/tests/python/contrib/test_cmsisnn/test_remove_reshapes.py
index 8b33a8a90b76..3cd60341ebfe 100644
--- a/tests/python/contrib/test_cmsisnn/test_remove_reshapes.py
+++ b/tests/python/contrib/test_cmsisnn/test_remove_reshapes.py
@@ -23,6 +23,7 @@
 from tvm.relay.op.contrib import cmsisnn
 
 from tvm.testing.aot import (
+    get_dtype_range,
     generate_ref_data,
     AOTTestModel,
     compile_models,
@@ -31,7 +32,6 @@
 from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 from .utils import (
     make_module,
-    get_range_for_dtype_str,
     get_same_padding,
     make_qnn_relu,
     assert_partitioned_function,
@@ -126,7 +126,7 @@ def test_reshape_removal(padding):
 
     # generate reference output
     rng = np.random.default_rng(12345)
-    in_min, in_max = get_range_for_dtype_str("int8")
+    in_min, in_max = get_dtype_range("int8")
     inputs = {"input": rng.integers(in_min, high=in_max, size=in_shape, dtype="int8")}
     output_list = generate_ref_data(orig_mod["main"], inputs, params=None)
 
diff --git a/tests/python/contrib/test_cmsisnn/test_softmax.py b/tests/python/contrib/test_cmsisnn/test_softmax.py
index d048723529e0..0316d567adf4 100644
--- a/tests/python/contrib/test_cmsisnn/test_softmax.py
+++ b/tests/python/contrib/test_cmsisnn/test_softmax.py
@@ -24,12 +24,11 @@
 import tvm.testing
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
-from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.testing.aot import get_dtype_range, AOTTestModel, compile_and_run, generate_ref_data
 
 from .utils import (
     skip_if_no_reference_system,
     make_module,
-    get_range_for_dtype_str,
     assert_partitioned_function,
     assert_no_external_function,
     create_test_runner,
@@ -78,7 +77,7 @@ def test_op_int8(zero_point, scale, compiler_cpu, cpu_flags):
     assert_partitioned_function(orig_mod, cmsisnn_mod)
 
     # validate the output
-    in_min, in_max = get_range_for_dtype_str(dtype)
+    in_min, in_max = get_dtype_range(dtype)
     np.random.seed(0)
     input_data = np.random.randint(in_min, high=in_max, size=shape, dtype=dtype)
     inputs = {"in0": input_data}
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index f3a6b0c1343b..1ec3e609f1a3 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -23,7 +23,7 @@
 
 import tvm
 from tvm import relay
-from tvm.testing.aot import AOTTestRunner
+from tvm.testing.aot import AOTTestRunner, get_dtype_range
 
 
 def skip_if_no_reference_system(func):
@@ -86,30 +86,6 @@ def assert_no_external_function(mod):
     assert not any(attrs), "No function should have an external attribute."
 
 
-def get_range_for_dtype_str(dtype):
-    """
-    Produces the min,max for a give data type.
-
-    Parameters
-    ----------
-    dtype : str
-        a type string (e.g., int8)
-
-    Returns
-    -------
-    type_info.min : int
-        the minimum of the range
-    type_info.max : int
-        the maximum of the range
-    """
-
-    try:
-        type_info = np.iinfo(dtype)
-    except ValueError:
-        type_info = np.finfo(dtype)
-    return type_info.min, type_info.max
-
-
 def make_module(func):
     """Creates IRModule from Function"""
     func = relay.Function(relay.analysis.free_vars(func), func)
@@ -193,11 +169,11 @@ def get_conv2d_qnn_params(
     output_zp : int
         zero point of the output tensor
     """
-    input_dtype_min, input_dtype_max = get_range_for_dtype_str(input_dtype)
+    input_dtype_min, input_dtype_max = get_dtype_range(input_dtype)
     input_max = input_scale * (input_dtype_max - input_zp)
     input_min = input_scale * (input_dtype_min - input_zp)
 
-    kernel_dtype_min, kernel_dtype_max = get_range_for_dtype_str(kernel_dtype)
+    kernel_dtype_min, kernel_dtype_max = get_dtype_range(kernel_dtype)
     kernel_sc_max = np.max(kernel_scale)
     kernel_max = kernel_sc_max * (kernel_dtype_max - kernel_zp)
 
@@ -222,7 +198,7 @@ def get_conv2d_qnn_params(
 
     output_max = max(output_limits)
     output_min = min(output_limits)
-    output_dtype_min, output_dtype_max = get_range_for_dtype_str(output_dtype)
+    output_dtype_min, output_dtype_max = get_dtype_range(output_dtype)
 
     output_scale = (output_max - output_min) / (output_dtype_max - output_dtype_min)
     output_zp = int(output_dtype_min - (output_min / output_scale))
@@ -236,7 +212,7 @@ def make_qnn_relu(expr, fused_activation_fn, scale, zero_point, dtype):
 
     # Get min/max of the output dtype. This will be used to ensure that clip a_min/a_max are not
     # beyond the dtype range.
-    qmin, qmax = get_range_for_dtype_str(dtype)
+    qmin, qmax = get_dtype_range(dtype)
 
     # The input expr is a quantized tensor with its scale and zero point. We calculate the
     # suitable clip off points based on these scale and zero point.
diff --git a/tests/python/relay/aot/test_crt_forward_declarations.py b/tests/python/relay/aot/test_crt_forward_declarations.py
index 7454f85ed153..e54846f3aaca 100644
--- a/tests/python/relay/aot/test_crt_forward_declarations.py
+++ b/tests/python/relay/aot/test_crt_forward_declarations.py
@@ -34,30 +34,6 @@
 )
 
 
-def get_range_for_dtype_str(dtype):
-    """
-    Produces the min,max for a give data type.
-
-    Parameters
-    ----------
-    dtype : str
-        a type string (e.g., int8)
-
-    Returns
-    -------
-    type_info.min : int
-        the minimum of the range
-    type_info.max : int
-        the maximum of the range
-    """
-
-    try:
-        type_info = np.iinfo(dtype)
-    except ValueError:
-        type_info = np.finfo(dtype)
-    return type_info.min, type_info.max
-
-
 def _change_ndarray_layout(arr, src_layout, dst_layout):
     """Makes a copy of an ndarray, reshaping it to a new data layout.
 

From 05ffab90e937a208787aa919c7ddbe270557e090 Mon Sep 17 00:00:00 2001
From: Matveenko Valery <50880524+valmat07@users.noreply.github.com>
Date: Tue, 27 Dec 2022 08:19:26 +0100
Subject: [PATCH 086/286] [Pytorch][Relay] aten::_weight_norm implementation
 (#13661)

Add implementation for pytorch weight normalization
---
 python/tvm/relay/frontend/pytorch.py          | 15 ++++++++++++
 tests/python/frontend/pytorch/test_forward.py | 24 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index b9d167ad2d86..491c140c5cb4 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -3514,6 +3514,20 @@ def multinomial(self, inputs, input_types):
         _, indices = _expr.TupleWrapper(output, 2)
         return indices
 
+    def weight_norm(self, inputs, input_types):
+        weight_v, weight_g = inputs[0], inputs[1]
+        dim = inputs[2]
+        dtype = input_types[0]
+        order = 2.0
+        reci_order = _expr.const(1.0 / order, dtype=dtype)
+        order = _expr.const(order)
+
+        norm_v = _op.power(
+            _op.reduce.sum(_op.power(_op.abs(weight_v), order), axis=dim, exclude=2, keepdims=True),
+            reci_order,
+        )
+        return weight_g * (weight_v / norm_v)
+
     # Operator mappings
     def create_convert_map(self):
         self.convert_map = {
@@ -3781,6 +3795,7 @@ def create_convert_map(self):
             "aten::__lshift__": self.make_elemwise("left_shift"),
             "aten::__rshift__": self.make_elemwise("right_shift"),
             "aten::multinomial": self.multinomial,
+            "aten::_weight_norm": self.weight_norm,
         }
 
     def update_convert_map(self, custom_map):
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 35242fbf7dde..0035d202ded2 100755
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -5038,6 +5038,30 @@ def _test_multinomial(num_samples):
     )
 
 
+def test_weight_norm():
+    """Test for atten::_weight_norm"""
+    in_channels = 32
+    out_channels = 64
+    input_data_conv = torch.rand((1, in_channels, 32, 32)).float()
+
+    conv_wn = torch.nn.utils.weight_norm(torch.nn.Conv2d(in_channels, out_channels, kernel_size=3))
+    verify_model(conv_wn.eval().float(), input_data_conv)
+
+    conv_wn_groups = torch.nn.utils.weight_norm(
+        torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, groups=2)
+    )
+    verify_model(conv_wn_groups.eval().float(), input_data_conv)
+
+    conv_wn = torch.nn.utils.weight_norm(
+        torch.nn.Conv2d(in_channels, out_channels, kernel_size=3), dim=1
+    )
+    verify_model(conv_wn.eval().float(), input_data_conv)
+
+    linear_wn = torch.nn.utils.weight_norm(torch.nn.Linear(in_channels, out_channels))
+    input_data_linear = torch.rand((128, in_channels)).float()
+    verify_model(linear_wn.eval().float(), input_data_linear)
+
+
 @tvm.testing.uses_gpu
 def test_baddbmm():
     def test_fn(alpha, beta):

From a9c849425f587d05e7e700c443453ebcb70f8a4c Mon Sep 17 00:00:00 2001
From: Chun-I Tsai <quic_chunit@quicinc.com>
Date: Wed, 28 Dec 2022 02:37:51 +0800
Subject: [PATCH 087/286] [Relay][Frontend] Span filling common API (#13402)

- Expose and add span attribute of Expr-derived types from C++ to Python
- Add common API of span filling
- Add test cases of span filling
- Add function to control whether to fill span via environment variable
- Modify the way of pretty-print to print span

Co-authored-by: Joey Tsai <chunit@qti.qualcomm.com>
---
 python/tvm/relay/expr.py             | 202 ++++++++++++++++++++++++---
 python/tvm/relay/frontend/common.py  | 165 +++++++++++++++++++++-
 python/tvm/relay/function.py         |   7 +-
 python/tvm/relay/loops.py            |   2 +-
 python/tvm/testing/utils.py          |  22 +++
 src/ir/span.cc                       |   4 +
 src/relay/ir/expr.cc                 |  88 ++++++++++--
 src/relay/ir/function.cc             |   4 +-
 tests/python/frontend/test_common.py | 194 ++++++++++++++++++++++++-
 tests/python/relay/utils/tag_span.py | 108 ++++++++++++++
 10 files changed, 750 insertions(+), 46 deletions(-)
 create mode 100644 tests/python/relay/utils/tag_span.py

diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index fefc2857230d..88b84bbe7ebc 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -171,10 +171,28 @@ class Constant(ExprWithOp):
     ----------
     data : tvm.nd.NDArray
         The data content of the constant expression.
+
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code.
     """
 
-    def __init__(self, data):
-        self.__init_handle_by_constructor__(_ffi_api.Constant, data)
+    def __init__(self, data, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Constant, data, span)
+
+
+@tvm._ffi.register_func("relay.ConstantWithFields")
+def ConstantWithFields(
+    constant,
+    data=None,
+    virtual_device=None,
+    span=None,
+):
+    """
+    Returns constant with the given properties. A None property denotes 'no change'.
+    Returns constant if all properties are unchanged. Otherwise, returns a copy with the new
+    fields.
+    """
+    return _ffi_api.ConstantWithFields(constant, data, virtual_device, span)
 
 
 @tvm._ffi.register_object("relay.Tuple")
@@ -187,7 +205,7 @@ class Tuple(ExprWithOp):
         The fields in the tuple.
 
     span: Optional[tvm.relay.Span]
-        Span that points to original source code
+        Span that points to original source code.
     """
 
     def __init__(self, fields, span=None):
@@ -205,6 +223,16 @@ def astype(self, _):
         raise TypeError("astype cannot be used on tuple")
 
 
+@tvm._ffi.register_func("relay.TupleWithFields")
+def TupleWithFields(tup, fields=None, virtual_device=None, span=None):
+    """
+    Returns tuple with the given properties. A None property denotes 'no change'.
+    Returns tuple if all properties are unchanged. Otherwise, returns a copy with the new
+    fields.
+    """
+    return _ffi_api.TupleWithFields(tup, fields, virtual_device, span)
+
+
 @tvm._ffi.register_object("relay.Var")
 class Var(ExprWithOp):
     """A local variable in Relay.
@@ -221,10 +249,13 @@ class Var(ExprWithOp):
 
     type_annotation: tvm.relay.Type, optional
         The type annotation on the variable.
+
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code.
     """
 
-    def __init__(self, name_hint, type_annotation=None):
-        self.__init_handle_by_constructor__(_ffi_api.Var, name_hint, type_annotation)
+    def __init__(self, name_hint, type_annotation=None, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Var, name_hint, type_annotation, span)
 
     @property
     def name_hint(self):
@@ -233,6 +264,16 @@ def name_hint(self):
         return name
 
 
+@tvm._ffi.register_func("relay.VarWithFields")
+def VarWithFields(variable, vid=None, type_annotation=None, virtual_device=None, span=None):
+    """
+    Returns var with the given properties. A None property denotes 'no change'.
+    Returns var if all properties are unchanged. Otherwise, returns a copy with the new
+    fields.
+    """
+    return _ffi_api.VarWithFields(variable, vid, type_annotation, virtual_device, span)
+
+
 @tvm._ffi.register_object("relay.Call")
 class Call(ExprWithOp):
     """Function call node in Relay.
@@ -256,7 +297,7 @@ class Call(ExprWithOp):
         used in advanced usecase of template functions.
 
     span: Optional[tvm.relay.Span]
-        Span that points to original source code
+        Span that points to original source code.
     """
 
     def __init__(self, op, args, attrs=None, type_args=None, span=None):
@@ -265,6 +306,18 @@ def __init__(self, op, args, attrs=None, type_args=None, span=None):
         self.__init_handle_by_constructor__(_ffi_api.Call, op, args, attrs, type_args, span)
 
 
+@tvm._ffi.register_func("relay.CallWithFields")
+def CallWithFields(
+    call, op=None, args=None, attrs=None, type_args=None, virtual_device=None, span=None
+):
+    """
+    Returns call with the given properties. A None property denotes 'no change'.
+    Returns call if all properties are unchanged. Otherwise, returns a copy with the new
+    fields.
+    """
+    return _ffi_api.CallWithFields(call, op, args, attrs, type_args, virtual_device, span)
+
+
 @tvm._ffi.register_object("relay.Let")
 class Let(ExprWithOp):
     """Let variable binding expression.
@@ -279,10 +332,23 @@ class Let(ExprWithOp):
 
     body: tvm.relay.Expr
         The body of the let binding.
+
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code.
     """
 
-    def __init__(self, variable, value, body):
-        self.__init_handle_by_constructor__(_ffi_api.Let, variable, value, body)
+    def __init__(self, variable, value, body, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.Let, variable, value, body, span)
+
+
+@tvm._ffi.register_func("relay.LetWithFields")
+def LetWithFields(let, variable=None, value=None, body=None, virtual_device=None, span=None):
+    """
+    Returns let with the given properties. A None property denotes 'no change'.
+    Returns let if all properties are unchanged. Otherwise, returns a copy with the new
+    fields.
+    """
+    return _ffi_api.LetWithFields(let, variable, value, body, virtual_device, span)
 
 
 @tvm._ffi.register_object("relay.If")
@@ -299,10 +365,25 @@ class If(ExprWithOp):
 
     false_branch: tvm.relay.Expr
         The expression evaluated when condition is false.
+
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code.
     """
 
-    def __init__(self, cond, true_branch, false_branch):
-        self.__init_handle_by_constructor__(_ffi_api.If, cond, true_branch, false_branch)
+    def __init__(self, cond, true_branch, false_branch, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.If, cond, true_branch, false_branch, span)
+
+
+@tvm._ffi.register_func("relay.IfWithFields")
+def IfWithFields(
+    if_expr, cond=None, true_branch=None, false_branch=None, virtual_device=None, span=None
+):
+    """
+    Returns if with the given properties. A None property denotes 'no change'.
+    Returns if if all properties are unchanged. Otherwise, returns a copy with the new
+    fields.
+    """
+    return _ffi_api.IfWithFields(if_expr, cond, true_branch, false_branch, virtual_device, span)
 
 
 @tvm._ffi.register_object("relay.TupleGetItem")
@@ -316,10 +397,25 @@ class TupleGetItem(ExprWithOp):
 
     index: int
         The index.
+
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code.
     """
 
-    def __init__(self, tuple_value, index):
-        self.__init_handle_by_constructor__(_ffi_api.TupleGetItem, tuple_value, index)
+    def __init__(self, tuple_value, index, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.TupleGetItem, tuple_value, index, span)
+
+
+@tvm._ffi.register_func("relay.TupleGetItemWithFields")
+def TupleGetItemWithFields(
+    tuple_get_item, tuple_value=None, index=None, virtual_device=None, span=None
+):
+    """
+    Returns tuple_get_item with the given properties. A None property denotes 'no change'.
+    Returns tuple_get_item if all properties are unchanged. Otherwise, returns a copy with the new
+    fields.
+    """
+    return _ffi_api.TupleGetItemWithFields(tuple_get_item, tuple_value, index, virtual_device, span)
 
 
 @tvm._ffi.register_object("relay.RefCreate")
@@ -329,10 +425,28 @@ class RefCreate(ExprWithOp):
     ----------
     value: tvm.relay.Expr
        The initial value.
+
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code.
     """
 
-    def __init__(self, value):
-        self.__init_handle_by_constructor__(_ffi_api.RefCreate, value)
+    def __init__(self, value, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.RefCreate, value, span)
+
+
+@tvm._ffi.register_func("relay.RefCreateWithFields")
+def RefCreateWithFields(
+    ref_create,
+    value=None,
+    virtual_device=None,
+    span=None,
+):
+    """
+    Returns ref_create with the given properties. A None property denotes 'no change'.
+    Returns ref_create if all properties are unchanged. Otherwise, returns a copy with the new
+    fields.
+    """
+    return _ffi_api.RefCreateWithFields(ref_create, value, virtual_device, span)
 
 
 @tvm._ffi.register_object("relay.RefRead")
@@ -342,10 +456,28 @@ class RefRead(ExprWithOp):
     ----------
     ref: tvm.relay.Expr
          The reference.
+
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code.
     """
 
-    def __init__(self, ref):
-        self.__init_handle_by_constructor__(_ffi_api.RefRead, ref)
+    def __init__(self, ref, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.RefRead, ref, span)
+
+
+@tvm._ffi.register_func("relay.RefReadWithFields")
+def RefReadWithFields(
+    ref_read,
+    ref=None,
+    virtual_device=None,
+    span=None,
+):
+    """
+    Returns ref_read with the given properties. A None property denotes 'no change'.
+    Returns ref_read if all properties are unchanged. Otherwise, returns a copy with the new
+    fields.
+    """
+    return _ffi_api.RefReadWithFields(ref_read, ref, virtual_device, span)
 
 
 @tvm._ffi.register_object("relay.RefWrite")
@@ -357,12 +489,32 @@ class RefWrite(ExprWithOp):
     ----------
     ref: tvm.relay.Expr
         The reference.
+
     value: tvm.relay.Expr
         The new value.
+
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code.
     """
 
-    def __init__(self, ref, value):
-        self.__init_handle_by_constructor__(_ffi_api.RefWrite, ref, value)
+    def __init__(self, ref, value, span=None):
+        self.__init_handle_by_constructor__(_ffi_api.RefWrite, ref, value, span)
+
+
+@tvm._ffi.register_func("relay.RefWriteWithFields")
+def RefWriteWithFields(
+    ref_write,
+    ref=None,
+    value=None,
+    virtual_device=None,
+    span=None,
+):
+    """
+    Returns ref_write with the given properties. A None property denotes 'no change'.
+    Returns ref_write if all properties are unchanged. Otherwise, returns a copy with the new
+    fields.
+    """
+    return _ffi_api.RefWriteWithFields(ref_write, ref, value, virtual_device, span)
 
 
 class TempExpr(ExprWithOp):
@@ -433,7 +585,7 @@ def astype(self, _):
         raise TypeError("astype cannot be used on tuple")
 
 
-def var(name_hint, type_annotation=None, shape=None, dtype="float32"):
+def var(name_hint, type_annotation=None, shape=None, dtype="float32", span=None):
     """Create a new tvm.relay.Var.
 
     This is a simple wrapper function that allows specify
@@ -456,6 +608,9 @@ def var(name_hint, type_annotation=None, shape=None, dtype="float32"):
     dtype: str, optional
         The data type of the tensor.
 
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code.
+
     Examples
     --------
     .. code-block:: python
@@ -476,10 +631,10 @@ def var(name_hint, type_annotation=None, shape=None, dtype="float32"):
         type_annotation = _ty.TensorType(shape, dtype)
     elif isinstance(type_annotation, str):
         type_annotation = _ty.TensorType((), type_annotation)
-    return Var(name_hint, type_annotation)
+    return Var(name_hint, type_annotation, span)
 
 
-def const(value, dtype=None):
+def const(value, dtype=None, span=None):
     """Create a constant value.
 
     Parameters
@@ -490,6 +645,9 @@ def const(value, dtype=None):
     dtype: str, optional
         The data type of the resulting constant.
 
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code.
+
     Note
     ----
     When dtype is None, we use the following rule:
@@ -516,7 +674,7 @@ def const(value, dtype=None):
     if not isinstance(value, _nd.NDArray):
         raise ValueError("value has to be scalar or NDArray")
 
-    return Constant(value)
+    return Constant(value, span)
 
 
 def bind(expr, binds):
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 660426fb4ad5..5d3b0a334590 100755
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -24,6 +24,7 @@
 from tvm.ir import IRModule
 from tvm.topi.utils import get_const_tuple
 
+from ..expr_functor import ExprMutator
 from .. import expr as _expr
 from .. import function as _function
 from .. import transform as _transform
@@ -304,13 +305,16 @@ def __init__(self):
         self.const_ctr = 1
         self.in_padding = False
 
-    def new_const(self, value, shape=None, dtype="float32"):
+    def new_const(self, value, shape=None, dtype="float32", source_name=None):
+        """Construct a new var expr and add to exprs dictionary"""
         name = "_param_%d" % (self.const_ctr)
         if hasattr(value, "shape"):
             shape = value.shape
         self.const_ctr += 1
         self.params[name] = value
         self.exprs[name] = _expr.var(name_hint=name, shape=shape, dtype=dtype)
+        if source_name:
+            self.exprs[name] = set_span(self.exprs[name], source_name)
         return self.exprs[name]
 
     def get_expr(self, name):
@@ -1048,3 +1052,162 @@ def try_resolve_var_to_const(x, graph_params):
         return _op.const(value, dtype)
 
     return x
+
+
+class _SpanFiller(ExprMutator):
+    """SpanFiller"""
+
+    def __init__(self, span):
+        ExprMutator.__init__(self)
+        if isinstance(span, tvm.relay.Span):
+            self._span = span
+        elif isinstance(span, str):
+            self._span = tvm.relay.Span(tvm.relay.SourceName(span), 0, 0, 0, 0)
+        elif isinstance(span, bytes):
+            self._span = tvm.relay.Span(tvm.relay.SourceName(span.decode("utf-8")), 0, 0, 0, 0)
+        else:
+            assert False, f"unsupported span type: {type(span)}"
+
+    def visit(self, expr):
+        if hasattr(expr, "span") and expr.span:
+            return expr
+
+        return super().visit(expr)
+
+    def visit_function(self, fn):
+        new_params = [self.visit(x) for x in fn.params]
+        new_body = self.visit(fn.body)
+        return _function.FunctionWithFields(
+            fn, list(new_params), new_body, fn.ret_type, fn.type_params, fn.attrs, None, self._span
+        )
+
+    def visit_let(self, let):
+        new_variable = self.visit(let.var)
+        new_value = self.visit(let.value)
+        new_body = self.visit(let.body)
+        return _expr.LetWithFields(let, new_variable, new_value, new_body, None, self._span)
+
+    def visit_call(self, call):
+        new_args = [self.visit(arg) for arg in call.args]
+        # call.op might be RelayExpr or Op type
+        # ExprMutator will return directly if subject belongs to Op type
+        new_op = self.visit(call.op)
+        return _expr.CallWithFields(
+            call, new_op, new_args, call.attrs, call.type_args, None, self._span
+        )
+
+    def visit_var(self, var):
+        return _expr.VarWithFields(var, var.vid, var.type_annotation, None, self._span)
+
+    def visit_if(self, ite):
+        return _expr.IfWithFields(
+            ite,
+            self.visit(ite.cond),
+            self.visit(ite.true_branch),
+            self.visit(ite.false_branch),
+            None,
+            self._span,
+        )
+
+    def visit_tuple(self, tup):
+        return _expr.TupleWithFields(
+            tup, [self.visit(field) for field in tup.fields], None, self._span
+        )
+
+    def visit_tuple_getitem(self, op):
+        return _expr.TupleGetItemWithFields(
+            op, self.visit(op.tuple_value), op.index, None, self._span
+        )
+
+    def visit_constant(self, const):
+        return _expr.ConstantWithFields(const, const.data, None, self._span)
+
+    # TODO: Frontend model translation could not use following relay expressions so far,
+    #       enable them when new models/impls leverage these kinds of relay expressions.
+    def visit_ref_create(self, _):
+        raise NotImplementedError()
+
+    def visit_ref_write(self, _):
+        raise NotImplementedError()
+
+    def visit_ref_read(self, _):
+        raise NotImplementedError()
+
+    def visit_match(self, _):
+        raise NotImplementedError()
+
+    def fill(self, sym):
+        """Fill span to sym when it is an expr, or return it without change
+
+        Parameters
+        ----------
+        sym :
+            A symbol which is generated from the conversion of a frontend operator.
+
+        Returns
+        -------
+        sym:
+            A expr with span-filled or the original sym.
+        """
+        if isinstance(sym, _expr.TupleWrapper):
+            return _expr.TupleWrapper(self.visit(sym.tuple_value), sym.size)
+        elif isinstance(sym, _expr.RelayExpr):
+            return self.visit(sym)
+        elif isinstance(sym, list):
+            assert all(
+                isinstance(expr, _expr.RelayExpr) for expr in sym
+            ), f"unexpected relay expressions in {sym}"
+            return [self.visit(expr) for expr in sym]
+        elif isinstance(sym, tuple):
+            # some op conversion may return dummy elements
+            # e.g. op in frontend/pytorch.py: min_max_common
+            assert all(
+                isinstance(expr, (_expr.RelayExpr, type(None))) for expr in sym
+            ), f"unexpected relay expressions in {sym}"
+            return tuple(self.visit(expr) if expr else None for expr in sym)
+        elif isinstance(sym, (float, int)):
+            return sym
+        elif isinstance(sym, np.ndarray):
+            return sym
+
+        raise RuntimeError(f"unsupported type {type(sym)}")
+
+
+def set_span(sym, span):
+    """
+    Recursively tag the span to the symbol. Stop when it encounters a span-tagged expr. Disabled
+    when setting the "relay.frontend.fill_span" as False to the config of PassContext
+
+    Parameters
+    ----------
+    sym :
+        A symbol is generated from the conversion of a frontend operator. Raise an error when the
+        type of the symbol is not supported.
+
+    span : String, Span, or bytes
+        The source information of the corresponding symbol.
+
+    Returns
+    -------
+    result :
+        The symbol tagged with span.
+
+    Examples
+    --------
+    .. code-block:: python
+
+      x = set_span(relay.var("x", shape=(1, 64, 56, 56)), "x_var")
+      w = relay.const(np.ones([64, 64, 3, 3]), dtype="int64")
+      y = set_span(
+          relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1)), "conv2d"
+      )
+      print(relay.Function([x], y))
+
+      #fn (%x: Tensor[(1, 64, 56, 56), float32] /* span=x_var:0:0 */) {
+      #  nn.conv2d(%x, meta[relay.Constant][0] /* span=conv2d:0:0 */, ...) /* span=conv2d:0:0 */
+      #}
+    """
+
+    if tvm.transform.PassContext.current().config.get("relay.frontend.fill_span", True):
+        return _SpanFiller(span).fill(sym)
+    return sym
diff --git a/python/tvm/relay/function.py b/python/tvm/relay/function.py
index 6b3513cb5e1a..68d8953900cf 100644
--- a/python/tvm/relay/function.py
+++ b/python/tvm/relay/function.py
@@ -44,14 +44,17 @@ class Function(BaseFunc):
     type_params: Optional[List[tvm.relay.TypeParam]]
         The additional type parameters, this is only
         used in advanced usecase of template functions.
+
+    span: Optional[tvm.relay.Span]
+        Span that points to original source code.
     """
 
-    def __init__(self, params, body, ret_type=None, type_params=None, attrs=None):
+    def __init__(self, params, body, ret_type=None, type_params=None, attrs=None, span=None):
         if type_params is None:
             type_params = convert([])
 
         self.__init_handle_by_constructor__(
-            _ffi_api.Function, params, body, ret_type, type_params, attrs
+            _ffi_api.Function, params, body, ret_type, type_params, attrs, span
         )
 
     def __call__(self, *args):
diff --git a/python/tvm/relay/loops.py b/python/tvm/relay/loops.py
index 6c2ab2e23d72..d46e34860f0b 100644
--- a/python/tvm/relay/loops.py
+++ b/python/tvm/relay/loops.py
@@ -54,7 +54,7 @@ def while_loop(cond, loop_vars, loop_bodies):
 
     for i, loop_var in enumerate(loop_vars):
         name = loop_var.name_hint if isinstance(loop_var, _expr.Var) else "arg{}".format(i)
-        new_var = _expr.var(name, type_annotation=sb.type_of(loop_var))
+        new_var = _expr.var(name, type_annotation=sb.type_of(loop_var), span=loop_var.span)
         fresh_vars.append(new_var)
 
     with sb.if_scope(cond(*fresh_vars)):
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 74ca326bca7e..899b05440388 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -2081,3 +2081,25 @@ def pprint(name, obj):
                 f"or an instance of `tvm.tir.PrimFunc`.  "
                 f"Instead, received {type(expected)}."
             )
+
+
+class _control_span_filling:
+    def __init__(self, on=True):
+        self._on = on
+        self._pass_ctx = tvm.transform.PassContext(config={"relay.frontend.fill_span": self._on})
+
+    def __enter__(self):
+        self._pass_ctx.__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self._pass_ctx.__exit__(exc_type, exc_val, exc_tb)
+
+
+class enable_span_filling(_control_span_filling):
+    def __init__(self):
+        super().__init__()
+
+
+class disable_span_filling(_control_span_filling):
+    def __init__(self):
+        super().__init__(on=False)
diff --git a/src/ir/span.cc b/src/ir/span.cc
index e19bef4cb864..39f0044d16d3 100644
--- a/src/ir/span.cc
+++ b/src/ir/span.cc
@@ -20,13 +20,17 @@
  * \file span.cc
  * \brief The span data structure.
  */
+#include <tvm/ir/expr.h>
 #include <tvm/ir/span.h>
+#include <tvm/ir/transform.h>
 #include <tvm/runtime/registry.h>
 
 #include <algorithm>
 
 namespace tvm {
 
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.frontend.fill_span", Bool);
+
 ObjectPtr<Object> GetSourceNameNode(const String& name) {
   // always return pointer as the reference can change as map re-allocate.
   // or use another level of indirection by creating a unique_ptr
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 5c85b3b29df7..062d9206cf92 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -72,9 +72,14 @@ Constant::Constant(runtime::NDArray data, Span span) {
 
 TVM_REGISTER_NODE_TYPE(ConstantNode);
 
-TVM_REGISTER_GLOBAL("relay.ir.Constant").set_body_typed([](runtime::NDArray data) {
-  return Constant(data);
+TVM_REGISTER_GLOBAL("relay.ir.Constant").set_body_typed([](runtime::NDArray data, Span span) {
+  return Constant(data, span);
 });
+TVM_REGISTER_GLOBAL("relay.ir.ConstantWithFields")
+    .set_body_typed([](Constant constant, Optional<runtime::NDArray> opt_data,
+                       Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
+      return WithFields(constant, opt_data, opt_virtual_device, opt_span);
+    });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<ConstantNode>([](const ObjectRef& ref, ReprPrinter* p) {
@@ -129,6 +134,11 @@ TVM_REGISTER_NODE_TYPE(TupleNode);
 TVM_REGISTER_GLOBAL("relay.ir.Tuple").set_body_typed([](tvm::Array<relay::Expr> fields, Span span) {
   return Tuple(fields, span);
 });
+TVM_REGISTER_GLOBAL("relay.ir.TupleWithFields")
+    .set_body_typed([](Tuple tuple, Optional<Array<Expr>> opt_fields,
+                       Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
+      return WithFields(tuple, opt_fields, opt_virtual_device, opt_span);
+    });
 
 Tuple WithFields(Tuple tuple, Optional<Array<Expr>> opt_fields,
                  Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
@@ -200,9 +210,14 @@ Var WithFields(Var var, Optional<Id> opt_vid, Optional<Type> opt_type_annotation
 
 TVM_REGISTER_NODE_TYPE(VarNode);
 
-TVM_REGISTER_GLOBAL("relay.ir.Var").set_body_typed([](String str, Type type_annotation) {
-  return Var(str, type_annotation);
+TVM_REGISTER_GLOBAL("relay.ir.Var").set_body_typed([](String str, Type type_annotation, Span span) {
+  return Var(str, type_annotation, span);
 });
+TVM_REGISTER_GLOBAL("relay.ir.VarWithFields")
+    .set_body_typed([](Var var, Optional<Id> opt_vid, Optional<Type> opt_type_annotation,
+                       Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
+      return WithFields(var, opt_vid, opt_type_annotation, opt_virtual_device, opt_span);
+    });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<VarNode>([](const ObjectRef& ref, ReprPrinter* p) {
@@ -278,6 +293,13 @@ TVM_REGISTER_GLOBAL("relay.ir.Call")
     .set_body_typed([](Expr op, Array<Expr> args, Attrs attrs, Array<Type> type_args, Span span) {
       return Call(op, args, attrs, type_args, span);
     });
+TVM_REGISTER_GLOBAL("relay.ir.CallWithFields")
+    .set_body_typed([](Call call, Optional<Expr> opt_op, Optional<Array<Expr>> opt_args,
+                       Optional<Attrs> opt_attrs, Optional<Array<Type>> opt_type_args,
+                       Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
+      return WithFields(call, opt_op, opt_args, opt_attrs, opt_type_args, opt_virtual_device,
+                        opt_span);
+    });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<CallNode>([](const ObjectRef& ref, ReprPrinter* p) {
@@ -320,9 +342,15 @@ Let WithFields(Let let, Optional<Var> opt_var, Optional<Expr> opt_value, Optiona
 
 TVM_REGISTER_NODE_TYPE(LetNode);
 
-TVM_REGISTER_GLOBAL("relay.ir.Let").set_body_typed([](Var var, Expr value, Expr body) {
-  return Let(var, value, body);
+TVM_REGISTER_GLOBAL("relay.ir.Let").set_body_typed([](Var var, Expr value, Expr body, Span span) {
+  return Let(var, value, body, span);
 });
+TVM_REGISTER_GLOBAL("relay.ir.LetWithFields")
+    .set_body_typed([](Let let, Optional<Var> opt_var, Optional<Expr> opt_value,
+                       Optional<Expr> opt_body, Optional<VirtualDevice> opt_virtual_device,
+                       Optional<Span> opt_span) {
+      return WithFields(let, opt_var, opt_value, opt_body, opt_virtual_device, opt_span);
+    });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<LetNode>([](const ObjectRef& ref, ReprPrinter* p) {
@@ -367,8 +395,15 @@ If WithFields(If if_expr, Optional<Expr> opt_cond, Optional<Expr> opt_true_branc
 TVM_REGISTER_NODE_TYPE(IfNode);
 
 TVM_REGISTER_GLOBAL("relay.ir.If")
-    .set_body_typed([](Expr cond, Expr true_branch, Expr false_branch) {
-      return If(cond, true_branch, false_branch);
+    .set_body_typed([](Expr cond, Expr true_branch, Expr false_branch, Span span) {
+      return If(cond, true_branch, false_branch, span);
+    });
+TVM_REGISTER_GLOBAL("relay.ir.IfWithFields")
+    .set_body_typed([](If if_expr, Optional<Expr> opt_cond, Optional<Expr> opt_true_branch,
+                       Optional<Expr> opt_false_branch, Optional<VirtualDevice> opt_virtual_device,
+                       Optional<Span> opt_span) {
+      return WithFields(if_expr, opt_cond, opt_true_branch, opt_false_branch, opt_virtual_device,
+                        opt_span);
     });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
@@ -410,9 +445,15 @@ TupleGetItem WithFields(TupleGetItem tuple_get_item, Optional<Expr> opt_tuple,
 
 TVM_REGISTER_NODE_TYPE(TupleGetItemNode);
 
-TVM_REGISTER_GLOBAL("relay.ir.TupleGetItem").set_body_typed([](Expr tuple, int index) {
-  return TupleGetItem(tuple, index);
+TVM_REGISTER_GLOBAL("relay.ir.TupleGetItem").set_body_typed([](Expr tuple, int index, Span span) {
+  return TupleGetItem(tuple, index, span);
 });
+TVM_REGISTER_GLOBAL("relay.ir.TupleGetItemWithFields")
+    .set_body_typed([](TupleGetItem tuple_get_item, Optional<Expr> opt_tuple,
+                       Optional<Integer> opt_index, Optional<VirtualDevice> opt_virtual_device,
+                       Optional<Span> opt_span) {
+      return WithFields(tuple_get_item, opt_tuple, opt_index, opt_virtual_device, opt_span);
+    });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<TupleGetItemNode>([](const ObjectRef& ref, ReprPrinter* p) {
@@ -448,9 +489,14 @@ RefCreate WithFields(RefCreate ref_create, Optional<Expr> opt_value,
 
 TVM_REGISTER_NODE_TYPE(RefCreateNode);
 
-TVM_REGISTER_GLOBAL("relay.ir.RefCreate").set_body_typed([](Expr value) {
-  return RefCreate(value);
+TVM_REGISTER_GLOBAL("relay.ir.RefCreate").set_body_typed([](Expr value, Span span) {
+  return RefCreate(value, span);
 });
+TVM_REGISTER_GLOBAL("relay.ir.RefCreateWithFields")
+    .set_body_typed([](RefCreate ref_create, Optional<Expr> opt_value,
+                       Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
+      return WithFields(ref_create, opt_value, opt_virtual_device, opt_span);
+    });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<RefCreateNode>([](const ObjectRef& ref, ReprPrinter* p) {
@@ -486,7 +532,14 @@ RefRead WithFields(RefRead ref_read, Optional<Expr> opt_ref,
 
 TVM_REGISTER_NODE_TYPE(RefReadNode);
 
-TVM_REGISTER_GLOBAL("relay.ir.RefRead").set_body_typed([](Expr ref) { return RefRead(ref); });
+TVM_REGISTER_GLOBAL("relay.ir.RefRead").set_body_typed([](Expr ref, Span span) {
+  return RefRead(ref, span);
+});
+TVM_REGISTER_GLOBAL("relay.ir.RefReadWithFields")
+    .set_body_typed([](RefRead ref_read, Optional<Expr> opt_ref,
+                       Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
+      return WithFields(ref_read, opt_ref, opt_virtual_device, opt_span);
+    });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<RefReadNode>([](const ObjectRef& ref, ReprPrinter* p) {
@@ -525,9 +578,14 @@ RefWrite WithFields(RefWrite ref_write, Optional<Expr> opt_ref, Optional<Expr> o
 
 TVM_REGISTER_NODE_TYPE(RefWriteNode);
 
-TVM_REGISTER_GLOBAL("relay.ir.RefWrite").set_body_typed([](Expr ref, Expr value) {
-  return RefWrite(ref, value);
+TVM_REGISTER_GLOBAL("relay.ir.RefWrite").set_body_typed([](Expr ref, Expr value, Span span) {
+  return RefWrite(ref, value, span);
 });
+TVM_REGISTER_GLOBAL("relay.ir.RefWriteWithFields")
+    .set_body_typed([](RefWrite ref_write, Optional<Expr> opt_ref, Optional<Expr> opt_value,
+                       Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
+      return WithFields(ref_write, opt_ref, opt_value, opt_virtual_device, opt_span);
+    });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<RefWriteNode>([](const ObjectRef& ref, ReprPrinter* p) {
diff --git a/src/relay/ir/function.cc b/src/relay/ir/function.cc
index 1a3db9974f05..07cfb27b1d35 100644
--- a/src/relay/ir/function.cc
+++ b/src/relay/ir/function.cc
@@ -124,8 +124,8 @@ TVM_REGISTER_NODE_TYPE(FunctionNode);
 
 TVM_REGISTER_GLOBAL("relay.ir.Function")
     .set_body_typed([](tvm::Array<Var> params, Expr body, Type ret_type,
-                       tvm::Array<TypeVar> ty_params, tvm::DictAttrs attrs) {
-      return Function(params, body, ret_type, ty_params, attrs);
+                       tvm::Array<TypeVar> ty_params, tvm::DictAttrs attrs, Span span) {
+      return Function(params, body, ret_type, ty_params, attrs, span);
     });
 TVM_REGISTER_GLOBAL("relay.ir.FunctionWithFields")
     .set_body_typed([](Function function, Optional<Array<Var>> opt_params, Optional<Expr> opt_body,
diff --git a/tests/python/frontend/test_common.py b/tests/python/frontend/test_common.py
index e706f2af304a..2b35ae71f2d6 100644
--- a/tests/python/frontend/test_common.py
+++ b/tests/python/frontend/test_common.py
@@ -14,7 +14,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-from tvm.relay.frontend.common import StrAttrsDict
+
+import numpy as np
+
+from tvm import relay, testing, transform
+from tvm.relay.frontend.common import StrAttrsDict, set_span
+from relay.utils.tag_span import _set_span, _create_span, _verify_structural_equal_with_span
 
 
 def test_key_is_present():
@@ -27,6 +32,189 @@ def test_key_is_not_present():
     assert not attrs.has_attr("b")
 
 
+class TestSetSpan:
+    def test_pass_ctx_switch(self):
+        def _res(should_fill):
+            if should_fill:
+                with testing.enable_span_filling():
+                    return set_span(relay.var("x", shape=(1, 64, 56, 56)), "x_var")
+            else:
+                with testing.disable_span_filling():
+                    return set_span(relay.var("x", shape=(1, 64, 56, 56)), "x_var")
+
+        disable = relay.var("x", shape=(1, 64, 56, 56))
+        enable = relay.var("x", shape=(1, 64, 56, 56), span=_create_span("x_var"))
+
+        _verify_structural_equal_with_span(_res(False), disable)
+        _verify_structural_equal_with_span(_res(True), enable)
+
+    # Should tag all exprs without span, and stop when expr is span-tagged
+    def test_builtin_tuple(self):
+        def _res():
+            a = relay.const(np.ones([1, 1, 1]), dtype="int64", span=_create_span("a"))
+            b = relay.const(np.zeros([1, 1, 1]), dtype="int64")
+            return set_span(tuple([a, b]), "tuple")
+
+        def _golden():
+            a = relay.const(np.ones([1, 1, 1]), dtype="int64", span=_create_span("a"))
+            b = relay.const(np.zeros([1, 1, 1]), dtype="int64", span=_create_span("tuple"))
+            return tuple([a, b])
+
+        res_tuple, golden_tuple = _res(), _golden()
+        assert len(res_tuple) == len(golden_tuple)
+        for i in range(len(res_tuple)):
+            _verify_structural_equal_with_span(res_tuple[i], golden_tuple[i])
+
+    def test_builtin_list(self):
+        def _res():
+            a = relay.const(np.ones([1, 1, 1]), dtype="int64", span=_create_span("a"))
+            b = relay.const(np.zeros([1, 1, 1]), dtype="int64")
+            t = relay.Tuple([a, b])
+            t_a = relay.TupleGetItem(t, 0)
+            t_b = relay.TupleGetItem(t, 1)
+            return set_span([t_a, t_b], "list")
+
+        def _golden():
+            a = relay.const(np.ones([1, 1, 1]), dtype="int64", span=_create_span("a"))
+            b = relay.const(np.zeros([1, 1, 1]), dtype="int64", span=_create_span("list"))
+            t = relay.Tuple([a, b], span=_create_span("list"))
+            t_a = relay.TupleGetItem(t, 0, span=_create_span("list"))
+            t_b = relay.TupleGetItem(t, 1, span=_create_span("list"))
+            return [t_a, t_b]
+
+        res_list, golden_list = _res(), _golden()
+        assert len(res_list) == len(golden_list)
+        for i in range(len(res_list)):
+            _verify_structural_equal_with_span(res_list[i], golden_list[i])
+
+    def test_var(self):
+        x = set_span(relay.var("x", shape=(1, 64, 56, 56)), "x_var")
+        x_expected = relay.var("x", shape=(1, 64, 56, 56), span=_create_span("x_var"))
+        _verify_structural_equal_with_span(x, x_expected)
+
+    def test_constant(self):
+        c = set_span(relay.const(np.ones([64, 64, 3, 3]), dtype="int64"), "const_c")
+        c_expected = relay.const(
+            np.ones([64, 64, 3, 3]), dtype="int64", span=_create_span("const_c")
+        )
+        _verify_structural_equal_with_span(c, c_expected)
+
+    def test_call(self):
+        def _res():
+            x = set_span(relay.var("x", shape=(1, 64, 56, 56)), "x_var")
+            w = relay.const(np.ones([64, 64, 3, 3]), dtype="int64")
+            y = set_span(
+                relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1)), "conv2d"
+            )
+            return relay.Function([x], y)
+
+        def _golden():
+            x = relay.var("x", shape=(1, 64, 56, 56), span=_create_span("x_var"))
+            w = relay.const(np.ones([64, 64, 3, 3]), dtype="int64", span=_create_span("conv2d"))
+            y = _set_span(
+                relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1)), "conv2d"
+            )
+            return relay.Function([x], y)
+
+        _verify_structural_equal_with_span(_res(), _golden())
+
+    def test_tuple(self):
+        def _res():
+            a = set_span(relay.const(np.ones([1, 1, 1]), dtype="int64"), "a")
+            b = relay.const(np.ones([1, 1, 1]), dtype="int64")
+            t = set_span(relay.Tuple([a, b]), "t")
+            return relay.Function([], t)
+
+        def _golden():
+            a = relay.const(np.ones([1, 1, 1]), dtype="int64", span=_create_span("a"))
+            b = relay.const(np.ones([1, 1, 1]), dtype="int64", span=_create_span("t"))
+            t = relay.Tuple([a, b], span=_create_span("t"))
+            return relay.Function([], t)
+
+        _verify_structural_equal_with_span(_res(), _golden())
+
+    def test_tuple_getitem(self):
+        def _res():
+            a = set_span(relay.const(np.ones([1, 1, 1]), dtype="int64"), "a")
+            b = relay.const(np.ones([1, 1, 1]), dtype="int64")
+            t = relay.Tuple([a, b])
+            i = set_span(relay.TupleGetItem(t, 0), "i")
+            return relay.Function([], i)
+
+        def _golden():
+            a = relay.const(np.ones([1, 1, 1]), dtype="int64", span=_create_span("a"))
+            b = relay.const(np.ones([1, 1, 1]), dtype="int64", span=_create_span("i"))
+            t = relay.Tuple([a, b], span=_create_span("i"))
+            i = relay.TupleGetItem(t, 0, span=_create_span("i"))
+            return relay.Function([], i)
+
+        _verify_structural_equal_with_span(_res(), _golden())
+
+    def test_let(self):
+        def _res():
+            x = set_span(relay.Var("x"), "x_var")
+            c_1 = relay.const(np.ones(10))
+            add = relay.add(x, x)
+            body = set_span(relay.Let(x, c_1, add), "let")
+
+            c_2 = set_span(relay.const(np.zeros(10)), "zeros")
+            y = set_span(relay.add(body, c_2), "add_2")
+            return relay.Function([x], y)
+
+        def _golden():
+            x = relay.Var("x", span=_create_span("x_var"))
+            c_1 = relay.const(np.ones(10), span=_create_span("let"))
+            add = _set_span(relay.add(x, x), "let")
+            body = relay.Let(x, c_1, add, span=_create_span("let"))
+
+            c_2 = relay.const(np.zeros(10), span=_create_span("zeros"))
+            y = _set_span(relay.add(body, c_2), "add_2")
+            return relay.Function([x], y)
+
+        _verify_structural_equal_with_span(_res(), _golden())
+
+    def test_if(self):
+        def _res():
+            x = set_span(relay.var("x", shape=[], dtype="float32"), "x_var")
+            y = set_span(relay.var("y", shape=[], dtype="float32"), "y_var")
+            eq = relay.equal(x, y)
+
+            true_branch = set_span(relay.add(x, y), "true_branch")
+            false_branch = relay.subtract(x, y)
+            ife = set_span(relay.If(eq, true_branch, false_branch), "if")
+            return relay.Function([x, y], ife)
+
+        def _golden():
+            x = relay.var("x", shape=[], dtype="float32", span=_create_span("x_var"))
+            y = relay.var("y", shape=[], dtype="float32", span=_create_span("y_var"))
+            eq = _set_span(relay.equal(x, y), "if")
+
+            true_branch = _set_span(relay.add(x, y), "true_branch")
+            false_branch = _set_span(relay.subtract(x, y), "if")
+            ife = relay.If(eq, true_branch, false_branch, span=_create_span("if"))
+            return relay.Function([x, y], ife)
+
+        _verify_structural_equal_with_span(_res(), _golden())
+
+    def test_fn(self):
+        def _res():
+            x = set_span(relay.var("x", shape=(1, 64, 56, 56)), "x_var")
+            w = relay.const(np.ones([64, 64, 3, 3]), dtype="int64")
+            y = relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1))
+            f = set_span(relay.Function([x], y), "func")
+            return f
+
+        def _golden():
+            x = relay.var("x", shape=(1, 64, 56, 56), span=_create_span("x_var"))
+            w = relay.const(np.ones([64, 64, 3, 3]), dtype="int64", span=_create_span("func"))
+            y = _set_span(
+                relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1)), "func"
+            )
+            f = relay.Function([x], y, span=_create_span("func"))
+            return f
+
+        _verify_structural_equal_with_span(_res(), _golden())
+
+
 if __name__ == "__main__":
-    test_key_is_present()
-    test_key_is_present()
+    testing.main()
diff --git a/tests/python/relay/utils/tag_span.py b/tests/python/relay/utils/tag_span.py
new file mode 100644
index 000000000000..77042be60285
--- /dev/null
+++ b/tests/python/relay/utils/tag_span.py
@@ -0,0 +1,108 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm import relay, tir
+from tvm.relay import expr as _expr
+from tvm.relay.expr_functor import ExprVisitor
+
+
+def _set_span(expr, src):
+    if isinstance(expr, _expr.Call):
+        return _expr.CallWithFields(
+            expr, expr.op, expr.args, expr.attrs, expr.type_args, None, _create_span(src)
+        )
+    elif isinstance(expr, _expr.Var):
+        return _expr.VarWithFields(expr, expr.vid, expr.type_annotation, None, _create_span(src))
+    elif isinstance(expr, _expr.TupleGetItem):
+        return _expr.TupleGetItemWithFields(
+            expr, expr.tuple_value, expr.index, None, _create_span(src)
+        )
+    elif isinstance(expr, _expr.Constant):
+        return _expr.ConstantWithFields(expr, expr.data, None, _create_span(src))
+    elif isinstance(expr, _expr.Tuple):
+        return _expr.TupleWithFields(expr, expr.fields, None, _create_span(src))
+    elif isinstance(expr, _expr.TupleWrapper):
+        return _expr.TupleWrapper(_set_span(expr.tuple_value, src), expr.size)
+
+    assert False, f"unsupported type {type(expr)}"
+
+
+def _create_span(src):
+    if isinstance(src, list):
+        tmp_list = []
+        for s in src:
+            if isinstance(s, str):
+                tmp_list.append(_create_span(s))
+            elif isinstance(s, relay.Span):
+                tmp_list.append(s)
+            elif isinstance(s, relay.SequentialSpan):
+                tmp_list.extend(s.spans)
+            elif s is None:
+                tmp_list.append(s)
+            else:
+                assert False, f"unsupported type {type(s)}"
+        return relay.SequentialSpan(tmp_list)
+    return relay.Span(relay.SourceName(src), 0, 0, 0, 0)
+
+
+def _collect_spans(objref):
+    class Collector:
+        def __init__(self):
+            self._spans = []
+
+        def collect(self, objref):
+            if hasattr(objref, "span"):
+                self._spans.append(objref.span)
+
+        @property
+        def get_spans(self):
+            return self._spans
+
+    pov = None
+    if isinstance(objref, relay.Expr):
+        pov = relay.analysis.post_order_visit
+    elif isinstance(objref, (tir.Stmt, tir.expr.PrimExprWithOp)):
+        pov = tir.stmt_functor.post_order_visit
+    else:
+        assert False, f"unsupported type {type(objref)}"
+
+    c = Collector()
+    pov(objref, c.collect)
+    return c.get_spans
+
+
+def _verify_span(lhs, rhs):
+    lhs_spans, rhs_spans = _collect_spans(lhs), _collect_spans(rhs)
+
+    assert len(lhs_spans) == len(rhs_spans)
+
+    for i in range(len(lhs_spans)):
+        assert tvm.ir.structural_equal(lhs_spans[i], rhs_spans[i])
+
+
+def _verify_structural_equal_with_span(lhs, rhs, assert_mode=False, map_free_vars=False):
+    if isinstance(lhs, relay.Var) and isinstance(rhs, relay.Var):
+        # SEqualReduce compares the vid of Var type. Threrfore we only compare span here.
+        _verify_span(lhs, rhs)
+        return
+
+    if assert_mode:
+        tvm.ir.assert_structural_equal(lhs, rhs, map_free_vars)
+    else:
+        assert tvm.ir.structural_equal(lhs, rhs, map_free_vars)
+
+    _verify_span(lhs, rhs)

From 46bc1360e22bad293d384d5eeea4082cd3a7f93f Mon Sep 17 00:00:00 2001
From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com>
Date: Wed, 28 Dec 2022 00:10:15 +0300
Subject: [PATCH 088/286] [QNN] Change in Pass Context for lookup table
 calculation (#13660)

Motivation:
It is possible to disable specific passes through the "disabled_pass"
parameter in the Pass Context. These "disabled" passes can be optional
for one target and mandatory for another one.
Since lookup table for some QNN operations (tanh, round and etc.) is
calculated on the host and some of disabled passes can be required for
the host, no need to disable these passes. This constant calculation/
evaluation is orthogonal to the compilation process for specific target.

What was changed:
This commit creates its own compilation Pass Context for lookup table
calculation and evaluation (for elemwise QNN ops: tanh, sqrt ...).
---
 python/tvm/relay/qnn/op/canonicalizations.py | 23 ++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/qnn/op/canonicalizations.py b/python/tvm/relay/qnn/op/canonicalizations.py
index 1f2c57c6da34..6bfcd34aba90 100644
--- a/python/tvm/relay/qnn/op/canonicalizations.py
+++ b/python/tvm/relay/qnn/op/canonicalizations.py
@@ -23,10 +23,25 @@
 
 
 def run_const_expr(expr: "relay.Expr") -> np.ndarray:
-    """Evaluate a const expression, receiving result as np array."""
-    mod = tvm.IRModule.from_expr(expr)
-    vm_exe = relay.create_executor("vm", mod=mod)
-    return vm_exe.evaluate()().asnumpy()
+    """Evaluate a const expression, receiving result as np array.
+
+    If a number of passes are disabled in the current Pass Context, then there is no need to disable
+    these passes for const expression evaluation as well. That's why we use empty list
+    "disabled_pass=[]", all other arguments are inherited from the current Pass Context.
+    """
+    curr_pass_ctx = tvm.ir.transform.PassContext.current()
+    with tvm.ir.transform.PassContext(
+        opt_level=curr_pass_ctx.opt_level,
+        required_pass=curr_pass_ctx.required_pass,
+        disabled_pass=[],
+        instruments=curr_pass_ctx.instruments,
+        config=curr_pass_ctx.config,
+    ):
+        mod = tvm.IRModule.from_expr(expr)
+        vm_exe = relay.create_executor("vm", mod=mod)
+        output = vm_exe.evaluate()().asnumpy()
+
+    return output
 
 
 def create_integer_lookup_table(

From a6337ca14fbbd17d2d7dc9ccafb1989791c15bc3 Mon Sep 17 00:00:00 2001
From: krishnaraj36 <45380557+krishnaraj36@users.noreply.github.com>
Date: Wed, 28 Dec 2022 11:24:11 +0530
Subject: [PATCH 089/286] [CLML][RELAY] Enable Pad and Conv2d layer fusion
 (#13649)

* [CLML][RELAY] Enable Pad and Conv2d layer fusion

Enabled clml supported nn.pad+nn.conv2d fusion pattern in clml pattern table

* Fix pad testcase attributes

* Fix the lint error

* Fix the lint error

* Removed redundent check in clml pattern

* Fix the lint error

Co-authored-by: kvegiraj <kvegiraj@qti.qualcomm.com>
---
 python/tvm/relay/op/contrib/clml.py        | 21 +++++++++++++++++++++
 src/relay/backend/contrib/clml/codegen.cc  |  2 +-
 tests/python/contrib/test_clml/test_ops.py |  4 ++--
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
index c3d4eb84700d..6453b8a06c9f 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -147,6 +147,23 @@ def conv_pattern():
         pattern = pattern.optional(is_op("clip"))
         return pattern
 
+    def pad_conv_pattern():
+        """Create a pad with convolution pattern."""
+        pattern = is_op("nn.pad")(wildcard(), is_constant())
+        pattern = is_op("nn.conv2d")(pattern, is_constant())
+        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
+        pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
+        pattern = pattern.optional(
+            lambda x: is_tuple_get_item(
+                is_op("nn.batch_norm")(
+                    x, is_constant(), is_constant(), is_constant(), is_constant()
+                )
+            )
+        )
+        pattern = pattern.optional(is_op("nn.relu"))
+        pattern = pattern.optional(is_op("clip"))
+        return pattern
+
     def batch_norm_pattern():
         """Create a batch norm pattern."""
         pattern = is_op("nn.batch_norm")(
@@ -200,9 +217,11 @@ def check_conv(extract):
 
         while call.op.name != "nn.conv2d":
             call = call.args[0]
+
         attrs, args = call.attrs, call.args
         if attrs.data_layout != "NCHW":
             return False
+
         if (
             (not clip_found)
             and (attrs.kernel_size[0] == 3)
@@ -211,6 +230,7 @@ def check_conv(extract):
             and (attrs.channels == attrs.groups)
         ):
             return False
+
         data_typ = args[0].checked_type
         kernel_typ = args[1].checked_type
         is_depthwise = is_depthwise_conv2d(
@@ -246,6 +266,7 @@ def check_default_op(extract):
         return True
 
     return [
+        ("clml.pad_conv2d", pad_conv_pattern(), check_conv),
         ("clml.conv2d", conv_pattern(), check_conv),
         ("clml.dense", dense_pattern(), check_default_op),
         ("clml.pad", pad_pattern(), check_pad_op),
diff --git a/src/relay/backend/contrib/clml/codegen.cc b/src/relay/backend/contrib/clml/codegen.cc
index 9ecec0c4531f..167c48e1baf5 100644
--- a/src/relay/backend/contrib/clml/codegen.cc
+++ b/src/relay/backend/contrib/clml/codegen.cc
@@ -83,7 +83,7 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
     ICHECK(comp.defined()) << "CLML JSON runtime only supports composite functions.";
     const std::string name = comp.value();
     std::shared_ptr<JSONGraphNode> json_node;
-    if (name == "clml.conv2d") {
+    if (name == "clml.conv2d" || name == "clml.pad_conv2d") {
       json_node = CreateCompositeConvJSONNode(cn);
     } else if (name == "clml.batch_norm") {
       json_node = CreateBatchNormJSONNode(cn);
diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py
index d2431d2dfd3b..da09715fbe4c 100644
--- a/tests/python/contrib/test_clml/test_ops.py
+++ b/tests/python/contrib/test_clml/test_ops.py
@@ -45,7 +45,7 @@ def _get_conv_model(
     a = relay.var(next(iter(var)), shape=shape, dtype=dtype)
     input_arr = var[next(iter(var))]
     if has_pad:
-        p = ((0, 0), (padding[0], padding[0]), (padding[1], padding[1]), (0, 0))
+        p = ((0, 0), (0, 0), (padding[0], padding[0]), (padding[1], padding[1]))
         a = relay.nn.pad(a, pad_width=p)
         padding = (0, 0, 0, 0)
     else:
@@ -97,7 +97,7 @@ def test_conv2d(device, dtype):
     trials = [
         # Normal convolution
         [3, 3, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False)],
-        [2, 1, (2, 2), (1, 1), (1, 1), 7, (15, 16, 12), (False, False, True)],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (15, 16, 12), (True, False, True)],
         [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False)],
         [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, True)],
         # Normal convolution

From 9c16365f0d57c2e406841e22029c8c9059f8c30e Mon Sep 17 00:00:00 2001
From: Hongyu Cai <h.tsai@hotmail.com>
Date: Wed, 28 Dec 2022 06:10:50 -0500
Subject: [PATCH 090/286] [TVMScript] More accurate hints for ImportError
 (#13662)

---
 python/tvm/script/highlight.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/tvm/script/highlight.py b/python/tvm/script/highlight.py
index d12f6c276767..5cf28fff3a4b 100644
--- a/python/tvm/script/highlight.py
+++ b/python/tvm/script/highlight.py
@@ -160,13 +160,20 @@ def _get_pygments_style(
         if version.parse(pygments.__version__) < version.parse("2.4.0"):
             raise ImportError("Required Pygments version >= 2.4.0 but got " + pygments.__version__)
     except ImportError as err:
+        if err.name == "packaging":
+            name = "packaging"
+        elif err.name == "pygments":
+            name = "Pygments>=2.4.0"
+        else:
+            raise ValueError(f'Package "{err.name}" should not be used')
+
         with warnings.catch_warnings():
             warnings.simplefilter("once", UserWarning)
-            install_cmd = sys.executable + ' -m pip install "Pygments>=2.4.0" --upgrade --user'
+            install_cmd = sys.executable + f' -m pip install "{name}" --upgrade --user'
             warnings.warn(
                 str(err)
                 + "\n"
-                + "To print highlighted TVM script, please install Pygments:\n"
+                + f"To print highlighted TVM script, please install {name}:\n"
                 + install_cmd,
                 category=UserWarning,
             )

From 96d55ec83fd2e6a5f8baa9727188e9f2b18a86e3 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Wed, 28 Dec 2022 06:11:06 -0500
Subject: [PATCH 091/286] [TIR] Create Layout with specified axis dtype
 (#13663)

---
 include/tvm/tir/data_layout.h                 |  4 ++-
 python/tvm/tir/data_layout.py                 |  8 ++++--
 src/tir/ir/data_layout.cc                     | 15 ++++++-----
 tests/python/unittest/test_tir_data_layout.py | 27 ++++++++++++++++++-
 4 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/include/tvm/tir/data_layout.h b/include/tvm/tir/data_layout.h
index 81c3e98e663d..7aefef6e485b 100644
--- a/include/tvm/tir/data_layout.h
+++ b/include/tvm/tir/data_layout.h
@@ -137,8 +137,10 @@ class Layout : public ObjectRef {
    *        the corresponding lower case with factor size
    *        indicates the split dimension.
    *        return undefined layout if "__undef__" is passed.
+   * \param dtype The dtype of generated axes vars in the returned layout.
+   *        It is required to be integer type.
    */
-  TVM_DLL Layout(const std::string& name);  // NOLINT(*)
+  TVM_DLL Layout(const std::string& name, DataType dtype = DataType::Int(32));  // NOLINT(*)
 
   /*!
    * \brief access the internal node container
diff --git a/python/tvm/tir/data_layout.py b/python/tvm/tir/data_layout.py
index f46a154612e1..71cc404ee23b 100644
--- a/python/tvm/tir/data_layout.py
+++ b/python/tvm/tir/data_layout.py
@@ -163,7 +163,7 @@ def backward_shape(self, shape):
         return _ffi_api.BijectiveLayoutBackwardShape(self, shape)  # type: ignore
 
 
-def layout(layout_str: str) -> Layout:
+def layout(layout_str: str, dtype: str = "int32") -> Layout:
     """Create a layout node from a string.
 
     Parameters
@@ -177,12 +177,16 @@ def layout(layout_str: str) -> Layout:
         Here subordinate axis channel_block=16 is the factor size of
         the primal axis C (channel).
 
+    dtype : str
+        The dtype of generated axes vars in the returned layout.
+        It is required to be integer type.
+
     Returns
     -------
     layout : Layout
         The created layout
     """
-    return _ffi_api.Layout(layout_str)  # type: ignore
+    return _ffi_api.Layout(layout_str, dtype)  # type: ignore
 
 
 def bijective_layout(
diff --git a/src/tir/ir/data_layout.cc b/src/tir/ir/data_layout.cc
index 3b22ffc71173..3bcb6e8d53fc 100644
--- a/src/tir/ir/data_layout.cc
+++ b/src/tir/ir/data_layout.cc
@@ -90,7 +90,8 @@ Layout::Layout(const Array<IterVar>& axes) {
   data_ = std::move(node);
 }
 
-Layout::Layout(const std::string& name) {  // NOLINT(*)
+Layout::Layout(const std::string& name, DataType dtype) {  // NOLINT(*)
+  CHECK(dtype.is_int()) << "TypeError: The input dtype should be integer type";
   if (name == "__undef__") return;
 
   auto node = make_object<LayoutNode>();
@@ -106,14 +107,14 @@ Layout::Layout(const std::string& name) {  // NOLINT(*)
                            << " before dimension " << c;
       std::string shape_name("_shape");
       shape_name.insert(0, 1, c);
-      IterVar axis =
-          IterVar(Range(PrimExpr(0), Var(shape_name)), Var(std::string(1, c)), tir::kDataPar);
+      IterVar axis(Range(IntImm(dtype, 0), Var(shape_name, dtype)), Var(std::string(1, c), dtype),
+                   tir::kDataPar);
       node->axes.push_back(axis);
     } else if (c >= 'a' && c <= 'z') {
       ICHECK_GT(factor, 0) << "Invalid layout " << name << ": invalid factor size " << factor
                            << " for dimension " << c;
-      IterVar axis =
-          IterVar(Range(PrimExpr(0), PrimExpr(factor)), Var(std::string(1, c)), tir::kDataPar);
+      IterVar axis(Range(IntImm(dtype, 0), IntImm(dtype, factor)), Var(std::string(1, c), dtype),
+                   tir::kDataPar);
       node->axes.push_back(axis);
       factor = 0;
     } else if (c >= '0' && c <= '9') {
@@ -426,7 +427,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
                 << ")";
     });
 
-TVM_REGISTER_GLOBAL("tir.Layout").set_body_typed([](std::string name) { return Layout(name); });
+TVM_REGISTER_GLOBAL("tir.Layout").set_body_typed([](std::string name, DataType dtype) {
+  return Layout(name, dtype);
+});
 
 TVM_REGISTER_GLOBAL("tir.LayoutIndexOf").set_body_typed([](Layout layout, std::string axis) -> int {
   return layout.IndexOf(LayoutAxis::Get(axis));
diff --git a/tests/python/unittest/test_tir_data_layout.py b/tests/python/unittest/test_tir_data_layout.py
index 5c2eb8febd9b..a76cb50da3bd 100644
--- a/tests/python/unittest/test_tir_data_layout.py
+++ b/tests/python/unittest/test_tir_data_layout.py
@@ -16,8 +16,9 @@
 # under the License.
 """Test layout and bijective-layout node"""
 
+import pytest
 import tvm
-from tvm import te
+import tvm.error
 from tvm.topi.utils import get_const_tuple
 
 
@@ -52,6 +53,29 @@ def test_layout():
     assert layout[-1] == "c"
 
 
+def test_layout_dtype():
+    layout_i32 = tvm.tir.layout("NCHW")
+    assert layout_i32.axes[0].var.dtype == "int32"
+    assert layout_i32.axes[0].dom.min.dtype == "int32"
+    assert layout_i32.axes[0].dom.extent.dtype == "int32"
+    assert layout_i32.axes[1].var.dtype == "int32"
+    assert layout_i32.axes[1].dom.min.dtype == "int32"
+    assert layout_i32.axes[1].dom.extent.dtype == "int32"
+
+    layout_i64 = tvm.tir.layout("NCHW", dtype="int64")
+    assert layout_i64.axes[2].var.dtype == "int64"
+    assert layout_i64.axes[2].dom.min.dtype == "int64"
+    assert layout_i64.axes[2].dom.extent.dtype == "int64"
+    assert layout_i64.axes[3].var.dtype == "int64"
+    assert layout_i64.axes[3].dom.min.dtype == "int64"
+    assert layout_i64.axes[3].dom.extent.dtype == "int64"
+
+    with pytest.raises(TypeError):
+        tvm.tir.layout("NCHW", dtype="float32")
+    with pytest.raises(TypeError):
+        tvm.tir.layout("NCHW", dtype=None)
+
+
 def test_bilayout_convertible():
     # not convertible
     assert tvm.tir.bijective_layout("NCHW", "ABCD") is None
@@ -88,6 +112,7 @@ def test_bilayout_index():
 
 if __name__ == "__main__":
     test_layout()
+    test_layout_dtype()
     test_bilayout_convertible()
     test_bilayout_shape()
     test_bilayout_index()

From a188e7cac284904231b3bcf519a966cec395b556 Mon Sep 17 00:00:00 2001
From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com>
Date: Wed, 28 Dec 2022 23:35:06 +0300
Subject: [PATCH 092/286] [MetaSchedule] Add "disabled_pass" option in tuning
 API (#13659)

* [MetaSchedule] Add "disabled_pass" option in tuning API

Now there is no way to disable passes in MetaShedule tuner. This commit adds
new parameter "disabled_pass" in tuning API (tune_relay/compile_relay).
It can be used for different experiments and non default behavoir.

* Add unit test for 'disabled_pass' parameter in MetaScheduler tuner

This commit adds unit test for 'disabled_pass' parameter in
MetaSchedule tuner. Test should throw TVMError exception. That's why it
is marked as XFAIL.
---
 python/tvm/meta_schedule/relay_integration.py | 17 ++++++-
 .../test_meta_schedule_relay_integration.py   | 45 +++++++++++++++++++
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index 0b8705aafea9..876dba106c38 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -17,7 +17,7 @@
 """MetaSchedule-Relay integration"""
 from contextlib import contextmanager
 from types import MappingProxyType
-from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Union, Set
 
 # isort: off
 from typing_extensions import Literal
@@ -120,6 +120,7 @@ def extract_tasks(
     ),
     executor: Optional["relay.backend.Executor"] = None,
     module_equality: str = "structural",
+    disabled_pass: Optional[Union[List[str], Set[str], Tuple[str]]] = None,
 ) -> List[ExtractedTask]:
     """Extract tuning tasks from a relay program.
 
@@ -147,6 +148,8 @@ def extract_tasks(
                             given module. The "ignore-ndarray" varint is used for the extracted
                             blocks or in case no anchor block is found.
                             For the definition of the anchor block, see tir/analysis/analysis.py.
+    disabled_pass : Optional[Union[List[str], Set[str], Tuple[str]]]
+        The list of disabled passes
 
     Returns
     -------
@@ -171,6 +174,7 @@ def extract_tasks(
             with transform.PassContext(
                 opt_level=opt_level,
                 config=pass_config,
+                disabled_pass=disabled_pass,
             ):
                 return list(_extract_task(mod, target, params, module_equality))
 
@@ -250,6 +254,7 @@ def tune_relay(
     seed: Optional[int] = None,
     module_equality: str = "structural",
     num_tuning_cores: Union[Literal["physical", "logical"], int] = "physical",
+    disabled_pass: Optional[Union[List[str], Set[str], Tuple[str]]] = None,
 ) -> Database:
     """Tune a Relay program.
 
@@ -299,6 +304,8 @@ def tune_relay(
                             For the definition of the anchor block, see tir/analysis/analysis.py.
     num_tuning_cores : Union[Literal["physical", "logical"], int]
         The number of CPU cores to use during tuning.
+    disabled_pass : Optional[Union[List[str], Set[str], Tuple[str]]]
+        The list of disabled passes during tasks extraction
 
     Returns
     -------
@@ -306,7 +313,9 @@ def tune_relay(
         The database that contains the tuning records
     """
     tasks, task_weights = extracted_tasks_to_tune_contexts(
-        extracted_tasks=extract_tasks(mod, target, params, module_equality=module_equality),
+        extracted_tasks=extract_tasks(
+            mod, target, params, module_equality=module_equality, disabled_pass=disabled_pass
+        ),
         work_dir=work_dir,
         space=space,
         strategy=strategy,
@@ -345,6 +354,7 @@ def compile_relay(
         }
     ),
     executor: Optional["relay.backend.Executor"] = None,
+    disabled_pass: Optional[Union[List[str], Set[str], Tuple[str]]] = None,
 ):
     """Compile a relay program with a MetaSchedule database.
 
@@ -368,6 +378,8 @@ def compile_relay(
         The pass configuration
     executor : Optional[relay.backend.Executor]
         The executor to use in relay.build. It is not supported by RelayVM.
+    disabled_pass : Optional[Union[List[str], Set[str], Tuple[str]]]
+        The list of disabled passes
 
     Returns
     -------
@@ -387,6 +399,7 @@ def compile_relay(
             with transform.PassContext(
                 opt_level=opt_level,
                 config=pass_config,
+                disabled_pass=disabled_pass,
             ):
                 if backend == "graph":
                     return relay.build(mod, target=target, params=params, executor=executor)
diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index 76d6323f309a..d3731cfa1be8 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -826,5 +826,50 @@ def test_anchor_tuning_cpu_link_params():
     np.testing.assert_allclose(ref, out, atol=1e-3)
 
 
+@pytest.mark.xfail(raises=tvm.error.TVMError)
+def test_disabled_pass_param():
+    """
+    Check 'disabled_pass' parameter in tune_relay. Should throw exception in
+    case of correct work.
+    """
+    data_shape = [1, 4, 16, 16]
+    weight_shape = [32, 4, 2, 2]
+
+    data = relay.var("data", shape=data_shape, dtype="uint8")
+    weight = relay.var("weight", shape=weight_shape, dtype="int8")
+
+    op = relay.qnn.op.conv2d(
+        data,
+        weight,
+        input_zero_point=relay.const(0),
+        kernel_zero_point=relay.const(0),
+        input_scale=relay.const(0.7),
+        kernel_scale=relay.const(0.3),
+        kernel_size=[2, 2],
+        channels=32,
+    )
+    mod = tvm.IRModule.from_expr(op)
+
+    weight_np = np.random.randint(-10, 10, size=weight_shape).astype("int8")
+    params = {"weight": weight_np}
+
+    executor = relay.backend.Executor("graph", {"link-params": True})
+    mod = mod.with_attr("executor", executor)
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        database = ms.relay_integration.tune_relay(
+            mod=mod,
+            target="llvm --num-cores=4",
+            params=params,
+            work_dir=work_dir,
+            max_trials_global=4,
+            strategy="replay-trace",
+            disabled_pass=["qnn.Legalize"],
+        )
+
+    # Test failed, otherwise we can not reach this point.
+    pytest.fail("'disabled_pass' argument does not work")
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From cede8502f9498f84121d53a38ac80d7365c2209f Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 29 Dec 2022 18:26:09 -0500
Subject: [PATCH 093/286] [CONTAINER] Struct Hash/Equal and JSON support for
 ShapeTuple (#13671)

This PR add struct equal/hash and json serialization support
for shape tuple. Testcases added.
---
 src/node/structural_hash.cc                   | 44 +++++++++++++++++++
 src/support/base64.h                          |  9 +++-
 .../test_container_structural_equal.py        | 14 ++++++
 .../python/unittest/test_runtime_container.py |  5 +++
 4 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index 1d1185cddc3d..0426b8454dce 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -484,6 +484,50 @@ TVM_REGISTER_REFLECTION_VTABLE(ArrayNode, ArrayNodeTrait)
       return ::tvm::runtime::make_object<ArrayNode>();
     });
 
+struct ShapeTupleObjTrait {
+  static constexpr const std::nullptr_t VisitAttrs = nullptr;
+
+  static void SHashReduce(const ShapeTupleObj* self, SHashReducer hash_reduce) {
+    hash_reduce(self->size);
+    for (size_t i = 0; i < self->size; ++i) {
+      hash_reduce(self->data[i]);
+    }
+  }
+
+  static bool SEqualReduce(const ShapeTupleObj* lhs, const ShapeTupleObj* rhs,
+                           SEqualReducer equal) {
+    if (lhs->size != rhs->size) return false;
+    for (size_t i = 0; i < lhs->size; ++i) {
+      if (!equal(lhs->data[i], rhs->data[i])) return false;
+    }
+    return true;
+  }
+};
+
+TVM_REGISTER_REFLECTION_VTABLE(ShapeTupleObj, ShapeTupleObjTrait)
+    .set_creator([](const std::string& blob) {
+      // Store shape tuple in blob to avoid large integer overflow in JSON.
+      dmlc::MemoryStringStream mstrm(const_cast<std::string*>(&blob));
+      support::Base64InStream b64strm(&mstrm);
+      b64strm.InitPosition();
+      uint64_t size;
+      b64strm.Read<uint64_t>(&size);
+      std::vector<int64_t> data(size);
+      b64strm.ReadArray(data.data(), size);
+      ShapeTuple shape(data);
+      return RefToObjectPtr::Get(shape);
+    })
+    .set_repr_bytes([](const Object* n) -> std::string {
+      std::string blob;
+      dmlc::MemoryStringStream mstrm(&blob);
+      support::Base64OutStream b64strm(&mstrm);
+      const auto* shape = static_cast<const runtime::ShapeTupleObj*>(n);
+      b64strm.Write<uint64_t>(shape->size);
+      b64strm.WriteArray(shape->data, shape->size);
+      b64strm.Finish();
+      return blob;
+    });
+
 struct MapNodeTrait {
   static constexpr const std::nullptr_t VisitAttrs = nullptr;
 
diff --git a/src/support/base64.h b/src/support/base64.h
index 7b37afce66cc..aba4197bce20 100644
--- a/src/support/base64.h
+++ b/src/support/base64.h
@@ -115,8 +115,10 @@ class Base64InStream : public dmlc::Stream {
   }
   /*! \brief whether current position is end of a base64 stream */
   bool IsEOF(void) const { return num_prev_ == 0 && (temp_ch_ == EOF || isspace(temp_ch_)); }
+
+  using dmlc::Stream::Read;
   // override read function.
-  virtual size_t Read(void* ptr, size_t size) {
+  size_t Read(void* ptr, size_t size) final {
     using base64::DecodeTable;
     if (size == 0) return 0;
     // use tlen to record left size
@@ -224,7 +226,10 @@ class Base64InStream : public dmlc::Stream {
 class Base64OutStream : public dmlc::Stream {
  public:
   explicit Base64OutStream(dmlc::Stream* fp) : fp_(fp) {}
-  virtual void Write(const void* ptr, size_t size) {
+
+  using dmlc::Stream::Write;
+
+  void Write(const void* ptr, size_t size) final {
     using base64::EncodeTable;
     size_t tlen = size;
     const unsigned char* cptr = static_cast<const unsigned char*>(ptr);
diff --git a/tests/python/unittest/test_container_structural_equal.py b/tests/python/unittest/test_container_structural_equal.py
index cdd9ffb7af53..61511c609ca4 100644
--- a/tests/python/unittest/test_container_structural_equal.py
+++ b/tests/python/unittest/test_container_structural_equal.py
@@ -107,6 +107,20 @@ def test_array_structural_equal_to_self(contents):
     assert get_first_mismatch_ensure_symmetry(a, b) is None
 
 
+@pytest.mark.parametrize(
+    "contents",
+    [
+        [],
+        [1],
+        [1, 2, 3],
+    ],
+)
+def test_shape_tuple_structural_equal_to_self(contents):
+    a = tvm.runtime.ShapeTuple(list(contents))
+    b = tvm.runtime.ShapeTuple(list(contents))
+    assert get_first_mismatch_ensure_symmetry(a, b) is None
+
+
 @pytest.mark.parametrize(
     "a, b, expected_a_path, expected_b_path",
     [
diff --git a/tests/python/unittest/test_runtime_container.py b/tests/python/unittest/test_runtime_container.py
index 8c302e920577..7538075ae7f8 100644
--- a/tests/python/unittest/test_runtime_container.py
+++ b/tests/python/unittest/test_runtime_container.py
@@ -90,6 +90,11 @@ def test_shape_tuple():
     # ShapleTuple vs. ShapeTuple
     assert stuple == _container.ShapeTuple(shape)
 
+    # test pickle
+    z = pickle.loads(pickle.dumps(stuple))
+    assert isinstance(z, tvm.runtime.ShapeTuple)
+    assert stuple == z
+
 
 if __name__ == "__main__":
     test_string()

From e9cd558992abb445526ef8a190e41cb494e43d94 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Fri, 30 Dec 2022 19:40:51 +0530
Subject: [PATCH 094/286] [RUNTIME][OPENCL] OpenCL host pointer support to
 acheive zero copy (#13413)

* [RUNTIME][OPENCL] OpenCL host pointer support to acheive zero copy

OpenCL supports device memory access to host by memory mapping.
OpenCL flag "CL_MEM_ALLOC_HOST_PTR" enable this while creating a memory object.

We enable this feature via compilation setting "USE_OPENCL_ENABLE_HOST_PTR"
followed by a new API "GetNativePtr" on OpenCLWorkSpace.

This allows application directly use hardware allocated memory while preparing the input.
From user side we allocate NDArray which same size as graph input, access native memory and
finally call set_input_zero_copy to set the input.

Psudo code looks like

auto narr = tvm::runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLOpenCL, 0});
OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
void *nptr = workspace->GetNativePtr(narr);

... access memory pointed by nptr up to the tensor size ...

tvm::runtime::PackedFunc set_input = mod.GetFunction("set_input_zero_copy");
set_input(i, narr);
---
 CMakeLists.txt                                |  1 +
 cmake/config.cmake                            |  5 +++
 cmake/modules/LibInfo.cmake                   |  1 +
 cmake/modules/OpenCL.cmake                    |  3 ++
 cmake/modules/contrib/CLML.cmake              |  4 ++-
 src/runtime/opencl/opencl_common.h            |  5 +++
 src/runtime/opencl/opencl_device_api.cc       | 32 +++++++++++++++--
 .../opencl/opencl_wrapper/opencl_wrapper.cc   | 31 ++++++++++++++++
 src/support/libinfo.cc                        |  5 +++
 tests/cpp-runtime/opencl/opencl_nativeptr.cc  | 36 +++++++++++++++++++
 10 files changed, 119 insertions(+), 4 deletions(-)
 create mode 100644 tests/cpp-runtime/opencl/opencl_nativeptr.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b59d5ab69185..b774181f5f71 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,7 @@ endif()
 # Alernatively, use cmake -DOPTION=VALUE through command-line.
 tvm_option(USE_CUDA "Build with CUDA" OFF)
 tvm_option(USE_OPENCL "Build with OpenCL" OFF)
+tvm_option(USE_OPENCL_ENABLE_HOST_PTR "Enable OpenCL memory object access to host" OFF)
 tvm_option(USE_OPENCL_GTEST "Path to OpenCL specific gtest version for runtime cpp tests." /path/to/opencl/gtest)
 tvm_option(USE_VULKAN "Build with Vulkan" OFF)
 
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 5a93f9db652b..952c4a9cc814 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -71,6 +71,11 @@ set(USE_AOCL OFF)
 # - /path/to/opencl-sdk: use specific path to opencl-sdk
 set(USE_OPENCL OFF)
 
+# Wheather to allow OPENCL cl_mem access to host
+# cl_mem will be allocated with CL_MEM_ALLOC_HOST_PTR
+# OpenCLWorkspace->GetHostPtr API returns the host accessible pointer
+set(USE_OPENCL_ENABLE_HOST_PTR OFF)
+
 # Whether enable Metal runtime
 set(USE_METAL OFF)
 
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 7c24088c0ad2..042fa3c6ddd7 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -89,6 +89,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_MSVC_MT="${USE_MSVC_MT}"
     TVM_INFO_USE_NNPACK="${USE_NNPACK}"
     TVM_INFO_USE_OPENCL="${USE_OPENCL}"
+    TVM_INFO_USE_OPENCL_ENABLE_HOST_PTR="${USE_OPENCL_ENABLE_HOST_PTR}"
     TVM_INFO_USE_OPENCL_GTEST="${USE_OPENCL_GTEST}"
     TVM_INFO_USE_OPENMP="${USE_OPENMP}"
     TVM_INFO_USE_PAPI="${USE_PAPI}"
diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
index 1e1041efe386..ced2da2d17e3 100644
--- a/cmake/modules/OpenCL.cmake
+++ b/cmake/modules/OpenCL.cmake
@@ -74,6 +74,9 @@ if(USE_OPENCL)
     target_link_libraries(opencl-cpptest PRIVATE gtest_main tvm_runtime)
   endif()
   list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
+  if(USE_OPENCL_ENABLE_HOST_PTR)
+    add_definitions(-DOPENCL_ENABLE_HOST_PTR)
+  endif(USE_OPENCL_ENABLE_HOST_PTR)
 else()
   list(APPEND COMPILER_SRCS src/target/opt/build_opencl_off.cc)
 endif(USE_OPENCL)
diff --git a/cmake/modules/contrib/CLML.cmake b/cmake/modules/contrib/CLML.cmake
index 30e60423b03b..e86a7e1ae032 100644
--- a/cmake/modules/contrib/CLML.cmake
+++ b/cmake/modules/contrib/CLML.cmake
@@ -54,5 +54,7 @@ if(USE_CLML_GRAPH_EXECUTOR)
     file(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
     list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
     set(USE_OPENCL ON)
-
+    if(USE_OPENCL_ENABLE_HOST_PTR)
+        add_definitions(-DOPENCL_ENABLE_HOST_PTR)
+    endif(USE_OPENCL_ENABLE_HOST_PTR)
 endif()
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index f0a68864d724..7bbb358f8f92 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -212,6 +212,7 @@ inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) {
   }
 
 class OpenCLThreadEntry;
+struct BufferDescriptor;
 
 /*!
  * \brief Process global OpenCL workspace.
@@ -290,6 +291,7 @@ class OpenCLWorkspace : public DeviceAPI {
   void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final;
   void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                        Optional<String> mem_scope = NullOpt) final;
+  void* GetNativePtr(const tvm::runtime::NDArray& narr);
   void FreeDataSpace(Device dev, void* ptr) final;
   void StreamSync(Device dev, TVMStreamHandle stream) final;
   void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
@@ -310,6 +312,8 @@ class OpenCLWorkspace : public DeviceAPI {
 
   void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final;
 
+  void* CreateHostPtrIfEnabled(BufferDescriptor* desc, Device dev, size_t size);
+
  private:
   std::string GetError() {
     if (this->devices.size() == 0) return noDevicesErrorMsg;
@@ -377,6 +381,7 @@ struct BufferDescriptor {
   static String ScopeFromMemoryLayout(MemoryLayout mem_scope);
 
   cl_mem buffer{nullptr};
+  cl_uchar* host_ptr{nullptr};
   MemoryLayout layout{MemoryLayout::kBuffer1D};
 };
 }  // namespace cl
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 1244fddf0983..aa31d80d6e8b 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -29,6 +29,12 @@
 
 #include "opencl_common.h"
 
+#ifdef OPENCL_ENABLE_HOST_PTR
+#define CL_MEM_CREATE_FLAGS CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR
+#else
+#define CL_MEM_CREATE_FLAGS CL_MEM_READ_WRITE
+#endif
+
 namespace tvm {
 namespace runtime {
 namespace cl {
@@ -191,6 +197,17 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
   }
 }
 
+void* OpenCLWorkspace::CreateHostPtrIfEnabled(cl::BufferDescriptor* desc, Device dev, size_t size) {
+#if defined(OPENCL_ENABLE_HOST_PTR)
+  cl_int err_code;
+  desc->host_ptr = reinterpret_cast<cl_uchar*>(
+      clEnqueueMapBuffer(this->GetQueue(dev), desc->buffer, CL_TRUE, CL_MAP_WRITE, 0,
+                         sizeof(cl_uchar) * size, 0, NULL, NULL, &err_code));
+  OPENCL_CHECK_ERROR(err_code);
+#endif  // OPENCL_ENABLE_HOST_PTR
+  return desc;
+}
+
 void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
                                       DLDataType type_hint) {
   this->Init();
@@ -201,10 +218,10 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
   if (size == 0) {
     size = 1;
   }
-  desc->buffer = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size, nullptr, &err_code);
+  desc->buffer = clCreateBuffer(this->context, CL_MEM_CREATE_FLAGS, size, nullptr, &err_code);
   desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D;
   OPENCL_CHECK_ERROR(err_code);
-  return desc;
+  return CreateHostPtrIfEnabled(desc, dev, size);
 }
 
 void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
@@ -226,12 +243,21 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape
   return desc;
 }
 
+void* OpenCLWorkspace::GetNativePtr(const tvm::runtime::NDArray& narr) {
+  cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(narr.operator->()->data);
+  return desc->host_ptr;
+}
+
 void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
   // We have to make sure that the memory object is not in the command queue
   // for some OpenCL platforms.
   OPENCL_CALL(clFinish(this->GetQueue(dev)));
 
   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
+  if (desc->host_ptr) {
+    clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
+                            reinterpret_cast<void*>(desc->host_ptr), 0, NULL, NULL);
+  }
   OPENCL_CALL(clReleaseMemObject(desc->buffer));
   delete desc;
 }
@@ -245,7 +271,7 @@ cl_mem OpenCLWorkspace::AllocTexture(Device dev, size_t width, size_t height,
   cl_image_format format = {CL_RGBA, cl_type};
   cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0};
   cl_mem mptr =
-      clCreateImage(this->context, CL_MEM_READ_WRITE, &format, &descriptor, nullptr, &err_code);
+      clCreateImage(this->context, CL_MEM_CREATE_FLAGS, &format, &descriptor, nullptr, &err_code);
   OPENCL_CHECK_ERROR(err_code);
   return mptr;
 }
diff --git a/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc b/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc
index c447ebcb5339..2c2768945424 100644
--- a/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc
+++ b/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc
@@ -173,6 +173,11 @@ using f_clEnqueueNDRangeKernel = cl_int (*)(cl_command_queue, cl_kernel, cl_uint
                                             cl_event*);
 using f_clCreateCommandQueue = cl_command_queue (*)(cl_context, cl_device_id,
                                                     cl_command_queue_properties, cl_int*);
+using f_clEnqueueUnmapMemObject = cl_int (*)(cl_command_queue, cl_mem, void*, cl_uint,
+                                             const cl_event*, cl_event*);
+using f_clEnqueueMapBuffer = void* (*)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t,
+                                       size_t, cl_uint, const cl_event*, cl_event*, cl_int*);
+
 }  // namespace
 
 cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms) {
@@ -572,3 +577,29 @@ cl_command_queue clCreateCommandQueue(cl_context context, cl_device_id device,
     return nullptr;
   }
 }
+
+cl_int clEnqueueUnmapMemObject(cl_command_queue queue, cl_mem memobj, void* mapped_ptr,
+                               cl_uint num_events_in_wait_list, const cl_event* event_wait_list,
+                               cl_event* event) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clEnqueueUnmapMemObject)lib.getOpenCLFunction("clEnqueueUnmapMemObject");
+  if (func) {
+    return func(queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+void* clEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map,
+                         cl_map_flags map_flags, size_t offset, size_t cb,
+                         cl_uint num_events_in_wait_list, const cl_event* event_wait_list,
+                         cl_event* event, cl_int* errcode_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clEnqueueMapBuffer)lib.getOpenCLFunction("clEnqueueMapBuffer");
+  if (func) {
+    return func(command_queue, buffer, blocking_map, map_flags, offset, cb, num_events_in_wait_list,
+                event_wait_list, event, errcode_ret);
+  } else {
+    return nullptr;
+  }
+}
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index c0fc9881b4f5..7bb1e04920fa 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -43,6 +43,10 @@
 #define TVM_INFO_USE_OPENCL "NOT-FOUND"
 #endif
 
+#ifndef TVM_INFO_USE_OPENCL_ENABLE_HOST_PTR
+#define TVM_INFO_USE_OPENCL_ENABLE_HOST_PTR "NOT-FOUND"
+#endif
+
 #ifndef TVM_INFO_USE_OPENCL_GTEST
 #define TVM_INFO_USE_OPENCL_GTEST "NOT-FOUND"
 #endif
@@ -294,6 +298,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_MSVC_MT", TVM_INFO_USE_MSVC_MT},
       {"USE_NNPACK", TVM_INFO_USE_NNPACK},
       {"USE_OPENCL", TVM_INFO_USE_OPENCL},
+      {"USE_OPENCL_ENABLE_HOST_PTR", TVM_INFO_USE_OPENCL_ENABLE_HOST_PTR},
       {"USE_OPENCL_GTEST", TVM_INFO_USE_OPENCL_GTEST},
       {"USE_OPENMP", TVM_INFO_USE_OPENMP},
       {"USE_PAPI", TVM_INFO_USE_PAPI},
diff --git a/tests/cpp-runtime/opencl/opencl_nativeptr.cc b/tests/cpp-runtime/opencl/opencl_nativeptr.cc
new file mode 100644
index 000000000000..ebfb62e92069
--- /dev/null
+++ b/tests/cpp-runtime/opencl/opencl_nativeptr.cc
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/runtime/container/optional.h>
+
+#include "../src/runtime/opencl/opencl_common.h"
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::cl;
+
+#if defined(OPENCL_ENABLE_HOST_PTR)
+TEST(OpenCLNDArray, native_ptr) {
+  OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
+
+  auto A = tvm::runtime::NDArray::Empty({128, 128}, {kDLFloat, 32, 1}, {kDLOpenCL, 0});
+  void* nptr = workspace->GetNativePtr(A);
+  memset(nptr, 0x0, 128 * 128 * 4);
+}
+#endif

From 0fad2dec60bfbbaa3e9b3946d761fc7277689d9f Mon Sep 17 00:00:00 2001
From: lightzhan <1126207509@qq.com>
Date: Fri, 30 Dec 2022 22:11:17 +0800
Subject: [PATCH 095/286] [BugFix] Pylance emits the warnning 'Code is
 unreachable'  (#13673)

Fix the issue that pylance emits the warnning 'Code is unreachable' for the code  below the function calls tvm.ir.load_json/save_json.

Co-authored-by: lightzhan-intellif <zhan.liang@intellif.com>
---
 python/tvm/ir/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/ir/base.py b/python/tvm/ir/base.py
index c6b30d38edac..d754ae567c5e 100644
--- a/python/tvm/ir/base.py
+++ b/python/tvm/ir/base.py
@@ -120,7 +120,7 @@ def get(name):
         return _ffi_api.EnvFuncGet(name)
 
 
-def load_json(json_str):
+def load_json(json_str) -> Object:
     """Load tvm object from json_str.
 
     Parameters
@@ -141,7 +141,7 @@ def load_json(json_str):
         return tvm.runtime._ffi_node_api.LoadJSON(json_str)
 
 
-def save_json(node):
+def save_json(node) -> str:
     """Save tvm object as json string.
 
     Parameters

From 59e7a5c60619c4fcf85595cfc7080c8f05ea91a7 Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Sun, 1 Jan 2023 01:18:22 +0200
Subject: [PATCH 096/286] [TOPI][bugfix] Fix a bug in arm_cpu int8 dotprod
 schedule and modernize tests (#13669)

topi.arm_cpu.schedule_conv2d_NHWC_quantized_native was failing
compilation in case the input channels divided by 4 was less than 4.

This was because we were splitting this axis by a factor of 4 to create
appropriate loop nest for tensorize, but then tensorize was assuming
the outer axis bound was divisible by 4.

If the outer bound was less than 4, compilation failed, if it was
greater than 4 but not divisible by 4, we were occasionally
accessing data outside of tensor, which luckily was padded due to
alignment (I think).

So here we make sure that we explicitly pad the input axis such that
the outer loop will always be divisible by 4.

There are also some refactors to test_topi_conv2d_int8.py:
- decouple the tests using pytest.parametrize
- extend the NHWC int8 schedules test to test against arm
targets and various schedules. When these schedules were initialy
added, we didn't have Arm CI, so only compilation was tested, now
we can also run the workloads on Arm targets.
---
 python/tvm/topi/nn/conv2d.py                  |   10 +-
 .../topi/python/test_topi_conv2d_int8.py      | 1110 ++++++++---------
 2 files changed, 537 insertions(+), 583 deletions(-)

diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index db1bcaa27694..92b5a90e5b11 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -606,8 +606,14 @@ def conv2d_gemm_weight_transform(kernel, tile_rows, tile_cols):
     if N % tile_rows != 0:
         pad_N = tile_rows - (N % tile_rows)
 
-    if K % tile_cols != 0:
-        pad_K = tile_cols - (K % tile_cols)
+    # Tensorize will later make use of 4 tiles at once across the columns so make sure we pad such
+    # that the columns is multiple of 4
+    column_multiplier = 4
+    tile_cols_multiplied = tile_cols * column_multiplier
+    K_misalignment = K % tile_cols_multiplied
+
+    if K_misalignment != 0:
+        pad_K = tile_cols_multiplied - K_misalignment
 
     N_padded = N + pad_N
     K_padded = K + pad_K
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index c84f39ab5a66..e05dba3dfee4 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -28,6 +28,7 @@
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.nn.conv2d import _get_workload
 from tvm.topi.generic.conv2d import fallback_schedule_cpu_common_int8
+from tvm.testing.aot import get_dtype_range
 
 from common import Int8Fallback
 import tvm.testing
@@ -35,67 +36,146 @@
 import platform
 
 
-def compile_conv2d_NHWC_gemm_int8_arm(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-):
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-    padding_sum = pad_top + pad_left + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-    )
-
-    in_height = in_width = in_size
-    A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int8")
-    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8")
-    bias = te.placeholder((num_filter,), name="bias", dtype="int8")
-    dtype = "int32"
-    devices = [
-        (
-            "llvm --device arm_cpu --mtriple aarch64-linux-gnu",
-            topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
-            topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
-        ),
-        (
-            "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
-            topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
-            topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
-        ),
+devices = [
+    (
+        "llvm",
+        topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+        topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+    ),
+    (
+        "llvm --device arm_cpu --mtriple aarch64-linux-gnu",
+        topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+        topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+    ),
+    (
+        "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
+        topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+        topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+    ),
+    (
+        "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
+        topi.arm_cpu.compute_conv2d_NHWC_quantized_native,
+        topi.arm_cpu.schedule_conv2d_NHWC_quantized_native,
+    ),
+    # TODO(giuseros) We need LLVM-11 in order to compile with +i8mm extension
+    # (
+    # "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+i8mm",
+    # topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+    # topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+    # ),
+]
+
+
+@tvm.testing.requires_llvm
+@pytest.mark.parametrize("device", devices)
+@pytest.mark.parametrize(
+    "params",
+    [
+        # Subset of inception v3 expanded (dilation > 1, batch > 1, 'VALID' padding)
+        (1, 3, 299, 32, 3, 2, "SAME", 1, False, False),
+        (1, 32, 149, 32, 3, 1, "SAME", 2, False, False),
+        (4, 32, 147, 64, 3, 1, "SAME", 1, False, False),
+        (1, 64, 73, 80, 1, 1, "SAME", 1, False, False),
+        (1, 80, 73, 192, 3, 1, "SAME", 1, False, False),
+        (1, 192, 35, 48, 1, 1, "SAME", 1, False, False),
+        (1, 192, 35, 64, 1, 1, "VALID", 1, False, False),
+        (1, 192, 35, 32, 1, 1, "SAME", 1, False, False),
+        (1, 48, 35, 64, 5, 1, "SAME", 1, False, False),
+        (1, 96, 35, 96, 3, 1, "SAME", 1, False, False),
+        (1, 256, 35, 48, 1, 1, "SAME", 1, False, False),
+        (1, 256, 35, 64, 1, 1, "SAME", 1, False, False),
+        (1, 288, 35, 64, 1, 1, "SAME", 1, False, False),
+        (1, 288, 35, 48, 1, 1, "SAME", 1, False, False),
+        (1, 96, 35, 96, 3, 2, "SAME", 1, False, False),
+        (1, 128, 17, 192, 7, 1, "SAME", 2, False, False),
+        (1, 160, 17, 160, 7, 1, "SAME", 1, False, False),
+        (1, 160, 17, 192, 1, 1, "VALID", 1, False, False),
+        (1, 192, 17, 192, 1, 1, "SAME", 1, False, False),
+        (1, 768, 5, 128, 1, 1, "SAME", 1, False, False),
+        (1, 192, 17, 320, 3, 2, "SAME", 1, False, False),
+        (1, 192, 17, 192, 3, 2, "SAME", 1, False, False),
+        (1, 1280, 8, 192, 1, 1, "SAME", 1, False, False),
+        (1, 1280, 8, 384, 1, 1, "SAME", 1, False, False),
+        (1, 1280, 8, 320, 1, 1, "SAME", 1, False, False),
+        (1, 1280, 8, 448, 1, 1, "SAME", 1, False, False),
+        (1, 384, 8, 384, 1, 1, "SAME", 1, False, False),
+        (1, 384, 8, 384, 3, 1, "SAME", 1, False, False),
+        (1, 448, 8, 384, 3, 1, "VALID", 1, False, False),
+        (1, 2048, 8, 320, 1, 1, "SAME", 1, False, False),
+        (1, 2048, 8, 448, 1, 1, "SAME", 1, True, True),
+        (1, 2048, 8, 192, 1, 1, "SAME", 1, True, False),
+        # A trouble case for native schedule
+        (1, 8, 1, 24, 1, 1, "SAME", 1, False, False),
+    ],
+)
+def test_conv2d_NHWC_gemm_int8(params, device):
+
+    with Int8Fallback():
+        target, compute, schedule = device
+
         (
-            "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
-            topi.arm_cpu.compute_conv2d_NHWC_quantized_native,
-            topi.arm_cpu.schedule_conv2d_NHWC_quantized_native,
-        ),
-        # TODO(giuseros) Need LLVM-11 in order to compile with +i8mm extension
-        # (
-        #   "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+i8mm",
-        #   topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
-        #   topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
-        # ),
-    ]
-
-    for device_tuple in devices:
-        target = device_tuple[0]
-        compute = device_tuple[1]
-        schedule = device_tuple[2]
-
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Compiling on arm AArch64 target: %s" % target)
-        with tvm.target.Target(target) as tvm_target:
-            assert tvm_target.features.is_aarch64, "AArch64 target not recognized"
+            batch,
+            in_channel,
+            in_size,
+            num_filter,
+            kernel,
+            stride,
+            padding,
+            dilation,
+            add_bias,
+            add_relu,
+        ) = params
+
+        dtype = "int8"
+
+        # TODO(ekalda): These combinations hang during compilation
+        failing_cases = [
+            (devices[1], (1, 128, 17, 192, 7, 1, "SAME", 2, False, False)),
+            (devices[1], (1, 160, 17, 160, 7, 1, "SAME", 1, False, False)),
+            (
+                devices[1],
+                (1, 448, 8, 384, 3, 1, "VALID", 1, False, False),
+            ),  # this one passes but is just incredibly slow
+        ]
+        if (device, params) in failing_cases:
+            pytest.skip("Skipping because this test will hang")
+
+        print("Compiling for target: %s" % target)
+
+        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
+        padding_sum = pad_top + pad_left + pad_bottom + pad_right
+        print(
+            "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
+            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
+        )
+
+        in_height = in_width = in_size
+
+        a_shape = (batch, in_height, in_width, in_channel)
+        w_shape = (kernel, kernel, in_channel, num_filter)
+        bias_shape = (num_filter,)
+
+        @memoize("topi.tests.test_topi_conv2d_int8.test_conv2d_NHWC_gemm_int8")
+        def get_ref_data():
+            input_min, input_max = get_dtype_range(dtype)
+            a_np = np.random.randint(low=input_min, high=input_max, size=a_shape).astype(dtype)
+            w_np = np.random.randint(low=input_min, high=input_max, size=w_shape).astype(dtype)
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
+            c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding).astype(dtype)
+
+            if add_bias:
+                b_np = np.random.uniform(size=bias_shape).astype(dtype)
+                c_np += b_np
+            if add_relu:
+                c_np = np.maximum(c_np, 0)
 
+            return a_np, w_np, b_np, c_np
+
+        with tvm.target.Target(target) as tvm_target:
+            A = te.placeholder(a_shape, name="A", dtype=dtype)
+            W = te.placeholder(w_shape, name="W", dtype=dtype)
+            bias = te.placeholder(bias_shape, name="bias", dtype=dtype)
             C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
             if add_bias:
                 C = topi.add(C, bias)
@@ -103,573 +183,441 @@ def compile_conv2d_NHWC_gemm_int8_arm(
                 C = topi.nn.relu(C)
             s = schedule([C])
 
-        if add_bias:
-            tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%dnnn_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-        else:
+            build_args = [A, W, bias, C] if add_bias else [A, W, C]
+
             func = tvm.build(
                 s,
-                [A, W, C],
+                build_args,
                 target,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
+                % (
+                    batch,
+                    in_channel,
+                    in_size,
+                    num_filter,
+                    kernel,
+                    stride,
+                    padding_sum,
+                    dilation,
+                ),
             )
 
+            build_only = tvm_target.features.is_aarch64 and (platform.machine() != "aarch64")
 
-def verify_conv2d_NHWC_gemm_int8(
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-):
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-    padding_sum = pad_top + pad_left + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-    )
-
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int8")
-    W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8")
-    bias = te.placeholder((num_filter,), name="bias", dtype="int8")
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
-    def get_ref_data():
-        a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
-        w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
-        c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding).astype(dtype)
-
-        if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def check_target(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            C = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved(
-                A, W, (stride, stride), padding, (dilation, dilation), dtype
-            )
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, b, c)
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
-
-    check_target("llvm")
-
-
-def verify_conv2d_NCHWc_int8(
-    in_dtype,
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-):
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-    padding_sum = pad_top + pad_left + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-    )
-
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype)
-    W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype)
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
-    out_dtype = "int32" if in_dtype == "int8" else "uint32"
-    lo = -128 if in_dtype == "int8" else 0
-    hi = 127 if in_dtype == "int8" else 255
-
-    def check_target(target, compute, schedule, oc_block_factor, build_only):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
-            print("Skip because int8 intrinsics are not available")
-            return
-
-        bias = te.placeholder(
-            (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype=out_dtype
+            if build_only:
+                return
+
+            print("Running on target: %s" % target)
+
+            dev = tvm.device(target, 0)
+            a_np, w_np, b_np, c_np = get_ref_data()
+            a = tvm.nd.array(a_np, dev)
+            w = tvm.nd.array(w_np, dev)
+            b = tvm.nd.array(b_np, dev)
+            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+
+            run_args = [a, w, b, c] if add_bias else [a, w, c]
+            func(*run_args)
+
+            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
+
+
+@pytest.mark.parametrize("in_dtype", ["int8", "uint8"])
+@pytest.mark.parametrize(
+    "params",
+    [
+        # ResNet18 workloads where channels in / out are multiple of oc_block_factor
+        (1, 64, 56, 64, 3, 1, 1, 1, False, False),
+        (1, 64, 56, 64, 1, 1, 0, 1, False, False),
+        (1, 64, 56, 128, 3, 2, 1, 1, False, False),
+        (1, 64, 56, 128, 1, 2, 0, 1, False, False),
+        (1, 128, 28, 128, 3, 1, 1, 1, False, False),
+        (1, 128, 28, 256, 3, 2, 1, 1, False, False),
+        (1, 128, 28, 256, 1, 2, 0, 1, False, False),
+        (1, 256, 14, 256, 3, 1, 1, 1, False, False),
+        (1, 256, 14, 512, 3, 2, 1, 1, False, False),
+        (1, 256, 14, 512, 1, 2, 0, 1, False, False),
+        (1, 512, 7, 512, 3, 1, 1, 1, False, False),
+        # bias, relu
+        (1, 64, 56, 64, 3, 1, 1, 1, False, True),
+        (1, 64, 56, 64, 3, 1, 1, 1, True, False),
+        (1, 64, 56, 64, 3, 1, 1, 1, True, True),
+        # dilation = 2
+        (1, 64, 56, 64, 3, 1, 1, 2, False, False),
+        # batch size
+        (4, 64, 56, 64, 3, 1, 1, 1, False, False),
+        (9, 64, 56, 64, 3, 1, 1, 1, False, False),
+        # weird workloads
+        (4, 4, 4, 8, 4, 4, 4, 1, False, False),
+        # inception v3 workloads where channels in / out are multiple of oc_block_factor
+        (1, 32, 149, 32, 3, 1, 0, 1, False, False),
+        (1, 32, 147, 64, 3, 1, 1, 1, False, False),
+        (1, 64, 73, 80, 1, 1, 0, 1, False, False),
+        (1, 80, 73, 192, 3, 1, 0, 1, False, False),
+        (1, 192, 35, 64, 1, 1, 0, 1, False, False),
+        (1, 192, 35, 48, 1, 1, 0, 1, False, False),
+        (1, 48, 35, 64, 5, 1, 2, 1, False, False),
+        (1, 64, 35, 96, 3, 1, 1, 1, False, False),
+        (1, 96, 35, 96, 3, 1, 1, 1, False, False),
+        (1, 192, 35, 32, 1, 1, 0, 1, False, False),
+        (1, 256, 35, 64, 1, 1, 0, 1, False, False),
+        (1, 256, 35, 48, 1, 1, 0, 1, False, False),
+        (1, 288, 35, 64, 1, 1, 0, 1, False, False),
+        (1, 288, 35, 48, 1, 1, 0, 1, False, False),
+        (1, 288, 35, 384, 3, 2, 0, 1, False, False),
+        (1, 96, 35, 96, 3, 2, 0, 1, False, False),
+        (1, 768, 17, 192, 1, 1, 0, 1, False, False),
+        (1, 768, 17, 128, 1, 1, 0, 1, False, False),
+        (1, 128, 17, 128, 1, 1, 0, 1, False, False),
+        (1, 128, 17, 192, 7, 1, 3, 1, False, False),
+        (1, 128, 17, 128, 7, 1, 3, 1, False, False),
+        (1, 128, 17, 192, 1, 1, 0, 1, False, False),
+        (1, 768, 17, 160, 1, 1, 0, 1, False, False),
+        (1, 160, 17, 160, 1, 1, 0, 1, False, False),
+        (1, 160, 17, 192, 7, 1, 3, 1, False, False),
+        (1, 160, 17, 160, 7, 1, 3, 1, False, False),
+        (1, 160, 17, 192, 1, 1, 0, 1, False, False),
+        (1, 192, 17, 192, 1, 1, 0, 1, False, False),
+        (1, 192, 17, 192, 7, 1, 3, 1, False, False),
+        (1, 192, 17, 320, 3, 2, 0, 1, False, False),
+        (1, 192, 17, 192, 3, 2, 0, 1, False, False),
+        (1, 1280, 8, 320, 1, 1, 0, 1, False, False),
+        (1, 1280, 8, 384, 1, 1, 0, 1, False, False),
+        (1, 384, 8, 384, 1, 1, 0, 1, False, False),
+        (1, 384, 8, 384, 3, 1, 1, 1, False, False),
+        (1, 1280, 8, 448, 1, 1, 0, 1, False, False),
+        (1, 448, 8, 384, 3, 1, 1, 1, False, False),
+        (1, 1280, 8, 192, 1, 1, 0, 1, False, False),
+        (1, 2048, 8, 320, 1, 1, 0, 1, False, False),
+        (1, 2048, 8, 384, 1, 1, 0, 1, False, False),
+        (1, 2048, 8, 448, 1, 1, 0, 1, False, False),
+        (1, 2048, 8, 192, 1, 1, 0, 1, False, False),
+        (1, 1024, 19, 88, 3, 1, 1, 1, False, False),
+        # batch > 1
+        (7, 32, 149, 32, 3, 1, 0, 1, False, False),
+        (8, 32, 149, 32, 3, 1, 0, 1, False, False),
+        (32, 32, 149, 32, 3, 1, 0, 1, False, False),
+        # Asymmetric padding
+        (1, 32, 35, 64, 7, 2, (0, 0, 1, 1), 1, False, False),
+        (1, 64, 8, 128, 3, 1, (3, 3, 2, 2), 1, False, False),
+        (1, 64, 8, 64, 1, 1, (1, 2, 2, 1), 1, False, False),
+        (1, 64, 17, 192, 1, 1, (1, 2), 1, False, False),
+        (1, 64, 8, 64, 3, 1, (3, 1), 1, False, False),
+        (1, 128, 8, 384, 3, 1, (0, 2), 1, False, False),
+        (1, 64, 8, 64, 1, 1, "VALID", 1, False, False),
+        (1, 392, 8, 64, 3, 1, "VALID", 1, False, False),
+        (1, 512, 19, 64, 1, 1, "SAME", 1, False, False),
+        (1, 64, 16, 32, 2, 1, "SAME", 1, False, False),
+        (1, 64, 8, 64, 3, 1, (1, 2, 2, 1), 1, False, True),
+        (1, 64, 8, 64, 5, 2, (1, 3), 1, True, False),
+        (1, 64, 56, 64, 3, 1, "VALID", 1, True, True),
+        (1, 64, 56, 64, 24, 1, "SAME", 1, True, True),
+    ],
+)
+def test_conv2d_NCHWc_int8(in_dtype, params):
+    with Int8Fallback():
+        (
+            batch,
+            in_channel,
+            in_size,
+            num_filter,
+            kernel,
+            stride,
+            padding,
+            dilation,
+            add_bias,
+            add_relu,
+        ) = params
+        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
+        padding_sum = pad_top + pad_left + pad_bottom + pad_right
+        print(
+            "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
+            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
         )
-        bias_shape = get_const_tuple(bias.shape)
 
-        @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
-        def get_ref_data():
-            a_np = np.random.randint(low=lo, high=hi, size=a_shape).astype(out_dtype)
-            w_np = np.random.randint(low=lo, high=hi, size=w_shape).astype(out_dtype)
-            b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
-            dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-            c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(
-                out_dtype
-            )
+        in_height = in_width = in_size
 
-            # convert to NCHWc
-            _, _, out_height, out_width = c_np.shape
-            c_np = c_np.reshape(
-                (batch, num_filter // oc_block_factor, oc_block_factor, out_height, out_width)
-            ).transpose(0, 1, 3, 4, 2)
+        A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype)
+        W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype)
 
-            if add_bias:
-                b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
-                c_np += b_np
-            if add_relu:
-                c_np = np.maximum(c_np, 0)
+        a_shape = get_const_tuple(A.shape)
+        w_shape = get_const_tuple(W.shape)
+        dtype = A.dtype
+        out_dtype = "int32" if in_dtype == "int8" else "uint32"
+        input_min, input_max = get_dtype_range(in_dtype)
 
-            return a_np, w_np, b_np, c_np
+        def check_target(target, compute, schedule, oc_block_factor, build_only):
+            dev = tvm.device(target, 0)
+            if not tvm.testing.device_enabled(target):
+                pytest.skip(reason="Skip because %s is not enabled" % target)
+            if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
+                pytest.skip(reason="Skip because %s is not enabled" % target)
 
-        a_np, w_np, b_np, c_np = get_ref_data()
+            bias = te.placeholder(
+                (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype=out_dtype
+            )
+            bias_shape = get_const_tuple(bias.shape)
+
+            @memoize("topi.tests.test_topi_conv2d_int8.test_conv2d_NCHWc_int8")
+            def get_ref_data():
+                a_np = np.random.randint(low=input_min, high=input_max, size=a_shape).astype(
+                    out_dtype
+                )
+                w_np = np.random.randint(low=input_min, high=input_max, size=w_shape).astype(
+                    out_dtype
+                )
+                b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
+                dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+                c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(
+                    out_dtype
+                )
+
+                # convert to NCHWc
+                _, _, out_height, out_width = c_np.shape
+                c_np = c_np.reshape(
+                    (batch, num_filter // oc_block_factor, oc_block_factor, out_height, out_width)
+                ).transpose(0, 1, 3, 4, 2)
+
+                if add_bias:
+                    b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
+                    c_np += b_np
+                if add_relu:
+                    c_np = np.maximum(c_np, 0)
+
+                return a_np, w_np, b_np, c_np
+
+            with tvm.target.Target(target):
+                C = compute(
+                    A,
+                    W,
+                    (stride, stride),
+                    padding,
+                    (dilation, dilation),
+                    "NCHW",
+                    "NCHW",
+                    out_dtype,
+                )
+                if add_bias:
+                    C = topi.add(C, bias)
+                if add_relu:
+                    C = topi.nn.relu(C)
+                s = schedule([C])
+
+            compile_args = [A, W, bias, C] if add_bias else [A, W, C]
 
-        with tvm.target.Target(target):
-            C = compute(
-                A,
-                W,
-                (stride, stride),
-                padding,
-                (dilation, dilation),
-                "NCHW",
-                "NCHW",
-                out_dtype,
+            func = tvm.build(
+                s,
+                compile_args,
+                target,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
             )
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = schedule([C])
 
-        a = tvm.nd.array(a_np.astype(dtype), dev)
-        w = tvm.nd.array(w_np.astype(dtype), dev)
-        b = tvm.nd.array(b_np.astype(out_dtype), dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-
-        if add_bias:
-            compile_args = [A, W, bias, C]
-            run_args = [a, w, b, c]
-        else:
-            compile_args = [A, W, C]
-            run_args = [a, w, c]
-
-        func = tvm.build(
-            s,
-            compile_args,
-            target,
-            name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-        )
+            if build_only:
+                return
 
-        if build_only:
-            return
+            a_np, w_np, b_np, c_np = get_ref_data()
 
-        print("Running on target: %s" % target)
+            a = tvm.nd.array(a_np.astype(dtype), dev)
+            w = tvm.nd.array(w_np.astype(dtype), dev)
+            b = tvm.nd.array(b_np.astype(out_dtype), dev)
+            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+            run_args = [a, w, b, c] if add_bias else [a, w, c]
 
-        func(*run_args)
+            print("Running on target: %s" % target)
 
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
+            func(*run_args)
 
-    targets = [
-        (
-            "cuda",
-            lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
-            topi.cuda.schedule_conv2d_NCHWc_int8,
-            4,
-            False,
-        ),
-        # Disable on CI since it does not support spirv int8 dot product
-        # (
-        #     "vulkan -from_device=0",
-        #     lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
-        #     topi.cuda.schedule_conv2d_NCHWc_int8,
-        #     4,
-        #     False,
-        # ),
-    ]
-
-    build_only_aarch64 = platform.machine() != "aarch64"
-
-    targets.append(
-        (
-            "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod",
-            topi.arm_cpu.conv2d_NCHWc_int8,
-            topi.arm_cpu.schedule_conv2d_NCHWc_int8,
-            8,
-            build_only_aarch64,
-        )
-    )
+            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
 
-    if in_dtype == "int8":
-        targets += [
-            (
-                "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
-                topi.arm_cpu.conv2d_NCHWc_int8,
-                topi.arm_cpu.schedule_conv2d_NCHWc_int8,
-                8,
-                build_only_aarch64,
-            ),
+        targets = [
             (
-                "rocm -mattr=+dotprod",
+                "cuda",
                 lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
                 topi.cuda.schedule_conv2d_NCHWc_int8,
                 4,
                 False,
             ),
+            # Disable on CI since it does not support spirv int8 dot product
+            # (
+            #     "vulkan -from_device=0",
+            #     lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
+            #     topi.cuda.schedule_conv2d_NCHWc_int8,
+            #     4,
+            #     False,
+            # ),
         ]
 
-    for target, compute, schedule, oc_block_factor, build_only in targets:
-        check_target(target, compute, schedule, oc_block_factor, build_only)
-
-
-def verify_conv2d_nchw_int8(
-    in_dtype,
-    batch,
-    in_channel,
-    in_size,
-    num_filter,
-    kernel,
-    stride,
-    padding,
-    dilation=1,
-    add_bias=False,
-    add_relu=False,
-):
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
-    padding_sum = pad_top + pad_left + pad_bottom + pad_right
-    print(
-        "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
-        % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
-    )
-
-    in_height = in_width = in_size
-
-    A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype)
-    W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype)
-    bias = te.placeholder((num_filter, 1, 1), name="bias", dtype=in_dtype)
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
-    def get_ref_data():
-        a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
-        w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
-        b_np = np.random.uniform(size=bias_shape).astype(dtype)
-        dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype)
-
-        if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
-            c_np += b_np
-        if add_relu:
-            c_np = np.maximum(c_np, 0)
-
-        return a_np, w_np, b_np, c_np
-
-    a_np, w_np, b_np, c_np = get_ref_data()
-
-    def verify_workload_padding():
-        _, _, out_height, out_width = get_const_tuple(c_np.shape)
-        wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)
-
-        # for testing functionality,
-        # we choose arbitrary int32_lanes and num_int8_elements can divide the channel,
-        # regardless of the performance.
-        int32_lanes, num_int8_elements = num_filter, in_channel
-
-        # check if tile_ow candidates are the factors of the right output weight.
-        cfg = autotvm.get_config()
-        fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements)
-        ow_tile = np.prod(cfg["tile_ow"].size)
-
-        tvm.testing.assert_allclose(ow_tile, out_width)
-
-    def check_target(target):
-        dev = tvm.device(target, 0)
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            return
-        if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
-            print("Skip because int8 intrinsics are not available")
-            return
-
-        print("Running on target: %s" % target)
-        with tvm.target.Target(target):
-            C = topi.cuda.conv2d_nchw_int8(
-                A, W, (stride, stride), padding, (dilation, dilation), dtype
-            )
-            if add_bias:
-                C = topi.add(C, bias)
-            if add_relu:
-                C = topi.nn.relu(C)
-            s = topi.cuda.schedule_conv2d_nchw_int8([C])
-
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
-        if add_bias:
-            tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func(a, w, b, c)
-        else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
+        build_only_aarch64 = platform.machine() != "aarch64"
+
+        targets.append(
+            (
+                "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod",
+                topi.arm_cpu.conv2d_NCHWc_int8,
+                topi.arm_cpu.schedule_conv2d_NCHWc_int8,
+                8,
+                build_only_aarch64,
             )
-            func(a, w, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
+        )
 
-    verify_workload_padding()
+        if in_dtype == "int8":
+            targets += [
+                (
+                    "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
+                    topi.arm_cpu.conv2d_NCHWc_int8,
+                    topi.arm_cpu.schedule_conv2d_NCHWc_int8,
+                    8,
+                    build_only_aarch64,
+                ),
+                (
+                    "rocm -mattr=+dotprod",
+                    lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(
+                        a, w, s, p, d, l, o
+                    ),
+                    topi.cuda.schedule_conv2d_NCHWc_int8,
+                    4,
+                    False,
+                ),
+            ]
+
+        for target, compute, schedule, oc_block_factor, build_only in targets:
+            check_target(target, compute, schedule, oc_block_factor, build_only)
+
+
+# Conv2d NCHW int8 schedule testing. Internally, it uses NCHWc schedule. So, just
+# performing basic testing - one test for all different scenarios - batch, dilation etc..
+@pytest.mark.parametrize("in_dtype", ["int8", "uint8"])
+@pytest.mark.parametrize(
+    "params",
+    [
+        (1, 64, 56, 64, 3, 1, 1, 1, False, False),
+        (1, 64, 56, 64, 3, 1, 1, 1, False, True),
+        (1, 64, 56, 64, 3, 1, 1, 2, False, False),
+        (9, 64, 56, 64, 3, 1, 1, 1, False, False),
+        (4, 4, 4, 4, 4, 4, 4, 1, False, False),
+        (1, 32, 149, 32, 3, 1, 0, 1, False, False),
+        (7, 32, 149, 32, 3, 1, 0, 1, False, False),
+        (1, 32, 35, 64, 7, 2, (0, 0, 1, 1), 1, False, False),
+        (1, 32, 35, 64, 7, 2, (0, 0, 2, 2), 1, False, False),
+    ],
+)
+def test_conv2d_nchw_int8(in_dtype, params):
+    with Int8Fallback():
+        (
+            batch,
+            in_channel,
+            in_size,
+            num_filter,
+            kernel,
+            stride,
+            padding,
+            dilation,
+            add_bias,
+            add_relu,
+        ) = params
+        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
+        padding_sum = pad_top + pad_left + pad_bottom + pad_right
+        print(
+            "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
+            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)
+        )
 
-    for target in ["cuda"]:
-        check_target(target)
+        in_height = in_width = in_size
 
+        A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype)
+        W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype)
+        bias = te.placeholder((num_filter, 1, 1), name="bias", dtype=in_dtype)
 
-@pytest.mark.parametrize("in_dtype", ["int8", "uint8"])
-def test_conv2d_nchw(in_dtype):
-    with Int8Fallback():
-        # ResNet18 workloads where channels in / out are multiple of oc_block_factor
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 128, 3, 2, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 128, 1, 2, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 28, 128, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 28, 256, 3, 2, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 28, 256, 1, 2, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 14, 256, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 14, 512, 3, 2, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 14, 512, 1, 2, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 512, 7, 512, 3, 1, 1)
+        a_shape = get_const_tuple(A.shape)
+        w_shape = get_const_tuple(W.shape)
+        bias_shape = get_const_tuple(bias.shape)
+        dtype = A.dtype
 
-        # bias, relu
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, add_relu=True)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, add_bias=True)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True)
+        @memoize("topi.tests.test_topi_conv2d_int8.test_conv2d_nchw_int8")
+        def get_ref_data():
+            a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
+            w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+            c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype)
 
-        # dilation = 2
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, dilation=2)
+            if add_bias:
+                b_np = np.random.uniform(size=bias_shape).astype(dtype)
+                c_np += b_np
+            if add_relu:
+                c_np = np.maximum(c_np, 0)
 
-        # batch size
-        verify_conv2d_NCHWc_int8(in_dtype, 4, 64, 56, 64, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 9, 64, 56, 64, 3, 1, 1)
+            return a_np, w_np, b_np, c_np
 
-        # weird workloads
-        verify_conv2d_NCHWc_int8(in_dtype, 4, 4, 4, 8, 4, 4, 4)
+        a_np, w_np, b_np, c_np = get_ref_data()
 
-        # inception v3 workloads where channels in / out are multiple of oc_block_factor
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 32, 149, 32, 3, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 32, 147, 64, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 73, 80, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 80, 73, 192, 3, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 35, 64, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 35, 48, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 48, 35, 64, 5, 1, 2)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 35, 96, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 96, 35, 96, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 35, 32, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 35, 64, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 35, 48, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 288, 35, 64, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 288, 35, 48, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 288, 35, 384, 3, 2, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 96, 35, 96, 3, 2, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 768, 17, 192, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 768, 17, 128, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 128, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 192, 7, 1, 3)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 128, 7, 1, 3)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 192, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 768, 17, 160, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 160, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 192, 7, 1, 3)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 160, 7, 1, 3)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 192, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 192, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 192, 7, 1, 3)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 320, 3, 2, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 192, 3, 2, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 320, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 384, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 384, 8, 384, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 384, 8, 384, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 448, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 448, 8, 384, 3, 1, 1)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 192, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 320, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 384, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 448, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 192, 1, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 1024, 19, 88, 3, 1, 1)
+        def verify_workload_padding():
+            _, _, _, out_width = get_const_tuple(c_np.shape)
+            wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)
+
+            # for testing functionality,
+            # we choose arbitrary int32_lanes and num_int8_elements can divide the channel,
+            # regardless of the performance.
+            int32_lanes, num_int8_elements = num_filter, in_channel
+
+            # check if tile_ow candidates are the factors of the right output weight.
+            cfg = autotvm.get_config()
+            fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements)
+            ow_tile = np.prod(cfg["tile_ow"].size)
+
+            tvm.testing.assert_allclose(ow_tile, out_width)
+
+        def check_target(target):
+            dev = tvm.device(target, 0)
+            if not tvm.testing.device_enabled(target):
+                pytest.skip("Skip because %s is not enabled" % target)
+            if target == "cuda" and not tvm.contrib.nvcc.have_int8(dev.compute_version):
+                pytest.skip("Skip because int8 intrinsics are not available")
+
+            print("Running on target: %s" % target)
+            with tvm.target.Target(target):
+                C = topi.cuda.conv2d_nchw_int8(
+                    A, W, (stride, stride), padding, (dilation, dilation), dtype
+                )
+                if add_bias:
+                    C = topi.add(C, bias)
+                if add_relu:
+                    C = topi.nn.relu(C)
+                s = topi.cuda.schedule_conv2d_nchw_int8([C])
+
+            build_args = [A, W, bias, C] if add_bias else [A, W, C]
 
-        # batch > 1
-        verify_conv2d_NCHWc_int8(in_dtype, 7, 32, 149, 32, 3, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 8, 32, 149, 32, 3, 1, 0)
-        verify_conv2d_NCHWc_int8(in_dtype, 32, 32, 149, 32, 3, 1, 0)
+            func = tvm.build(
+                s,
+                build_args,
+                target,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+                % (
+                    batch,
+                    in_channel,
+                    in_size,
+                    num_filter,
+                    kernel,
+                    stride,
+                    padding_sum,
+                    dilation,
+                ),
+            )
 
-        # Asymmetric padding
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 32, 35, 64, 7, 2, (0, 0, 1, 1))
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 128, 3, 1, (3, 3, 2, 2))
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 1, 1, (1, 2, 2, 1))
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 17, 192, 1, 1, (1, 2))
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 3, 1, (3, 1))
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 8, 384, 3, 1, (0, 2))
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 1, 1, "VALID")
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 392, 8, 64, 3, 1, "VALID")
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 512, 19, 64, 1, 1, "SAME")
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 16, 32, 2, 1, "SAME")
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 3, 1, (1, 2, 2, 1), add_relu=True)
-        verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 5, 2, (1, 3), add_bias=True)
-        verify_conv2d_NCHWc_int8(
-            in_dtype, 1, 64, 56, 64, 3, 1, "VALID", add_bias=True, add_relu=True
-        )
-        verify_conv2d_NCHWc_int8(
-            in_dtype, 1, 64, 56, 64, 24, 1, "SAME", add_bias=True, add_relu=True
-        )
+            a = tvm.nd.array(a_np, dev)
+            w = tvm.nd.array(w_np, dev)
+            b = tvm.nd.array(b_np, dev)
+            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
 
-        # Conv2d NCHW int8 schedule testing. Internally, it uses NCHWc schedule. So, just
-        # performing basic testing - one test for all different scenarios - batch, dilation etc..
-        verify_conv2d_nchw_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1)
-        verify_conv2d_nchw_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, add_relu=True)
-        verify_conv2d_nchw_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, dilation=2)
-        verify_conv2d_nchw_int8(in_dtype, 9, 64, 56, 64, 3, 1, 1)
-        verify_conv2d_nchw_int8(in_dtype, 4, 4, 4, 4, 4, 4, 4)
-        verify_conv2d_nchw_int8(in_dtype, 1, 32, 149, 32, 3, 1, 0)
-        verify_conv2d_nchw_int8(in_dtype, 7, 32, 149, 32, 3, 1, 0)
-        verify_conv2d_nchw_int8(in_dtype, 1, 32, 35, 64, 7, 2, (0, 0, 1, 1))
-        verify_conv2d_nchw_int8(in_dtype, 1, 32, 35, 64, 7, 2, (0, 0, 2, 2))
+            run_args = [a, w, b, c] if add_bias else [a, w, c]
 
+            func(*run_args)
 
-def test_conv2d_nhwc():
-    with Int8Fallback():
-        # Subset of inception v3 expanded (dilation > 1, batch > 1, 'VALID' padding)
-        verify_conv2d_NHWC_gemm_int8(1, 3, 299, 32, 3, 2, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 32, 149, 32, 3, 1, "SAME", dilation=2)
-        verify_conv2d_NHWC_gemm_int8(4, 32, 147, 64, 3, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 64, 73, 80, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 80, 73, 192, 3, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 192, 35, 48, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 192, 35, 64, 1, 1, "VALID")
-        verify_conv2d_NHWC_gemm_int8(1, 192, 35, 32, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 48, 35, 64, 5, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 96, 35, 96, 3, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 256, 35, 48, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 256, 35, 64, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 288, 35, 64, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 288, 35, 48, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 96, 35, 96, 3, 2, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 128, 17, 192, 7, 1, "SAME", dilation=2)
-        verify_conv2d_NHWC_gemm_int8(1, 160, 17, 160, 7, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 160, 17, 192, 1, 1, "VALID")
-        verify_conv2d_NHWC_gemm_int8(1, 192, 17, 192, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 768, 5, 128, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 192, 17, 320, 3, 2, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 192, 17, 192, 3, 2, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 192, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 384, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 320, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 448, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 384, 8, 384, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 384, 8, 384, 3, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 448, 8, 384, 3, 1, "VALID")
-        verify_conv2d_NHWC_gemm_int8(1, 2048, 8, 320, 1, 1, "SAME")
-        verify_conv2d_NHWC_gemm_int8(1, 2048, 8, 448, 1, 1, "SAME", add_bias=True, add_relu=True)
-        verify_conv2d_NHWC_gemm_int8(1, 2048, 8, 192, 1, 1, "SAME", add_bias=True)
-
-        # Let's also verify that it compiles fine on AArch64 targets
-        compile_conv2d_NHWC_gemm_int8_arm(1, 3, 299, 32, 3, 2, "SAME")
+            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
+
+        verify_workload_padding()
+
+        check_target("cuda")
 
 
 if __name__ == "__main__":

From a1df230efe1864ad4711013d6fca1beebd831ffe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uro=C5=A1=20Petkovi=C4=87?=
 <104573172+petuca@users.noreply.github.com>
Date: Sun, 1 Jan 2023 11:51:39 +0100
Subject: [PATCH 097/286] [fix] MXNet dot for all tensor dimensions (#11760)

* [fix] MXNet dot for all tensor dimensions

* Fixing the MxNet structure
---
 python/tvm/relay/frontend/mxnet.py          | 45 ++++++++++++++++++---
 tests/python/frontend/mxnet/test_forward.py | 11 +++++
 2 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 1b1d60119967..4e6540fb08de 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -795,19 +795,52 @@ def _mx_multibox_detection(inputs, attrs):
 
 def _mx_dot(inputs, attrs):
     assert len(inputs) == 2
-    a, b = inputs
+
+    a = inputs[0]
+    b = inputs[1]
+
     rank_a = len(_infer_type(a).checked_type.shape)
     rank_b = len(_infer_type(b).checked_type.shape)
-    if rank_a != 2 or rank_b != 2:
-        raise tvm.error.OpAttributeUnimplemented("Only 2-D arrays are supported.")
+
+    if rank_a < 1 or rank_b < 1:
+        raise tvm.error.OpAttributeInvalid("Unsupported shape of input tensors.")
+
     transpose_a = attrs.get_bool("transpose_a", False)
     transpose_b = attrs.get_bool("transpose_b", False)
+
     if transpose_a is True:
         msg = 'Value {} in attribute "transpose_a" of operator dot ' "is not valid."
         raise tvm.error.OpAttributeInvalid(msg.format(transpose_a))
-    if transpose_b is False:
-        b = _op.transpose(b, axes=[1, 0])
-    return _op.nn.dense(a, b)
+
+    # When performing dot product we need to properly handle shape of result -> out_shape
+    if rank_a == 1:
+        out_shape = list()
+        a = _op.expand_dims(a, axis=0)
+    else:
+        shape_a = list(_infer_type(a).checked_type.shape)
+        out_shape = shape_a[:-1]
+        a = _op.reshape(a, newshape=(-1, shape_a[-1]))
+
+    if rank_b == 1:
+        if not out_shape:
+            out_shape = [
+                1,
+            ]
+        b = _op.expand_dims(b, axis=1)
+    else:
+        # Transpose matrix b if needed
+        if transpose_b:
+            trans_axes = list(range(rank_b))
+            trans_axes = trans_axes[-1:] + trans_axes[:-1]
+            b = _op.transpose(b, axes=trans_axes)
+
+        shape_b = list(_infer_type(b).checked_type.shape)
+        out_shape += shape_b[1:]
+        b = _op.reshape(b, newshape=(shape_b[0], -1))
+
+    out = _op.reshape(_op.nn.matmul(a, b), newshape=out_shape)
+
+    return out
 
 
 def _mx_batch_dot(inputs, attrs):
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 44aa93061a62..0e34719ea27d 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -690,6 +690,17 @@ def verify(a_shape, b_shape, transpose_b=False):
 
     verify((1, 256), (256, 1))
     verify((1, 256), (1, 256), transpose_b=True)
+    verify((5,), (5,))
+    verify((3,), (3, 5))
+    verify((3,), (5, 3), transpose_b=True)
+    verify((3,), (3, 5, 3, 5))
+    verify((3,), (5, 5, 3, 3), transpose_b=True)
+    verify((10, 1), (1,))
+    verify((1, 1), (4, 3, 2, 1), transpose_b=True)
+    verify((4, 3, 2, 1), (1,))
+    verify((1, 2, 3, 4), (1, 4), transpose_b=True)
+    verify((4, 1, 1), (1, 2, 3))
+    verify((1, 1, 4), (2, 3, 4), transpose_b=True)
 
 
 @tvm.testing.uses_gpu

From a4ec3a28b2383e9f667ab599361c188512e2051f Mon Sep 17 00:00:00 2001
From: Balint Cristian <cristian.balint@gmail.com>
Date: Sun, 1 Jan 2023 12:52:00 +0200
Subject: [PATCH 098/286] [Build] Expose missing USE_VERILATOR in cmake
 (#13676)

Expose missing USE_VERILATOR in cmake
---
 CMakeLists.txt              | 1 +
 cmake/modules/LibInfo.cmake | 1 +
 src/support/libinfo.cc      | 1 +
 3 files changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b774181f5f71..2d1785f3ffa1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -118,6 +118,7 @@ tvm_option(SUMMARIZE "Print CMake option summary after configuring" OFF)
 tvm_option(USE_CLML "Build with CLML Codegen support" OFF)
 tvm_option(USE_CLML_GRAPH_EXECUTOR "Build with CLML graph runtime" OFF)
 tvm_option(USE_UMA "Build with UMA support" OFF)
+tvm_option(USE_VERILATOR "Build with Verilator support" OFF)
 
 # include directories
 include_directories(${CMAKE_INCLUDE_PATH})
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 042fa3c6ddd7..779e10d01246 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -119,6 +119,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_CLML="${USE_CLML}"
     TVM_INFO_USE_CLML_GRAPH_EXECUTOR="${USE_CLML_GRAPH_EXECUTOR}"
     TVM_INFO_USE_UMA="${USE_UMA}"
+    TVM_INFO_USE_VERILATOR="${USE_VERILATOR}"
     TVM_INFO_USE_CCACHE="${USE_CCACHE}"
     TVM_INFO_BACKTRACE_ON_SEGFAULT="${BACKTRACE_ON_SEGFAULT}"
   )
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 7bb1e04920fa..8d1332bee406 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -328,6 +328,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_CLML", TVM_INFO_USE_CLML},
       {"USE_CLML_GRAPH_EXECUTOR", TVM_INFO_USE_CLML_GRAPH_EXECUTOR},
       {"USE_UMA", TVM_INFO_USE_UMA},
+      {"USE_VERILATOR", TVM_INFO_USE_VERILATOR},
       {"USE_CCACHE", TVM_INFO_USE_CCACHE},
       {"BACKTRACE_ON_SEGFAULT", TVM_INFO_BACKTRACE_ON_SEGFAULT},
   };

From 30a513a4f5ee41bcd330e202055fce67b13ed185 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Sun, 1 Jan 2023 16:22:56 +0530
Subject: [PATCH 099/286] [BENCHMARK][ADRENO] Adreno Benchmarks with texture
 (#13675)

* [BENCHMARK][ADRENO] Adreno Benchmarks with texture

Benchmarks for various networks listed below with fp16 and fp32.
resnet-18, resnet-34, resnet-50, vgg-16, vgg-19, densenet-121,
inception_v3, mobilenetv1, squeezenet_v1.0, squeezenet_v1.1

* * lint error

Co-authored-by: Siva Rama Krishna Reddy B <sivb@qti.qualcomm.com>
---
 .../adreno/adreno_gpu_bench_texture.py        | 278 ++++++++++++++++++
 apps/benchmark/adreno/bench.sh                |  59 ++++
 tests/scripts/ci.py                           |   7 +
 3 files changed, 344 insertions(+)
 create mode 100755 apps/benchmark/adreno/adreno_gpu_bench_texture.py
 create mode 100755 apps/benchmark/adreno/bench.sh

diff --git a/apps/benchmark/adreno/adreno_gpu_bench_texture.py b/apps/benchmark/adreno/adreno_gpu_bench_texture.py
new file mode 100755
index 000000000000..2228cda31a39
--- /dev/null
+++ b/apps/benchmark/adreno/adreno_gpu_bench_texture.py
@@ -0,0 +1,278 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Benchmark script for various models on Adreno GPU.
+"""
+import argparse
+
+import numpy as np
+
+import os
+import sys
+import tvm
+from tvm import te
+from tvm.relay import testing
+from tvm.contrib.utils import tempdir
+import tvm.contrib.graph_executor as runtime
+from tvm import relay
+from tvm import autotvm
+from tvm.contrib import utils, ndk
+
+
+def get_network(name, batch_size, dtype="float32"):
+    """Get the symbol definition and random weight of a network
+
+    Parameters
+    ----------
+    name: str
+        The name of the network, can be 'resnet-18', 'resnet-50', 'vgg-16', 'inception_v3', 'mobilenet', ...
+    batch_size: int
+        batch size
+    dtype: str
+        Data type
+
+    Returns
+    -------
+    net: tvm.IRModule
+        The relay function of network definition
+    params: dict
+        The random parameters for benchmark
+    input_shape: tuple
+        The shape of input tensor
+    output_shape: tuple
+        The shape of output tensor
+    """
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if name == "mobilenet":
+        net, params = testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299)
+        net, params = testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif "resnet" in name:
+        n_layer = int(name.split("-")[1])
+        net, params = testing.resnet.get_workload(
+            num_layers=n_layer, batch_size=batch_size, dtype=dtype
+        )
+    elif "vgg" in name:
+        n_layer = int(name.split("-")[1])
+        net, params = testing.vgg.get_workload(
+            num_layers=n_layer, batch_size=batch_size, dtype=dtype
+        )
+    elif "densenet" in name:
+        n_layer = int(name.split("-")[1])
+        net, params = testing.densenet.get_workload(
+            densenet_size=n_layer, batch_size=batch_size, dtype=dtype
+        )
+    elif "squeezenet" in name:
+        version = name.split("_v")[1]
+        net, params = testing.squeezenet.get_workload(
+            batch_size=batch_size, version=version, dtype=dtype
+        )
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        block = get_model("resnet18_v1", pretrained=True)
+        net, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = net["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        net = tvm.IRModule.from_expr(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+
+def print_progress(msg):
+    """print progress message
+
+    Parameters
+    ----------
+    msg: str
+        The message to print
+    """
+    sys.stdout.write(msg + "\r")
+    sys.stdout.flush()
+
+
+def tune_tasks(
+    tasks,
+    measure_option,
+    n_trial=1024,
+    early_stopping=None,
+    log_filename="tuning.log",
+):
+    from tvm.autotvm.tuner import XGBTuner
+
+    tmp_log_file = log_filename + ".tmp"
+
+    for i, tsk in enumerate(reversed(tasks)):
+        print("Task: ", tsk)
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
+        tuner_obj = XGBTuner(tsk, loss_type="rank")
+
+        tsk_trial = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(
+            n_trial=tsk_trial,
+            early_stopping=early_stopping,
+            measure_option=measure_option,
+            callbacks=[
+                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
+                autotvm.callback.log_to_file(tmp_log_file),
+            ],
+        )
+
+        autotvm.record.pick_best(tmp_log_file, log_filename)
+
+
+def evaluate_network(network, target, target_host, dtype, repeat):
+    print_progress(network)
+    net, params, input_shape, output_shape = get_network(network, batch_size=1, dtype=dtype)
+
+    # Auto Tuning
+    tune_log = "adreno-" + network + "-" + dtype + ".log"
+    tuning_options = {
+        "log_filename": tune_log,
+        "early_stopping": None,
+        "measure_option": autotvm.measure_option(
+            builder=autotvm.LocalBuilder(build_func=ndk.create_shared, timeout=15),
+            runner=autotvm.RPCRunner(
+                args.rpc_key,
+                host=args.host,
+                port=args.port,
+                number=3,
+                timeout=600,
+            ),
+        ),
+    }
+    if args.tune:
+        tasks = autotvm.task.extract_from_program(
+            net, target=target, target_host=target_host, params=params
+        )
+        tune_tasks(tasks, **tuning_options)
+
+    print_progress("%-20s building..." % network)
+
+    # Build the tuning log
+    if os.path.exists(tune_log):
+        with autotvm.apply_history_best(tune_log):
+            with tvm.transform.PassContext(opt_level=3):
+                lib = relay.build(
+                    net, target=tvm.target.Target(target, host=target_host), params=params
+                )
+    else:
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(
+                net, target=tvm.target.Target(target, host=target_host), params=params
+            )
+
+    tmp = tempdir()
+
+    filename = "%s.so" % network
+    lib.export_library(tmp.relpath(filename), ndk.create_shared)
+
+    # upload library and params
+    print_progress("%-20s uploading..." % network)
+
+    # connect to remote device
+    tracker = tvm.rpc.connect_tracker(args.host, args.port)
+    remote = tracker.request(args.rpc_key)
+
+    dev = remote.device(str(target), 0)
+    remote.upload(tmp.relpath(filename))
+
+    rlib = remote.load_module(filename)
+    module = runtime.GraphModule(rlib["default"](dev))
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+    module.set_input("data", data_tvm)
+
+    # evaluate
+    print_progress("%-20s evaluating..." % network)
+    ftimer = module.module.time_evaluator("run", dev, number=1, repeat=repeat)
+    prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
+    print(
+        "%-20s %-19s (%s)"
+        % (network + "-" + dtype, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res))
+    )
+    return (np.mean(prof_res), np.std(prof_res))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--network",
+        type=str,
+        choices=[
+            "resnet-18",
+            "resnet-34",
+            "resnet-50",
+            "vgg-16",
+            "vgg-19",
+            "densenet-121",
+            "inception_v3",
+            "mobilenet",
+            "squeezenet_v1.0",
+            "squeezenet_v1.1",
+        ],
+        help="The name of neural network",
+    )
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=9190)
+    parser.add_argument("--rpc-key", type=str, default="android")
+    parser.add_argument("--repeat", type=int, default=30)
+    parser.add_argument("--tune", type=bool, default=False)
+    args = parser.parse_args()
+
+    if args.network is None:
+        networks = [
+            "resnet-18",
+            "resnet-34",
+            "resnet-50",
+            "vgg-16",
+            "vgg-19",
+            "densenet-121",
+            "inception_v3",
+            "mobilenet",
+            "squeezenet_v1.0",
+            "squeezenet_v1.1",
+        ]
+    else:
+        networks = [args.network]
+
+    target = "opencl -device=adreno"
+    target_host = "llvm -mtriple=arm64-linux-android"
+
+    print("--------------------------------------------------")
+    print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
+    print("--------------------------------------------------")
+
+    results = {}
+
+    for network in networks:
+        ftime = evaluate_network(network, target, target_host, "float32", args.repeat)
+        results[network + "-float32"] = ftime
+        ftime = evaluate_network(network, target, target_host, "float16", args.repeat)
+        results[network + "-float16"] = ftime
+
+    print("----------------------------------------------------------------------")
+    print("%-30s %-30s" % ("Network Name", "Mean Inference Time        (std dev)"))
+    print("----------------------------------------------------------------------")
+    for key, val in results.items():
+        print("%-30s %-30s (%s)" % (key, "%.2f ms" % val[0], "%.2f ms" % val[1]))
diff --git a/apps/benchmark/adreno/bench.sh b/apps/benchmark/adreno/bench.sh
new file mode 100755
index 000000000000..7d46685b8654
--- /dev/null
+++ b/apps/benchmark/adreno/bench.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euxo pipefail
+
+source tests/scripts/setup-pytest-env.sh
+export PYTHONPATH=${PYTHONPATH}:${TVM_PATH}/apps/extension/python
+export LD_LIBRARY_PATH="build:${LD_LIBRARY_PATH:-}"
+
+export TVM_TRACKER_HOST=127.0.0.1
+export TVM_TRACKER_PORT=$(((RANDOM % 100) + 9100))
+export RPC_DEVICE_KEY="android"
+export TVM_NDK_CC="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang"
+
+env PYTHONPATH=python python3 -m tvm.exec.rpc_tracker --host "${TVM_TRACKER_HOST}" --port "${TVM_TRACKER_PORT}" &
+TRACKER_PID=$!
+sleep 5   # Wait for tracker to bind
+
+export ANDROID_SERIAL=$2
+
+adb shell "mkdir -p /data/local/tmp/tvm_ci"
+adb push build-adreno-target/tvm_rpc /data/local/tmp/tvm_ci/tvm_rpc_ci
+adb push build-adreno-target/libtvm_runtime.so /data/local/tmp/tvm_ci
+
+adb reverse tcp:${TVM_TRACKER_PORT} tcp:${TVM_TRACKER_PORT}
+adb forward tcp:5000 tcp:5000
+adb forward tcp:5001 tcp:5001
+adb forward tcp:5002 tcp:5002
+env adb shell "cd /data/local/tmp/tvm_ci; killall -9 tvm_rpc_ci; sleep 2; LD_LIBRARY_PATH=/data/local/tmp/tvm_ci/ ./tvm_rpc_ci server --host=0.0.0.0 --port=5000 --port-end=5010 --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=${RPC_DEVICE_KEY}" &
+DEVICE_PID=$!
+sleep 5 # Wait for the device connections
+trap "{ kill ${TRACKER_PID}; kill ${DEVICE_PID}; }" 0
+
+# cleanup pycache
+find . -type f -path "*.pyc" | xargs rm -f
+# Test TVM
+make cython3
+
+if [ "texture" == $1 ] ; then
+    python3 apps/benchmark/adreno/adreno_gpu_bench_texture.py --host ${TVM_TRACKER_HOST} --port ${TVM_TRACKER_PORT} --rpc-key ${RPC_DEVICE_KEY}
+fi
+
+kill ${TRACKER_PID}
+kill ${DEVICE_PID}
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 16389d29354c..756b269d0e50 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -727,6 +727,13 @@ def add_subparser(
                     "./tests/scripts/task_python_adreno.sh " + os.environ.get("ANDROID_SERIAL", ""),
                 ],
             ),
+            "benchmarks": (
+                "run Adreno Texture Benchmarks",
+                [
+                    "./apps/benchmark/adreno/bench.sh texture "
+                    + os.environ.get("ANDROID_SERIAL", ""),
+                ],
+            ),
         },
     ),
 ]

From f121fd7c197e771ab2535d1ba8545b47f21ed3a1 Mon Sep 17 00:00:00 2001
From: ninesheep <ninesheep@live.cn>
Date: Mon, 2 Jan 2023 17:39:59 +0800
Subject: [PATCH 100/286] [Bug][CodeGen,Cuda]fix cast fp16 to int8/uint8 in
 cuda (#13641)

* [Fix Bug]fix the bug of tensorflow frontend when parsing Range layer

* [Fix Bug]fix the bug of schedule batch_matmul_int8 on cuda

* fix cast fp16 to int8/uint8 on cuda

Co-authored-by: wangjiuyang <wang.jiuyang@intellif.com>
---
 src/target/source/codegen_cuda.cc | 17 +++++++++++++++++
 src/target/source/codegen_cuda.h  |  1 +
 2 files changed, 18 insertions(+)

diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index 436e85247ffe..c891ec5a28cf 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -579,6 +579,23 @@ void CodeGenCUDA::PrintStorageScope(const std::string& scope, std::ostream& os)
   }
 }
 
+std::string CodeGenCUDA::CastFromTo(std::string value, DataType from, DataType target) {
+  if (from == target) return value;
+  std::ostringstream os;
+  os << "((";
+  this->PrintType(target, os);
+  os << ")";
+  if (from.is_float16() && (target.is_int() || target.is_uint()) && target.bits() == 8) {
+    os << "(";
+    if (target.is_uint()) {
+      os << "u";
+    }
+    os << "int)";
+  }
+  os << value << ")";
+  return os.str();
+}
+
 void CodeGenCUDA::VisitExpr_(const CastNode* op, std::ostream& os) {
   DataType from_ty = op->value.dtype();
   DataType target_ty = op->dtype;
diff --git a/src/target/source/codegen_cuda.h b/src/target/source/codegen_cuda.h
index 0fef15c7a7f3..bb507c179993 100644
--- a/src/target/source/codegen_cuda.h
+++ b/src/target/source/codegen_cuda.h
@@ -58,6 +58,7 @@ class CodeGenCUDA final : public CodeGenC {
   void PrintVecElemStore(const std::string& vec, DataType t, int i, const std::string& value) final;
   void BindThreadIndex(const IterVar& iv) final;  // NOLINT(*)
   void PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os) final;
+  std::string CastFromTo(std::string value, DataType from, DataType target) final;
   // overload visitor
   void VisitExpr_(const RampNode* op, std::ostream& os) final;       // NOLINT(*)
   void VisitExpr_(const ShuffleNode* op, std::ostream& os) final;    // NOLINT(*)

From aa0699ca21b18d345f3633575efac414d102698b Mon Sep 17 00:00:00 2001
From: Balint Cristian <cristian.balint@gmail.com>
Date: Tue, 3 Jan 2023 08:57:56 +0200
Subject: [PATCH 101/286] [TOPI] Expose mem_scope from generic conv2d variants
 to be more reusable (#13680)

Expose mem_scope from generic conv2d variants to be more reusable
---
 python/tvm/topi/generic/conv2d.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py
index 76cd9a7d69d1..a4a37247c82e 100644
--- a/python/tvm/topi/generic/conv2d.py
+++ b/python/tvm/topi/generic/conv2d.py
@@ -132,6 +132,7 @@ def schedule_conv_NCHWc_cpu_common_int8(
     int8_elems=4,
     intrin=None,
     inline_fused=True,
+    mem_scope="global",
 ):
     """
     Defines the schedule for INT8 for Intel and ARM machines
@@ -186,7 +187,7 @@ def schedule_conv_NCHWc_cpu_common_int8(
 
     # schedule 5-D NCHW[x]c conv
     C, O = conv_out, last
-    CC = s.cache_write(C, "global")
+    CC = s.cache_write(C, mem_scope)
 
     batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
     ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
@@ -279,6 +280,7 @@ def schedule_conv_NCHWc_cpu_1x1_int8(
     int8_elems=4,
     intrin=None,
     inline_fused=False,
+    mem_scope="global",
 ):
     """
     Defines the 1x1 conv schedule for INT8 for Intel and ARM machines
@@ -323,7 +325,7 @@ def schedule_conv_NCHWc_cpu_1x1_int8(
         s[kernel_vec].parallel(parallel_axis)
 
     C, O = conv_out, last
-    CC = s.cache_write(C, "global")
+    CC = s.cache_write(C, mem_scope)
 
     batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
     oh_outer, oh_inner = s[C].split(oh, factor=oh_factor)

From ed6a407cf95000782d9c2d5b826850a067bd1a76 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Tue, 3 Jan 2023 12:33:51 +0530
Subject: [PATCH 102/286] [CLML] Version compatibility and various test cases 
 (#13670)

* [CLML][TEST] Codegen test cases for ops

Codegen verification test cases for all the ops (convolution, concat, pad, pool ..etc.)
that are supported by clml BYOC path.

Fix depthwise conv2d issue with layout

* * lint errors

* * version compatilibility changes.

* * review comments

* * Make the adreno container compatible w/ and w/o CLML SDK availability

Co-authored-by: Siva Rama Krishna Reddy B <sivb@qti.qualcomm.com>
---
 cmake/modules/contrib/CLML.cmake              |  16 +-
 python/tvm/relay/op/contrib/clml.py           |  58 +--
 src/relay/backend/contrib/clml/codegen.cc     |   2 +-
 src/runtime/contrib/clml/clml_runtime.cc      |  38 +-
 .../contrib/test_clml/infrastructure.py       |  58 ++-
 .../python/contrib/test_clml/test_network.py  |  15 +-
 tests/python/contrib/test_clml/test_ops.py    | 377 ++++++++++++++++--
 tests/scripts/task_build_adreno_bins.sh       |   6 +-
 tests/scripts/task_config_build_adreno.sh     |   4 +-
 9 files changed, 482 insertions(+), 92 deletions(-)

diff --git a/cmake/modules/contrib/CLML.cmake b/cmake/modules/contrib/CLML.cmake
index e86a7e1ae032..811b8f8d5863 100644
--- a/cmake/modules/contrib/CLML.cmake
+++ b/cmake/modules/contrib/CLML.cmake
@@ -22,7 +22,21 @@ if(USE_CLML)
     if(NOT USE_CLML_GRAPH_EXECUTOR)
         list(APPEND COMPILER_SRCS ${CLML_RUNTIME_MODULE})
     endif()
-    message(STATUS "Build with CLML support...")
+    message(STATUS "Build with CLML support : " ${USE_CLML})
+    if (NOT USE_CLML STREQUAL "ON")
+        set(CLML_VERSION_HEADER "${USE_CLML}/CL/cl_qcom_ml_ops.h")
+        if(EXISTS ${CLML_VERSION_HEADER})
+            file(READ ${CLML_VERSION_HEADER} ver)
+            string(REGEX MATCH "CL_QCOM_ML_OPS_H_MAJOR_VERSION ([0-9]*)" _ ${ver})
+            set(CLML_VERSION_MAJOR ${CMAKE_MATCH_1})
+        else()
+            set(CLML_VERSION_MAJOR "2")
+        endif()
+    else()
+        set(CLML_VERSION_MAJOR "2")
+    endif()
+    add_definitions(-DTVM_CLML_VERSION=${CLML_VERSION_MAJOR})
+    message(STATUS "CLML SDK Version :" ${CLML_VERSION_MAJOR})
 endif()
 
 if(USE_CLML_GRAPH_EXECUTOR)
diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
index 6453b8a06c9f..02e4f62bed24 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -28,6 +28,12 @@
 from ..strategy.generic import is_depthwise_conv2d
 
 
+def clml_sdk_version():
+    """Utility function to get clml version version"""
+
+    return tvm.support.libinfo().get("TVM_CLML_VERSION", 2)
+
+
 def is_clml_runtime_enabled():
     """Check if the CLML graph runtime is present.
 
@@ -92,38 +98,35 @@ def preprocess_module(mod):
     preprocessed_mod : The processed module.
     """
 
-    def convert_layout_conv2d(conv2d_function):
-        def convert_conv(attrs, inputs, tinfos, desired_layouts):
-            new_attrs = dict(attrs)
-            data_info = tinfos[0]
-            weight_info = tinfos[1]
-            desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
-            new_attrs["data_layout"] = desired_data_layout
-            new_attrs["kernel_layout"] = desired_kernel_layout
-
-            if is_depthwise_conv2d(
-                data_info.shape,
-                attrs["data_layout"],
-                weight_info.shape,
-                attrs["kernel_layout"],
-                attrs["groups"],
-            ):
-                dkl = desired_kernel_layout
-                new_attrs["kernel_layout"] = dkl[1] + dkl[0] + dkl[2] + dkl[3]
-            return conv2d_function(*inputs, **new_attrs)
-
-        return convert_conv
-
-    with OpAttrContext(
-        "nn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.nn.conv2d)
-    ):
+    def alter_conv(attrs, inputs, tinfos, out_type):
+        new_attrs = dict(attrs)
+        data_info = tinfos[0]
+        weight_info = tinfos[1]
+        (desired_data_layout, desired_kernel_layout) = ("NCHW", "OIHW")
+        new_attrs["data_layout"] = desired_data_layout
+        new_attrs["kernel_layout"] = desired_kernel_layout
+
+        if is_depthwise_conv2d(
+            data_info.shape,
+            attrs["data_layout"],
+            weight_info.shape,
+            attrs["kernel_layout"],
+            attrs["groups"],
+        ):
+            dkl = desired_kernel_layout
+            new_attrs["kernel_layout"] = dkl[1] + dkl[0] + dkl[2] + dkl[3]
+        return relay.nn.conv2d(*inputs, **new_attrs)
+
+    with OpAttrContext("nn.conv2d", "FTVMAlterOpLayout", alter_conv):
         seq = tvm.transform.Sequential(
             [
                 transform.ConvertLayout({"nn.conv2d": ["NCHW", "OIHW"]}),
+                transform.AlterOpLayout(),
                 transform.FoldConstant(),
             ]
         )
-        preprocessed_mod = seq(mod)
+        with tvm.transform.PassContext(opt_level=3):
+            preprocessed_mod = seq(mod)
     return preprocessed_mod
 
 
@@ -275,6 +278,9 @@ def check_default_op(extract):
         ("clml.add", is_op("add")(wildcard(), wildcard()), check_binary_op),
         ("clml.subtract", is_op("subtract")(wildcard(), wildcard()), check_binary_op),
         ("clml.multiply", is_op("multiply")(wildcard(), wildcard()), check_binary_op),
+        ("clml.divide", is_op("divide")(wildcard(), wildcard()), check_binary_op),
+        ("clml.minimum", is_op("minimum")(wildcard(), wildcard()), check_binary_op),
+        ("clml.maximum", is_op("maximum")(wildcard(), wildcard()), check_binary_op),
         ("clml.softmax", is_op("nn.softmax")(wildcard()), check_softmax_op),
         ("clml.reshape", is_op("reshape")(wildcard()), check_default_op),
         ("clml.avg_pool2d", is_op("nn.avg_pool2d")(wildcard()), check_default_op),
diff --git a/src/relay/backend/contrib/clml/codegen.cc b/src/relay/backend/contrib/clml/codegen.cc
index 167c48e1baf5..d8ca791ad8c4 100644
--- a/src/relay/backend/contrib/clml/codegen.cc
+++ b/src/relay/backend/contrib/clml/codegen.cc
@@ -328,7 +328,7 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
     const auto* dense = fn->body.as<CallNode>();
     const CallNode* bias = nullptr;
 
-    if (backend::IsOp(dense, "add")) {
+    if (backend::IsOp(dense, "add") || backend::IsOp(dense, "nn.bias_add")) {
       bias = dense;
       dense = dense->args[0].as<CallNode>();
     }
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index a667caaafcd8..6396fce4858b 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -153,13 +153,25 @@ class CLMLRuntime : public JSONRuntimeBase {
     ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
 
     for (cl_uint i = 0; i < numVersions; ++i) {
+#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 2
       if (majorVersions[i] == 2) {
-        LOG(WARNING) << "CLML Version Selected:" << majorVersions[i] << " : " << majorVersions[i];
         h_ClmlIntf = clGetMLInterfaceV2QCOM(0);
-        ICHECK(h_ClmlIntf != NULL) << "clGetMLInterfaceV2QCOM:" << result;
+        LOG(WARNING) << "CLML Target version:" << majorVersions[i];
         break;
       }
+#endif
+#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 3
+      if (majorVersions[i] == 3) {
+        h_ClmlIntf = clGetMLInterfaceV3QCOM(0);
+        LOG(WARNING) << "CLML Target version:" << majorVersions[i];
+        break;
+      }
+#endif
     }
+    ICHECK(h_ClmlIntf != NULL)
+        << "clGetMLInterfaceVxQCOM:" << result
+        << " Perhaps there is mispatch between CLML SDK version to target supported version:"
+        << majorVersions[numVersions - 1];
     char* tune_flag;
     if ((tune_flag = getenv("CLML_IS_TUNNING_RUN")))
       this->is_tuning_run = std::stoi(tune_flag);
@@ -400,7 +412,7 @@ class CLMLRuntime : public JSONRuntimeBase {
           this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
           this->layer_.func_outs.push_back(out);
         } else if ("add" == op_name || "subtract" == op_name || "multiply" == op_name ||
-                   "minimum" == op_name || "maximum" == op_name) {
+                   "minimum" == op_name || "maximum" == op_name || "divide" == op_name) {
           auto out = CreateBinaryLayer(&layer_, node);
           this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
           this->layer_.func_outs.push_back(out);
@@ -523,7 +535,7 @@ class CLMLRuntime : public JSONRuntimeBase {
   }
 
   cl_ml_tensor_qcom DeviceMakeCLMLTensor(
-      void* pClmlIntf, cl_context context, tensor_dims_t dims,
+      cl_context context, tensor_dims_t dims,
       cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
       cl_channel_type dtype = CL_FLOAT) {
     cl_ml_tensor_qcom tensor;
@@ -531,8 +543,7 @@ class CLMLRuntime : public JSONRuntimeBase {
 
     cl_ml_tensor_desc_qcom desc = {
         dtype, layout, dims.n, dims.c, dims.h, dims.w, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, { 0 }};
-    CLMLInterfaceV2QCOM* clmlIntf = reinterpret_cast<CLMLInterfaceV2QCOM*>(pClmlIntf);
-    result = clmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &tensor);
+    result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &tensor);
     ICHECK(tensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
     (void)result;
     return tensor;
@@ -544,9 +555,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_int result = CL_OUT_OF_HOST_MEMORY;
     cl_mem buffer = NULL;
 
-    CLMLInterfaceV2QCOM* clmlIntf = reinterpret_cast<CLMLInterfaceV2QCOM*>(pClmlIntf);
     result =
-        clmlIntf->clGetMLTensorMemorySizeQCOM(workspace->context, pTensorMemDesc->tensor, &size);
+        h_ClmlIntf->clGetMLTensorMemorySizeQCOM(workspace->context, pTensorMemDesc->tensor, &size);
     ICHECK(result == CL_SUCCESS) << "clGetMLTensorMemorySizeQCOM:" << result;
 
     buffer = clCreateBuffer(workspace->context, CL_MEM_READ_WRITE, size, NULL, &result);
@@ -612,8 +622,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
 
     auto tensor_dsc = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
-    tensor_dsc->tensor =
-        DeviceMakeCLMLTensor(h_ClmlIntf, workspace->context, dims, layout, cl_dtype);
+    tensor_dsc->tensor = DeviceMakeCLMLTensor(workspace->context, dims, layout, cl_dtype);
     return tensor_dsc;
   }
 
@@ -901,7 +910,6 @@ class CLMLRuntime : public JSONRuntimeBase {
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
                                              cl_dtype);
     auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
-    auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
 
     std::vector<std::string> windows = node.GetAttr<std::vector<std::string>>("pool_size");
     std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
@@ -1103,7 +1111,6 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
     int inputSize = input_.size();
-    int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
     auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     cl_ml_tensor_qcom* concatInputs = new cl_ml_tensor_qcom[inputSize];
     for (int i = 0; i < inputSize; i++) {
@@ -1236,6 +1243,8 @@ class CLMLRuntime : public JSONRuntimeBase {
       binary_op = CL_TENSOR_OP_SUB_QCOM;
     else if (op_name == "multiply")
       binary_op = CL_TENSOR_OP_MUL_QCOM;
+    else if (op_name == "divide")
+      binary_op = CL_TENSOR_OP_DIV_QCOM;
     else if (op_name == "minimum")
       binary_op = CL_TENSOR_OP_MIN_QCOM;
     else if (op_name == "maximum")
@@ -1260,7 +1269,12 @@ class CLMLRuntime : public JSONRuntimeBase {
 
   CachedLayer layer_;
   // CLML Context
+#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 2
   CLMLInterfaceV2QCOM* h_ClmlIntf = NULL;
+#endif
+#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 3
+  CLMLInterfaceV3QCOM* h_ClmlIntf = NULL;
+#endif
   cl::OpenCLWorkspace* workspace = NULL;
   cl::OpenCLThreadEntry* tentry = NULL;
   cl_ml_tuningcache_qcom tuning_cache = NULL;
diff --git a/tests/python/contrib/test_clml/infrastructure.py b/tests/python/contrib/test_clml/infrastructure.py
index 89c22255d77d..be2bbc7f8a71 100644
--- a/tests/python/contrib/test_clml/infrastructure.py
+++ b/tests/python/contrib/test_clml/infrastructure.py
@@ -39,9 +39,9 @@ class Device:
     Configuration for CLML tests.
 
     Check tests/python/contrib/clml/ for the presence of an test_config.json file.
-    This file can be used to override the default configuration here which will attempt to run the Arm
-    Compute Library runtime tests locally if the runtime is available. Changing the configuration
-    will allow these runtime tests to be offloaded to a remote Arm device via a tracker for example.
+    This file can be used to override the default configuration here which will attempt to run the
+    Open CLML runtime tests locally if the runtime is available. Changing the configuration
+    will allow these runtime tests to be offloaded to a remote Snapdragon device via a tracker for example.
 
     Notes
     -----
@@ -101,6 +101,25 @@ def _get_remote(cls):
         return device
 
 
+def get_cpu_op_count(mod):
+    """Traverse graph counting ops offloaded to TVM."""
+
+    class Counter(tvm.relay.ExprVisitor):
+        def __init__(self):
+            super().__init__()
+            self.count = 0
+
+        def visit_call(self, call):
+            if isinstance(call.op, tvm.ir.Op):
+                self.count += 1
+
+            super().visit_call(call)
+
+    c = Counter()
+    c.visit(mod["main"])
+    return c.count
+
+
 def skip_codegen_test():
     """Skip test if it requires the CLML codegen and it's not present."""
     if not tvm.get_global_func("relay.ext.clml", True):
@@ -130,7 +149,6 @@ def build_and_run(
 
     try:
         libm = build_module(mod, device.target, device.target_host, params, enable_clml, tune_log)
-
         clml_modules = extract_clml_modules(libm)
         for mod in clml_modules:
             source = mod.get_source("json")
@@ -155,9 +173,9 @@ def build_and_run(
     for _ in range(no_runs):
         gen_module.run()
         out.append([gen_module.get_output(i) for i in range(outputs)])
-    time_f = gen_module.module.time_evaluator("run", device.device.cl(0), number=1)
-    cost = time_f().mean
-    print("%g secs/iteration\n" % cost)
+    # time_f = gen_module.module.time_evaluator("run", device.device.cl(0), number=1)
+    # cost = time_f().mean
+    # print("%g secs/iteration\n" % cost)
     return out
 
 
@@ -181,16 +199,34 @@ def extract_clml_modules(module):
 
 
 def verify_codegen(
-    module,
+    mod,
     known_good_codegen,
+    device,
+    params,
     num_clml_modules=1,
     tvm_ops=0,
-    target="llvm -mtriple=aarch64-linux-gnu",
 ):
     """Check clml codegen against a known good output."""
-    module = build_module(module, target, tvm_ops=tvm_ops, clml_partitions=num_clml_modules)
-    clml_modules = extract_clml_modules(module)
+    if isinstance(mod, tvm.relay.expr.Call):
+        mod = tvm.IRModule.from_expr(mod)
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        mod = clml.partition_for_clml(mod, params)
+        tvm_op_count = get_cpu_op_count(mod)
+        assert tvm_op_count == tvm_ops, "Got {} TVM operators, expected {}".format(
+            tvm_op_count, tvm_ops
+        )
+        partition_count = 0
+        for global_var in mod.get_global_vars():
+            if "clml" in global_var.name_hint:
+                partition_count += 1
+
+        assert (
+            num_clml_modules == partition_count
+        ), "Got {} Open CLML partitions, expected {}".format(partition_count, num_clml_modules)
+    relay.backend.te_compiler.get().clear()
 
+    module = relay.build(mod, target=device.target, target_host=device.target_host, params=params)
+    clml_modules = extract_clml_modules(module)
     assert len(clml_modules) == num_clml_modules, (
         f"The number of CLML modules produced ({len(clml_modules)}) does not "
         f"match the expected value ({num_clml_modules})."
diff --git a/tests/python/contrib/test_clml/test_network.py b/tests/python/contrib/test_clml/test_network.py
index 8d740d6dce4d..177359d9b18a 100644
--- a/tests/python/contrib/test_clml/test_network.py
+++ b/tests/python/contrib/test_clml/test_network.py
@@ -91,13 +91,8 @@ def get_model():
         mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5
     )
 
-    # test
-    print("OpenCL:", outputs[0].asnumpy().shape)
-    print("CLML:", outputs[1].asnumpy().shape)
-
     opencl_sort = np.argsort(outputs[1].asnumpy()).flatten()
     clml_sort = np.argsort(outputs[0].asnumpy()).flatten()
-
     tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, atol=1e-5)
 
 
@@ -134,7 +129,6 @@ def get_model():
 
     opencl_sort = np.argsort(outputs[1].asnumpy()).flatten()
     clml_sort = np.argsort(outputs[0].asnumpy()).flatten()
-
     tvm.testing.assert_allclose(opencl_sort[:5], clml_sort[:5], rtol=1e-5, atol=1e-5)
 
 
@@ -176,11 +170,10 @@ def get_model():
         mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5
     )
 
-    # test
-    print("OpenCL:", outputs[0].asnumpy().shape)
-    print("CLML:", outputs[1].asnumpy().shape)
-
     opencl_sort = np.argsort(outputs[1].asnumpy()).flatten()
     clml_sort = np.argsort(outputs[0].asnumpy()).flatten()
-
     tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, atol=1e-5)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py
index da09715fbe4c..c4ec2603249b 100644
--- a/tests/python/contrib/test_clml/test_ops.py
+++ b/tests/python/contrib/test_clml/test_ops.py
@@ -14,15 +14,23 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""CLML integration conv2d tests."""
+"""CLML integration operator tests."""
 
 import tvm
 import numpy as np
 from tvm import relay
+from tvm.relay.op.contrib import clml
 from tvm.relay import testing
 from tvm.ir import IRModule
 from tvm.contrib import utils
-from test_clml.infrastructure import build_and_run, Device, skip_codegen_test
+from test_clml.infrastructure import (
+    build_and_run,
+    Device,
+    skip_codegen_test,
+    verify_codegen,
+    build_module,
+    get_cpu_op_count,
+)
 import pytest
 
 
@@ -54,11 +62,8 @@ def _get_conv_model(
         shape = (shape[0], shape[1], shape[2] + padding[0] * 2, shape[3] + padding[1] * 2)
     is_depthwise = shape[1] == channels == groups
 
-    weight_format = "OIHW" if is_depthwise else "OIHW"
-    if weight_format == "IOHW":
-        weight_shape = (shape[1] // groups, channels, kernel_h, kernel_w)
-    else:
-        weight_shape = (channels, shape[1] // groups, kernel_h, kernel_w)
+    weight_format = "OIHW"
+    weight_shape = (channels, shape[1] // groups, kernel_h, kernel_w)
 
     w = tvm.nd.array(np.random.uniform(-1, 1, weight_shape).astype(dtype))
     weights = relay.const(w, dtype)
@@ -77,7 +82,7 @@ def _get_conv_model(
     )
     params = {"w": w}
     if has_bias:
-        bias_shape = weight_shape[2] if is_depthwise else weight_shape[0]
+        bias_shape = (weight_shape[0],)
         b = tvm.nd.array(np.random.uniform(-1, 1, bias_shape).astype(dtype))
         biasc = relay.const(b, dtype)
         out = relay.nn.bias_add(out, biasc, axis=1)
@@ -86,31 +91,121 @@ def _get_conv_model(
     if has_activation:
         out = relay.nn.relu(out)
 
-    print("Out:", out)
-
     return out, params
 
 
+def _get_conv_expected_codegen(
+    shape,
+    kernel_h,
+    kernel_w,
+    padding,
+    strides,
+    dilation,
+    groups,
+    dtype,
+    channels,
+    has_bias=False,
+    has_activation=False,
+):
+    if len(padding) == 2:
+        padding = (padding[0], padding[1], padding[0], padding[1])
+    output_height = ((shape[2] - kernel_h + padding[0] + padding[2]) / strides[0]) + 1
+    output_width = ((shape[3] - kernel_w + padding[1] + padding[3]) / strides[1]) + 1
+    output_shape = (1, channels, int(output_height), int(output_width))
+    out_dtype = dtype
+    is_depthwise = shape[1] == channels == groups
+
+    weight_format = "IOHW" if is_depthwise else "OIHW"
+    if weight_format == "OIHW":
+        weight_shape = (channels, shape[1] // groups, kernel_h, kernel_w)
+    else:
+        weight_shape = (shape[1] // groups, channels, kernel_h, kernel_w)
+
+    if is_depthwise:
+        name = "nn.depthwise_conv2d"
+    else:
+        name = "nn.conv2d"
+
+    node = {
+        "op": "kernel",
+        "name": name,
+        "inputs": [],
+        "attrs": {
+            "groups": [[str(groups)]],
+            "num_outputs": "1",
+            "data_layout": [["NCHW"]],
+            "kernel_layout": [[weight_format]],
+            "channels": [[str(channels)]],
+            "dilation": [[str(dilation[0]), str(dilation[1])]],
+            "out_layout": [[""]],
+            "out_dtype": [[out_dtype]],
+            "kernel_size": [[str(kernel_h), str(kernel_w)]],
+            "shape": [[list(output_shape)]],
+            "dtype": [[dtype]],
+            "padding": [[str(p) for p in padding]],
+            "strides": [[str(s) for s in strides]],
+        },
+    }
+
+    if has_activation:
+        node["attrs"]["activation_type"] = [["relu"]]
+
+    inputs = [
+        {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[str(dtype)]]}},
+        {
+            "op": "const",
+            "name": "",
+            "attrs": {"shape": [[list(weight_shape)]], "dtype": [[str(dtype)]]},
+        },
+    ]
+
+    if has_bias:
+        bias_dtype = dtype
+        inputs.append(
+            {
+                "op": "const",
+                "name": "",
+                "attrs": {
+                    "shape": [[[1, weight_shape[1] if is_depthwise else weight_shape[0], 1, 1]]],
+                    "dtype": [[bias_dtype]],
+                },
+            }
+        )
+
+    input_idx = 0
+    for _ in range(len(inputs)):
+        node["inputs"].append([input_idx, 0, 0])
+        input_idx += 1
+    node["attrs"]["num_inputs"] = str(len(inputs))
+    inputs.append(node)
+    return inputs
+
+
 @pytest.mark.parametrize("dtype", ["float32"])
 @tvm.testing.requires_openclml
 def test_conv2d(device, dtype):
     trials = [
         # Normal convolution
-        [3, 3, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False)],
-        [2, 1, (2, 2), (1, 1), (1, 1), 7, (15, 16, 12), (True, False, True)],
-        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False)],
-        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, True)],
-        # Normal convolution
-        [2, 2, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False)],
-        [2, 1, (2, 2), (1, 1), (1, 1), 7, (16, 12, 15), (False, False, True)],
-        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False)],
-        [3, 3, (1, 1), (1, 1), (1, 1), 16, (16, 12, 15), (False, False, False)],
-        [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False)],
-        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True)],
-        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False)],
-        [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False)],
-        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False)],
-        [3, 3, (1, 1), (2, 2), (1, 1), 16, (14, 10, 10), (False, True, True)],
+        [3, 3, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False), False],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (15, 16, 12), (True, False, True), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, True), False],
+        [2, 2, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False), False],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (16, 12, 15), (False, False, True), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False), False],
+        [3, 3, (1, 1), (1, 1), (1, 1), 16, (16, 12, 15), (False, False, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False), False],
+        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False],
+        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False), False],
+        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False],
+        [3, 3, (1, 1), (2, 2), (1, 1), 16, (14, 10, 10), (False, True, True), False],
+        # Depth-wise convolution
+        [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True],
+        [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True],
+        [3, 3, (2, 2), (2, 2), (1, 1), 14, (14, 10, 10), (False, False, False), True],
+        [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True],
+        [3, 3, (1, 1), (2, 2), (1, 1), 14, (14, 10, 10), (False, True, True), True],
     ]
 
     for (
@@ -122,9 +217,13 @@ def test_conv2d(device, dtype):
         out_channels,
         shape,
         composite,
+        is_depthwise,
     ) in trials:
         shape = (1, *shape)
-        groups = 1
+        if is_depthwise:
+            groups = shape[1]
+        else:
+            groups = 1
         outputs = []
         inputs = {
             "a": tvm.nd.array(np.random.uniform(-1, 1, shape).astype(dtype)),
@@ -151,11 +250,19 @@ def test_conv2d(device, dtype):
         tvm.testing.assert_allclose(
             clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-5, atol=1e-5
         )
+        args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype, out_channels)
+        exp_codegen = _get_conv_expected_codegen(
+            *args, has_bias=composite[1], has_activation=composite[2]
+        )
+        verify_codegen(func, exp_codegen, device, params)
 
 
 @pytest.mark.parametrize("dtype", ["float16"])
 @tvm.testing.requires_openclml
-def _test_batchnorm(device, dtype):
+def test_batchnorm(device, dtype):
+    if tvm.support.libinfo().get("TVM_CLML_VERSION", 2) < 3:
+        print("Skip due to unsupported CLML version")
+        return
     in_shape = (1, 8, 64, 64)
     channels = 8
 
@@ -211,11 +318,80 @@ def test_concat(device, dtype):
     tvm.testing.assert_allclose(
         clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
     )
+    exp_codegen = [
+        {
+            "attrs": {
+                "dtype": [[dtype]],
+                "shape": [[list(in_shape_1)]],
+            },
+            "name": "",
+            "op": "input",
+        },
+        {
+            "attrs": {
+                "dtype": [[dtype]],
+                "shape": [[list(in_shape_2)]],
+            },
+            "name": "",
+            "op": "input",
+        },
+        {
+            "attrs": {
+                "axis": [["1"]],
+                "dtype": [[dtype]],
+                "num_inputs": "2",
+                "num_outputs": "1",
+                "shape": [[list(clml_out[0].shape)]],
+            },
+            "inputs": [[0, 0, 0], [1, 0, 0]],
+            "name": "concatenate",
+            "op": "kernel",
+        },
+    ]
+    verify_codegen(func, exp_codegen, device, params)
+
+
+def _get_pool_expected_codegen(input_shape, pool_size, stride, padding, pool_type, dtype):
+    import math
+
+    pool_height = math.floor(((input_shape[2] + padding[2] - pool_size[0]) / stride[0]) + 1)
+    pool_width = math.floor(((input_shape[3] + padding[3] - pool_size[1]) / stride[1]) + 1)
+    output_shape = [input_shape[0], input_shape[1], pool_height, pool_width]
+    attrs = {
+        "ceil_mode": [["0"]],
+        "dilation": [["1", "1"]],
+        "layout": [["NCHW"]],
+        "num_inputs": "1",
+        "num_outputs": "1",
+        "out_layout": [[""]],
+        "padding": [[str(p) for p in padding]],
+        "pool_size": [[str(p) for p in pool_size]],
+        "shape": [[list(output_shape)]],
+        "dtype": [[dtype]],
+        "strides": [[str(s) for s in stride]],
+    }
+    if sum(padding):
+        attrs["count_include_pad"] = [["0"]]
+
+    exp_codegen = [
+        {
+            "op": "input",
+            "name": "",
+            "attrs": {"shape": [[list(input_shape)]], "dtype": [[str(dtype)]]},
+        },
+        {
+            "op": "kernel",
+            "name": "nn.avg_pool2d" if pool_type == "avg" else "nn.max_pool2d",
+            "inputs": [[0, 0, 0]],
+            "attrs": attrs,
+        },
+    ]
+    return exp_codegen
 
 
 @pytest.mark.parametrize("dtype", ["float16"])
 @tvm.testing.requires_openclml
-def test_avgpool(device, dtype):
+def test_pool(device, dtype):
     trials = [
         # input size         pool_size stride  paading
         [(1, 64, 147, 147), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
@@ -251,7 +427,152 @@ def test_avgpool(device, dtype):
 
         opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0]
         clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0]
+        tvm.testing.assert_allclose(
+            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
+        )
+
+        args = (input_shape, pool_size, stride, padding, pooling_type, dtype)
+        exp_codegen = _get_pool_expected_codegen(*args)
+        verify_codegen(func, exp_codegen, device, params)
+
 
+@pytest.mark.parametrize("dtype", ["float32"])
+@tvm.testing.requires_openclml
+def test_dense(device, dtype):
+    def _get_model(x_shape, k_shape, has_bias=False):
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
+        out = relay.nn.dense(x, kernel, units=k_shape[0])
+        params = {"kernel": tvm.nd.array(np.random.uniform(-1, 1, k_shape).astype(dtype))}
+        inputs = {"x": tvm.nd.array(np.random.uniform(-1, 1, x_shape).astype(dtype))}
+        exp_codegen = [
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "shape": [[list(x_shape)]],
+                },
+                "name": "",
+                "op": "input",
+            },
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "shape": [[list(k_shape)]],
+                },
+                "name": "",
+                "op": "const",
+            },
+        ]
+        if has_bias:
+            bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
+            out = relay.nn.bias_add(out, bias)
+            bias_node = {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "shape": [[list((1, k_shape[0]))]],
+                },
+                "name": "",
+                "op": "const",
+            }
+            exp_codegen.append(bias_node)
+            params["bias"] = tvm.nd.array(np.random.uniform(-1, 1, (k_shape[0],)).astype(dtype))
+
+        dense_node = {
+            "attrs": {
+                "num_inputs": "3" if has_bias else "2",
+                "num_outputs": "1",
+                "dtype": [[dtype]],
+                "out_dtype": [[""]],
+                "shape": [[[x_shape[0], k_shape[0]]]],
+                "units": [[str(k_shape[0])]],
+            },
+            "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]] if has_bias else [[0, 0, 0], [1, 0, 0]],
+            "name": "nn.dense",
+            "op": "kernel",
+        }
+        exp_codegen.append(dense_node)
+        return out, params, inputs, exp_codegen
+
+    def _verify(out, params, inputs, exp_codegen):
+        mod = IRModule.from_expr(out)
+        opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0]
+        clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0]
         tvm.testing.assert_allclose(
             clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
         )
+        verify_codegen(out, exp_codegen, device, params)
+
+    _verify(*(_get_model((1, 16), (32, 16))))
+    _verify(*(_get_model((1, 16), (32, 16), True)))
+
+
+@pytest.mark.parametrize("dtype", ["float32"])
+@tvm.testing.requires_openclml
+def test_binary_ops(device, dtype):
+    def _get_model(a_shape, b_shape, op):
+        a = relay.var("a", shape=(a_shape), dtype=dtype)
+        b = relay.var("b", shape=(b_shape), dtype=dtype)
+        out = op(a, b)
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(-1, 1, a_shape).astype(dtype)),
+            "b": tvm.nd.array(np.random.uniform(-1, 1, b_shape).astype(dtype)),
+        }
+        params = {}
+        return out, params, inputs
+
+    def _verify(out, params, inputs):
+        mod = IRModule.from_expr(out)
+        opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0]
+        clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0]
+        tvm.testing.assert_allclose(
+            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
+        )
+
+        # Check to make sure these ops are offloaded to CLML instead of TVM.
+        with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+            mod = clml.partition_for_clml(mod, params)
+            tvm_op_count = get_cpu_op_count(mod)
+            assert tvm_op_count == 0, "Got {} TVM Native Compute partitions, expected 0".format(
+                tvm_op_count
+            )
+
+    _verify(*(_get_model((1, 16), (1, 16), relay.add)))
+    _verify(*(_get_model((1, 16), (1, 16), relay.subtract)))
+    _verify(*(_get_model((1, 16), (1, 16), relay.multiply)))
+    _verify(*(_get_model((1, 16), (1, 16), relay.divide)))
+    _verify(*(_get_model((1, 16), (1, 16), relay.minimum)))
+    _verify(*(_get_model((1, 16), (1, 16), relay.maximum)))
+
+
+@pytest.mark.parametrize("dtype", ["float32"])
+@tvm.testing.requires_openclml
+def test_unary_ops(device, dtype):
+    def _get_model(a_shape, op):
+        a = relay.var("a", shape=(a_shape), dtype=dtype)
+        out = op(a)
+        inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, a_shape).astype(dtype))}
+        params = {}
+        return out, params, inputs
+
+    def _verify(out, params, inputs):
+        mod = IRModule.from_expr(out)
+        opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0]
+        clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0]
+        tvm.testing.assert_allclose(
+            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
+        )
+
+        # Check to make sure these ops are offloaded to CLML instead of TVM.
+        with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+            mod = clml.partition_for_clml(mod, params)
+            tvm_op_count = get_cpu_op_count(mod)
+            assert tvm_op_count == 0, "Got {} TVM Native Compute partitions, expected 0".format(
+                tvm_op_count
+            )
+
+    _verify(*(_get_model((1, 16), relay.nn.softmax)))
+    _verify(*(_get_model((1, 16), relay.nn.relu)))
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/scripts/task_build_adreno_bins.sh b/tests/scripts/task_build_adreno_bins.sh
index 6b43d7cbc421..187ca7f815df 100755
--- a/tests/scripts/task_build_adreno_bins.sh
+++ b/tests/scripts/task_build_adreno_bins.sh
@@ -29,8 +29,12 @@ cd ${output_directory}
 cp ../cmake/config.cmake .
 
 echo set\(USE_MICRO OFF\) >> config.cmake
-echo set\(USE_CLML ON\) >> config.cmake
+if [ -f "${ADRENO_OPENCL}/CL/cl_qcom_ml_ops.h" ] ; then
+echo set\(USE_CLML "${ADRENO_OPENCL}"\) >> config.cmake
 echo set\(USE_CLML_GRAPH_EXECUTOR "${ADRENO_OPENCL}"\) >> config.cmake
+else
+echo set\(USE_OPENCL ON\) >> config.cmake
+fi
 echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_CPP_RPC ON\) >> config.cmake
 echo set\(USE_GRAPH_EXECUTOR ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_adreno.sh b/tests/scripts/task_config_build_adreno.sh
index d45c5e8b7dcf..d378b5f842b5 100755
--- a/tests/scripts/task_config_build_adreno.sh
+++ b/tests/scripts/task_config_build_adreno.sh
@@ -24,7 +24,9 @@ cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_OPENCL ON\) >> config.cmake
-echo set\(USE_CLML ON\) >> config.cmake
+if [ -f "${ADRENO_OPENCL}/CL/cl_qcom_ml_ops.h" ] ; then
+echo set\(USE_CLML ${ADRENO_OPENCL}\) >> config.cmake
+fi
 echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_GRAPH_EXECUTOR ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake

From 444e3a1fc72d1d25ca5370e998f74d46f895f33e Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Tue, 3 Jan 2023 12:35:07 +0530
Subject: [PATCH 103/286] [BENCHMARKS][ADRENO] Documentation for Adreno
 (Texture) benchmarks (#13679)

Steps to teproduce Adreno (Texture) benchmarks via adreno docker on given device.
---
 apps/benchmark/README.md | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
index 43d93d9e00fa..ccac79df47d8 100644
--- a/apps/benchmark/README.md
+++ b/apps/benchmark/README.md
@@ -37,7 +37,8 @@ It is recommended that you run tuning by yourself if you have your customized ne
 Please follow the tutorial for
 [NVIDIA GPU](https://tvm.apache.org/docs/tutorials/autotvm/tune_conv2d_cuda.html),
 [ARM CPU](https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_arm.html),
-[Mobile GPU](https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_mobile_gpu.html).
+[Mobile GPU](https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_mobile_gpu.html) and
+[Adreno GPU](https://www.qualcomm.com/products/features/adreno).
 
 ### NVIDIA GPU
 
@@ -127,3 +128,16 @@ Build TVM with LLVM and ROCm enabled. [Help](https://tvm.apache.org/docs/install
 ```bash
 python3 gpu_imagenet_bench.py --model gfx900 --target rocm
 ```
+
+### Adreno GPU
+
+Adreno benchmarks are automated over the docker - [ci_adreno](https://github.com/apache/tvm/blob/main/docker/Dockerfile.ci_adreno).
+Adreno docker share the Android devices from host. It is adviced to have host adb version same as docker, which is ```1.0.41```
+
+Below command runs all the benchmarks over given Android device.
+```bash
+export ANDROID_SERIAL=<ADB ID>
+./tests/scripts/ci.py adreno -b
+```
+
+Note: Tuning cache is implicite through tophub repo for all the benchmarks and is tuned over Snapdragon Gen 1.

From 381476e4413afc619148e08c28e8af4cba42a790 Mon Sep 17 00:00:00 2001
From: Alexey Gladyshev <wotpricol@mail.ru>
Date: Wed, 4 Jan 2023 00:51:38 +0300
Subject: [PATCH 104/286] [ONNX] Add converter for QAttention from Microsoft
 onnxruntime contrib opset (#13654)

* init QAttention converter

* add type and shape checking

* add test for QAttention

* add tests for optional parameters

* change mask_index shape

* add support for 'past' input

* add support for 'unidirectional' attribute

* expand test coverage

* fix lint

* fix pylint

* fix batch dimension for topi/cuda/batch_matmul_tensorcore.py::batch_matmul_tensorcore_cuda

* code review fix
---
 python/tvm/relay/frontend/onnx.py          | 297 +++++++++++++++++++++
 tests/python/frontend/onnx/test_forward.py | 184 +++++++++++++
 2 files changed, 481 insertions(+)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index a8ab62602573..328b5d7bd8d7 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1379,6 +1379,302 @@ def massage(tensor):
         return _expr.TupleWrapper(_expr.Tuple([output, present]), 2)
 
 
+class QAttention(OnnxOpConverter):
+    """Operator converter for QAttention from Microsoft onnxruntime contrib opset.
+
+    This is the self-attention mechanism used in transformer models.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # ************************* Read attrs *************************
+        num_heads = attr["num_heads"]
+        unidirectional = attr["unidirectional"]
+
+        # ************************* Read inputs *************************
+        # (batch, seq, in_hidden)
+        input_emb = inputs[0]
+
+        # (in_hidden, 3 * out_hidden), where out_hidden = num_heads * head_size
+        weight = inputs[1]
+
+        # (3 * out_hidden,)
+        bias = inputs[2]
+
+        # Scale of quantized input tensor.
+        # Scalar, which means a per-tensor/layer quantization
+        input_scale = inputs[3]
+
+        # Scale of quantized weight tensor.
+        # Scalar or a 1D tensor, which means a per-tensor/per-column quantization.
+        # Its size should be 3 * out_hidden if it is per-column quantization
+        weight_scale = inputs[4]
+
+        # TODO(agladyshev):
+        #  ORT documentation says that shape is (batch,),
+        #  but in ORT source code we have following comment:
+        #       1. (batch_size)
+        #       2. (2 * batch_size)
+        #       3. (batch_size, 1)
+        #       4. (1, 1)
+        #       5. (batch_size, past_sequence_length + sequence_length)
+        #  In practice, for GPT-2 there shape is (batch, past_seq_length + seq_length).
+        #  Currently only (batch, past_seq_length + seq_length) shape is supported.
+        mask_index = inputs[5]
+
+        # Zero point of quantized input tensor.
+        # Scalar, which means a per-tensor/layer quantization
+        input_zero_point = inputs[6]
+
+        # Zero point of quantized weight tensor.
+        # Scalar or a 1D tensor, which means a per-tensor/per-column quantization.
+        # Its size should be 3 * out_hidden if it is per-column quantization
+        weight_zero_point = inputs[7]
+
+        # (2, batch, num_heads, past_seq, head_size)
+        past = inputs[8]
+
+        # ************************* Parse inputs *************************
+        t1 = ["int8", "uint8"]
+        t2 = ["int8", "uint8"]
+        t3 = ["float32", "float16"]
+        t4 = ["int32"]
+
+        # input
+        assert infer_type(input_emb).checked_type.dtype in t1
+        assert (
+            len(infer_shape(input_emb)) == 3
+        ), "Input should be 3D tensor with shape (batch_size, sequence_length, input_hidden_size)"
+        (batch_size, seq_len, input_hidden) = infer_shape(input_emb)
+        assert input_hidden > 0, (
+            "The weight tensor has (input_hidden_size, 3 * output_hidden_size) shape, so it doesn't"
+            f" make sense to have ({input_hidden}, 3 * output_hidden_size) weight tensor."
+        )
+        assert seq_len > 0, (
+            "The output tensor has (batch_size, sequence_length, hidden_size) shape,"
+            f" so it doesn't make sense to have (batch_size, {seq_len}, hidden_size) output."
+        )
+
+        # weight
+        assert infer_type(weight).checked_type.dtype in t2
+        assert len(infer_shape(weight)) == 2, (
+            "Weight should be 2D input tensor with shape (input_hidden_size, 3 * hidden_size), "
+            "hidden_size = num_heads * head_size"
+        )
+        (input_hidden_weight, out_hidden_x3) = infer_shape(weight)
+        assert input_hidden == input_hidden_weight
+        assert out_hidden_x3 % 3 == 0, "output hidden shape should be divisible by 3: W_Q, W_K, W_V"
+        out_hidden = out_hidden_x3 // 3
+        assert (
+            out_hidden % num_heads == 0
+        ), "output hidden size should be divisible by number of attention heads"
+        head_size = out_hidden // num_heads
+
+        # bias
+        assert infer_type(bias).checked_type.dtype in t3
+        assert (
+            len(infer_shape(bias)) == 1
+        ), "Bias should be 1D input tensor with shape (3 * hidden_size)"
+        (out_hidden_x3_bias,) = infer_shape(bias)
+        assert out_hidden_x3 == out_hidden_x3_bias
+
+        # input_scale
+        assert infer_type(input_scale).checked_type.dtype in t3
+        input_scale = get_scalar(
+            input_scale, params, dtype=infer_type(input_scale).checked_type.dtype
+        )
+
+        # weight_scale
+        assert infer_type(weight_scale).checked_type.dtype in t3
+        # TODO(agladyshev): now QNN Batch Matmul only supports scalar types for scale and zero_point
+        weight_scale = get_scalar(
+            weight_scale, params, dtype=infer_type(weight_scale).checked_type.dtype
+        )
+
+        # mask_index
+        assert (
+            mask_index is not None
+        ), "Attention import currently only supports required mask_index"
+        assert infer_type(mask_index).checked_type.dtype in t4
+        mask_index_shape = infer_shape(mask_index)
+        assert (
+            len(mask_index_shape) == 2
+            and mask_index_shape[0] == batch_size
+            and mask_index_shape[1] >= seq_len
+        ), "currently only support (batch_size, sequence_length) mask index"
+
+        # TODO(agladyshev): int32 required for qnn.batch_matmul (QnnBatchMatmulRel)
+        zero_point_zero = _expr.const(0, "int32")
+
+        # input_zero_point
+        if input_zero_point is None:
+            input_zero_point = zero_point_zero
+        else:
+            assert infer_type(input_zero_point).checked_type.dtype in t1
+            # TODO(agladyshev): int32 required for qnn.batch_matmul (QnnBatchMatmulRel)
+            input_zero_point = get_scalar(input_zero_point, params, dtype="int32")
+
+        # weight_zero_point
+        if weight_zero_point is None:
+            weight_zero_point = zero_point_zero
+        else:
+            assert infer_type(weight_zero_point).checked_type.dtype in t2
+            # TODO(agladyshev): int32 required for qnn.batch_matmul (QnnBatchMatmulRel)
+            weight_zero_point = get_scalar(weight_zero_point, params, dtype="int32")
+
+        # past (2, batch_size, num_heads, past_sequence_length, head_size)
+        past_seq_len = 0
+        if past is not None:
+            assert infer_type(past).checked_type.dtype in t3
+            past_shape = infer_shape(past)
+            assert len(past_shape) == 5, "past should be 5D tensor"
+            assert (
+                past_shape[0] == 2
+                and past_shape[1] == batch_size
+                and past_shape[2] == num_heads
+                and past_shape[3] + seq_len == mask_index_shape[1]
+                and past_shape[4] == head_size
+            )
+            past_seq_len = past_shape[3]
+
+        # ************************* Create Relay *************************
+        # Add batch dimension for QNN Batch Matmul
+        weight = _op.expand_dims(weight, 0, num_newaxis=1)
+        weight = _op.concatenate([weight] * batch_size, axis=0)
+
+        # Split weight and biases and do the Matmul
+        w_Q, w_K, w_V = _op.split(weight, 3, axis=-1)
+        b_Q, b_K, b_V = _op.split(bias, 3, axis=-1)
+
+        def qmatmul_dequantize_bias(
+            lhs, rhs, lhs_scale, rhs_scale, lhs_zero_point, rhs_zero_point, bias
+        ):
+            rhs_transposed = _op.transpose(rhs, axes=[0, 2, 1])  # QNN Batch Matmul do: X * Y^T
+            result = _qnn.op.batch_matmul(
+                lhs, rhs_transposed, lhs_zero_point, rhs_zero_point, lhs_scale, rhs_scale
+            )
+            # In our case zero point and scale are scalar, therefore 'axis' doesn't matter
+            result = _qnn.op.dequantize(
+                result,
+                _op.multiply(lhs_scale, rhs_scale),
+                zero_point_zero,
+            )
+            result = _op.add(result, bias)
+            return result
+
+        Q = qmatmul_dequantize_bias(
+            input_emb, w_Q, input_scale, weight_scale, input_zero_point, weight_zero_point, b_Q
+        )
+        K = qmatmul_dequantize_bias(
+            input_emb, w_K, input_scale, weight_scale, input_zero_point, weight_zero_point, b_K
+        )
+        V = qmatmul_dequantize_bias(
+            input_emb, w_V, input_scale, weight_scale, input_zero_point, weight_zero_point, b_V
+        )
+
+        def split_into_heads(tensor):
+            """
+            In the implementation of Multi-head attention we just split queries, keys, and values
+            we compute for a single-head attention into several parts:
+            (batch_size, num_heads, seq_len, head_size)
+            """
+            tensor = _op.reshape(tensor, (batch_size, seq_len, num_heads, head_size))
+
+            # (batch_size, num_heads, seq_len, head_size)
+            tensor = _op.transpose(tensor, axes=[0, 2, 1, 3])
+
+            return tensor
+
+        Q = split_into_heads(Q)
+        K = split_into_heads(K)
+        V = split_into_heads(V)
+
+        # Concatenate (past_K, past_V) with (K, V) by sequence axis:
+        # (batch_size, num_heads, past_sequence_length + sequence_length, head_size)
+        if past is not None and past_seq_len > 0:
+            K_past, V_past = _op.split(past, 2, axis=0)
+            K = _op.concatenate([_op.squeeze(K_past, axis=[0]), K], axis=2)
+            V = _op.concatenate([_op.squeeze(V_past, axis=[0]), V], axis=2)
+
+        # Prepare present state for Key and Value with shape
+        # (2, batch_size, num_heads, past_sequence_length + sequence_length, head_size)
+        present = _op.stack([K, V], axis=0)
+
+        def merge_first_dimensions(tensor):
+            """
+            nn.batch_matmul is expecting 3D tensor:
+            (batch_size * num_heads, past_seq_len + seq_len, head_size)
+            """
+            return _op.reverse_reshape(tensor, (-1, 0, 0))
+
+        Q = merge_first_dimensions(Q)
+        K = merge_first_dimensions(K)
+        V = merge_first_dimensions(V)
+
+        att_scores = _op.nn.batch_matmul(Q, K, transpose_a=False, transpose_b=True)
+        score_dtype = infer_type(att_scores).checked_type.dtype
+        att_scores = _op.divide(
+            att_scores,
+            _op.const(np.sqrt(head_size), dtype=infer_type(att_scores).checked_type.dtype),
+        )
+        att_scores = _op.reshape(
+            att_scores, (batch_size, num_heads, seq_len, past_seq_len + seq_len)
+        )
+
+        # Build the attention mask
+        att_mask = _op.cast(mask_index, score_dtype)
+        # Attention mask has value 0 or 1. Here we convert 0 to -10000, and 1 to 0.
+        att_mask = _op.subtract(_op.const(1, dtype=score_dtype), att_mask)
+        att_mask = _op.multiply(att_mask, _op.const(-10000, dtype=score_dtype))
+        # Expand for att_scores broadcast
+        # (batch_size, past_seq_len + seq_len) -> (batch_size, 1, seq_len, past_seq_len + seq_len)
+        att_mask = _op.expand_dims(att_mask, 1, num_newaxis=2)
+        att_mask = _op.concatenate([att_mask] * seq_len, axis=2)
+
+        def create_unidirectional_mask(left_value, right_value):
+            numpy_unidirectional_mask = np.array(
+                [
+                    np.concatenate(
+                        [
+                            np.full(past_seq_len + s_i + 1, left_value),
+                            np.full(seq_len - s_i - 1, right_value),
+                        ]
+                    )
+                    for s_i in range(seq_len)
+                ]
+            )
+            unidirectional_mask = _op.const(numpy_unidirectional_mask, dtype=score_dtype)
+            unidirectional_mask = _op.expand_dims(unidirectional_mask, 0, num_newaxis=2)
+
+            return unidirectional_mask
+
+        if unidirectional:
+            att_mask = _op.add(att_mask, create_unidirectional_mask(0, -10000))
+
+        # Apply the mask
+        att_scores = _op.add(att_scores, att_mask)
+        # TODO(agladyshev):
+        #   Comment from ORT source code (onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h):
+        #   "Fix unidirectional mask to be parity with huggingface implementation"
+        if unidirectional:
+            att_scores = _op.multiply(att_scores, create_unidirectional_mask(1, 0))
+            att_scores = _op.add(att_scores, create_unidirectional_mask(0, -10000))
+
+        # Compute Softmax
+        att_scores = _op.reshape(
+            att_scores, (batch_size * num_heads, seq_len, past_seq_len + seq_len)
+        )
+        att_probs = _op.nn.softmax(att_scores, axis=-1)
+
+        # Compute output
+        output = _op.nn.batch_matmul(att_probs, V, transpose_a=False, transpose_b=False)
+        output = _op.reverse_reshape(output, (-1, num_heads, 0, 0))
+        output = _op.transpose(output, axes=[0, 2, 1, 3])
+        output = _op.reshape(output, (0, 0, out_hidden))
+
+        return _expr.TupleWrapper(_expr.Tuple([output, present]), 2)
+
+
 class Gemm(OnnxOpConverter):
     """Operator converter for Gemm."""
 
@@ -5716,6 +6012,7 @@ def _get_convert_map(opset):
         "EmbedLayerNormalization": EmbedLayerNormalization.get_converter(opset),
         "SkipLayerNormalization": SkipLayerNormalization.get_converter(opset),
         "Attention": Attention.get_converter(opset),
+        "QAttention": QAttention.get_converter(opset),
         "Exp": Renamer("exp"),
         "Greater": Renamer("greater"),
         "GreaterOrEqual": Renamer("greater_equal"),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 92a87ff6a72c..09206b341dd9 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5921,6 +5921,190 @@ def verify_attention(input_, weight, bias, mask_index, num_heads):
     verify_attention(input_array, weight, bias, mask_index, num_heads)
 
 
+@tvm.testing.parametrize_targets
+def test_qattention(target, dev):
+    """test_qattention"""
+
+    def verify_attention(
+        _unidirectional,
+        _input,
+        _weight,
+        _bias,
+        _input_scale,
+        _weight_scale,
+        _mask_index=None,
+        _input_zero_point=None,
+        _weight_zero_point=None,
+        _past=None,
+    ):
+        input_names = ["input", "weight", "bias", "input_scale", "weight_scale"]
+        if _mask_index is not None:
+            input_names.append("mask_index")
+        if _input_zero_point is not None:
+            input_names.append("input_zero_point")
+        if _weight_zero_point is not None:
+            input_names.append("weight_zero_point")
+        if _past is not None:
+            input_names.append("past")
+
+        node = onnx.helper.make_node(
+            "QAttention",
+            inputs=input_names,
+            outputs=["output", "present"],
+            domain="com.microsoft",
+            num_heads=num_heads,
+            unidirectional=_unidirectional,
+        )
+
+        past_shape = (2, batch_size, num_heads, past_sequence_length, head_size)
+        present_output_shape = (
+            2,
+            batch_size,
+            num_heads,
+            past_sequence_length + sequence_length,
+            head_size,
+        )
+
+        inputs_info = [
+            helper.make_tensor_value_info("input", TensorProto.UINT8, list(_input.shape)),
+            helper.make_tensor_value_info("weight", TensorProto.UINT8, list(_weight.shape)),
+            helper.make_tensor_value_info("bias", TensorProto.FLOAT, list(_bias.shape)),
+            helper.make_tensor_value_info("input_scale", TensorProto.FLOAT, ()),
+            helper.make_tensor_value_info("weight_scale", TensorProto.FLOAT, ()),
+        ]
+        if _mask_index is not None:
+            inputs_info.append(
+                helper.make_tensor_value_info(
+                    "mask_index", TensorProto.INT32, list(_mask_index.shape)
+                )
+            )
+        if _input_zero_point is not None:
+            inputs_info.append(
+                helper.make_tensor_value_info("input_zero_point", TensorProto.UINT8, ())
+            )
+        if _weight_zero_point is not None:
+            inputs_info.append(
+                helper.make_tensor_value_info("weight_zero_point", TensorProto.UINT8, ())
+            )
+        if _past is not None:
+            inputs_info.append(
+                helper.make_tensor_value_info("past", TensorProto.FLOAT, list(past_shape))
+            )
+
+        graph = helper.make_graph(
+            [node],
+            "qattention_test",
+            inputs=inputs_info,
+            outputs=[
+                helper.make_tensor_value_info("output", TensorProto.FLOAT, list(_input.shape)),
+                helper.make_tensor_value_info(
+                    "present", TensorProto.FLOAT, list(present_output_shape)
+                ),
+            ],
+        )
+
+        model = helper.make_model(graph, producer_name="qattention_test")
+
+        inputs = [_input, _weight, _bias, _input_scale, _weight_scale]
+        if _mask_index is not None:
+            inputs.append(_mask_index)
+        if _input_zero_point is not None:
+            inputs.append(_input_zero_point)
+        if _weight_zero_point is not None:
+            inputs.append(_weight_zero_point)
+        if _past is not None:
+            inputs.append(_past)
+
+        verify_with_ort_with_inputs(
+            model,
+            inputs,
+            [_input.shape, present_output_shape],
+            target=target,
+            dev=dev,
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+    batch_size = 11
+    num_heads = 13
+    head_size = 37
+    sequence_length = 7
+    input_hidden_size = 147
+    weight_hidden_size = num_heads * head_size
+    past_sequence_length = 17
+
+    total_sequence_length = past_sequence_length + sequence_length
+
+    # Required inputs
+    input_array = np.random.randint(
+        0, 255, (batch_size, sequence_length, input_hidden_size)
+    ).astype("uint8")
+    weight = np.random.randint(0, 255, (input_hidden_size, 3 * weight_hidden_size)).astype("uint8")
+    bias = np.random.randn(3 * weight_hidden_size).astype("float32")
+    input_scale = np.random.random(1).astype("float32")
+    weight_scale = np.random.random(1).astype("float32")
+
+    # Optional inputs
+    input_zero_point = np.random.randint(0, 255, 1).astype("uint8")
+    weight_zero_point = np.random.randint(0, 255, 1).astype("uint8")
+    past = np.random.random((2, batch_size, num_heads, past_sequence_length, head_size)).astype(
+        "float32"
+    )
+
+    for unidirectional in [0, 1]:
+        for have_past in [False, True]:
+            if not have_past:
+                mask_index = np.random.randint(0, 2, (batch_size, sequence_length)).astype("int32")
+
+                verify_attention(
+                    unidirectional,
+                    input_array,
+                    weight,
+                    bias,
+                    input_scale,
+                    weight_scale,
+                    mask_index,
+                )
+                verify_attention(
+                    unidirectional,
+                    input_array,
+                    weight,
+                    bias,
+                    input_scale,
+                    weight_scale,
+                    mask_index,
+                    input_zero_point,
+                )
+                verify_attention(
+                    unidirectional,
+                    input_array,
+                    weight,
+                    bias,
+                    input_scale,
+                    weight_scale,
+                    mask_index,
+                    input_zero_point,
+                    weight_zero_point,
+                )
+            else:
+                mask_index = np.random.randint(0, 2, (batch_size, total_sequence_length)).astype(
+                    "int32"
+                )
+
+                verify_attention(
+                    unidirectional,
+                    input_array,
+                    weight,
+                    bias,
+                    input_scale,
+                    weight_scale,
+                    mask_index,
+                    input_zero_point,
+                    weight_zero_point,
+                    past,
+                )
+
+
 @tvm.testing.parametrize_targets
 def test_skiplayernormalization(target, dev):
     """test_skiplayernormalization"""

From d375d0e5d33ae04e1291b3cee3631abe6073c760 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Tue, 3 Jan 2023 14:55:03 -0800
Subject: [PATCH 105/286] [BugFix][Runtime] Add missing check for `PackedFunc`
 (#13687)

* Add missing check for `PackedFunc`

* delete duplicated line
---
 include/tvm/runtime/packed_func.h |  5 +++++
 tests/cpp/packed_func_test.cc     | 12 ++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index a4054c71f335..636700910d04 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -1903,6 +1903,11 @@ inline TVMRetValue& TVMRetValue::operator=(TObjectRef other) {
          ptr->IsInstance<Module::ContainerType>())) {
       return operator=(Module(std::move(other.data_)));
     }
+    if (std::is_base_of<PackedFunc::ContainerType, ContainerType>::value ||
+        (std::is_base_of<ContainerType, PackedFunc::ContainerType>::value &&
+         ptr->IsInstance<PackedFunc::ContainerType>())) {
+      return operator=(PackedFunc(std::move(other.data_)));
+    }
     SwitchToObject(kTVMObjectHandle, std::move(other.data_));
   } else {
     SwitchToPOD(kTVMNullptr);
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index ef72d03cf9ce..183aca1385a7 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -156,6 +156,18 @@ TEST(PackedFunc, Type) {
   ICHECK(get_type2("float32x2").operator DataType() == DataType::Float(32, 2));
 }
 
+TEST(PackedFunc, AsTVMRetValue) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  ObjectRef obj = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
+    PrimExpr x = args[0];
+    *rv = x.as<tvm::tir::IntImmNode>()->value + 1;
+  });
+  TVMRetValue value;
+  value = obj;
+  ICHECK_EQ(value.operator PackedFunc()(1).operator int(), 2);
+}
+
 TEST(TypedPackedFunc, HighOrder) {
   using namespace tvm;
   using namespace tvm::runtime;

From 43f1913e70fac7aaba274ed3f10c081fe7d197a0 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 3 Jan 2023 18:13:47 -0800
Subject: [PATCH 106/286] [COMMUNITY] @blackkker -> Reviewer (#13686)

Please join us to welcome @blackkker as a new reviewer to TVM.
@blackkker contributed extensively across different layers of the system,
including layout transform, PaddlePaddle, TFLite, and ONNX frontend.

- [Commits History](https://github.com/apache/tvm/commits?author=blackkker)
- [Code Review](https://github.com/apache/tvm/pulls?q=reviewed-by:blackkker)
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 84615e9fc60b..79df9186b995 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -204,6 +204,7 @@ We do encourage everyone to work anything they are interested in.
 - [Lianmin Zheng](https://github.com/merrymercy): @merrymercy
 - [Min Chen](https://github.com/multiverstack-intellif): @multiverstack-intellif
 - [Xiyou Zhou](https://github.com/zxybazh): @zxybazh
+- [@blackkker](https://github.com/blackkker): @blackkker
 
 ## List of Contributors
 - [Full List of Contributors](https://github.com/apache/tvm/graphs/contributors)

From f554f35af00cd8ac98014290461a3720b06f8d5d Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 3 Jan 2023 20:14:39 -0600
Subject: [PATCH 107/286] [Git] Ignore python/requirements directory (#13684)

This directory is automatically generated by the
`python/gen_requirements.py` script, and should be ignored by git
whenever present.
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index e9b9743f1359..03c0a0bc6af9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,9 @@ var/
 *.manifest
 *.spec
 
+# Generated by python/gen_requirements.py
+python/requirements/*.txt
+
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt

From 5a043c9b059f8f33b87bb52d021551588c404ace Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Wed, 4 Jan 2023 10:23:24 +0800
Subject: [PATCH 108/286] [Schedule][Bugfix] Fix decompose padding wrt the
 single child subtree (#13646)

Fix bug when decompose padding wrt the single child subtree
---
 .../schedule/primitive/decompose_padding.cc   | 17 +++--
 .../test_tir_schedule_decompose_padding.py    | 63 +++++++++++++++++++
 2 files changed, 74 insertions(+), 6 deletions(-)

diff --git a/src/tir/schedule/primitive/decompose_padding.cc b/src/tir/schedule/primitive/decompose_padding.cc
index c41760876722..e657b4f4663d 100644
--- a/src/tir/schedule/primitive/decompose_padding.cc
+++ b/src/tir/schedule/primitive/decompose_padding.cc
@@ -114,6 +114,10 @@ class PaddingInfoAnalyzer {
 
     // Step 3. Analyze in-bound write region.
     PrimExpr in_bound_predicate = RewritePredicate(pad_predicate && realize->predicate);
+    if (analyzer_->CanProveEqual(in_bound_predicate, 1)) {
+      SetError("The in-bound predicate is trivial");
+      return false;
+    }
     Array<Range> in_bound_region = this->EstimateInBoundRegion(
         /*iter_values=*/realize->iter_values, /*dom_map=*/dom_map,
         /*in_bound_predicate=*/in_bound_predicate);
@@ -439,13 +443,14 @@ StmtSRef DecomposePaddingImpl(ScheduleState self, const StmtSRef& block_sref,
     analyzer.Bind(cur_loop->loop_var, range);
     loops.push_back(cur_loop);
 
-    if (!found_const_filling_pos) {
-      if (cur_loop.same_as(const_filling_pos)) {
-        found_const_filling_pos = true;
+    if (cur_loop.same_as(const_filling_pos)) {
+      ICHECK(!found_const_filling_pos);
+      found_const_filling_pos = true;
+      if (!found_in_bound_filling_pos) {
+        found_in_bound_filling_pos = true;
+        in_bound_filling_pos = cur_loop;
       }
-    }
-
-    if (!found_in_bound_filling_pos) {
+    } else if (!found_in_bound_filling_pos) {
       if (!cur_loop->body->IsInstance<ForNode>() &&
           !cur_loop->body->IsInstance<BlockRealizeNode>()) {
         found_in_bound_filling_pos = true;
diff --git a/tests/python/unittest/test_tir_schedule_decompose_padding.py b/tests/python/unittest/test_tir_schedule_decompose_padding.py
index a3fc4326a3c9..ead8b0b33262 100644
--- a/tests/python/unittest/test_tir_schedule_decompose_padding.py
+++ b/tests/python/unittest/test_tir_schedule_decompose_padding.py
@@ -309,5 +309,68 @@ def pooling_decompose_3(
     check_decompose_padding(sum_pool_2d, sch.mod["main"], pooling_decompose_3, check_run=True)
 
 
+def test_decompose_wrt_single_child_subtree():
+    """Test the case when the decompose position is under the single child subtree"""
+
+    @T.prim_func
+    def pad_op(
+        x: T.Buffer[(1, 16, 225, 225), "int8"], y: T.Buffer([1, 16, 231, 231], dtype="int8")
+    ):
+        for i0, i1, i2, i3 in T.grid(1, 16, 231, 231):
+            with T.block("pad_temp"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                y[ax0, ax1, ax2, ax3] = T.if_then_else(
+                    3 <= ax2 and ax2 < 228 and 3 <= ax3 and ax3 < 228,
+                    x[ax0, ax1, ax2 - 3, ax3 - 3],
+                    T.int8(0),
+                    dtype="int8",
+                )
+
+    @T.prim_func
+    def pad_op_after(
+        x: T.Buffer[(1, 16, 225, 225), "int8"], y: T.Buffer[(1, 16, 231, 231), "int8"]
+    ):
+        for i0, i1 in T.grid(1, 16):
+            for i2, i3 in T.grid(231, 231):
+                with T.block("pad_temp_pad_const"):
+                    ax0 = T.axis.spatial(1, 0)
+                    ax1, ax2, ax3 = T.axis.remap("SSS", [i1, i2, i3])
+                    y[ax0, ax1, ax2, ax3] = T.int8(0)
+            for i2, i3 in T.grid(225, 225):
+                with T.block("pad_temp"):
+                    ax0 = T.axis.spatial(1, 0)
+                    ax1, ax2, ax3 = T.axis.remap("SSS", [i1, i2, i3])
+                    y[ax0, ax1, ax2 + 3, ax3 + 3] = x[ax0, ax1, ax2, ax3]
+
+    sch = tir.Schedule(pad_op, debug_mask="all")
+    pad = sch.get_block("pad_temp")
+    _, _, h, _ = sch.get_loops(pad)
+    sch.decompose_padding(pad, h)
+    check_decompose_padding(pad_op, sch.mod["main"], pad_op_after, check_run=True)
+
+
+def test_not_to_decompose_trivial_predicate():
+    """Test the case when the padding condition is trivial"""
+
+    @T.prim_func
+    def trivial_pad(
+        x: T.Buffer[(1, 16, 225, 225), "int8"], y: T.Buffer([1, 16, 225, 225], dtype="int8")
+    ):
+        for i0, i1, i2, i3 in T.grid(1, 16, 225, 225):
+            with T.block("pad_temp"):
+                ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                y[ax0, ax1, ax2, ax3] = T.if_then_else(
+                    0 <= ax2 and ax2 < 225 and 0 <= ax3 and ax3 < 225,
+                    x[ax0, ax1, ax2, ax3],
+                    T.int8(0),
+                    dtype="int8",
+                )
+
+    sch = tir.Schedule(trivial_pad, debug_mask="all")
+    pad = sch.get_block("pad_temp")
+    _, _, h, _ = sch.get_loops(pad)
+    assert not sch.can_decompose_padding(pad, h)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 942abff3c3f53cf595ef9009e8b8db74ec3cd49d Mon Sep 17 00:00:00 2001
From: Janet Schneider <21978033+janetsc@users.noreply.github.com>
Date: Wed, 4 Jan 2023 08:44:20 -0800
Subject: [PATCH 109/286] [Hexagon] Remove temporary VTCM workspace APIs
 (#13681)

---
 src/runtime/hexagon/hexagon_device_api.cc | 18 ++----------------
 src/runtime/hexagon/hexagon_device_api.h  | 14 --------------
 2 files changed, 2 insertions(+), 30 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index a1d55db42f28..ee2a826b02ea 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -157,20 +157,6 @@ void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
   dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
 }
 
-void* HexagonDeviceAPI::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape,
-                                           DLDataType dtype, Optional<String> mem_scope) {
-  // must be Hexagon device (not CPU)
-  CHECK(dev.device_type == kDLHexagon) << "dev.device_type: " << dev.device_type;
-  CHECK((ndim == 1 || ndim == 2) && "Hexagon Device API supports only 1d and 2d allocations");
-  return AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
-}
-
-void HexagonDeviceAPI::FreeVtcmWorkspace(Device dev, void* ptr) {
-  // must be Hexagon device (not CPU)
-  CHECK(dev.device_type == kDLHexagon) << "dev.device_type: " << dev.device_type;
-  FreeDataSpace(dev, ptr);
-}
-
 void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   CHECK_EQ(from->byte_offset, 0);
   CHECK_EQ(to->byte_offset, 0);
@@ -268,7 +254,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.alloc_nd").set_body([](TVMArgs args, TVM
   type_hint.lanes = 1;
 
   HexagonDeviceAPI* hexapi = HexagonDeviceAPI::Global();
-  *rv = hexapi->AllocVtcmWorkspace(dev, ndim, shape, type_hint, String(scope));
+  *rv = hexapi->AllocDataSpace(dev, ndim, shape, type_hint, String(scope));
 });
 
 TVM_REGISTER_GLOBAL("device_api.hexagon.free_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
@@ -283,7 +269,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.free_nd").set_body([](TVMArgs args, TVMR
   dev.device_id = device_id;
 
   HexagonDeviceAPI* hexapi = HexagonDeviceAPI::Global();
-  hexapi->FreeVtcmWorkspace(dev, ptr);
+  hexapi->FreeDataSpace(dev, ptr);
   *rv = static_cast<int32_t>(0);
 });
 
diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index e3adaf65548d..c4e87a957ade 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -138,20 +138,6 @@ class HexagonDeviceAPI final : public DeviceAPI {
   void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                        Optional<String> mem_scope) final;
 
-  /*!
-   * \brief Allocate an Nd VTCM workspace.
-   * \param dev The device to perform the operation.
-   * \param ndim The number of dimensions of allocated tensor.
-   * \param shape The shape of allocated tensor.
-   * \param dtype The element type.
-   * \return The allocated HexagonBuffer pointer.
-   */
-  void* AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
-                           Optional<String> mem_scope);
-
-  //! \brief Free the allocated Nd VTCM workspace.
-  void FreeVtcmWorkspace(Device dev, void* ptr);
-
   /*!
    * \brief Copy data from one storage to another.
    * \note This API is designed to support special memory with shape dependent layout.

From 91c8004c8855aaea7e73c07ba48e3a76498352a3 Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Wed, 4 Jan 2023 09:40:26 -0800
Subject: [PATCH 110/286] [Contrib][Sort] Faster Top-K Implementation (#13599)

This is a simple rewrite of hand-coded top-k function used for CPU targets.

The old implementation sorted each axis and then took the biggest k elements.

The new implementation does a single pass of each axis, keeping a min heap to store the top-k elements up to that point.

If n is the size of the array, and we want to find top k, the old implementation has runtime in O(nlogn) with additional memory O(n) to store the sorted array. The new implementation is O(n log k), and in practice is probably amortized to O(n / k * log k) in many scenarios and only requires O(k). Note n >> k most of the time.

In practice this new kernel led to a 20x speedup over existing one. On a Xeon Platinum 8370C CPU @ 2.80GHz for input shape [1, 3050] with k = 15, the latency went from 200us --> ~10us. There is probably more room for shaving off a little more time on the scale of a single us's, however I have determined it to not be worth it.
---
 src/runtime/contrib/sort/sort.cc | 91 +++++++++++++++++++++++---------
 1 file changed, 67 insertions(+), 24 deletions(-)

diff --git a/src/runtime/contrib/sort/sort.cc b/src/runtime/contrib/sort/sort.cc
index 8ea2f4b60cdf..bfb174a9206e 100644
--- a/src/runtime/contrib/sort/sort.cc
+++ b/src/runtime/contrib/sort/sort.cc
@@ -34,13 +34,25 @@ namespace contrib {
 
 using namespace runtime;
 
-template <typename DType>
+template <typename DType, bool stable_comparison = false>
 bool CompareAscend(const std::pair<int64_t, DType>& lhs, const std::pair<int64_t, DType>& rhs) {
+  if constexpr (stable_comparison) {
+    if (lhs.second == rhs.second) {
+      return lhs.first < rhs.first;
+    }
+  }
+
   return lhs.second < rhs.second;
 }
 
-template <typename DType>
+template <typename DType, bool stable_comparison = false>
 bool CompareDescend(const std::pair<int64_t, DType>& lhs, const std::pair<int64_t, DType>& rhs) {
+  if constexpr (stable_comparison) {
+    if (lhs.second == rhs.second) {
+      return lhs.first < rhs.first;
+    }
+  }
+
   return lhs.second > rhs.second;
 }
 
@@ -49,18 +61,14 @@ struct float16 {
   float to_float() const {
     return __extendXfYf2__<uint16_t, uint16_t, 10, float, uint32_t, 23>(bits);
   }
-};
 
-template <>
-bool CompareAscend(const std::pair<int64_t, float16>& lhs, const std::pair<int64_t, float16>& rhs) {
-  return lhs.second.to_float() < rhs.second.to_float();
-}
-
-template <>
-bool CompareDescend(const std::pair<int64_t, float16>& lhs,
-                    const std::pair<int64_t, float16>& rhs) {
-  return lhs.second.to_float() > rhs.second.to_float();
-}
+  inline bool operator==(const float16& rhs) const { return to_float() == rhs.to_float(); }
+  inline bool operator!=(const float16& rhs) const { return to_float() != rhs.to_float(); }
+  inline bool operator<(const float16& rhs) const { return to_float() < rhs.to_float(); }
+  inline bool operator>(const float16& rhs) const { return to_float() > rhs.to_float(); }
+  inline bool operator<=(const float16& rhs) const { return to_float() <= rhs.to_float(); }
+  inline bool operator>=(const float16& rhs) const { return to_float() >= rhs.to_float(); }
+};
 
 // Argsort implemented C library sort for nms.
 // Return indices of sorted tensor.
@@ -346,7 +354,12 @@ void topk(DLTensor* input, DLTensor* out_values, DLTensor* out_indices, int k, i
       (out_values == nullptr) ? nullptr : static_cast<DataType*>(out_values->data);
   IndicesType* indices_ptr =
       (out_indices == nullptr) ? nullptr : static_cast<IndicesType*>(out_indices->data);
-  std::vector<std::pair<int64_t, DataType>> sorter;
+
+  // Maintain a min/max containing the top-k elements
+  std::vector<std::pair<int64_t, DataType>> running_heap;
+
+  // Need +1 when inserting new element before maintaining heap invariant
+  running_heap.reserve(k + 1);
 
   int axis_mul_before = 1;
   int axis_mul_after = 1;
@@ -363,26 +376,56 @@ void topk(DLTensor* input, DLTensor* out_values, DLTensor* out_indices, int k, i
 
   for (int i = 0; i < axis_mul_before; ++i) {
     for (int j = 0; j < axis_mul_after; ++j) {
-      sorter.clear();
+      running_heap.clear();
       int64_t src_base_idx = i * input->shape[axis] * axis_mul_after + j;
       int64_t dst_base_idx = i * k * axis_mul_after + j;
-      for (int64_t kk = 0; kk < input->shape[axis]; ++kk) {
-        int64_t full_idx = src_base_idx + kk * axis_mul_after;
-        sorter.emplace_back(std::make_pair(kk, data_ptr[full_idx]));
+
+      // Start by creating min/max heap with fixed-k elements
+      int cur_axis_index = 0;
+      for (; cur_axis_index < k && cur_axis_index < input->shape[axis]; cur_axis_index++) {
+        int64_t full_idx = src_base_idx + cur_axis_index * axis_mul_after;
+        running_heap.emplace_back(std::make_pair(cur_axis_index, data_ptr[full_idx]));
+      }
+      if (!is_ascend) {
+        std::make_heap(running_heap.begin(), running_heap.end(), CompareDescend<DataType, true>);
+      } else {
+        std::make_heap(running_heap.begin(), running_heap.end(), CompareAscend<DataType, true>);
+      }
+
+      // Iterate through all elements, adding to heap along the way
+      for (; cur_axis_index < input->shape[axis]; cur_axis_index++) {
+        int64_t full_idx = src_base_idx + cur_axis_index * axis_mul_after;
+        std::pair<int64_t, DataType> cur_val = {cur_axis_index, data_ptr[full_idx]};
+
+        // Eq. to cur_val.second > running_heap.second
+        if (!is_ascend && CompareDescend<DataType, true>(cur_val, running_heap[0])) {
+          running_heap.push_back(cur_val);
+          std::push_heap(running_heap.begin(), running_heap.end(), CompareDescend<DataType, true>);
+          std::pop_heap(running_heap.begin(), running_heap.end(), CompareDescend<DataType, true>);
+          running_heap.pop_back();
+        } else if (is_ascend && CompareAscend<DataType, true>(cur_val, running_heap[0])) {
+          running_heap.push_back(cur_val);
+          std::push_heap(running_heap.begin(), running_heap.end(), CompareAscend<DataType, true>);
+          std::pop_heap(running_heap.begin(), running_heap.end(), CompareAscend<DataType, true>);
+          running_heap.pop_back();
+        }
       }
+
+      // finally sort heap and deliver results
       if (is_ascend) {
-        std::stable_sort(sorter.begin(), sorter.end(), CompareAscend<DataType>);
+        std::stable_sort(running_heap.begin(), running_heap.end(), CompareAscend<DataType, true>);
       } else {
-        std::stable_sort(sorter.begin(), sorter.end(), CompareDescend<DataType>);
+        std::stable_sort(running_heap.begin(), running_heap.end(), CompareDescend<DataType, true>);
       }
-      int64_t cnt = k > 0 ? k : input->shape[axis];
-      for (int64_t kk = 0; kk < cnt; ++kk) {
+
+      for (uint32_t kk = 0; kk < running_heap.size(); ++kk) {
         if (indices_ptr != nullptr) {
           indices_ptr[dst_base_idx + kk * axis_mul_after] =
-              static_cast<IndicesType>(sorter[kk].first);
+              static_cast<IndicesType>(running_heap[kk].first);
         }
         if (values_ptr != nullptr) {
-          values_ptr[dst_base_idx + kk * axis_mul_after] = static_cast<DataType>(sorter[kk].second);
+          values_ptr[dst_base_idx + kk * axis_mul_after] =
+              static_cast<DataType>(running_heap[kk].second);
         }
       }
     }

From 47fc772529e88012bffe7a6065c1a6df6bd0ca41 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 4 Jan 2023 11:40:54 -0600
Subject: [PATCH 111/286] [Build][Bugfix] Use CMAKE_ prefix for
 <LANG>_COMPILER_LAUNCHER (#13697)

Previously, when using `set(USE_CCACHE AUTO)`, the cmake config would
set variables `CXX_COMPILER_LAUNCHER` and `C_COMPILER_LAUNCHER`.
While there are the target-specific properties named
[`<LANG>_COMPILER_LAUNCHER`](https://cmake.org/cmake/help/latest/prop_tgt/LANG_COMPILER_LAUNCHER.html),
cmake doesn't check for their use as global variables.

This commit updates the build file to instead set the
[`CMAKE_<LANG>_COMPILER_LAUNCHER`](https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_LAUNCHER.html)
variables, which are used as the default for the
`<LANG>_COMPILER_LAUNCHER` property.
---
 cmake/utils/CCache.cmake  | 10 +++++-----
 cmake/utils/Summary.cmake |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cmake/utils/CCache.cmake b/cmake/utils/CCache.cmake
index f38a36b5dee8..446542b4b0be 100644
--- a/cmake/utils/CCache.cmake
+++ b/cmake/utils/CCache.cmake
@@ -16,11 +16,11 @@
 # under the License.
 
 if(USE_CCACHE) # True for AUTO, ON, /path/to/ccache
-  if(DEFINED CXX_COMPILER_LAUNCHER OR DEFINED C_COMPILER_LAUNCHER)
+  if(DEFINED CMAKE_CXX_COMPILER_LAUNCHER OR DEFINED CMAKE_C_COMPILER_LAUNCHER)
     if("${USE_CCACHE}" STREQUAL "AUTO")
-      message(STATUS "CXX_COMPILER_LAUNCHER or C_COMPILER_LAUNCHER already defined, not using ccache")
+      message(STATUS "CMAKE_CXX_COMPILER_LAUNCHER or CMAKE_C_COMPILER_LAUNCHER already defined, not using ccache")
     elseif("${USE_CCACHE}" MATCHES ${IS_TRUE_PATTERN})
-      message(FATAL_ERROR "CXX_COMPILER_LAUNCHER or C_COMPILER_LAUNCHER is already defined, refusing to override with ccache. Either unset or disable ccache.")
+      message(FATAL_ERROR "CMAKE_CXX_COMPILER_LAUNCHER or CMAKE_C_COMPILER_LAUNCHER is already defined, refusing to override with ccache. Either unset or disable ccache.")
     endif()
   else()
     if("${USE_CCACHE}" STREQUAL "AUTO") # Auto mode
@@ -45,8 +45,8 @@ if(USE_CCACHE) # True for AUTO, ON, /path/to/ccache
     endif()
     # Set the flag for ccache
     if(DEFINED PATH_TO_CCACHE)
-      set(CXX_COMPILER_LAUNCHER "${PATH_TO_CCACHE}")
-      set(C_COMPILER_LAUNCHER "${PATH_TO_CCACHE}")
+      set(CMAKE_CXX_COMPILER_LAUNCHER "${PATH_TO_CCACHE}")
+      set(CMAKE_C_COMPILER_LAUNCHER "${PATH_TO_CCACHE}")
     endif()
   endif()
 endif(USE_CCACHE)
diff --git a/cmake/utils/Summary.cmake b/cmake/utils/Summary.cmake
index e3ea925a9ae1..acb5703f600a 100644
--- a/cmake/utils/Summary.cmake
+++ b/cmake/utils/Summary.cmake
@@ -42,7 +42,7 @@ macro(print_summary)
     message(STATUS "  C++ compiler ID       : ${CMAKE_CXX_COMPILER_ID}")
     message(STATUS "  C++ compiler version  : ${CMAKE_CXX_COMPILER_VERSION}")
     message(STATUS "  CXX flags             : ${CMAKE_CXX_FLAGS}")
-    message(STATUS "  CXX launcher          : ${CXX_COMPILER_LAUNCHER}")
+    message(STATUS "  CXX launcher          : ${CMAKE_CXX_COMPILER_LAUNCHER}")
     message(STATUS "  Linker flags          : ${CMAKE_SHARED_LINKER_FLAGS}")
     message(STATUS "  Build type            : ${CMAKE_BUILD_TYPE}")
     get_directory_property(READABLE_COMPILE_DEFS DIRECTORY ${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS)

From 4d816b772146175a54ed0a0b6f83f5a613ff72cc Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 4 Jan 2023 09:48:32 -0800
Subject: [PATCH 112/286] [microTVM]Add default value to unspecified project
 options in project API (#13610)

In current TVM, project API ignores the default values that are set for each project option in a project API server. This results in extra function handlers to specify the default value for those options in runtime.
This PR searches through available ProjectOptions for a project API server and adds default values for unspecified options before sending the options to API call function handler.
---
 .../template_project/microtvm_api_server.py   |  46 +++---
 .../template_project/microtvm_api_server.py   | 137 +++++++-----------
 python/tvm/micro/project.py                   |  15 +-
 python/tvm/micro/project_api/server.py        |   2 +-
 tests/micro/arduino/conftest.py               |  13 --
 .../arduino/test_arduino_error_detection.py   |   6 +-
 .../micro/arduino/test_arduino_rpc_server.py  |  48 ++----
 tests/micro/arduino/test_arduino_workflow.py  |   6 +-
 tests/micro/arduino/test_utils.py             |   3 +-
 .../test_arduino_microtvm_api_server.py       |   7 +-
 tests/micro/zephyr/test_zephyr.py             |   1 -
 11 files changed, 110 insertions(+), 174 deletions(-)

diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py
index 4975d924dac1..4e8ee32c3719 100644
--- a/apps/microtvm/arduino/template_project/microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/microtvm_api_server.py
@@ -56,15 +56,6 @@
     raise FileNotFoundError(f"Board file {{{BOARDS}}} does not exist.")
 
 
-def get_cmsis_path(cmsis_path: pathlib.Path) -> pathlib.Path:
-    """Returns CMSIS dependency path"""
-    if cmsis_path:
-        return pathlib.Path(cmsis_path)
-    if os.environ.get("CMSIS_PATH"):
-        return pathlib.Path(os.environ.get("CMSIS_PATH"))
-    assert False, "'cmsis_path' option not passed!"
-
-
 class BoardAutodetectFailed(Exception):
     """Raised when no attached hardware is found matching the requested board"""
 
@@ -78,7 +69,11 @@ class BoardAutodetectFailed(Exception):
 ) + [
     server.ProjectOption(
         "arduino_cli_cmd",
-        required=(["generate_project", "flash", "open_transport"] if not ARDUINO_CLI_CMD else None),
+        required=(
+            ["generate_project", "build", "flash", "open_transport"]
+            if not ARDUINO_CLI_CMD
+            else None
+        ),
         optional=(
             ["generate_project", "build", "flash", "open_transport"] if ARDUINO_CLI_CMD else None
         ),
@@ -337,7 +332,7 @@ def _copy_cmsis(self, project_path: pathlib.Path, cmsis_path: str):
         However, the latest release does not include header files that are copied in this function.
         """
         (project_path / "include" / "cmsis").mkdir()
-        cmsis_path = get_cmsis_path(cmsis_path)
+        cmsis_path = pathlib.Path(cmsis_path)
         for item in self.CMSIS_INCLUDE_HEADERS:
             shutil.copy2(
                 cmsis_path / "CMSIS" / "NN" / "Include" / item,
@@ -357,7 +352,7 @@ def _populate_makefile(
         flags = {
             "FQBN": self._get_fqbn(board),
             "VERBOSE_FLAG": "--verbose" if verbose else "",
-            "ARUINO_CLI_CMD": self._get_arduino_cli_cmd(arduino_cli_cmd),
+            "ARUINO_CLI_CMD": arduino_cli_cmd,
             "BOARD": board,
             "BUILD_EXTRA_FLAGS": build_extra_flags,
         }
@@ -377,9 +372,10 @@ def _populate_makefile(
     def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
         # List all used project options
         board = options["board"]
-        verbose = options.get("verbose")
         project_type = options["project_type"]
-        arduino_cli_cmd = options.get("arduino_cli_cmd")
+        arduino_cli_cmd = options["arduino_cli_cmd"]
+        verbose = options["verbose"]
+
         cmsis_path = options.get("cmsis_path")
         compile_definitions = options.get("compile_definitions")
         extra_files_tar = options.get("extra_files_tar")
@@ -455,12 +451,6 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
             build_extra_flags,
         )
 
-    def _get_arduino_cli_cmd(self, arduino_cli_cmd: str):
-        if not arduino_cli_cmd:
-            arduino_cli_cmd = ARDUINO_CLI_CMD
-        assert arduino_cli_cmd, "'arduino_cli_cmd' command not passed and not found by default!"
-        return arduino_cli_cmd
-
     def _get_platform_version(self, arduino_cli_path: str) -> float:
         # sample output of this command:
         # 'arduino-cli alpha Version: 0.18.3 Commit: d710b642 Date: 2021-05-14T12:36:58Z\n'
@@ -494,11 +484,10 @@ def _get_fqbn(self, board: str):
 
     def build(self, options):
         # List all used project options
-        arduino_cli_cmd = options.get("arduino_cli_cmd")
+        arduino_cli_cmd = options["arduino_cli_cmd"]
         warning_as_error = options.get("warning_as_error")
 
-        cli_command = self._get_arduino_cli_cmd(arduino_cli_cmd)
-        self._check_platform_version(cli_command, warning_as_error)
+        self._check_platform_version(arduino_cli_cmd, warning_as_error)
         compile_cmd = ["make", "build"]
         # Specify project to compile
         subprocess.run(compile_cmd, check=True, cwd=API_SERVER_DIR)
@@ -539,7 +528,7 @@ def _parse_connected_boards(self, tabular_str):
 
     def _auto_detect_port(self, arduino_cli_cmd: str, board: str) -> str:
         # It is assumed only one board with this type is connected to this host machine.
-        list_cmd = [self._get_arduino_cli_cmd(arduino_cli_cmd), "board", "list"]
+        list_cmd = [arduino_cli_cmd, "board", "list"]
         list_cmd_output = subprocess.run(
             list_cmd, check=True, stdout=subprocess.PIPE
         ).stdout.decode("utf-8")
@@ -599,7 +588,7 @@ def _get_board_from_makefile(self, makefile_path: pathlib.Path) -> str:
 
     def flash(self, options):
         # List all used project options
-        arduino_cli_cmd = options.get("arduino_cli_cmd")
+        arduino_cli_cmd = options["arduino_cli_cmd"]
         warning_as_error = options.get("warning_as_error")
         port = options.get("port")
         board = options.get("board")
@@ -608,9 +597,8 @@ def flash(self, options):
         if not board:
             board = self._get_board_from_makefile(API_SERVER_DIR / MAKEFILE_FILENAME)
 
-        cli_command = self._get_arduino_cli_cmd(arduino_cli_cmd)
-        self._check_platform_version(cli_command, warning_as_error)
-        port = self._get_arduino_port(cli_command, board, port, serial_number)
+        self._check_platform_version(arduino_cli_cmd, warning_as_error)
+        port = self._get_arduino_port(arduino_cli_cmd, board, port, serial_number)
 
         upload_cmd = ["make", "flash", f"PORT={port}"]
         for _ in range(self.FLASH_MAX_RETRIES):
@@ -639,7 +627,7 @@ def open_transport(self, options):
         import serial.tools.list_ports
 
         # List all used project options
-        arduino_cli_cmd = options.get("arduino_cli_cmd")
+        arduino_cli_cmd = options["arduino_cli_cmd"]
         port = options.get("port")
         board = options.get("board")
         serial_number = options.get("serial_number")
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index 9a8015d62571..b0cd21e4adb2 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -175,7 +175,7 @@ def _find_board_from_cmake_file(cmake_file: Union[str, pathlib.Path]) -> str:
 
 def _find_platform_from_cmake_file(cmake_file: Union[str, pathlib.Path]) -> str:
     emu_platform = None
-    with open(API_SERVER_DIR / CMAKELIST_FILENAME) as cmake_f:
+    with open(cmake_file) as cmake_f:
         for line in cmake_f:
             set_platform = re.match("set\(EMU_PLATFORM (.*)\)", line)
             if set_platform:
@@ -184,14 +184,14 @@ def _find_platform_from_cmake_file(cmake_file: Union[str, pathlib.Path]) -> str:
     return emu_platform
 
 
-def _get_device_args(options):
+def _get_device_args(serial_number: str = None):
     flash_runner = _get_flash_runner()
 
     if flash_runner == "nrfjprog":
-        return _get_nrf_device_args(options)
+        return _get_nrf_device_args(serial_number)
 
     if flash_runner == "openocd":
-        return _get_openocd_device_args(options)
+        return _get_openocd_device_args(serial_number)
 
     raise BoardError(
         f"Don't know how to find serial terminal for board {_find_board_from_cmake_file(API_SERVER_DIR / CMAKELIST_FILENAME)} with flash "
@@ -199,14 +199,8 @@ def _get_device_args(options):
     )
 
 
-def _get_board_mem_size_bytes(options):
-    board_file_path = (
-        pathlib.Path(get_zephyr_base(options))
-        / "boards"
-        / "arm"
-        / options["board"]
-        / (options["board"] + ".yaml")
-    )
+def _get_board_mem_size_bytes(zephyr_base: str, board: str):
+    board_file_path = pathlib.Path(zephyr_base) / "boards" / "arm" / board / (board + ".yaml")
     try:
         with open(board_file_path) as f:
             board_data = yaml.load(f, Loader=yaml.FullLoader)
@@ -219,14 +213,14 @@ def _get_board_mem_size_bytes(options):
 DEFAULT_HEAP_SIZE_BYTES = 216 * 1024
 
 
-def _get_recommended_heap_size_bytes(options):
-    prop = BOARD_PROPERTIES[options["board"]]
+def _get_recommended_heap_size_bytes(board: str):
+    prop = BOARD_PROPERTIES[board]
     if "recommended_heap_size_bytes" in prop:
         return prop["recommended_heap_size_bytes"]
     return DEFAULT_HEAP_SIZE_BYTES
 
 
-def generic_find_serial_port(serial_number=None):
+def generic_find_serial_port(serial_number: str = None):
     """Find a USB serial port based on its serial number or its VID:PID.
 
     This method finds a USB serial port device path based on the port's serial number (if given) or
@@ -264,12 +258,11 @@ def generic_find_serial_port(serial_number=None):
     return serial_ports[0].device
 
 
-def _get_openocd_device_args(options):
-    serial_number = options.get("serial_number")
+def _get_openocd_device_args(serial_number: str = None):
     return ["--serial", generic_find_serial_port(serial_number)]
 
 
-def _get_nrf_device_args(serial_number: str):
+def _get_nrf_device_args(serial_number: str = None):
     nrfjprog_args = ["nrfjprog", "--ids"]
     nrfjprog_ids = subprocess.check_output(nrfjprog_args, encoding="utf-8")
     if not nrfjprog_ids.strip("\n"):
@@ -369,26 +362,6 @@ def _get_nrf_device_args(serial_number: str):
 ]
 
 
-def get_zephyr_base(options: dict) -> str:
-    """Returns Zephyr base path"""
-    zephyr_base = options.get("zephyr_base", ZEPHYR_BASE)
-    assert zephyr_base, "'zephyr_base' option not passed and not found by default!"
-    return zephyr_base
-
-
-def get_cmsis_path(options: dict) -> pathlib.Path:
-    """Returns CMSIS dependency path"""
-    cmsis_path = options.get("cmsis_path", os.environ.get("CMSIS_PATH", None))
-    if cmsis_path:
-        return pathlib.Path(cmsis_path)
-    return None
-
-
-def get_west_cmd(options: dict) -> str:
-    """Returns west command"""
-    return options.get("west_cmd", WEST_CMD)
-
-
 class Handler(server.ProjectAPIHandler):
     def __init__(self):
         super(Handler, self).__init__()
@@ -546,23 +519,22 @@ def _generate_cmake_args(
     def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
         zephyr_board = options["board"]
         project_type = options["project_type"]
+        zephyr_base = options["zephyr_base"]
+        west_cmd = options["west_cmd"]
 
-        zephyr_base = get_zephyr_base(options)
         warning_as_error = options.get("warning_as_error")
         use_fvp = options.get("use_fvp")
-        west_cmd = get_west_cmd(options)
         verbose = options.get("verbose")
 
-        recommended_heap_size = _get_recommended_heap_size_bytes(options)
+        recommended_heap_size = _get_recommended_heap_size_bytes(zephyr_board)
         heap_size_bytes = options.get("heap_size_bytes") or recommended_heap_size
-        board_mem_size = _get_board_mem_size_bytes(options)
+        board_mem_size = _get_board_mem_size_bytes(zephyr_base, zephyr_board)
 
         compile_definitions = options.get("compile_definitions")
         config_main_stack_size = options.get("config_main_stack_size")
 
         extra_files_tar = options.get("extra_files_tar")
-
-        cmsis_path = get_cmsis_path(options)
+        cmsis_path = options.get("cmsis_path")
 
         # Check Zephyr version
         version = self._get_platform_version(zephyr_base)
@@ -679,7 +651,7 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
                 tf.extractall(project_dir)
 
     def build(self, options):
-        verbose = options.get("verbose", None)
+        verbose = options.get("verbose")
 
         if BUILD_DIR.exists():
             shutil.rmtree(BUILD_DIR)
@@ -737,7 +709,7 @@ def _has_fpu(cls, zephyr_board):
 
     def flash(self, options):
         serial_number = options.get("serial_number")
-        west_cmd_list = get_west_cmd(options).split(" ")
+        west_cmd_list = options["west_cmd"].split(" ")
 
         if _find_platform_from_cmake_file(API_SERVER_DIR / CMAKELIST_FILENAME):
             return  # NOTE: qemu requires no flash step--it is launched from open_transport.
@@ -766,11 +738,16 @@ def open_transport(self, options):
         zephyr_board = _find_board_from_cmake_file(API_SERVER_DIR / CMAKELIST_FILENAME)
         emu_platform = _find_platform_from_cmake_file(API_SERVER_DIR / CMAKELIST_FILENAME)
         if self._is_fvp(zephyr_board, emu_platform == "armfvp"):
-            transport = ZephyrFvpTransport(options)
+            arm_fvp_path = options["arm_fvp_path"]
+            verbose = options.get("verbose")
+            transport = ZephyrFvpTransport(arm_fvp_path, verbose)
         elif self._is_qemu(zephyr_board):
-            transport = ZephyrQemuTransport(options)
+            gdbserver_port = options.get("gdbserver_port")
+            transport = ZephyrQemuTransport(gdbserver_port)
         else:
-            transport = ZephyrSerialTransport(options)
+            zephyr_base = options["zephyr_base"]
+            serial_number = options.get("serial_number")
+            transport = ZephyrSerialTransport(zephyr_base, serial_number)
 
         to_return = transport.open()
         self._transport = transport
@@ -811,14 +788,12 @@ class ZephyrSerialTransport:
     NRF5340_DK_BOARD_VCOM_BY_PRODUCT_ID = {0x1055: "VCOM2", 0x1051: "VCOM1"}
 
     @classmethod
-    def _lookup_baud_rate(cls, options):
+    def _lookup_baud_rate(cls, zephyr_base: str):
         # TODO(mehrdadh): remove this hack once dtlib.py is a standalone project
         # https://github.com/zephyrproject-rtos/zephyr/blob/v2.7-branch/scripts/dts/README.txt
         sys.path.insert(
             0,
-            os.path.join(
-                get_zephyr_base(options), "scripts", "dts", "python-devicetree", "src", "devicetree"
-            ),
+            os.path.join(zephyr_base, "scripts", "dts", "python-devicetree", "src", "devicetree"),
         )
         try:
             import dtlib  # pylint: disable=import-outside-toplevel
@@ -838,9 +813,9 @@ def _lookup_baud_rate(cls, options):
         return uart_baud
 
     @classmethod
-    def _find_nrf_serial_port(cls, options):
+    def _find_nrf_serial_port(cls, serial_number: str = None):
         com_ports = subprocess.check_output(
-            ["nrfjprog", "--com"] + _get_device_args(options), encoding="utf-8"
+            ["nrfjprog", "--com"] + _get_device_args(serial_number), encoding="utf-8"
         )
         ports_by_vcom = {}
         for line in com_ports.split("\n")[:-1]:
@@ -860,43 +835,43 @@ def _find_nrf_serial_port(cls, options):
         return ports_by_vcom[vcom_port]
 
     @classmethod
-    def _find_openocd_serial_port(cls, options):
-        serial_number = options.get("serial_number")
+    def _find_openocd_serial_port(cls, serial_number: str = None):
         return generic_find_serial_port(serial_number)
 
     @classmethod
-    def _find_jlink_serial_port(cls, options):
-        return generic_find_serial_port()
+    def _find_jlink_serial_port(cls, serial_number: str = None):
+        return generic_find_serial_port(serial_number)
 
     @classmethod
-    def _find_stm32cubeprogrammer_serial_port(cls, options):
-        return generic_find_serial_port()
+    def _find_stm32cubeprogrammer_serial_port(cls, serial_number: str = None):
+        return generic_find_serial_port(serial_number)
 
     @classmethod
-    def _find_serial_port(cls, options):
+    def _find_serial_port(cls, serial_number: str = None):
         flash_runner = _get_flash_runner()
 
         if flash_runner == "nrfjprog":
-            return cls._find_nrf_serial_port(options)
+            return cls._find_nrf_serial_port(serial_number)
 
         if flash_runner == "openocd":
-            return cls._find_openocd_serial_port(options)
+            return cls._find_openocd_serial_port(serial_number)
 
         if flash_runner == "jlink":
-            return cls._find_jlink_serial_port(options)
+            return cls._find_jlink_serial_port(serial_number)
 
         if flash_runner == "stm32cubeprogrammer":
-            return cls._find_stm32cubeprogrammer_serial_port(options)
+            return cls._find_stm32cubeprogrammer_serial_port(serial_number)
 
         raise RuntimeError(f"Don't know how to deduce serial port for flash runner {flash_runner}")
 
-    def __init__(self, options):
-        self._options = options
+    def __init__(self, zephyr_base: str, serial_number: str = None):
+        self._zephyr_base = zephyr_base
+        self._serial_number = serial_number
         self._port = None
 
     def open(self):
-        port_path = self._find_serial_port(self._options)
-        self._port = serial.Serial(port_path, baudrate=self._lookup_baud_rate(self._options))
+        port_path = self._find_serial_port(self._serial_number)
+        self._port = serial.Serial(port_path, baudrate=self._lookup_baud_rate(self._zephyr_base))
         return server.TransportTimeouts(
             session_start_retry_timeout_sec=2.0,
             session_start_timeout_sec=5.0,
@@ -933,8 +908,8 @@ class ZephyrQemuMakeResult(enum.Enum):
 class ZephyrQemuTransport:
     """The user-facing Zephyr QEMU transport class."""
 
-    def __init__(self, options):
-        self.options = options
+    def __init__(self, gdbserver_port: int = None):
+        self._gdbserver_port = gdbserver_port
         self.proc = None
         self.pipe_dir = None
         self.read_fd = None
@@ -954,9 +929,9 @@ def open(self):
         os.mkfifo(self.read_pipe)
 
         env = None
-        if self.options.get("gdbserver_port"):
+        if self._gdbserver_port:
             env = os.environ.copy()
-            env["TVM_QEMU_GDBSERVER_PORT"] = self.options["gdbserver_port"]
+            env["TVM_QEMU_GDBSERVER_PORT"] = self._gdbserver_port
 
         self.proc = subprocess.Popen(
             ["ninja", "run"],
@@ -1102,20 +1077,18 @@ def write(self, data):
 class ZephyrFvpTransport:
     """A transport class that communicates with the ARM FVP via Iris server."""
 
-    def __init__(self, options):
-        self.options = options
+    def __init__(self, arm_fvp_path: str, verbose: bool = False):
+        self._arm_fvp_path = arm_fvp_path
+        self._verbose = verbose
         self.proc = None
         self._queue = queue.Queue()
         self._import_iris()
 
     def _import_iris(self):
-        assert "arm_fvp_path" in self.options, "arm_fvp_path is not defined."
+        assert self._arm_fvp_path, "arm_fvp_path is not defined."
         # Location as seen in the FVP_Corstone_SSE-300_11.15_24 tar.
         iris_lib_path = (
-            pathlib.Path(self.options["arm_fvp_path"]).parent.parent.parent
-            / "Iris"
-            / "Python"
-            / "iris"
+            pathlib.Path(self._arm_fvp_path).parent.parent.parent / "Iris" / "Python" / "iris"
         )
 
         sys.path.insert(0, str(iris_lib_path.parent))
@@ -1142,7 +1115,7 @@ def _convertStringToU64Array(strValue):
 
     def open(self):
         args = ["ninja"]
-        if self.options.get("verbose"):
+        if self._verbose:
             args.append("-v")
         args.append("run")
         env = dict(os.environ)
diff --git a/python/tvm/micro/project.py b/python/tvm/micro/project.py
index 9dd57123676b..32d2cbf4db71 100644
--- a/python/tvm/micro/project.py
+++ b/python/tvm/micro/project.py
@@ -28,6 +28,17 @@
 from .transport import Transport, TransportTimeouts
 
 
+def add_unspecified_options(options: dict, server_project_options: list) -> dict:
+    """Adds default value of project template options that are not specified by user."""
+    if not options:
+        options = dict()
+    for option in server_project_options:
+        name = option["name"]
+        if name not in options.keys():
+            options[name] = option["default"]
+    return options
+
+
 class ProjectTransport(Transport):
     """A Transport implementation that uses the Project API client."""
 
@@ -69,10 +80,10 @@ def from_directory(cls, project_dir: Union[pathlib.Path, str], options: dict):
 
     def __init__(self, api_client, options):
         self._api_client = api_client
-        self._options = options
         self._info = self._api_client.server_info_query(__version__)
         if self._info["is_template"]:
             raise TemplateProjectError()
+        self._options = add_unspecified_options(options, self._info["project_options"])
 
     def build(self):
         self._api_client.build(self._options)
@@ -124,6 +135,8 @@ def _check_project_options(self, options: dict):
     def generate_project_from_mlf(self, model_library_format_path, project_dir, options: dict):
         """Generate a project from MLF file."""
         self._check_project_options(options)
+        options = add_unspecified_options(options, self._info["project_options"])
+
         self._api_client.generate_project(
             model_library_format_path=str(model_library_format_path),
             standalone_crt_dir=get_standalone_crt_dir(),
diff --git a/python/tvm/micro/project_api/server.py b/python/tvm/micro/project_api/server.py
index 5aed3a896241..2d5db09f4bbe 100644
--- a/python/tvm/micro/project_api/server.py
+++ b/python/tvm/micro/project_api/server.py
@@ -804,7 +804,7 @@ def default_project_options(**kw) -> typing.List[ProjectOption]:
             "cmsis_path",
             optional=["generate_project"],
             type="str",
-            default=None,
+            default=os.environ.get("CMSIS_PATH", None),
             help="Path to the CMSIS directory.",
         ),
         ProjectOption(
diff --git a/tests/micro/arduino/conftest.py b/tests/micro/arduino/conftest.py
index a5ce8127c0bb..ffa1376efe12 100644
--- a/tests/micro/arduino/conftest.py
+++ b/tests/micro/arduino/conftest.py
@@ -22,20 +22,7 @@
 import pytest
 
 
-def pytest_addoption(parser):
-    parser.addoption(
-        "--arduino-cli-cmd",
-        default="arduino-cli",
-        help="Path to `arduino-cli` command for flashing device.",
-    )
-
-
 def pytest_configure(config):
     config.addinivalue_line(
         "markers", "requires_hardware: mark test to run only when an Arduino board is connected"
     )
-
-
-@pytest.fixture(scope="session")
-def arduino_cli_cmd(request):
-    return request.config.getoption("--arduino-cli-cmd")
diff --git a/tests/micro/arduino/test_arduino_error_detection.py b/tests/micro/arduino/test_arduino_error_detection.py
index f1278094b484..75b97fa86ca3 100644
--- a/tests/micro/arduino/test_arduino_error_detection.py
+++ b/tests/micro/arduino/test_arduino_error_detection.py
@@ -24,10 +24,8 @@
 
 
 @pytest.fixture
-def project(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number):
-    return test_utils.make_kws_project(
-        board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number
-    )
+def project(board, microtvm_debug, workspace_dir, serial_number):
+    return test_utils.make_kws_project(board, microtvm_debug, workspace_dir, serial_number)
 
 
 def test_blank_project_compiles(workspace_dir, project):
diff --git a/tests/micro/arduino/test_arduino_rpc_server.py b/tests/micro/arduino/test_arduino_rpc_server.py
index ae22fb9499b8..38f34de82beb 100644
--- a/tests/micro/arduino/test_arduino_rpc_server.py
+++ b/tests/micro/arduino/test_arduino_rpc_server.py
@@ -41,7 +41,6 @@
 def _make_session(
     model,
     arduino_board,
-    arduino_cli_cmd,
     workspace_dir,
     mod,
     build_config,
@@ -53,7 +52,6 @@ def _make_session(
         workspace_dir / "project",
         {
             "board": arduino_board,
-            "arduino_cli_cmd": arduino_cli_cmd,
             "project_type": "host_driven",
             "verbose": bool(build_config.get("debug")),
             "serial_number": serial_number,
@@ -67,7 +65,6 @@ def _make_session(
 def _make_sess_from_op(
     model,
     arduino_board,
-    arduino_cli_cmd,
     workspace_dir,
     op_name,
     sched,
@@ -80,14 +77,10 @@ def _make_sess_from_op(
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.build(sched, arg_bufs, target=target, runtime=runtime, name=op_name)
 
-    return _make_session(
-        model, arduino_board, arduino_cli_cmd, workspace_dir, mod, build_config, serial_number
-    )
+    return _make_session(model, arduino_board, workspace_dir, mod, build_config, serial_number)
 
 
-def _make_add_sess(
-    model, arduino_board, arduino_cli_cmd, workspace_dir, build_config, serial_number: str = None
-):
+def _make_add_sess(model, arduino_board, workspace_dir, build_config, serial_number: str = None):
     A = tvm.te.placeholder((2,), dtype="int8")
     B = tvm.te.placeholder((1,), dtype="int8")
     C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
@@ -95,7 +88,6 @@ def _make_add_sess(
     return _make_sess_from_op(
         model,
         arduino_board,
-        arduino_cli_cmd,
         workspace_dir,
         "add",
         sched,
@@ -108,7 +100,7 @@ def _make_add_sess(
 # The same test code can be executed on both the QEMU simulation and on real hardware.
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_compile_runtime(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number):
+def test_compile_runtime(board, microtvm_debug, workspace_dir, serial_number):
     """Test compiling the on-device runtime."""
 
     model = test_utils.ARDUINO_BOARDS[board]
@@ -127,15 +119,13 @@ def test_basic_add(sess):
         system_lib.get_function("add")(A_data, B_data, C_data)
         assert (C_data.numpy() == np.array([6, 7])).all()
 
-    with _make_add_sess(
-        model, board, arduino_cli_cmd, workspace_dir, build_config, serial_number
-    ) as sess:
+    with _make_add_sess(model, board, workspace_dir, build_config, serial_number) as sess:
         test_basic_add(sess)
 
 
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_platform_timer(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number):
+def test_platform_timer(board, microtvm_debug, workspace_dir, serial_number):
     """Test compiling the on-device runtime."""
 
     model = test_utils.ARDUINO_BOARDS[board]
@@ -159,15 +149,13 @@ def test_basic_add(sess):
         assert result.mean > 0
         assert len(result.results) == 3
 
-    with _make_add_sess(
-        model, board, arduino_cli_cmd, workspace_dir, build_config, serial_number
-    ) as sess:
+    with _make_add_sess(model, board, workspace_dir, build_config, serial_number) as sess:
         test_basic_add(sess)
 
 
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_relay(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number):
+def test_relay(board, microtvm_debug, workspace_dir, serial_number):
     """Testing a simple relay graph"""
     model = test_utils.ARDUINO_BOARDS[board]
     build_config = {"debug": microtvm_debug}
@@ -186,9 +174,7 @@ def test_relay(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_num
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.relay.build(func, target=target, runtime=runtime)
 
-    with _make_session(
-        model, board, arduino_cli_cmd, workspace_dir, mod, build_config, serial_number
-    ) as session:
+    with _make_session(model, board, workspace_dir, mod, build_config, serial_number) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
             mod.get_graph_json(), session.get_system_lib(), session.device
         )
@@ -202,7 +188,7 @@ def test_relay(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_num
 
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_onnx(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number):
+def test_onnx(board, microtvm_debug, workspace_dir, serial_number):
     """Testing a simple ONNX model."""
     model = test_utils.ARDUINO_BOARDS[board]
     build_config = {"debug": microtvm_debug}
@@ -232,7 +218,7 @@ def test_onnx(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_numb
         graph = lowered.get_graph_json()
 
     with _make_session(
-        model, board, arduino_cli_cmd, workspace_dir, lowered, build_config, serial_number
+        model, board, workspace_dir, lowered, build_config, serial_number
     ) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
             graph, session.get_system_lib(), session.device
@@ -256,7 +242,6 @@ def check_result(
     relay_mod,
     model,
     arduino_board,
-    arduino_cli_cmd,
     workspace_dir,
     map_inputs,
     out_shape,
@@ -272,7 +257,7 @@ def check_result(
         mod = tvm.relay.build(relay_mod, target=target, runtime=runtime)
 
     with _make_session(
-        model, arduino_board, arduino_cli_cmd, workspace_dir, mod, build_config, serial_number
+        model, arduino_board, workspace_dir, mod, build_config, serial_number
     ) as session:
         rt_mod = tvm.micro.create_local_graph_executor(
             mod.get_graph_json(), session.get_system_lib(), session.device
@@ -294,7 +279,7 @@ def check_result(
 
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_byoc_microtvm(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number):
+def test_byoc_microtvm(board, microtvm_debug, workspace_dir, serial_number):
     """This is a simple test case to check BYOC capabilities of microTVM"""
     model = test_utils.ARDUINO_BOARDS[board]
     build_config = {"debug": microtvm_debug}
@@ -352,7 +337,6 @@ def test_byoc_microtvm(board, arduino_cli_cmd, microtvm_debug, workspace_dir, se
         model=model,
         build_config=build_config,
         arduino_board=board,
-        arduino_cli_cmd=arduino_cli_cmd,
         workspace_dir=workspace_dir,
         serial_number=serial_number,
     )
@@ -361,7 +345,6 @@ def test_byoc_microtvm(board, arduino_cli_cmd, microtvm_debug, workspace_dir, se
 def _make_add_sess_with_shape(
     model,
     arduino_board,
-    arduino_cli_cmd,
     workspace_dir,
     shape,
     build_config,
@@ -373,7 +356,6 @@ def _make_add_sess_with_shape(
     return _make_sess_from_op(
         model,
         arduino_board,
-        arduino_cli_cmd,
         workspace_dir,
         "add",
         sched,
@@ -393,9 +375,7 @@ def _make_add_sess_with_shape(
 )
 @tvm.testing.requires_micro
 @pytest.mark.requires_hardware
-def test_rpc_large_array(
-    board, arduino_cli_cmd, microtvm_debug, workspace_dir, shape, serial_number
-):
+def test_rpc_large_array(board, microtvm_debug, workspace_dir, shape, serial_number):
     """Test large RPC array transfer."""
     model = test_utils.ARDUINO_BOARDS[board]
     build_config = {"debug": microtvm_debug}
@@ -410,7 +390,7 @@ def test_tensors(sess):
         assert (C_data.numpy() == np.zeros(shape)).all()
 
     with _make_add_sess_with_shape(
-        model, board, arduino_cli_cmd, workspace_dir, shape, build_config, serial_number
+        model, board, workspace_dir, shape, build_config, serial_number
     ) as sess:
         test_tensors(sess)
 
diff --git a/tests/micro/arduino/test_arduino_workflow.py b/tests/micro/arduino/test_arduino_workflow.py
index 51898424aee5..42874ad6c349 100644
--- a/tests/micro/arduino/test_arduino_workflow.py
+++ b/tests/micro/arduino/test_arduino_workflow.py
@@ -54,12 +54,10 @@ def project_dir(workflow_workspace_dir):
 # We MUST pass workspace_dir, not project_dir, or the workspace will be dereferenced
 # too soon. We can't use the board fixture either for the reason mentioned above.
 @pytest.fixture(scope="module")
-def project(request, arduino_cli_cmd, microtvm_debug, workflow_workspace_dir):
+def project(request, microtvm_debug, workflow_workspace_dir):
     board = request.config.getoption("--board")
     serial_number = request.config.getoption("--serial-number")
-    return test_utils.make_kws_project(
-        board, arduino_cli_cmd, microtvm_debug, workflow_workspace_dir, serial_number
-    )
+    return test_utils.make_kws_project(board, microtvm_debug, workflow_workspace_dir, serial_number)
 
 
 def _get_directory_elements(directory):
diff --git a/tests/micro/arduino/test_utils.py b/tests/micro/arduino/test_utils.py
index d81edc845b98..1456e1f7591e 100644
--- a/tests/micro/arduino/test_utils.py
+++ b/tests/micro/arduino/test_utils.py
@@ -61,7 +61,7 @@ def make_workspace_dir(test_name, board):
     return t
 
 
-def make_kws_project(board, arduino_cli_cmd, microtvm_debug, workspace_dir, serial_number: str):
+def make_kws_project(board, microtvm_debug, workspace_dir, serial_number: str):
     this_dir = pathlib.Path(__file__).parent
     model = ARDUINO_BOARDS[board]
     build_config = {"debug": microtvm_debug}
@@ -85,7 +85,6 @@ def make_kws_project(board, arduino_cli_cmd, microtvm_debug, workspace_dir, seri
         workspace_dir / "project",
         {
             "board": board,
-            "arduino_cli_cmd": arduino_cli_cmd,
             "project_type": "example_project",
             "verbose": bool(build_config.get("debug")),
             "serial_number": serial_number,
diff --git a/tests/micro/project_api/test_arduino_microtvm_api_server.py b/tests/micro/project_api/test_arduino_microtvm_api_server.py
index ad9bd4a56a2d..39e5780af6dc 100644
--- a/tests/micro/project_api/test_arduino_microtvm_api_server.py
+++ b/tests/micro/project_api/test_arduino_microtvm_api_server.py
@@ -136,14 +136,15 @@ def test_auto_detect_port(self, mock_run):
         arduino_cli_cmd = self.DEFAULT_OPTIONS.get("arduino_cli_cmd")
         warning_as_error = self.DEFAULT_OPTIONS.get("warning_as_error")
 
-        cli_command = handler._get_arduino_cli_cmd(arduino_cli_cmd)
-        handler._check_platform_version(cli_command=cli_command, warning_as_error=warning_as_error)
+        handler._check_platform_version(
+            cli_command=arduino_cli_cmd, warning_as_error=warning_as_error
+        )
         assert handler._version == version.parse("0.21.1")
 
         handler = microtvm_api_server.Handler()
         mock_run.return_value.stdout = bytes(self.BAD_CLI_VERSION, "utf-8")
         with pytest.raises(server.ServerError) as error:
-            handler._check_platform_version(cli_command=cli_command, warning_as_error=True)
+            handler._check_platform_version(cli_command=arduino_cli_cmd, warning_as_error=True)
         mock_run.reset_mock()
 
     @mock.patch("subprocess.run")
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index a8fb26133970..6b49c043cc3d 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -605,7 +605,6 @@ def test_schedule_build_with_cmsis_dependency(workspace_dir, board, microtvm_deb
         "project_type": "host_driven",
         "verbose": bool(build_config.get("debug")),
         "board": board,
-        "cmsis_path": os.getenv("CMSIS_PATH"),
         "use_fvp": bool(use_fvp),
     }
 

From 77a39dafe38aff10235ce232c63b59c4f86bae27 Mon Sep 17 00:00:00 2001
From: zhaojinxi <super-string@outlook.com>
Date: Thu, 5 Jan 2023 04:26:31 +0800
Subject: [PATCH 113/286] Add header files for GraphExecutorDebug (#13694)

---
 .../debug/graph_executor_debug.cc             | 527 ++++++++----------
 .../debug/graph_executor_debug.h              | 147 +++++
 2 files changed, 368 insertions(+), 306 deletions(-)
 create mode 100644 src/runtime/graph_executor/debug/graph_executor_debug.h

diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index e0b970a3ad88..5e6182ec279f 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -20,6 +20,8 @@
 /*!
  * \file graph_executor_debug.cc
  */
+#include "./graph_executor_debug.h"
+
 #include <tvm/runtime/container/string.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
@@ -32,339 +34,158 @@
 #include <sstream>
 
 #include "../../rpc/rpc_session.h"
-#include "../graph_executor.h"
 
 namespace tvm {
 namespace runtime {
-
-/*!
- * \brief Graph executor with debug .
- *
- *  This is the extension of GraphExecutor class used for debugging
- *  TVM runtime PackedFunc API.
- */
-class GraphExecutorDebug : public GraphExecutor {
- public:
-  /*!
-   * \brief Run each operation in the graph and get the time per op for all ops.
-   * \param number The number of times to run this function for taking average.
-   * \param repeat The number of times to repeat the measurement.
-   *        In total, the function will be invoked (1 + number x repeat) times,
-   *        where the first one is warmed up and will be discarded in case
-   *        there is lazy initialization.
-   * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
-   *        By default, one `repeat` contains `number` runs. If this parameter is set,
-   *        the parameters `number` will be dynamically adjusted to meet the
-   *        minimum duration requirement of one `repeat`.
-   * \param limit_zero_time_iterations The maximum number of repeats when
-   *        measured time is equal to 0.  It helps to avoid hanging during
-   *        measurements.
-   * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
-   *        defined by `repeats_to_cooldown`.
-   * \param repeats_to_cooldown The number of repeats before the
-   *        cooldown is activated.
-   * \return Returns a string with an encoded byte array. Where the first 8 bytes are int64_t
-   * representing the number of layers. Next the encoded real numbers are float32_t in the number of
-   * repeat multiplied by the number of layers.
-   */
-  std::string RunIndividual(int number, int repeat, int min_repeat_ms,
-                            int limit_zero_time_iterations, int cooldown_interval_ms,
-                            int repeats_to_cooldown) {
-    // warmup run
-    GraphExecutor::Run();
-    std::string tkey = module_->type_key();
-    std::vector<std::vector<double>> time_sec_per_op(op_execs_.size());
-    if (tkey == "rpc") {
-      // RPC modules rely on remote timing which implements the logic from the else branch.
-      for (size_t index = 0; index < op_execs_.size(); ++index) {
-        time_sec_per_op[index] =
-            RunOpRPC(index, number, repeat, min_repeat_ms, limit_zero_time_iterations,
-                     cooldown_interval_ms, repeats_to_cooldown);
+std::string GraphExecutorDebug::RunIndividual(int number, int repeat, int min_repeat_ms,
+                                              int limit_zero_time_iterations,
+                                              int cooldown_interval_ms, int repeats_to_cooldown) {
+  // warmup run
+  GraphExecutor::Run();
+  std::string tkey = module_->type_key();
+  std::vector<std::vector<double>> time_sec_per_op(op_execs_.size());
+  if (tkey == "rpc") {
+    // RPC modules rely on remote timing which implements the logic from the else branch.
+    for (size_t index = 0; index < op_execs_.size(); ++index) {
+      time_sec_per_op[index] =
+          RunOpRPC(index, number, repeat, min_repeat_ms, limit_zero_time_iterations,
+                   cooldown_interval_ms, repeats_to_cooldown);
+    }
+  } else {
+    int op = 0;
+    for (size_t index = 0; index < op_execs_.size(); ++index) {
+      std::string result_str =
+          RunIndividualNode(index, number, repeat, min_repeat_ms, limit_zero_time_iterations,
+                            cooldown_interval_ms, repeats_to_cooldown);
+      const double* blob_ptr = reinterpret_cast<const double*>(result_str.data());
+      for (int i = 0; i < repeat; ++i, ++blob_ptr) {
+        time_sec_per_op[index].push_back(*blob_ptr);
       }
-    } else {
-      int op = 0;
-      for (size_t index = 0; index < op_execs_.size(); ++index) {
-        std::string result_str =
-            RunIndividualNode(index, number, repeat, min_repeat_ms, limit_zero_time_iterations,
-                              cooldown_interval_ms, repeats_to_cooldown);
-        const double* blob_ptr = reinterpret_cast<const double*>(result_str.data());
-        for (int i = 0; i < repeat; ++i, ++blob_ptr) {
-          time_sec_per_op[index].push_back(*blob_ptr);
-        }
-        if (op_execs_[index]) {
-          LOG(INFO) << "Op #" << op << " " << GetNodeName(index) << ":";
-          for (size_t cur_repeat = 0; cur_repeat < time_sec_per_op[index].size(); cur_repeat++) {
-            const auto& data = time_sec_per_op[index][cur_repeat];
-            LOG(INFO) << "Iteration: " << cur_repeat << ": " << (data * 1e6) << " us/iter";
-          }
-          ++op;
+      if (op_execs_[index]) {
+        LOG(INFO) << "Op #" << op << " " << GetNodeName(index) << ":";
+        for (size_t cur_repeat = 0; cur_repeat < time_sec_per_op[index].size(); cur_repeat++) {
+          const auto& data = time_sec_per_op[index][cur_repeat];
+          LOG(INFO) << "Iteration: " << cur_repeat << ": " << (data * 1e6) << " us/iter";
         }
+        ++op;
       }
     }
-
-    std::ostringstream os;
-    int64_t size = time_sec_per_op.size();
-    os.write(reinterpret_cast<char*>(&size), sizeof(int64_t));
-    for (size_t index = 0; index < time_sec_per_op.size(); ++index) {
-      for (auto& repeat_data : time_sec_per_op[index]) {
-        // To have good behavior when calculating total time, etc.
-        double data = std::isnan(repeat_data) ? 0 : repeat_data;
-        os.write(reinterpret_cast<char*>(&data), sizeof(double));
-      }
-    }
-    return os.str();
   }
 
-  std::string RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms,
-                                int limit_zero_time_iterations, int cooldown_interval_ms,
-                                int repeats_to_cooldown) {
-    std::string tkey = module_->type_key();
-
-    if (tkey == "rpc") {
-      LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
+  std::ostringstream os;
+  int64_t size = time_sec_per_op.size();
+  os.write(reinterpret_cast<char*>(&size), sizeof(int64_t));
+  for (size_t index = 0; index < time_sec_per_op.size(); ++index) {
+    for (auto& repeat_data : time_sec_per_op[index]) {
+      // To have good behavior when calculating total time, etc.
+      double data = std::isnan(repeat_data) ? 0 : repeat_data;
+      os.write(reinterpret_cast<char*>(&data), sizeof(double));
     }
-
-    if (!op_execs_[node_index]) {
-      // don't return anything...
-      std::ostringstream os;
-      double zero = 0;
-      for (int i = 0; i < repeat; ++i) {
-        os.write(reinterpret_cast<char*>(&zero), sizeof(double));
-      }
-      return os.str();
-    }
-
-    // assume host runs things which is first device
-    Device& d = devices_[0];
-    PackedFunc time_evaluator = profiling::WrapTimeEvaluator(
-        TypedPackedFunc<void()>([this, node_index]() { this->RunOpHost(node_index); }), d, number,
-        repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms,
-        repeats_to_cooldown);
-    return time_evaluator();
   }
+  return os.str();
+}
 
-  std::vector<double> RunOpRPC(int index, int number, int repeat, int min_repeat_ms,
-                               int limit_zero_time_iterations, int cooldown_interval_ms,
-                               int repeats_to_cooldown) {
-    std::vector<double> results(repeat, 0);
-    // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes
-    // which represent inputs/parameters to the graph. Other types may be supported in the
-    // future, but consideration would be needed as to how to do that over RPC before we support
-    // it here.
-    if (nodes_[index].op_type != "tvm_op") {
-      CHECK_EQ(nodes_[index].op_type, "null")
-          << "Don't know how to run op type " << nodes_[index].op_type
-          << " remotely over RPC right now";
-
-      // NOTE: GraphExecutorDebug expects graph nodes to have an "op" attribute of "tvm_op" or
-      // "null" and "null" is a placeholder node for a parameter or input.
-      return results;
-    }
+std::string GraphExecutorDebug::RunIndividualNode(int node_index, int number, int repeat,
+                                                  int min_repeat_ms, int limit_zero_time_iterations,
+                                                  int cooldown_interval_ms,
+                                                  int repeats_to_cooldown) {
+  std::string tkey = module_->type_key();
 
-    const Device& dev = data_entry_[entry_id(index, 0)]->device;
-    TVMOpParam param = nodes_[index].param;
-    std::string name = param.func_name;
-    uint32_t num_inputs = param.num_inputs;
-    uint32_t num_outputs = param.num_outputs;
-
-    PackedFunc time_eval =
-        runtime::Registry::Get("runtime.RPCTimeEvaluator")
-            ->
-            operator()(module_, name, static_cast<int>(dev.device_type), dev.device_id, number,
-                       repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms,
-                       repeats_to_cooldown, "");
-
-    int num_flat_args = num_inputs + num_outputs;
-    auto values = std::make_unique<TVMValue[]>(num_flat_args);
-    auto type_codes = std::make_unique<int[]>(num_flat_args);
-    TVMArgsSetter setter(values.get(), type_codes.get());
-    int offs = 0;
-    const auto& inode = nodes_[index];
-    for (const auto& e : inode.inputs) {
-      uint32_t eid = this->entry_id(e);
-      DLTensor* arg = const_cast<DLTensor*>(data_entry_[eid].operator->());
-      setter(offs, arg);
-      offs++;
-    }
-    for (uint32_t i = 0; i < num_outputs; ++i) {
-      uint32_t eid = this->entry_id(index, i);
-      DLTensor* arg = const_cast<DLTensor*>(data_entry_[eid].operator->());
-      setter(offs, arg);
-      offs++;
-    }
-    TVMRetValue rv;
-    time_eval.CallPacked(TVMArgs(values.get(), type_codes.get(), num_flat_args), &rv);
-    std::string results_str = rv.operator std::string();
-    const double* blob_ptr = reinterpret_cast<const double*>(results_str.data());
-    for (int i = 0; i < repeat; ++i, ++blob_ptr) {
-      results[i] = *blob_ptr;
-    }
+  if (tkey == "rpc") {
+    LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
+  }
 
+  if (!op_execs_[node_index]) {
+    // don't return anything...
     std::ostringstream os;
-    for (auto& repeat_data : results) {
-      os << std::to_string(repeat_data) << ", ";
+    double zero = 0;
+    for (int i = 0; i < repeat; ++i) {
+      os.write(reinterpret_cast<char*>(&zero), sizeof(double));
     }
-    LOG(INFO) << "Got op timing: " << os.str();
-    return results;
+    return os.str();
   }
 
-  Timer RunOpHost(int index) {
-    const Device& dev = data_entry_[entry_id(index, 0)]->device;
-    Timer t = Timer::Start(dev);
-    op_execs_[index]();
-    t->Stop();
-    return t;
-  }
+  // assume host runs things which is first device
+  Device& d = devices_[0];
+  PackedFunc time_evaluator = profiling::WrapTimeEvaluator(
+      TypedPackedFunc<void()>([this, node_index]() { this->RunOpHost(node_index); }), d, number,
+      repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown);
+  return time_evaluator();
+}
 
-  /*!
-   * \brief GetFunction Get the function based on input.
-   * \param name The function which needs to be invoked.
-   * \param sptr_to_self Packed function pointer.
-   */
-  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self);
-
-  /*!
-   * \brief Get the node index given the name of node.
-   * \param name The name of the node.
-   * \return The index of node.
-   */
-  int GetNodeIndex(const std::string& name) const {
-    for (size_t nid = 0; nid < GetNumOfNodes(); ++nid) {
-      if (GetNodeName(nid) == name) {
-        return static_cast<int>(nid);
-      }
-    }
-    LOG(FATAL) << "cannot find " << name << " among nodex";
+std::vector<double> GraphExecutorDebug::RunOpRPC(int index, int number, int repeat,
+                                                 int min_repeat_ms, int limit_zero_time_iterations,
+                                                 int cooldown_interval_ms,
+                                                 int repeats_to_cooldown) {
+  std::vector<double> results(repeat, 0);
+  // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes
+  // which represent inputs/parameters to the graph. Other types may be supported in the
+  // future, but consideration would be needed as to how to do that over RPC before we support
+  // it here.
+  if (nodes_[index].op_type != "tvm_op") {
+    CHECK_EQ(nodes_[index].op_type, "null")
+        << "Don't know how to run op type " << nodes_[index].op_type
+        << " remotely over RPC right now";
+
+    // NOTE: GraphExecutorDebug expects graph nodes to have an "op" attribute of "tvm_op" or
+    // "null" and "null" is a placeholder node for a parameter or input.
+    return results;
   }
 
-  /*!
-   * \brief Execute index-th node in the network.
-   *
-   * This method will do a partial run of the graph
-   * up to index-th node.
-   *
-   * \param node: The index of the node.
-   */
-  void ExecuteNode(int node) {
-    ICHECK_LT(static_cast<size_t>(node), op_execs_.size());
-
-    int start_ind;
-    int end_ind;
-    if (node < last_executed_node_) {
-      start_ind = 0;
-      end_ind = node;
-    } else if (node > last_executed_node_) {
-      start_ind = last_executed_node_ + 1;
-      end_ind = node;
-    } else {
-      return;
-    }
-
-    for (int i = start_ind; i <= end_ind; i++) {
-      if (op_execs_[i]) op_execs_[i]();
-    }
-    last_executed_node_ = end_ind;
+  const Device& dev = data_entry_[entry_id(index, 0)]->device;
+  TVMOpParam param = nodes_[index].param;
+  std::string name = param.func_name;
+  uint32_t num_inputs = param.num_inputs;
+  uint32_t num_outputs = param.num_outputs;
+
+  PackedFunc time_eval =
+      runtime::Registry::Get("runtime.RPCTimeEvaluator")
+          ->
+          operator()(module_, name, static_cast<int>(dev.device_type), dev.device_id, number,
+                     repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms,
+                     repeats_to_cooldown, "");
+
+  int num_flat_args = num_inputs + num_outputs;
+  auto values = std::make_unique<TVMValue[]>(num_flat_args);
+  auto type_codes = std::make_unique<int[]>(num_flat_args);
+  TVMArgsSetter setter(values.get(), type_codes.get());
+  int offs = 0;
+  const auto& inode = nodes_[index];
+  for (const auto& e : inode.inputs) {
+    uint32_t eid = this->entry_id(e);
+    DLTensor* arg = const_cast<DLTensor*>(data_entry_[eid].operator->());
+    setter(offs, arg);
+    offs++;
   }
-
-  /*!
-   * \brief Returns index-th output of node.
-   *
-   * This method will return index-th out_ind output
-   * of index-th node in the network.
-   *
-   * \param node: The index of the node.
-   * \param out_ind: The index of the output.
-   * \return Output array.
-   */
-  NDArray GetNodeOutput(int node, int out_ind) {
-    ICHECK_EQ(node, last_executed_node_);
-    ICHECK_LT(entry_id(node, out_ind), data_entry_.size());
-    return data_entry_[entry_id(node, out_ind)].CopyTo({kDLCPU, 0});
+  for (uint32_t i = 0; i < num_outputs; ++i) {
+    uint32_t eid = this->entry_id(index, i);
+    DLTensor* arg = const_cast<DLTensor*>(data_entry_[eid].operator->());
+    setter(offs, arg);
+    offs++;
   }
-
-  /*!
-   * \brief Copy index-th node to data_out.
-   *
-   * This method will do a partial run of the graph
-   * from begining upto the index-th node and return output of index-th node.
-   * This is costly operation and suggest to use only for debug porpose.
-   *
-   * \param index: The  index of the node.
-   * \param data_out the node data.
-   */
-  void DebugGetNodeOutput(int index, DLTensor* data_out) {
-    ICHECK_LT(static_cast<size_t>(index), op_execs_.size());
-    uint32_t eid = index;
-
-    for (size_t i = 0; i < op_execs_.size(); ++i) {
-      if (op_execs_[i]) op_execs_[i]();
-      if (static_cast<int>(i) == index) break;
-    }
-
-    data_entry_[eid].CopyTo(data_out);
+  TVMRetValue rv;
+  time_eval.CallPacked(TVMArgs(values.get(), type_codes.get(), num_flat_args), &rv);
+  std::string results_str = rv.operator std::string();
+  const double* blob_ptr = reinterpret_cast<const double*>(results_str.data());
+  for (int i = 0; i < repeat; ++i, ++blob_ptr) {
+    results[i] = *blob_ptr;
   }
 
-  /*!
-   * \brief Profile execution time of the module.
-   *
-   * We run the entire module while recording overall and per-op timing
-   * information. The module may be run multiple times to ensure everything is
-   * warmed up. This function is a more correct reflection of actual runtime of
-   * the module compared to GraphRuntimeDebug::RunIndividual as it runs the
-   * entire graph in order.
-   *
-   * \param collectors Optional user defined `MetricCollector`s to use with this profiling run.
-   *
-   * \returns A table of per-op runtimes and total times.
-   */
-  profiling::Report Profile(Array<profiling::MetricCollector> collectors) {
-    std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
-    profiling::Profiler prof(devices_, cs, {{String("Executor"), String("Graph")}});
-
-    // warm up. 1 iteration does not seem enough.
-    for (int i = 0; i < 3; i++) {
-      GraphExecutor::Run();
-    }
-
-    prof.Start();
-    for (size_t i = 0; i < op_execs_.size(); ++i) {
-      if (op_execs_[i]) {
-        // get argument shapes
-        std::vector<NDArray> shapes;
-        for (const auto& e : nodes_[i].inputs) {
-          uint32_t eid = entry_id(e);
-          shapes.push_back(data_entry_[eid]);
-        }
-        for (uint32_t j = 0; j < nodes_[i].param.num_outputs; ++j) {
-          uint32_t eid = entry_id(i, j);
-          shapes.push_back(data_entry_[eid]);
-        }
-
-        uint32_t eid = entry_id(i, 0);
-        const Device& dev = data_entry_[eid]->device;
-
-        std::unordered_map<std::string, ObjectRef> metrics;
-        for (auto p : nodes_[i].param.attrs) {
-          if (std::string(p.first).find("layout") != std::string::npos) {
-            metrics[p.first] = p.second;
-          }
-        }
-        if (nodes_[i].param.attrs.find("hash") != nodes_[i].param.attrs.end()) {
-          metrics["Hash"] = Downcast<String>(nodes_[i].param.attrs.at("hash"));
-        }
-        metrics["Argument Shapes"] = profiling::ShapeString(shapes);
-        prof.StartCall(nodes_[i].param.func_name, dev, metrics);
-        op_execs_[i]();
-        prof.StopCall();
-      }
-    }
-    prof.Stop();
-    return prof.Report();
+  std::ostringstream os;
+  for (auto& repeat_data : results) {
+    os << std::to_string(repeat_data) << ", ";
   }
+  LOG(INFO) << "Got op timing: " << os.str();
+  return results;
+}
 
- private:
-  int last_executed_node_ = -1;
-};
+Timer GraphExecutorDebug::RunOpHost(int index) {
+  const Device& dev = data_entry_[entry_id(index, 0)]->device;
+  Timer t = Timer::Start(dev);
+  op_execs_[index]();
+  t->Stop();
+  return t;
+}
 
 /*!
  * \brief GetFunction Get the function based on input.
@@ -461,6 +282,100 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
   }
 }
 
+int GraphExecutorDebug::GetNodeIndex(const std::string& name) const {
+  for (size_t nid = 0; nid < GetNumOfNodes(); ++nid) {
+    if (GetNodeName(nid) == name) {
+      return static_cast<int>(nid);
+    }
+  }
+  LOG(FATAL) << "cannot find " << name << " among nodex";
+  return -1;
+}
+
+void GraphExecutorDebug::ExecuteNode(int node) {
+  ICHECK_LT(static_cast<size_t>(node), op_execs_.size());
+
+  int start_ind;
+  int end_ind;
+  if (node < last_executed_node_) {
+    start_ind = 0;
+    end_ind = node;
+  } else if (node > last_executed_node_) {
+    start_ind = last_executed_node_ + 1;
+    end_ind = node;
+  } else {
+    return;
+  }
+
+  for (int i = start_ind; i <= end_ind; i++) {
+    if (op_execs_[i]) op_execs_[i]();
+  }
+  last_executed_node_ = end_ind;
+}
+
+void GraphExecutorDebug::DebugGetNodeOutput(int index, DLTensor* data_out) {
+  ICHECK_LT(static_cast<size_t>(index), op_execs_.size());
+  uint32_t eid = index;
+
+  for (size_t i = 0; i < op_execs_.size(); ++i) {
+    if (op_execs_[i]) op_execs_[i]();
+    if (static_cast<int>(i) == index) break;
+  }
+
+  data_entry_[eid].CopyTo(data_out);
+}
+
+NDArray GraphExecutorDebug::GetNodeOutput(int node, int out_ind) {
+  ICHECK_EQ(node, last_executed_node_);
+  ICHECK_LT(entry_id(node, out_ind), data_entry_.size());
+  return data_entry_[entry_id(node, out_ind)].CopyTo({kDLCPU, 0});
+}
+
+profiling::Report GraphExecutorDebug::Profile(Array<profiling::MetricCollector> collectors) {
+  std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
+  profiling::Profiler prof(devices_, cs, {{String("Executor"), String("Graph")}});
+
+  // warm up. 1 iteration does not seem enough.
+  for (int i = 0; i < 3; i++) {
+    GraphExecutor::Run();
+  }
+
+  prof.Start();
+  for (size_t i = 0; i < op_execs_.size(); ++i) {
+    if (op_execs_[i]) {
+      // get argument shapes
+      std::vector<NDArray> shapes;
+      for (const auto& e : nodes_[i].inputs) {
+        uint32_t eid = entry_id(e);
+        shapes.push_back(data_entry_[eid]);
+      }
+      for (uint32_t j = 0; j < nodes_[i].param.num_outputs; ++j) {
+        uint32_t eid = entry_id(i, j);
+        shapes.push_back(data_entry_[eid]);
+      }
+
+      uint32_t eid = entry_id(i, 0);
+      const Device& dev = data_entry_[eid]->device;
+
+      std::unordered_map<std::string, ObjectRef> metrics;
+      for (auto p : nodes_[i].param.attrs) {
+        if (std::string(p.first).find("layout") != std::string::npos) {
+          metrics[p.first] = p.second;
+        }
+      }
+      if (nodes_[i].param.attrs.find("hash") != nodes_[i].param.attrs.end()) {
+        metrics["Hash"] = Downcast<String>(nodes_[i].param.attrs.at("hash"));
+      }
+      metrics["Argument Shapes"] = profiling::ShapeString(shapes);
+      prof.StartCall(nodes_[i].param.func_name, dev, metrics);
+      op_execs_[i]();
+      prof.StopCall();
+    }
+  }
+  prof.Stop();
+  return prof.Report();
+}
+
 /*!
  * \brief GraphExecutorDebugCreate Get the function based on input.
  * \param sym_json The graph symbol in json format.
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.h b/src/runtime/graph_executor/debug/graph_executor_debug.h
new file mode 100644
index 000000000000..a53245c2e2e7
--- /dev/null
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.h
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_GRAPH_EXECUTOR_DEBUG_GRAPH_EXECUTOR_DEBUG_H_
+#define TVM_RUNTIME_GRAPH_EXECUTOR_DEBUG_GRAPH_EXECUTOR_DEBUG_H_
+
+#include <tvm/runtime/profiling.h>
+
+#include <string>
+#include <vector>
+
+#include "../graph_executor.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Graph executor with debug .
+ *
+ *  This is the extension of GraphExecutor class used for debugging
+ *  TVM runtime PackedFunc API.
+ */
+class GraphExecutorDebug : public GraphExecutor {
+ public:
+  /*!
+   * \brief Run each operation in the graph and get the time per op for all ops.
+   * \param number The number of times to run this function for taking average.
+   * \param repeat The number of times to repeat the measurement.
+   *        In total, the function will be invoked (1 + number x repeat) times,
+   *        where the first one is warmed up and will be discarded in case
+   *        there is lazy initialization.
+   * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
+   *        By default, one `repeat` contains `number` runs. If this parameter is set,
+   *        the parameters `number` will be dynamically adjusted to meet the
+   *        minimum duration requirement of one `repeat`.
+   * \param limit_zero_time_iterations The maximum number of repeats when
+   *        measured time is equal to 0.  It helps to avoid hanging during
+   *        measurements.
+   * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
+   *        defined by `repeats_to_cooldown`.
+   * \param repeats_to_cooldown The number of repeats before the
+   *        cooldown is activated.
+   * \return Returns a string with an encoded byte array. Where the first 8 bytes are int64_t
+   * representing the number of layers. Next the encoded real numbers are float32_t in the number of
+   * repeat multiplied by the number of layers.
+   */
+  std::string RunIndividual(int number, int repeat, int min_repeat_ms,
+                            int limit_zero_time_iterations, int cooldown_interval_ms,
+                            int repeats_to_cooldown);
+
+  std::string RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms,
+                                int limit_zero_time_iterations, int cooldown_interval_ms,
+                                int repeats_to_cooldown);
+
+  std::vector<double> RunOpRPC(int index, int number, int repeat, int min_repeat_ms,
+                               int limit_zero_time_iterations, int cooldown_interval_ms,
+                               int repeats_to_cooldown);
+
+  Timer RunOpHost(int index);
+
+  /*!
+   * \brief GetFunction Get the function based on input.
+   * \param name The function which needs to be invoked.
+   * \param sptr_to_self Packed function pointer.
+   */
+  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self);
+
+  /*!
+   * \brief Get the node index given the name of node.
+   * \param name The name of the node.
+   * \return The index of node.
+   */
+  int GetNodeIndex(const std::string& name) const;
+
+  /*!
+   * \brief Execute index-th node in the network.
+   *
+   * This method will do a partial run of the graph
+   * up to index-th node.
+   *
+   * \param node: The index of the node.
+   */
+  void ExecuteNode(int node);
+
+  /*!
+   * \brief Returns index-th output of node.
+   *
+   * This method will return index-th out_ind output
+   * of index-th node in the network.
+   *
+   * \param node: The index of the node.
+   * \param out_ind: The index of the output.
+   * \return Output array.
+   */
+  NDArray GetNodeOutput(int node, int out_ind);
+
+  /*!
+   * \brief Copy index-th node to data_out.
+   *
+   * This method will do a partial run of the graph
+   * from begining upto the index-th node and return output of index-th node.
+   * This is costly operation and suggest to use only for debug porpose.
+   *
+   * \param index: The  index of the node.
+   * \param data_out the node data.
+   */
+  void DebugGetNodeOutput(int index, DLTensor* data_out);
+
+  /*!
+   * \brief Profile execution time of the module.
+   *
+   * We run the entire module while recording overall and per-op timing
+   * information. The module may be run multiple times to ensure everything is
+   * warmed up. This function is a more correct reflection of actual runtime of
+   * the module compared to GraphRuntimeDebug::RunIndividual as it runs the
+   * entire graph in order.
+   *
+   * \param collectors Optional user defined `MetricCollector`s to use with this profiling run.
+   *
+   * \returns A table of per-op runtimes and total times.
+   */
+  profiling::Report Profile(Array<profiling::MetricCollector> collectors);
+
+ private:
+  int last_executed_node_ = -1;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_GRAPH_EXECUTOR_DEBUG_GRAPH_EXECUTOR_DEBUG_H_

From e1ff1c411f3420b045cf982a51734a517245b6e4 Mon Sep 17 00:00:00 2001
From: padreofthegame <97688606+padreofthegame@users.noreply.github.com>
Date: Thu, 5 Jan 2023 00:40:38 +0100
Subject: [PATCH 114/286] [Relay][Docs] Fixed examples in relay/transform.py
 documentation (#13682)

[Relay][Docs] Fixed examples in relay/transform.py documentation. Also fixed some typos in the file.
---
 python/tvm/relay/op/transform.py     | 333 +++++++++++++++------------
 tests/python/relay/test_op_level3.py |   4 +-
 2 files changed, 187 insertions(+), 150 deletions(-)

diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 024da84cbfd8..c7234f340395 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -91,8 +91,8 @@ def cast(data, dtype):
     data : relay.Expr
         The input data to the operator.
 
-    dtype: str
-        The target data type
+    dtype : str
+        The target data type.
 
     Returns
     -------
@@ -112,7 +112,7 @@ def cast_like(data, dtype_like):
     data : relay.Expr
         The input data to the operator.
 
-    dtype_like: relay.Expr
+    dtype_like : relay.Expr
         The tensor to cast to.
 
     Returns
@@ -133,8 +133,8 @@ def reinterpret(data, dtype):
     data : relay.Expr
         The input data to the operator.
 
-    dtype: str
-        The target data type
+    dtype : str
+        The target data type.
 
     Returns
     -------
@@ -160,7 +160,7 @@ def expand_dims(data, axis, num_newaxis=1):
         If `axis < 0`, it is the first axis inserted;
         If `axis >= 0`, it is the last axis inserted in Python's negative indexing.
 
-    num_newaxis : int
+    num_newaxis : int, optional
         Number of axes to be inserted. Should be >= 0.
 
     Returns
@@ -340,7 +340,7 @@ def argwhere(condition):
 
     Returns
     -------
-    out : relay.Expr
+    result : relay.Expr
         Tensor with the indices of elements that are non-zero.
 
     Examples
@@ -354,7 +354,7 @@ def argwhere(condition):
 
 
 def scatter(data, indices, updates, axis):
-    """Update data at positions defined by indices with values in updates
+    """Update data at positions defined by indices with values in updates.
 
     Parameters
     ----------
@@ -368,7 +368,7 @@ def scatter(data, indices, updates, axis):
         The values to update.
 
     axis : int
-        The axis to scatter on
+        The axis to scatter on.
 
     Returns
     -------
@@ -379,7 +379,7 @@ def scatter(data, indices, updates, axis):
 
 
 def scatter_add(data, indices, updates, axis):
-    """Update data by adding values in updates at positions defined by indices
+    """Update data by adding values in updates at positions defined by indices.
 
     Parameters
     ----------
@@ -393,7 +393,7 @@ def scatter_add(data, indices, updates, axis):
         The values to add.
 
     axis : int
-        The axis to scatter_add on
+        The axis to scatter_add on.
 
     Returns
     -------
@@ -419,7 +419,7 @@ def scatter_nd(data, indices, updates, mode="update"):
     updates : relay.Expr
         The values to update.
 
-    mode : string
+    mode : string, optional
         The accumulation mode for scatter. "update" or "add"
 
     Returns
@@ -489,14 +489,14 @@ def take(data, indices, axis=None, batch_dims=0, mode="clip"):
     data : relay.Expr
         The source array.
 
-    indices : rely.Expr
+    indices : relay.Expr
         The indices of the values to extract.
 
     axis : int, optional
         The axis over which to select values. By default,
         the flattened input array is used.
 
-    batch_dims : int
+    batch_dims : int, optional
         The number of batch dimensions. By default is 0.
 
     mode : str, optional
@@ -521,7 +521,7 @@ def full(fill_value, shape=(), dtype=""):
     fill_value : relay.Expr
         The value to fill. Must be a scalar.
 
-    shape : tuple of int or relay.Expr
+    shape : tuple of int or relay.Expr, optional
         The shape of the target.
 
     dtype : data type, optional (defaults to data type of the fill value)
@@ -566,7 +566,7 @@ def arange(start, stop=None, step=None, dtype="float32"):
     """Return evenly spaced values within a given interval.
 
     .. note::
-        Similar to ``numpy.arange``, when only one argument is given, it is used
+        Similar to ``numpy.arange``. When only one argument is given, it is used
         as `stop` instead of `start` while `start` takes default value 0.
 
         Warning: Undefined behavior when dtype is incompatible with start/stop/step.
@@ -574,14 +574,14 @@ def arange(start, stop=None, step=None, dtype="float32"):
 
     Parameters
     ----------
-    start : tvm.Expr, optional
+    start : relay.Expr, optional
         Start of interval. The interval includes this value. The default start
         value is 0.
 
-    stop : tvm.Expr
+    stop : relay.Expr
         Stop of interval. The interval does not include this value.
 
-    step : tvm.Expr, optional
+    step : relay.Expr, optional
         Spacing between values. The default step size is 1.
 
     dtype : str, optional
@@ -601,7 +601,7 @@ def arange(start, stop=None, step=None, dtype="float32"):
         relay.arange(1, 5, 1.5) = [1, 2.5, 4]
     """
     if step is None:
-        step = const(1, dtype)
+        step = const(1, dtype=dtype)
 
     if stop is None:
         stop = start
@@ -621,7 +621,7 @@ def meshgrid(data, indexing="ij"):
     data : Union(List[relay.Expr], Tuple[relay.Expr])
         A list of tensors, which must be either scalars or 1-D vectors.
 
-    indexing : str
+    indexing : str, optional
         Indexing mode, either "ij" for matrix indexing or "xy" for Cartesian indexing.
 
     Returns
@@ -655,6 +655,11 @@ def repeat(data, repeats, axis):
     """Repeats elements of an array.
     By default, repeat flattens the input array into 1-D and then repeats the elements.
 
+    Parameters
+    ----------
+    data : relay.Expr
+        The input tensor.
+
     repeats : int
         The number of repetitions for each element.
 
@@ -762,10 +767,10 @@ def reverse_sequence(data, seq_lengths, seq_axis=1, batch_axis=0):
         The tensor to be reversed.
 
     seq_lengths : relay.Expr
-        A 1D Tensor with length a.dims[batch_axis]
-        Must be one of the following types: int32, int64
-        if seq_lengths[i] > a.dims[seq_axis], it is rounded to a.dims[seq_axis]
-        if seq_lengths[i] < 1, it is rounded to 1
+        A 1D Tensor with length a.dims[batch_axis].
+        Must be one of the following types: int32, int64.
+        If seq_lengths[i] > a.dims[seq_axis], it is rounded to a.dims[seq_axis].
+        If seq_lengths[i] < 1, it is rounded to 1.
 
     seq_axis : int, optional
         The axis along which the elements will be reversed. Default is 1.
@@ -841,7 +846,7 @@ def where(condition, x, y):
 
 
 def broadcast_to(data, shape):
-    """Return a scalar value array with the same type, broadcast to
+    """Return a scalar value array with the same type, broadcasted to
     the provided shape.
 
     Parameters
@@ -877,7 +882,7 @@ def broadcast_to_like(data, broadcast_type):
         The input tensor.
 
     broadcast_type : relay.Expr
-        Provide the type to broadcast to.
+        Provide the shape to broadcast to.
 
     Returns
     -------
@@ -896,7 +901,7 @@ def collapse_sum_like(data, collapse_type):
         The input tensor.
 
     collapse_type : relay.Expr
-        Provide the type to collapse to.
+        Provide the shape to collapse to.
 
     Returns
     -------
@@ -942,7 +947,7 @@ def split(data, indices_or_sections, axis=0):
         The source array.
 
     indices_or_sections : int or tuple of int
-        Indices or sections to split into. Accepts an int or a tuple
+        Indices or sections to split into. Accepts an int or a tuple.
 
     axis : int, optional
         The axis over which to split.
@@ -974,7 +979,7 @@ def strided_slice(data, begin, end, strides=None, axes=None, slice_mode="end"):
         Indices indicating end of the slice.
 
     strides : relay.Expr, Tuple[int], or List[int], optional
-        Specifies the stride values, it can be negative in that case,
+        Specifies the stride values. It can be negative. In that case,
         the input tensor will be reversed in that particular axis.
 
     axes : Tuple[int] or List[int], optional
@@ -986,7 +991,7 @@ def strided_slice(data, begin, end, strides=None, axes=None, slice_mode="end"):
     slice_mode : str, optional
         The slice mode [end, size].
         end: The ending indices for the slice [default].
-        size: The input strides will be ignored, input end in this mode indicates
+        size: The input strides will be ignored. Input end in this mode indicates
         the size of a slice starting at the location specified by begin. If end[i]
         is -1, all remaining elements in that dimension are included in the slice.
 
@@ -1031,14 +1036,14 @@ def strided_set(data, v, begin, end, strides=None):
     v : relay.Expr
         The data to be set.
 
-    begin: relay.Expr, Tuple[int], or List[int]
+    begin : relay.Expr, Tuple[int], or List[int]
         The indices to begin with in the slicing.
 
-    end: relay.Expr, Tuple[int], or List[int]
+    end : relay.Expr, Tuple[int], or List[int]
         Indices indicating end of the slice.
 
     strides: relay.Expr, Tuple[int], or List[int], optional
-        Specifies the stride values, it can be negative in that case,
+        Specifies the stride values. It can be negative. In that case,
         the input tensor will be reversed in that particular axis.
 
     Returns
@@ -1060,17 +1065,17 @@ def slice_like(data, shape_like, axes=None):
     """Slice the first input with respect to the second input.
 
     For an input array with shape ``(d1, d2, ..., dk)``, `slice_like` operation slices the
-    the input array corresponding size of second array. By default will slice on all axes.
+    input array corresponding to the size of the second array. By default will slice on all axes.
 
     Parameters
     ----------
-    data : tvm.relay.Expr
+    data : relay.Expr
         The source array.
 
-    shape_like : tvm.relay.Expr
-        The new shape.
+    shape_like : relay.Expr
+        An array based on which shape, the result shape is computed.
 
-    axes : Optional[Tuple[int]]
+    axes : Tuple[int] or List[int], optional
         List of axes on which input data will be sliced according to the corresponding size of
         the second input. By default will slice on all axes. Negative axes mean counting in reverse.
 
@@ -1083,17 +1088,17 @@ def slice_like(data, shape_like, axes=None):
 
 
 def layout_transform(data, src_layout, dst_layout):
-    """Transform the layout of a tensor
+    """Transform the layout of a tensor.
 
     Parameters
     ----------
     data : relay.Expr
-        The source tensor to be transformed
+        The source tensor to be transformed.
 
-    src_layout: str
+    src_layout : str
         The source layout.  (e.g NCHW)
 
-    dst_layout: str
+    dst_layout : str
         The destination layout.  (e.g. NCHW16c)
 
     Returns
@@ -1115,7 +1120,7 @@ def reverse_reshape(data, newshape):
     .. code-block:: python
 
         data.shape = (10,5,4), newshape = (-1,0), reshape results in (40,5)
-        data.shape = (10,5,4), newshape = (-1,0), reverse_reshape results in (40,5)
+        data.shape = (10,5,4), newshape = (-1,0), reverse_reshape results in (50,4)
 
     Parameters
     ----------
@@ -1146,18 +1151,18 @@ def gather(data, axis, indices):
         out[i][j][k] = data[i][indices[i][j][k]][k]  # if axis == 1
         out[i][j][k] = data[i][j][indices[i][j][k]]  # if axis == 2
 
-    ``indices`` must have same shape as ``data``, except at dimension ``axis``
-    which must just be not null. Output will have same shape as ``indices``.
+    ``indices`` must have the same shape as ``data``, except at dimension ``axis``
+    which must just be not null. Output will have the same shape as ``indices``.
 
     Parameters
     ----------
-    data: relay.Expr
+    data : relay.Expr
         The input data to the operator.
 
-    axis: int
-        The axis along which to index. negative axis is supported.
+    axis : int
+        The axis along which to index. Negative axis is supported.
 
-    indices: relay.Expr
+    indices : relay.Expr
         The indices of values to gather.
 
     Examples
@@ -1173,7 +1178,7 @@ def gather(data, axis, indices):
 
 
 def gather_nd(data, indices, batch_dims=0, index_rank=None):
-    """Gather elements or slices from data and store to a tensor whose shape is
+    """Gather elements or slices from data and store them to a tensor whose shape is
     defined by indices.
 
     Parameters
@@ -1184,11 +1189,11 @@ def gather_nd(data, indices, batch_dims=0, index_rank=None):
     indices : relay.Expr
         The shape of output tensor.
 
-    batch_dims : int
+    batch_dims : int, optional
         The number of batch dimensions.
 
     index_rank : int, optional
-        The size of an indexing tuple, which is a fixed value and the same as indices.shape[0]
+        The size of an indexing tuple, which is a fixed value and the same as indices.shape[0].
         Only needed when other dimensions of indices are dynamic.
 
     Returns
@@ -1208,9 +1213,9 @@ def gather_nd(data, indices, batch_dims=0, index_rank=None):
         indices = [[0, 1], [1, 0]]
         relay.gather_nd(data, indices) = [[3, 4], [5, 6]]
 
-        data    = [[[0,1],[2,3]],[[4,5],[6,7]]]
+        data = [[[0, 1], [2, 3]], [[4, 5], [6, 7]]]
         indices = [[1, 0]]
-        relay.gather_nd(data, indices, batch_dims=1) = [[2,3],[4,5]]
+        relay.gather_nd(data, indices, batch_dims=1) = [[2, 3],[4, 5]]
     """
     return _make.gather_nd(data, indices, batch_dims, index_rank)
 
@@ -1229,10 +1234,10 @@ def sequence_mask(data, valid_length, mask_value=0, axis=0):
     valid_length : relay.Expr
         The expected (valid) length of each sequence in the tensor.
 
-    mask_value : float
+    mask_value : float, optional
         The masking value.
 
-    axis : int
+    axis : int, optional
         The axis of the length dimension.
 
     Returns
@@ -1262,9 +1267,8 @@ def sequence_mask(data, valid_length, mask_value=0, axis=0):
 
 
 def one_hot(indices, on_value, off_value, depth, axis, dtype):
-    """
-    Returns a one-hot tensor where the locations repsented by indices take value on_value,
-    other locations take value off_value.
+    """Returns a one-hot tensor where the locations represented by indices take value on_value,
+    and other locations take value off_value.
     Final dimension is <indices outer dimensions> x depth x <indices inner dimensions>.
 
     Parameters
@@ -1313,9 +1317,6 @@ def one_hot(indices, on_value, off_value, depth, axis, dtype):
 def unravel_index(indices, shape):
     """Convert a flat index or array of flat indices into a tuple of coordinate arrays.
 
-    Example::
-    -   unravel_index([22, 41, 37], [7, 6]) = [[3, 6, 6],[4, 5, 1]]
-
     Parameters
     ----------
     indices : relay.Expr
@@ -1328,17 +1329,21 @@ def unravel_index(indices, shape):
     -------
     result : relay.Expr
         The tuple of coordinate arrays.
-    """
 
+    Examples
+    -------
+    .. code-block:: python
+
+        relay.unravel_index([22, 41, 37], [7, 6]) =
+            [[3, 6, 6],
+             [4, 5, 1]]
+    """
     return _make.unravel_index(indices, shape)
 
 
 def sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value=0):
     """Converts a sparse representation into a dense tensor.
 
-    Example::
-    -   sparse_to_dense([[0, 0], [1, 1]], [2, 2], [3, 3], 0) = [[3, 0], [0, 3]]
-
     Parameters
     ----------
     sparse_indices : relay.Expr
@@ -1350,7 +1355,7 @@ def sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value=0
     sparse_values : relay.Expr
         A 0-D or 1-D tensor containing the sparse values for the sparse indices.
 
-    default_value : relay.Expr
+    default_value : relay.Expr, optional
         A 0-D tensor containing the default value for the remaining locations.
         Defaults to 0.
 
@@ -1358,8 +1363,15 @@ def sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value=0
     -------
     result : relay.Expr
         Dense tensor of shape output_shape. Has the same type as sparse_values.
-    """
 
+    Examples
+    -------
+    .. code-block:: python
+
+        relay.sparse_to_dense([[0, 0], [1, 1]], [2, 2], [3, 3], 0) =
+            [[3, 0],
+             [0, 3]]
+    """
     if default_value == 0:
         default_value = const(0)
     if isinstance(output_shape, Expr):
@@ -1368,19 +1380,19 @@ def sparse_to_dense(sparse_indices, output_shape, sparse_values, default_value=0
 
 
 def matrix_set_diag(data, diagonal, k=0, align="RIGHT_LEFT"):
-    """
-    Returns a tensor with the diagonals of input tensor replaced with the provided diagonal values.
+    """Returns a tensor with the diagonals of input tensor replaced with the provided
+    diagonal values.
 
     Parameters
     ----------
     data : relay.Expr
-        Input Tensor.
+        Input tensor.
 
     diagonal : relay.Expr
         Values to be filled in the diagonal.
 
     k : int or tuple of int, optional
-        Diagonal Offset(s). The diagonal or range of diagonals to set. (0 by default)
+        Diagonal offset(s). The diagonal or range of diagonals to set. (0 by default)
         Positive value means superdiagonal, 0 refers to the main diagonal, and
         negative value means subdiagonals. k can be a single integer (for a single diagonal)
         or a pair of integers specifying the low and high ends of a matrix band.
@@ -1440,41 +1452,39 @@ def matrix_set_diag(data, diagonal, k=0, align="RIGHT_LEFT"):
 
 
 def adv_index(inputs):
-    """
-    Numpy style advanced indexing. Index with a list of tensors.
+    """Numpy style advanced indexing. Index with a list of tensors.
 
     Parameters
     ----------
     inputs : Union(List[relay.Expr], Tuple[relay.Expr])
         Input tensor and indices.
-        The first tensor is input data and rests are indices.
+        The first tensor is the input data and the rest are the indices.
 
     Returns
     -------
-    result: relay.Expr
+    result : relay.Expr
         Output tensor.
     """
     return _make.adv_index(Tuple(inputs))
 
 
 def sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_value):
-    """
-    Fill rows in a sparse matrix that do no contain any values. Values are placed in the first
+    """Fill rows in a sparse matrix that do not contain any values. Values are placed in the first
     column of empty rows. The sparse array is in COO format.
-    It returns a TupleWrapper with 3 outputs
+    It returns a TupleWrapper with 3 outputs.
 
     Parameters
     ----------
     sparse_indices : relay.Expr
-        A 2-D tensor[N, ndims] of integers containing location of sparse values, where N is
+        A 2-D tensor[N, ndims] of integers containing the locations of sparse values, where N is
         the number of sparse values and n_dim is the number of dimensions of the dense_shape.
-        The first column of this relay parameter must be sorted in ascending order.
+        The first column of this parameter must be sorted in ascending order.
 
     sparse_values : relay.Expr
         A 1-D tensor[N] containing the sparse values for the sparse indices.
 
     dense_shape : relay.Expr
-        A 1-D tensor[ndims] which contains shape of the dense output tensor.
+        A 1-D tensor[ndims] which contains the shape of the dense output tensor.
 
     default_value : relay.Expr
         A 1-D tensor[1] containing the default value for the remaining locations.
@@ -1490,7 +1500,7 @@ def sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_v
 
     empty_row_indicator : relay.Expr
         A 1-D tensor[dense_shape[0]] filled with zeros and ones
-        indicating whether the particular row is empty or full respectively
+        indicating whether the particular row is empty or full respectively.
 
     Note
     ----
@@ -1508,22 +1518,29 @@ def sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_v
                          [0, 3],
                          [2, 0],
                          [3, 1]]
+
         sparse_values = [1, 2, 3, 4]
+
         default_value = [10]
+
         dense_shape = [5, 6]
-        new_sparse_indices, empty_row_indicator, new_sparse_values, slice_element_index =
+
+        new_sparse_indices, empty_row_indicator, new_sparse_values =
                             relay.sparse_fill_empty_rows(
                             sparse_indices,
                             sparse_values,
                             default_value,
                             dense_shape)
+
         new_sparse_indices = [[0, 1],
-                             [0, 3],
-                             [1, 0],
-                             [2, 0],
-                             [3, 1],
-                             [4, 0]]
+                              [0, 3],
+                              [1, 0],
+                              [2, 0],
+                              [3, 1],
+                              [4, 0]]
+
         empty_row_indicator = [False, True, False, False, True]
+
         new_sparse_values = [1, 2, 10, 3, 4, 10]
     """
     new_sparse_indices, new_sparse_values, empty_row_indicator = TupleWrapper(
@@ -1537,36 +1554,42 @@ def sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_v
 
 
 def sparse_reshape(sparse_indices, prev_shape, new_shape):
-    """
-    Reshape a Sparse Tensor. The sparse array is in COO format.
+    """Reshape a sparse tensor. The sparse array is in COO format.
 
     Parameters
     ----------
     sparse_indices : relay.Expr
         A 2-D tensor[N, n_dim] of integers containing location of sparse values, where N is the
-        number of sparse values and n_dim is the number of dimensions of the dense_shape
+        number of sparse values and n_dim is the number of dimensions of the dense_shape.
+
     prev_shape : relay.Expr
-        A 1-D tensor containing the previous shape of the dense tensor
+        A 1-D tensor containing the previous shape of the dense tensor.
+
     new_shape : relay.Expr
-        A 1-D tensor containing the new shape of the dense tensor
+        A 1-D tensor containing the new shape of the dense tensor.
+
     Returns
     -------
     result: relay.Expr
         Output tensor.
+
     Examples
     --------
     .. code-block:: python
 
         sparse_indices = [[0, 0, 0],
-                            [0, 0, 1],
-                            [0, 1, 0],
-                            [1, 0, 0],
-                            [1, 2, 3]]
-        prev_shape = [2, 3, 4]
+                          [0, 0, 1],
+                          [0, 1, 0],
+                          [1, 0, 0],
+                          [1, 2, 3]]
+
+        prev_shape = [2, 3, 6]
+
         new_shape = [9, -1]
+
         new_sparse_indices, new_shape = relay.sparse_reshape(sparse_indices,
-                            prev_shape,
-                            new_shape)
+                                                             prev_shape,
+                                                             new_shape)
         new_sparse_indices = [[0, 0],
                               [0, 1],
                               [1, 2],
@@ -1578,8 +1601,7 @@ def sparse_reshape(sparse_indices, prev_shape, new_shape):
 
 
 def segment_sum(data, segment_ids, num_segments=None):
-    """
-    Computes the sum along segment_ids along axis 0. If multiple segment_ids reference the same
+    """Computes the sum along segment_ids along axis 0. If multiple segment_ids reference the same
     location their contributions add up.
     result[index, j, k, ...] = Σi... data[i, j, k,..] where index = segment_ids[i]
     This op is much better understood with visualization articulated in the following links and
@@ -1591,20 +1613,24 @@ def segment_sum(data, segment_ids, num_segments=None):
     Parameters
     ----------
     data : relay.Expr
-        Input Tensor. It can be of any type and multi-dimensional
+        Input tensor. It can be of any type and multi-dimensional.
+
     segment_ids : relay.Expr
         A 1-D int32/int64 tensor containing the segment_ids of the rows to calculate the output
         sum upon. It defines a mapping from the zeroth dimension of data onto segment_ids. The
         segment_ids tensor should be the size of the first dimension, d0, with consecutive IDs
         in the range 0 to k, where k<d0. In particular, a segmentation of a matrix tensor is a
-        mapping of rows to segments. This tensor doesn't need to be sorted
-    num_segments : Optional[int]
-        An integer describing the shape of the zeroth dimension. If unspecified, its calculated
-        equivalent to the number of unique segment_ids
+        mapping of rows to segments. This tensor doesn't need to be sorted.
+
+    num_segments : int, optional
+        An integer describing the shape of the zeroth dimension. If unspecified, it is calculated
+        equivalent to the number of unique segment_ids.
+
     Returns
     -------
-    result: relay.Expr
+    result : relay.Expr
         Output tensor.
+
     Examples
     --------
     .. code-block:: python
@@ -1612,17 +1638,23 @@ def segment_sum(data, segment_ids, num_segments=None):
         data = [[1, 2, 3, 4],
                 [4, -3, 2, -1],
                 [5, 6, 7, 8]]
+
         segment_ids = [0, 0, 1]
-        result = segment_sum(data, segment_ids)
-        result = [[5, -1, 5, 3],[5, 6, 7, 8]]
+
+        relay.segment_sum(data, segment_ids) = [[5, -1, 5, 3],
+                                                [5, 6, 7, 8]]
 
         data = [[1, 2, 3, 4],
                 [4, -3, 2, -1],
                 [5, 6, 7, 8]]
+
         segment_ids = [2, 0, 0]
+
         num_segments = 3
-        result = segment_sum(data, segment_ids, num_segments)
-        result = [[5, 6, 7, 8],[0, 0, 0, 0], [5, -1, 5, 3]]
+
+        segment_sum(data, segment_ids, num_segments) = [[9, 3, 9, 7],
+                                                        [0, 0, 0, 0],
+                                                        [1, 2, 3, 4]]
     """
 
     one_tensor = cast_like(const([1]), segment_ids)
@@ -1680,7 +1712,7 @@ def cumsum(data, axis=None, dtype=None, exclusive=None):
     --------
     .. code-block:: python
 
-        a = [[1,2,3], [4,5,6]]
+        a = [[1, 2, 3], [4, 5, 6]]
 
         cumsum(a)  # if axis is not provided, cumsum is done over the flattened input.
         -> [ 1,  3,  6, 10, 15, 21]
@@ -1736,7 +1768,7 @@ def cumprod(data, axis=None, dtype=None, exclusive=None):
     --------
     .. code-block:: python
 
-        a = [[1,2,3], [4,5,6]]
+        a = [[1, 2, 3], [4, 5, 6]]
 
         cumprod(a)  # if axis is not provided, cumprod is done over the flattened input.
         -> [ 1,  2,  6, 24, 120, 720]
@@ -1760,8 +1792,7 @@ def cumprod(data, axis=None, dtype=None, exclusive=None):
 
 
 def unique(data, is_sorted=True, return_counts=False):
-    """
-    Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to
+    """Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to
     have the same length of `data` and element with index >= num_unique[0] has undefined value.
 
     Parameters
@@ -1769,10 +1800,10 @@ def unique(data, is_sorted=True, return_counts=False):
     data : relay.Expr
         A 1-D tensor of integers.
 
-    is_sorted : bool
+    is_sorted : bool, optional
         Whether to sort the unique elements in ascending order before returning as output.
 
-    return_counts : bool
+    return_counts : bool, optional
         Whether to return the count of each unique element.
 
     Returns
@@ -1781,7 +1812,8 @@ def unique(data, is_sorted=True, return_counts=False):
         A 1-D tensor containing the unique elements of the input data tensor.
 
     indices : relay.Expr
-        A 1-D tensor containing the index of each data element in the output tensor.
+        A 1-D tensor containing the indeces of the first occurence of each unique value
+        in the input tensor.
 
     inverse_indices : relay.Expr
         A 1-D tensor. For each entry in data, it contains the index of that data element in the
@@ -1790,28 +1822,35 @@ def unique(data, is_sorted=True, return_counts=False):
     num_unique : relay.Expr
         A 1-D tensor with size=1 containing the number of unique elements in the input data tensor.
 
-    counts (optional) : relay.Expr
+    counts : relay.Expr, optional
         A 1-D tensor containing the count of each unique element in the output.
 
     Examples
     --------
     .. code-block:: python
 
-        [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, False)
-        output         =  [4, 5, 1, 2, 3, _, _, _]
-        indices        =  [0, 1, 2, 3, 4, 4, 0, 1]
-        num_unique     =  [5]
-
-        [output, indices, num_unique, counts] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, True)
-        output         =  [4, 5, 1, 2, 3, _, _, _]
-        indices        =  [0, 1, 2, 3, 4, 4, 0, 1]
-        num_unique     =  [5]
-        counts         =  [2, 2, 1, 1, 2, _, _, _]
-
-        [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], True)
-        output         =  [1, 2, 3, 4, 5, _, _, _]
-        indices        =  [3, 4, 0, 1, 2, 2, 3, 4]
-        num_unique     =  [5]
+        [output, indices, inverse_indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5],
+                                                                False,
+                                                                False)
+        output          =  [4, 5, 1, 2, 3, _, _, _]
+        indices         =  [0, 1, 2, 3, 4, _, _, _]
+        inverse_indices =  [0, 1, 2, 3, 4, 4, 0, 1]
+        num_unique      =  [5]
+
+        [output, indices, inverse_indices, num_unique, counts] = unique([4, 5, 1, 2, 3, 3, 4, 5],
+                                                                        False,
+                                                                        True)
+        output          =  [4, 5, 1, 2, 3, _, _, _]
+        indices         =  [0, 1, 2, 3, 4, _, _, _]
+        inverse_indices =  [0, 1, 2, 3, 4, 4, 0, 1]
+        num_unique      =  [5]
+        counts          =  [2, 2, 1, 1, 2, _, _, _]
+
+        [output, indices, inverse_indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], True)
+        output          =  [1, 2, 3, 4, 5, _, _, _]
+        indices         =  [2, 3, 4, 0, 1, _, _, _]
+        inverse_indices =  [3, 4, 0, 1, 2, 2, 3, 4]
+        num_unique      =  [5]
     """
     if return_counts:
         return TupleWrapper(_make.unique(data, is_sorted, return_counts), 5)
@@ -1830,12 +1869,12 @@ def invert_permutation(data):
     Parameters
     ----------
     data : relay.Expr
-        The source data to be invert permuated.
+        The source data to be invert permuted.
 
     Returns
     -------
     ret : relay.Expr
-        Invert permuated data. Has the same type as data.
+        Invert permuted data. Has the same type as data.
 
     Examples
     --------
@@ -1850,8 +1889,7 @@ def invert_permutation(data):
 def stft(
     data, n_fft, hop_length=None, win_length=None, window=None, normalized=False, onesided=True
 ):
-    """
-    The STFT computes the Fourier transform of short overlapping windows of the input.
+    """The STFT computes the Fourier transform of short overlapping windows of the input.
     This gives frequency components of the signal as they change over time.
 
     Parameters
@@ -1860,7 +1898,7 @@ def stft(
         Either a 1-D tensor or a 2-D batch tensor.
 
     n_fft : int
-        The size of Fourier transform
+        The size of Fourier transform.
 
     hop_length : int, optional
         The distance between neighboring sliding window frames. If is None,
@@ -1892,8 +1930,9 @@ def stft(
         data = [1, 2, 3, 4, 5, 6]
         window = [4, 3, 2]
         [n_fft, hop_length, win_length, normalized, onesided] = [3, 3, 3, False, True]
+
         relay.stft(data, n_fft, hop_length, win_length, window, normalized, onesided)
-        -> [[[15.0000,  0.0000], [34.0000,  0.0000]], [[ 4.5000,  0.8660], [ 1.0000, -1.7321]]]
+        -> [[[16.0000,  0.0000], [43.0000,  0.0000]], [[ -2.0000,  0.0000], [ 2.5000, -2.5981]]]
     """
     if hop_length is None:
         hop_length = n_fft // 4
@@ -1908,17 +1947,16 @@ def stft(
 
 
 def trilu(data, k, upper=True):
-    """
-    Given a 2-D matrix or batches of 2-D matrices, returns the
+    """Given a 2-D matrix or batches of 2-D matrices, returns the
     upper or lower triangular part of the tensor.
 
     Parameters
     ----------
-    data: relay.Expr
+    data : relay.Expr
         The tensor that trilu will be applied to. Must be either
         a 2D matrix or a tensor of batches of 2D matrices.
 
-    k: int
+    k : int
         The number of diagonals above or below the main diagonal
         to exclude or include.
 
@@ -1926,7 +1964,6 @@ def trilu(data, k, upper=True):
         If True, only upper triangular values of input are kept,
         if False, the lower triangular values are kept.
 
-
     Returns
     -------
     ret : relay.Expr
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index c96bc940f920..efd37f2ecd22 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -595,7 +595,7 @@ def test_full_infer_type():
     # change the shape and dtype
     x = relay.var("x", relay.TensorType((), "float32"))
     y = relay.full(x, (1, 2), "int8")
-    "shape=" in y.astext()
+    assert "shape=" in y.astext()
     yy = run_infer_type(y)
     assert yy.checked_type == relay.TensorType((1, 2), "int8")
 
@@ -1580,7 +1580,7 @@ def ref_res(
         new_shape_np: np.ndarray,
     ):
         """
-        This function calculates the expected output of sparseshape operator given the inputs.
+        This function calculates the expected output of sparse_reshape operator given the inputs.
         """
 
         new_sparse_indices = np.ones(

From 38bf0ed14ac8a5bded87ec9c60822e60785adf0f Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Wed, 4 Jan 2023 15:45:14 -0800
Subject: [PATCH 115/286] [Hexagon] Denote DMA cache bypass as experimental
 feature (#13699)

---
 src/driver/driver_api.cc                                  | 8 +++++++-
 src/tir/transforms/lower_async_dma.cc                     | 3 ++-
 .../contrib/test_hexagon/test_async_dma_pipeline.py       | 2 +-
 .../contrib/test_hexagon/test_software_pipeline_async.py  | 2 +-
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 10d9e8023a61..92769d1cef45 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -53,9 +53,15 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_async_copy", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.merge_async_commit_queue_scope", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_lwp", Bool);
-TVM_REGISTER_PASS_CONFIG_OPTION("tir.dma_bypass_cache", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.vtcm_capacity", Integer);
 
+// WARNING: May cause coherency issues resulting data miscompares
+// Experimental feature that, when enabled by the runtime, bypasses the cache when using DMA. When
+// bypassing the cache TVM must manage cache coherency in software. Software managed cache coherency
+// can be tricky e.g. it is yet to be proven out in the Hexagon runtime. Hence the warning above and
+// the "experimental" notation for this feature.
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.experimental_dma_bypass_cache", Bool);
+
 using tvm::Array;
 using tvm::transform::Pass;
 
diff --git a/src/tir/transforms/lower_async_dma.cc b/src/tir/transforms/lower_async_dma.cc
index 94769dae0899..5abdc5da84d7 100644
--- a/src/tir/transforms/lower_async_dma.cc
+++ b/src/tir/transforms/lower_async_dma.cc
@@ -211,7 +211,8 @@ namespace transform {
 Pass LowerAsyncDMA() {
   auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
     auto fptr = f.CopyOnWrite();
-    bool dma_bypass_cache = ctx->GetConfig<Bool>("tir.dma_bypass_cache", Bool(false)).value();
+    bool dma_bypass_cache =
+        ctx->GetConfig<Bool>("tir.experimental_dma_bypass_cache", Bool(false)).value();
     fptr->body = AsyncDMALowerer(dma_bypass_cache)(std::move(fptr->body));
     return f;
   };
diff --git a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
index 914a26c51180..2b6bca008e05 100644
--- a/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
+++ b/tests/python/contrib/test_hexagon/test_async_dma_pipeline.py
@@ -275,7 +275,7 @@ def evaluate(
     with tvm.transform.PassContext(
         config={
             "tir.use_async_copy": use_async_copy,
-            "tir.dma_bypass_cache": 1,
+            "tir.experimental_dma_bypass_cache": 1,
             "tir.merge_async_commit_queue_scope": merge_async_commit_queue_scope,
         }
     ):
diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
index 387d0f20c4c2..c831472a521d 100644
--- a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
+++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py
@@ -180,7 +180,7 @@ def test_async_software_pipeline(
         with tvm.transform.PassContext(
             config={
                 "tir.use_async_copy": 1,
-                "tir.dma_bypass_cache": 1,
+                "tir.experimental_dma_bypass_cache": 1,
                 "tir.merge_async_commit_queue_scope": False,
             }
         ):

From 38e2d2d4ad8da1848f4f3087d35dd14df62ef9c9 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 5 Jan 2023 05:20:15 +0530
Subject: [PATCH 116/286] [BENCHMARKS][CLML] Adreno benchmarks with CLML BYOC
 path added (#13696)

* [BENCHMARKS][CLML] Adreno benchmarks with CLML BYOC path added

Various benchmarks enabled for CLML BYOC backend for Adreno GPU
Networks resnet-18, resnet-34, resnet-50, densenet-121,
inception_v3, mobilenetv1, squeezenet_v1.0, squeezenet_v1.1 are
added with FP16 and FP32 dtypes.

* * lint error
---
 apps/benchmark/README.md                      |  15 +-
 .../benchmark/adreno/adreno_gpu_bench_clml.py | 282 ++++++++++++++++++
 apps/benchmark/adreno/bench.sh                |   5 +
 python/tvm/relay/op/contrib/clml.py           |  32 ++
 src/runtime/contrib/clml/clml_runtime.cc      |  47 ++-
 tests/scripts/ci.py                           |  14 +
 6 files changed, 390 insertions(+), 5 deletions(-)
 create mode 100755 apps/benchmark/adreno/adreno_gpu_bench_clml.py

diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
index ccac79df47d8..44c54b1cf297 100644
--- a/apps/benchmark/README.md
+++ b/apps/benchmark/README.md
@@ -134,10 +134,23 @@ python3 gpu_imagenet_bench.py --model gfx900 --target rocm
 Adreno benchmarks are automated over the docker - [ci_adreno](https://github.com/apache/tvm/blob/main/docker/Dockerfile.ci_adreno).
 Adreno docker share the Android devices from host. It is adviced to have host adb version same as docker, which is ```1.0.41```
 
-Below command runs all the benchmarks over given Android device.
+Below command runs all (OpenCL native, CLML SDK) the benchmarks over given Android device.
 ```bash
 export ANDROID_SERIAL=<ADB ID>
 ./tests/scripts/ci.py adreno -b
 ```
+Below command runs all OpenCL native benchmarks over given Android device.
+```bash
+export ANDROID_SERIAL=<ADB ID>
+./tests/scripts/ci.py adreno -n
+```
+CLML SDK benchmarks require CLML SDK path to be exported and the SDK version should match with target device's SDK version.
+
+Below command runs all CLML SDK benchmarks over given Android device.
+```bash
+export ADRENO_OPENCL=<CLML SDK PATH>
+export ANDROID_SERIAL=<ADB ID>
+./tests/scripts/ci.py adreno -c
+```
 
 Note: Tuning cache is implicite through tophub repo for all the benchmarks and is tuned over Snapdragon Gen 1.
diff --git a/apps/benchmark/adreno/adreno_gpu_bench_clml.py b/apps/benchmark/adreno/adreno_gpu_bench_clml.py
new file mode 100755
index 000000000000..17c483fe2c76
--- /dev/null
+++ b/apps/benchmark/adreno/adreno_gpu_bench_clml.py
@@ -0,0 +1,282 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Benchmark script for various models on Adreno GPU.
+"""
+import argparse
+
+import numpy as np
+
+import os
+import sys
+import tvm
+from tvm import te
+from tvm.relay import testing
+from tvm.contrib.utils import tempdir
+from tvm.relay.op.contrib import clml
+import tvm.contrib.graph_executor as runtime
+from tvm import relay
+from tvm import autotvm
+from tvm.contrib import utils, ndk
+
+
+def get_network(name, batch_size, dtype="float32"):
+    """Get the symbol definition and random weight of a network
+
+    Parameters
+    ----------
+    name: str
+        The name of the network, can be 'resnet-18', 'resnet-50', 'vgg-16', 'inception_v3', 'mobilenet', ...
+    batch_size: int
+        batch size
+    dtype: str
+        Data type
+
+    Returns
+    -------
+    net: tvm.IRModule
+        The relay function of network definition
+    params: dict
+        The random parameters for benchmark
+    input_shape: tuple
+        The shape of input tensor
+    output_shape: tuple
+        The shape of output tensor
+    """
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if name == "mobilenet":
+        net, params = testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299)
+        net, params = testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif "resnet" in name:
+        n_layer = int(name.split("-")[1])
+        net, params = testing.resnet.get_workload(
+            num_layers=n_layer, batch_size=batch_size, dtype=dtype
+        )
+    elif "vgg" in name:
+        n_layer = int(name.split("-")[1])
+        net, params = testing.vgg.get_workload(
+            num_layers=n_layer, batch_size=batch_size, dtype=dtype
+        )
+    elif "densenet" in name:
+        n_layer = int(name.split("-")[1])
+        net, params = testing.densenet.get_workload(
+            densenet_size=n_layer, batch_size=batch_size, dtype=dtype
+        )
+    elif "squeezenet" in name:
+        version = name.split("_v")[1]
+        net, params = testing.squeezenet.get_workload(
+            batch_size=batch_size, version=version, dtype=dtype
+        )
+    elif name == "mxnet":
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        block = get_model("resnet18_v1", pretrained=True)
+        net, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
+        net = net["main"]
+        net = relay.Function(
+            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
+        )
+        net = tvm.IRModule.from_expr(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+
+def print_progress(msg):
+    """print progress message
+
+    Parameters
+    ----------
+    msg: str
+        The message to print
+    """
+    sys.stdout.write(msg + "\r")
+    sys.stdout.flush()
+
+
+def tune_tasks(
+    tasks,
+    measure_option,
+    n_trial=1024,
+    early_stopping=None,
+    log_filename="tuning.log",
+):
+    from tvm.autotvm.tuner import XGBTuner
+
+    tmp_log_file = log_filename + ".tmp"
+
+    for i, tsk in enumerate(reversed(tasks)):
+        print("Task: ", tsk)
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
+        tuner_obj = XGBTuner(tsk, loss_type="rank")
+
+        tsk_trial = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(
+            n_trial=tsk_trial,
+            early_stopping=early_stopping,
+            measure_option=measure_option,
+            callbacks=[
+                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
+                autotvm.callback.log_to_file(tmp_log_file),
+            ],
+        )
+
+        autotvm.record.pick_best(tmp_log_file, log_filename)
+
+
+def evaluate_network(network, target, target_host, dtype, repeat):
+    print_progress(network)
+    net, params, input_shape, output_shape = get_network(network, batch_size=1, dtype=dtype)
+
+    # Auto Tuning
+    tune_log = "adreno-" + network + "-" + dtype + ".log"
+    tuning_options = {
+        "log_filename": tune_log,
+        "early_stopping": None,
+        "measure_option": autotvm.measure_option(
+            builder=autotvm.LocalBuilder(build_func=ndk.create_shared, timeout=15),
+            runner=autotvm.RPCRunner(
+                args.rpc_key,
+                host=args.host,
+                port=args.port,
+                number=3,
+                timeout=600,
+            ),
+        ),
+    }
+    if args.tune:
+        tasks = autotvm.task.extract_from_program(
+            net, target=target, target_host=target_host, params=params
+        )
+        tune_tasks(tasks, **tuning_options)
+
+    print_progress("%-20s building..." % network)
+
+    # Build the tuning log
+    if os.path.exists(tune_log):
+        with autotvm.apply_history_best(tune_log):
+            with tvm.transform.PassContext(opt_level=3):
+                net = clml.partition_for_clml(net, params)
+                lib = relay.build(
+                    net, target=tvm.target.Target(target, host=target_host), params=params
+                )
+    else:
+        with tvm.transform.PassContext(opt_level=3):
+            net = clml.partition_for_clml(net, params)
+
+            lib = relay.build(
+                net, target=tvm.target.Target(target, host=target_host), params=params
+            )
+
+    tmp = tempdir()
+
+    filename = "%s.so" % network
+    lib.export_library(tmp.relpath(filename), ndk.create_shared)
+
+    # upload library and params
+    print_progress("%-20s uploading..." % network)
+
+    # connect to remote device
+    tracker = tvm.rpc.connect_tracker(args.host, args.port)
+    remote = tracker.request(args.rpc_key)
+
+    dev = remote.device(str(target), 0)
+    remote.upload(tmp.relpath(filename))
+
+    rlib = remote.load_module(filename)
+    module = runtime.GraphModule(rlib["default"](dev))
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+    module.set_input("data", data_tvm)
+
+    # evaluate
+    print_progress("%-20s evaluating..." % network)
+    ftimer = module.module.time_evaluator("run", dev, number=1, repeat=repeat)
+    prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
+    print(
+        "%-20s %-19s (%s)"
+        % (network + "-" + dtype, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res))
+    )
+    return (np.mean(prof_res), np.std(prof_res))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--network",
+        type=str,
+        choices=[
+            "resnet-18",
+            "resnet-34",
+            "resnet-50",
+            "vgg-16",
+            "vgg-19",
+            "densenet-121",
+            "inception_v3",
+            "mobilenet",
+            "squeezenet_v1.0",
+            "squeezenet_v1.1",
+        ],
+        help="The name of neural network",
+    )
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=9190)
+    parser.add_argument("--rpc-key", type=str, default="android")
+    parser.add_argument("--repeat", type=int, default=30)
+    parser.add_argument("--tune", type=bool, default=False)
+    args = parser.parse_args()
+
+    if args.network is None:
+        networks = [
+            "resnet-18",
+            "resnet-34",
+            "resnet-50",
+            # "vgg-16",
+            # "vgg-19",
+            "densenet-121",
+            "inception_v3",
+            "mobilenet",
+            "squeezenet_v1.0",
+            "squeezenet_v1.1",
+        ]
+    else:
+        networks = [args.network]
+
+    target = "opencl"
+    target_host = "llvm -mtriple=arm64-linux-android"
+
+    print("--------------------------------------------------")
+    print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
+    print("--------------------------------------------------")
+
+    results = {}
+
+    for network in networks:
+        ftime = evaluate_network(network, target, target_host, "float32", args.repeat)
+        results[network + "-float32"] = ftime
+        ftime = evaluate_network(network, target, target_host, "float16", args.repeat)
+        results[network + "-float16"] = ftime
+
+    print("----------------------------------------------------------------------")
+    print("%-30s %-30s" % ("Network Name", "Mean Inference Time        (std dev)"))
+    print("----------------------------------------------------------------------")
+    for key, val in results.items():
+        print("%-30s %-30s (%s)" % (key, "%.2f ms" % val[0], "%.2f ms" % val[1]))
diff --git a/apps/benchmark/adreno/bench.sh b/apps/benchmark/adreno/bench.sh
index 7d46685b8654..7f9adeea5251 100755
--- a/apps/benchmark/adreno/bench.sh
+++ b/apps/benchmark/adreno/bench.sh
@@ -55,5 +55,10 @@ if [ "texture" == $1 ] ; then
     python3 apps/benchmark/adreno/adreno_gpu_bench_texture.py --host ${TVM_TRACKER_HOST} --port ${TVM_TRACKER_PORT} --rpc-key ${RPC_DEVICE_KEY}
 fi
 
+if [ "clml" == $1 ] ; then
+    python3 apps/benchmark/adreno/adreno_gpu_bench_clml.py --host ${TVM_TRACKER_HOST} --port ${TVM_TRACKER_PORT} --rpc-key ${RPC_DEVICE_KEY}
+fi
+
+
 kill ${TRACKER_PID}
 kill ${DEVICE_PID}
diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
index 02e4f62bed24..77882917b1ad 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -19,9 +19,12 @@
 import tvm
 
 from tvm import relay
+from tvm.ir import Op
 from tvm._ffi import register_func
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.expr_functor import ExprMutator
+from tvm.relay.expr import Call, TupleGetItem
 
 from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item, is_tuple
 from .register import register_pattern_table
@@ -48,6 +51,33 @@ def is_clml_runtime_enabled():
     return False
 
 
+class RemoveDropout(ExprMutator):
+    """
+    Removes all nn.dropout from an expr.
+    """
+
+    def visit_tuple_getitem(self, op: TupleGetItem) -> relay.expr.Expr:
+        visit = super().visit_tuple_getitem(op)
+        if visit.index != 0:
+            return visit
+        if (
+            isinstance(visit.tuple_value, Call)
+            and isinstance(visit.tuple_value.op, Op)
+            and visit.tuple_value.op.name == "nn.dropout"
+            and visit.index == 0
+        ):
+            return visit.tuple_value.args[0]
+        return visit
+
+
+@transform.function_pass(opt_level=0)
+class RemoveDropoutPass:
+    def transform_function(
+        self, func: relay.function.Function, mod: tvm.IRModule, _: tvm.transform.PassContext
+    ) -> relay.function.Function:
+        return RemoveDropout().visit(func)
+
+
 def partition_for_clml(mod, params=None):
     """Partition the graph greedily offloading supported
     operators to CLML Library.
@@ -70,6 +100,7 @@ def partition_for_clml(mod, params=None):
     seq = tvm.transform.Sequential(
         [
             transform.InferType(),
+            RemoveDropoutPass(),
             transform.FoldConstant(),
             transform.MergeComposite(clml_pattern_table()),
             transform.AnnotateTarget("clml", False),
@@ -289,6 +320,7 @@ def check_default_op(extract):
         ("clml.global_max_pool2d", is_op("nn.global_max_pool2d")(wildcard()), check_default_op),
         ("clml.relu", is_op("nn.relu")(wildcard()), check_default_op),
         ("clml.clip", is_op("clip")(wildcard()), check_default_op),
+        ("clml.batch_flatten", is_op("nn.batch_flatten")(wildcard()), check_default_op),
     ]
 
 
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index 6396fce4858b..1fb694a91201 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -259,9 +259,14 @@ class CLMLRuntime : public JSONRuntimeBase {
           layer_.in_placeholder[i]->memory = static_cast<cl_mem>(
               ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
           cl_event cpy_evt = NULL;
+          cl_event* evt = &cpy_evt;
+          if (workspace->IsProfiling(tentry->device)) {
+            evts.resize(evts.size() + 1);
+            evt = &(evts.back());
+          }
           result = h_ClmlIntf->clEnqueueCopyMLTensorDataQCOM(
               queue, layer_.in_placeholder[i]->tensor, layer_.in_placeholder[i]->memory,
-              layer_.inputs[i]->tensor, layer_.inputs[i]->memory, 0, NULL, &cpy_evt);
+              layer_.inputs[i]->tensor, layer_.inputs[i]->memory, 0, NULL, evt);
           ICHECK(result == CL_SUCCESS) << "clEnqueueCopyMLTensorDataQCOM:" << result;
         } else {
           DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
@@ -277,7 +282,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     }
 
     for (size_t i = 0; i < this->layer_.function.size(); ++i) {
-      if (getenv("CLML_PROFILING")) {
+      // Make CLML subgraphs accounted by OpenCLTimerNode.
+      if (getenv("CLML_PROFILING") || workspace->IsProfiling(tentry->device)) {
         evts.resize(evts.size() + 1);
         cl_event* evt = &(evts.back());
         result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
@@ -317,10 +323,14 @@ class CLMLRuntime : public JSONRuntimeBase {
         layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
             ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
         cl_event cpy_evt = NULL;
+        cl_event* evt = &cpy_evt;
+        if (workspace->IsProfiling(tentry->device)) {
+          evts.resize(evts.size() + 1);
+          evt = &(evts.back());
+        }
         result = h_ClmlIntf->clEnqueueCopyMLTensorDataQCOM(
             queue, layer_.outputs[i]->tensor, layer_.outputs[i]->memory,
-            layer_.out_placeholder[i]->tensor, layer_.out_placeholder[i]->memory, 0, NULL,
-            &cpy_evt);
+            layer_.out_placeholder[i]->tensor, layer_.out_placeholder[i]->memory, 0, NULL, evt);
         ICHECK(result == CL_SUCCESS) << "clEnqueueCopyMLTensorDataQCOM:" << result;
       } else {
         DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
@@ -407,6 +417,10 @@ class CLMLRuntime : public JSONRuntimeBase {
           auto out = CreatePadLayer(&layer_, node);
           this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
           this->layer_.func_outs.push_back(out);
+        } else if ("nn.batch_flatten" == op_name) {
+          auto out = CreateBatchFlattenLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
         } else if ("clip" == op_name) {
           auto out = CreateClipLayer(&layer_, node);
           this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
@@ -1070,6 +1084,31 @@ class CLMLRuntime : public JSONRuntimeBase {
     return output;
   }
 
+  /*!
+   * \brief Create a Batch Flatten layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateBatchFlattenLayer(
+      CachedLayer* layer, const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+                                             cl_dtype);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+
+    result = h_ClmlIntf->clCreateMLOpReshapeQCOM(workspace->context, 0, input->tensor,
+                                                 output->tensor, &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
   /*!
    * \brief Create a Reshape layer.
    *
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 756b269d0e50..700febd353d0 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -728,12 +728,26 @@ def add_subparser(
                 ],
             ),
             "benchmarks": (
+                "run Adreno Benchmarks (Native OpenCL, CLML SDK)",
+                [
+                    "./apps/benchmark/adreno/bench.sh texture "
+                    + os.environ.get("ANDROID_SERIAL", ""),
+                    "./apps/benchmark/adreno/bench.sh clml " + os.environ.get("ANDROID_SERIAL", ""),
+                ],
+            ),
+            "nativebenchmarks": (
                 "run Adreno Texture Benchmarks",
                 [
                     "./apps/benchmark/adreno/bench.sh texture "
                     + os.environ.get("ANDROID_SERIAL", ""),
                 ],
             ),
+            "clmlbenchmarks": (
+                "run Adreno CLML SDK Benchmarks",
+                [
+                    "./apps/benchmark/adreno/bench.sh clml " + os.environ.get("ANDROID_SERIAL", ""),
+                ],
+            ),
         },
     ),
 ]

From 9f82ee6cee9f34746b4d4bb80a405a1fae278fc6 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 4 Jan 2023 18:30:09 -0600
Subject: [PATCH 117/286] [VTA] Provide zero-initialization for VTAGenericInsn
 (#13698)

Previously, this line caused a warning for `-Wmaybe-uninitialized`
when compiling in g++ 11.3.0.
---
 vta/runtime/runtime.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index b139fbda6819..c3d37a13133b 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -915,7 +915,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
  protected:
   /*! \return Add new instruction to the buffer. */
   VTAGenericInsn* NextInsn() {
-    VTAGenericInsn insn;
+    VTAGenericInsn insn = {};
     dram_buffer_.push_back(insn);
     return &dram_buffer_.back();
   }

From 0ad67e8b220668a1f1116f6dd49d37dcd63676b4 Mon Sep 17 00:00:00 2001
From: Qianshui <qianshui.jiang@intel.com>
Date: Thu, 5 Jan 2023 13:34:13 +0800
Subject: [PATCH 118/286] [Tensorize][runtime] Add support for AMX(Advanced
 Matrix Extensions) through Tensor intrinsics (#13642)

* add AMX config functions and building option.

* amx tensor intrinsics and u8s8s32 matmul testcase

* add int8 dense kernel use amx tensorize

* add int8 dense kernel use amx tensorize

* add amx init() and config() for dense test case

* correct the amx config

* fix lint.

* fix dense schedule

* remove operation of signal stack

* fix nit

* unified amx and vnni compute, remove dup one

* fix lint

* adopt to x86 int8 dense compute method;

* Revert "adopt to x86 int8 dense compute method;"

This reverts commit 5718a059c69972cf71ea082a3303b5c29fa2d21f.

* restore schedule ruls specially for ms dense_vnni

* add vnni ms target attributes

* remove the misoperations

* Revert "restore schedule ruls specially for ms dense_vnni"

This reverts commit 2bda03e0ed67d86a90511ee9eb7afaa2215bad17.

* add vnni ms target attributes and remove misops

* Revert "add vnni ms target attributes"

This reverts commit c2e9f26fd9d84ce75e9a0c1474df1b7e0b9ff4f3.

* remove the misops
---
 CMakeLists.txt                        |   2 +
 cmake/config.cmake                    |   3 +
 cmake/modules/LibInfo.cmake           |   1 +
 cmake/modules/contrib/AMX.cmake       |  23 +++
 python/tvm/relay/op/strategy/x86.py   |   9 +-
 python/tvm/topi/x86/dense.py          | 167 ++++++++++++++++---
 python/tvm/topi/x86/dense_alter_op.py |  10 +-
 python/tvm/topi/x86/tensor_intrin.py  | 226 +++++++++++++++++++++++++-
 python/tvm/topi/x86/utils.py          |   7 +
 src/runtime/contrib/amx/amx_config.cc | 135 +++++++++++++++
 src/runtime/thread_storage_scope.h    |   5 +
 src/support/libinfo.cc                |   5 +
 tests/python/contrib/test_amx.py      | 126 ++++++++++++++
 tests/python/relay/test_op_level1.py  |  47 ++++++
 14 files changed, 732 insertions(+), 34 deletions(-)
 create mode 100644 cmake/modules/contrib/AMX.cmake
 create mode 100644 src/runtime/contrib/amx/amx_config.cc
 create mode 100644 tests/python/contrib/test_amx.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d1785f3ffa1..bd69c9d7f120 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,6 +87,7 @@ tvm_option(PICOJSON_PATH "Path to PicoJSON" "3rdparty/picojson")
 # Contrib library options
 tvm_option(USE_BYODT_POSIT "Build with BYODT software emulated posit custom datatype" OFF)
 tvm_option(USE_BLAS "The blas library to be linked" none)
+tvm_option(USE_AMX "Enable Intel AMX" OFF)
 tvm_option(USE_MKL "MKL root path when use MKL blas" OFF)
 tvm_option(USE_DNNL "Enable DNNL codegen" OFF)
 tvm_option(USE_CUDNN "Build with cuDNN" OFF)
@@ -498,6 +499,7 @@ include(cmake/modules/contrib/Gemmini.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
 include(cmake/modules/contrib/CODEGENC.cmake)
 include(cmake/modules/contrib/DNNL.cmake)
+include(cmake/modules/contrib/AMX.cmake)
 include(cmake/modules/contrib/CUTLASS.cmake)
 include(cmake/modules/contrib/ExampleTargetHooks.cmake)
 include(cmake/modules/contrib/Random.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 952c4a9cc814..fba7fafe9316 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -179,6 +179,9 @@ set(USE_MKL OFF)
 # - OFF: Disable DNNL
 set(USE_DNNL OFF)
 
+# Whether use Intel AMX instructions.
+set(USE_AMX OFF)
+
 # Whether use OpenMP thread pool, choices: gnu, intel
 # Note: "gnu" uses gomp library, "intel" uses iomp5 library
 set(USE_OPENMP none)
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 779e10d01246..f73a294bd969 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -65,6 +65,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_CUDNN="${USE_CUDNN}"
     TVM_INFO_USE_CUSTOM_LOGGING="${USE_CUSTOM_LOGGING}"
     TVM_INFO_USE_CUTLASS="${USE_CUTLASS}"
+    TVM_INFO_USE_AMX="${USE_AMX}"
     TVM_INFO_USE_DNNL="${USE_DNNL}"
     TVM_INFO_USE_ETHOSN="${USE_ETHOSN}"
     TVM_INFO_USE_FALLBACK_STL_MAP="${USE_FALLBACK_STL_MAP}"
diff --git a/cmake/modules/contrib/AMX.cmake b/cmake/modules/contrib/AMX.cmake
new file mode 100644
index 000000000000..ac349c4336a2
--- /dev/null
+++ b/cmake/modules/contrib/AMX.cmake
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_AMX)
+    file(GLOB AMX_RUNTIME_CONFIG src/runtime/contrib/amx/amx_config.cc)
+    list(APPEND COMPILER_SRCS ${AMX_RUNTIME_CONFIG})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=sapphirerapids")
+    message(STATUS "Build with Intel AMX support...")
+endif()
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 7ff4dbc0ad1b..4585809f63e1 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -591,7 +591,6 @@ def dense_strategy_cpu(attrs, inputs, out_type, target):
 def dense_pack_strategy_cpu(attrs, inputs, out_type, target):
     """dense_pack x86 strategy"""
     strategy = _op.OpStrategy()
-
     if (
         inputs[0].dtype == "uint8"
         and inputs[1].dtype == "int8"
@@ -599,10 +598,10 @@ def dense_pack_strategy_cpu(attrs, inputs, out_type, target):
         and attrs["weight_layout"] == "NC16n4c"
     ):
         strategy.add_implementation(
-            wrap_compute_dense(topi.x86.dense_vnni),
-            wrap_topi_schedule(topi.x86.schedule_dense_vnni),
-            name="dense_vnni.x86",
-            plevel=12,
+            wrap_compute_dense(topi.x86.dense_int8),
+            wrap_topi_schedule(topi.x86.schedule_dense_int8),
+            name="dense_int8.x86",
+            plevel=13,
         )
     else:
         strategy.add_implementation(
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index 65a803781a57..ada19d598cdf 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -14,8 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name,too-many-locals,unused-variable
-# pylint: disable=no-value-for-parameter
+# pylint: disable=invalid-name,too-many-locals,unused-argument
+# pylint: disable=no-value-for-parameter,unused-variable
 """x86 dense operators"""
 from __future__ import absolute_import as _abs
 
@@ -27,7 +27,9 @@
 from .. import generic, tag
 from ..utils import get_const_tuple, traverse_inline
 from .tensor_intrin import dot_16x1x16_uint8_int8_int32_cascadelake
-from .utils import get_simd_32bit_lanes
+from .tensor_intrin import dot_32x128x32_u8s8s32_sapphirerapids
+from .tensor_intrin import acc_32x32_int32_sapphirerapids
+from .utils import get_simd_32bit_lanes, target_has_vnni, target_has_amx
 
 
 def _schedule_dense_pack_template(cfg, s, C, O):
@@ -278,11 +280,45 @@ def _callback(op):
     return s
 
 
-def dense_vnni_compute(cfg, X, packed_w, bias=None):
+@autotvm.register_topi_compute("dense_int8.x86")
+def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
+    """Compute for uint8 x int8 -> int32 dense"""
+    if out_dtype is None:
+        out_dtype = data.dtype
+    assert len(weight.shape) == 4
+    assert data.dtype == "uint8" and weight.dtype == "int8"
+    _, _, n_inner, k_inner = get_const_tuple(weight.shape)  # out_dim
+    assert n_inner == 16 and k_inner == 4
+    return dense_int8_compute(cfg, data, weight, bias)
+
+
+@autotvm.register_topi_schedule("dense_int8.x86")
+def schedule_dense_int8(cfg, outs):
+    """Create a schedule for dense__int8"""
+    s = te.create_schedule([x.op for x in outs])
+    mcpu = tvm.target.Target.current().mcpu
+
+    def _callback(op):
+        if "dense_int8" in op.tag:
+            if target_has_amx(mcpu):
+                dense_amx_int8_schedule(cfg, s, op.output(0), outs[0])
+            elif target_has_vnni(mcpu):
+                dense_vnni_schedule(cfg, s, op.output(0), outs[0])
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def dense_int8_compute(cfg, X, packed_w, bias=None):
     """Compute for uint8 x int8 -> int32 dense"""
     m, k = X.shape
     n_o, _, n_i, _ = packed_w.shape
     ak = te.reduce_axis((0, k), name="k")
+    mcpu = tvm.target.Target.current().mcpu
+    if target_has_vnni(mcpu):
+        target_attr = {"schedule_rule": "meta_schedule.x86.dense_vnni"}
+    else:
+        target_attr = None
 
     C = te.compute(
         (m, n_o * n_i),
@@ -293,16 +329,13 @@ def dense_vnni_compute(cfg, X, packed_w, bias=None):
             ),
             axis=ak,
         ),
-        tag="dense_vnni",
-        attrs={"schedule_rule": "dense_vnni"},
+        tag="dense_int8",
+        attrs=target_attr,
     )
 
     if bias is not None:
         C = te.compute(C.shape, lambda i, j: C[i, j] + bias[j], tag=tag.BROADCAST)
 
-    a_y, _ = C.op.axis
-    cfg.define_split("tile_y", a_y, num_outputs=2)
-
     return C
 
 
@@ -317,6 +350,7 @@ def split_y(out):
         if cfg.is_fallback:
             return s[out].split(a_y, factor=default_y_split_factor)
 
+        cfg.define_split("tile_y", a_y, num_outputs=2)
         return cfg["tile_y"].apply(s, out, a_y)
 
     (a_k,) = C.op.reduce_axis
@@ -348,26 +382,111 @@ def split_y(out):
     return s, fused
 
 
-@autotvm.register_topi_compute("dense_vnni.x86")
-def dense_vnni(cfg, data, weight, bias=None, out_dtype=None):
-    """Compute for uint8 x int8 -> int32 dense"""
-    if out_dtype is None:
-        out_dtype = data.dtype
-    assert len(weight.shape) == 4
-    assert data.dtype == "uint8" and weight.dtype == "int8"
-    _, _, n_inner, k_inner = get_const_tuple(weight.shape)  # out_dim
-    assert n_inner == 16 and k_inner == 4
-    return dense_vnni_compute(cfg, data, weight, bias)
+def dense_amx_int8_schedule(cfg, s, C, O, do_parallel=True):
+    """Schedule dense compute using AMX TMUL instruction"""
+    # C: The output of GEMM
+    # O: The output of the fused op
+    def split_x(out):
+        default_x_split_factor1 = 32
+        default_x_split_factor2 = 2
+        default_x_split_factor3 = 2
+        default_x_split_factor4 = 2
+        a_x = s[out].op.axis[-2]
+
+        if cfg.is_fallback:
+            a_xo, a_xi = s[out].split(a_x, factor=default_x_split_factor1)
+            a_xo2, a_xo1 = s[out].split(a_xo, factor=default_x_split_factor2)
+            a_xo3, a_xo2 = s[out].split(a_xo2, factor=default_x_split_factor3)
+            a_xo4, a_xo3 = s[out].split(a_xo3, factor=default_x_split_factor4)
+            return [a_xo4, a_xo3, a_xo2, a_xo1, a_xi]
+
+        cfg.define_split("tile_x", a_x, num_outputs=5, filter=lambda x: x.size[-1] == 32)
+        return cfg["tile_x"].apply(s, out, a_x)
+
+    def split_y(out):
+        default_y_split_factor1 = 32
+        default_y_split_factor2 = 4
+        default_y_split_factor3 = 4
+        default_y_split_factor4 = 4
+        a_y = s[out].op.axis[-1]
+
+        if cfg.is_fallback:
+            a_yo1, a_yo = s[out].split(a_y, factor=default_y_split_factor1)
+            a_yo2, a_yo1 = s[out].split(a_yo1, factor=default_y_split_factor2)
+            a_yo3, a_yo2 = s[out].split(a_yo2, factor=default_y_split_factor3)
+            a_yo4, a_yo3 = s[out].split(a_yo3, factor=default_y_split_factor4)
+            return [a_yo4, a_yo3, a_yo2, a_yo1, a_yo]
+
+        cfg.define_split("tile_y", a_y, num_outputs=5, filter=lambda y: y.size[-1] == 32)
+        return cfg["tile_y"].apply(s, out, a_y)
+
+    def split_k(out, rd_axis):
+        default_k_split_factor1 = 128
+        default_k_split_factor2 = 2
+        default_k_split_factor3 = 2
+        default_k_split_factor4 = 2
+
+        if cfg.is_fallback:
+            a_ko, a_ki = s[out].split(rd_axis, factor=default_k_split_factor1)
+            a_ko2, a_ko1 = s[out].split(a_ko, factor=default_k_split_factor2)
+            a_ko3, a_ko2 = s[out].split(a_ko2, factor=default_k_split_factor3)
+            a_ko4, a_ko3 = s[out].split(a_ko3, factor=default_k_split_factor4)
+            return [a_ko4, a_ko3, a_ko2, a_ko1, a_ki]
+
+        cfg.define_split("tile_k", rd_axis, num_outputs=5, filter=lambda y: y.size[-1] == 128)
+        return cfg["tile_k"].apply(s, out, rd_axis)
+
+    a_x, a_y = C.op.axis
+    (a_k,) = C.op.reduce_axis
+    CF = s.cache_write(C, "amx.tmm")
+
+    a_x3, a_x2, a_x1, a_xo, a_xi = split_x(C)
+    a_y3, a_y2, a_y1, a_yo, a_yi = split_y(C)
+    s[C].reorder(a_x3, a_y3, a_x2, a_y2, a_x1, a_y1, a_xo, a_yo, a_xi, a_yi)
+
+    s[CF].compute_at(s[C], a_yo)
+
+    (a_k_f,) = CF.op.reduce_axis
+    a_x_f, a_y_f = CF.op.axis
+
+    a_xo_f, a_xi_f = s[CF].split(a_x_f, factor=32)
+
+    a_yo_f, a_yi_f = s[CF].split(a_y_f, factor=32)
+    a_k3_f, a_k2_f, a_k1_f, a_ko_f, a_ki_f = split_k(CF, a_k_f)
+    s[CF].reorder(a_k3_f, a_k2_f, a_k1_f, a_ko_f, a_xo_f, a_yo_f, a_ki_f, a_xi_f, a_yi_f)
+
+    (m, k) = CF.op.input_tensors[0].shape
+    (n, c, n_i, c_i) = CF.op.input_tensors[1].shape
+    n = n * n_i
+
+    s[CF].tensorize(a_ki_f, dot_32x128x32_u8s8s32_sapphirerapids(LDA=int(k)))
+    s[C].tensorize(a_xi, acc_32x32_int32_sapphirerapids(LDC=int(n)))
+
+    if C == O:
+        fused = s[O].fuse(a_x3, a_y3)
+    else:
+        a_y3, a_y2, a_y1, a_yr, a_yi = split_y(O)
+        a_x3, a_x2, a_x1, a_xr, a_xi = split_x(O)
+
+        s[O].reorder(a_y3, a_x3, a_y2, a_x2, a_y1, a_x1, a_yr, a_xr, a_yi, a_xi)
+        s[O].vectorize(a_xi)
+
+        fused = s[O].fuse(a_x3, a_y3)
+
+    if do_parallel:
+        s[O].parallel(fused)
+
+    return s, fused
 
 
-@autotvm.register_topi_schedule("dense_vnni.x86")
-def schedule_dense_vnni(cfg, outs):
-    """Create a schedule for dense_vnni"""
+@autotvm.register_topi_schedule("dense_amx_int8.x86")
+def schedule_dense_amx_int8(cfg, outs):
+    """Create a schedule for dense_amx_int8"""
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if "dense_vnni" in op.tag:
-            dense_vnni_schedule(cfg, s, op.output(0), outs[0])
+        if "dense_amx_int8" in op.tag:
+            dense_amx_int8_schedule(cfg, s, op.output(0), outs[0])
 
     traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/python/tvm/topi/x86/dense_alter_op.py b/python/tvm/topi/x86/dense_alter_op.py
index fd2b184a87d2..2cb46b8291fb 100644
--- a/python/tvm/topi/x86/dense_alter_op.py
+++ b/python/tvm/topi/x86/dense_alter_op.py
@@ -25,13 +25,15 @@
 from ..utils import get_const_tuple
 from ..nn import dense_alter_layout
 from .utils import target_has_vnni
+from .utils import target_has_amx
 from .. import nn
 
 
-def check_vnni_applicable(x, y, allow_padding=False):
+def check_inst_applicable(x, y, allow_padding=False):
     mcpu = tvm.target.Target.current().mcpu
+    simd_avai = target_has_vnni(mcpu) or target_has_amx(mcpu)
     return (
-        target_has_vnni(mcpu)
+        simd_avai
         and "int8" in x.dtype
         and "int8" in y.dtype
         and (allow_padding or (y.shape[-2] % 16 == 0 and y.shape[-1] % 4 == 0))
@@ -47,7 +49,7 @@ def _alter_dense_layout(attrs, inputs, tinfos, out_type):
     M, K = get_const_tuple(data_tensor.shape)
     N, _ = get_const_tuple(weight_tensor.shape)
 
-    if check_vnni_applicable(data_tensor, weight_tensor) and data_tensor.dtype == "uint8":
+    if check_inst_applicable(data_tensor, weight_tensor) and data_tensor.dtype == "uint8":
         weight_layout = "NC16n4c"
         return relay.nn.contrib_dense_pack(inputs[0], inputs[1], weight_layout, None, out_dtype)
 
@@ -87,7 +89,7 @@ def _alter_dense_layout(attrs, inputs, tinfos, out_type):
 def vnni_legalize(inputs, arg_types, op, attrs, need_expand=False):
     """Legalizes s8, s8 -> s32 GEMM op for VNNI."""
     if (
-        check_vnni_applicable(arg_types[0], arg_types[1], allow_padding=True)
+        check_inst_applicable(arg_types[0], arg_types[1], allow_padding=True)
         and arg_types[0].dtype == "int8"
     ):
         x, y = inputs
diff --git a/python/tvm/topi/x86/tensor_intrin.py b/python/tvm/topi/x86/tensor_intrin.py
index 9e91e32b20e5..3b83fecbf552 100644
--- a/python/tvm/topi/x86/tensor_intrin.py
+++ b/python/tvm/topi/x86/tensor_intrin.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Core kernel of dot product of 4 Int8 operations"""
-# pylint: disable=invalid-name
+# pylint: disable=invalid-name,unused-variable
 import tvm
 from tvm import te
 import tvm.target.codegen
@@ -348,3 +348,227 @@ def _instr(index):
         binds={data: a_buffer, kernel: b_buffer},
         default_buffer_params=buffer_params,
     )
+
+
+def dot_32x128x32_u8s8s32_sapphirerapids(LDA):
+    """
+    Int8 dot product by every 16x64 elements using AMX-TMUL Sapphire Rapids instructions.
+    The tdpxxd instruction takes two tile of uint8 and int8 datatype -- data[16][64] and
+    kernel[1][16][16][4] -- and computes a dot product of data[16][16] in int32 datatype.
+
+    (Physically, to efficiently leveraging the tile register, we constructing a 2x2 tiles
+    matmul which performs 32x128x32 in total)
+
+    The pseudo code is as follows:
+        for(k=0; k<2; k++){
+            for(n=0; n<2; n++){
+                tileload64(tmm_b, B)
+                for(m=0; m<2; m++){
+                    if(n==0)
+                        tileload64(tmm_a, A)
+                    tdpbusd(tmm_c, tmm_a, tmm_b)
+                }
+            }
+        }
+
+    Args:
+        LDA (int): the stride of the matrix A, which is uint8 type and use it to determine
+                    memory strides of macro reduce axis.
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Sapphire Rapids AMX-TMUL int8 tdpbusd TensorIntrin that can be used in tensorizing
+        schedule
+    """
+    A = te.placeholder((32, 128), name="A", dtype="uint8")
+    B = te.placeholder((2, 32, 16, 4), name="B", dtype="int8")
+    k = te.reduce_axis((0, 128), name="k")
+
+    C = te.compute(
+        (32, 32),
+        lambda i, j: te.sum(
+            A[i, k].astype("int32")
+            * B[tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(k, 4), j % 16, k % 4].astype("int32"),
+            axis=k,
+        ),
+        name="C",
+    )
+
+    BA = tvm.tir.decl_buffer(
+        A.shape, A.dtype, offset_factor=1, strides=[te.var("ldw"), 1], name="BA"
+    )
+    BB = tvm.tir.decl_buffer(
+        B.shape,
+        B.dtype,
+        offset_factor=1,
+        strides=[te.var("ldw"), te.var("ldw"), te.var("ldw"), 1],
+        name="BB",
+    )
+    BC = tvm.tir.decl_buffer(
+        C.shape, C.dtype, offset_factor=1, strides=[te.var("ldw"), 1], name="BC", scope="amx.tmm"
+    )
+
+    def intrin_func(ins, outs):  # pylint: disable=unused-variable
+        bufA = ins[0]
+        bufB = ins[1]
+        bufC = outs[0]
+
+        assert LDA
+        _strides_A = tvm.tir.const(LDA, dtype="uint64")
+        _strides_B_tile = tvm.tir.const(LDA / 128, dtype="uint64")
+
+        def init():
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(
+                tvm.tir.call_llvm_intrin(
+                    "int32",
+                    "llvm.x86.tilezero",
+                    tvm.tir.const(1, "uint8"),
+                    tvm.tir.const(0, dtype="uint8"),
+                )
+            )  # tile C 0
+            ib.emit(
+                tvm.tir.call_llvm_intrin(
+                    "int32",
+                    "llvm.x86.tilezero",
+                    tvm.tir.const(1, "uint8"),
+                    tvm.tir.const(1, dtype="uint8"),
+                )
+            )  # tile C 1
+            ib.emit(
+                tvm.tir.call_llvm_intrin(
+                    "int32",
+                    "llvm.x86.tilezero",
+                    tvm.tir.const(1, "uint8"),
+                    tvm.tir.const(2, dtype="uint8"),
+                )
+            )  # tile C 2
+            ib.emit(
+                tvm.tir.call_llvm_intrin(
+                    "int32",
+                    "llvm.x86.tilezero",
+                    tvm.tir.const(1, "uint8"),
+                    tvm.tir.const(3, dtype="uint8"),
+                )
+            )  # tile C 3
+
+            return ib.get()
+
+        def body():  # load A, load B, dpbusd, store C
+            ib = tvm.tir.ir_builder.create()
+
+            for k_tile in range(2):  # reduced data blocks
+                for n_acc in range(2):  # broadcast data blocks
+                    tmm_B_ = tvm.tir.const(n_acc + 6, dtype="uint8")
+                    ib.emit(
+                        tvm.tir.call_llvm_intrin(
+                            "int32",
+                            "llvm.x86.tileloaddt164",  # load B: tmm6, tmm7
+                            tvm.tir.const(3, "uint8"),
+                            tmm_B_,
+                            bufB.access_ptr(
+                                "r", offset=64 * 16 * (n_acc * 2 * _strides_B_tile + k_tile)
+                            ),
+                            tvm.tir.const(64, dtype="uint64"),
+                        )
+                    )
+
+                    for m_acc in range(2):  # loaded data blocks
+                        tmm_A_ = tvm.tir.const(m_acc + 4, dtype="uint8")
+                        if n_acc == 0:
+                            ib.emit(
+                                tvm.tir.call_llvm_intrin(
+                                    "int32",
+                                    "llvm.x86.tileloaddt164",  # load A: , tmm4, tmm5
+                                    tvm.tir.const(3, "uint8"),
+                                    tmm_A_,
+                                    bufA.access_ptr(
+                                        "r", offset=m_acc * 16 * _strides_A + k_tile * 64
+                                    ),
+                                    _strides_A,
+                                )
+                            )
+
+                        tmm_C_ = tvm.tir.const(m_acc * 2 + n_acc, dtype="uint8")
+                        ib.emit(
+                            tvm.tir.call_llvm_intrin(
+                                "int32",
+                                "llvm.x86.tdpbusd",
+                                tvm.tir.const(3, "uint8"),
+                                tmm_C_,
+                                tmm_A_,
+                                tmm_B_,
+                            )
+                        )  # tdpxxd
+
+            return ib.get()
+
+        # body, reset, store
+        return (
+            body(),
+            init(),
+            body(),
+        )
+
+    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
+
+
+def acc_32x32_int32_sapphirerapids(LDC):
+    """
+    Store the accumulated tile register in scope amx.tmm to global memory.
+    (tmm0, tmm1, tmm2, tmm3 --> global 4 tiles)
+
+    Args:
+        LDC (int): the stride of the matrix C, which is int32 type and use it to
+                    determine memory strides.
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Sapphirerapids AMX-TMUL int8 tilestored64 TensorIntrin that can be used
+        in tensorizing schedule
+    """
+    A = te.placeholder((32, 32), name="A", dtype="int32")
+    bufA = tvm.tir.decl_buffer(
+        A.shape,
+        A.dtype,
+        scope="amx.tmm",
+        name="a_buffer",
+        offset_factor=1,
+        strides=[te.var("ldw"), 1],
+    )
+
+    C = te.compute((32, 32), lambda i, j: A[i, j], name="C")
+    bufC = tvm.tir.decl_buffer(
+        C.shape,
+        C.dtype,
+        scope="global",
+        name="c_buffer",
+        offset_factor=1,
+        strides=[te.var("ldw"), 1],
+    )
+
+    assert LDC
+    _strides_C = tvm.tir.const(4 * LDC, dtype="uint64")
+
+    def intrin_func(ins, outs):  # pylint: disable=unused-variable
+        ib = tvm.tir.ir_builder.create()
+        bufA = ins[0]
+        bufC = outs[0]
+        for n_acc in range(2):  # broadcast data blocks
+            for m_acc in range(2):  # loaded data blocks
+                ib.emit(
+                    tvm.tir.call_llvm_intrin(
+                        "int32",
+                        "llvm.x86.tilestored64",
+                        tvm.tir.const(3, "uint8"),
+                        tvm.tir.const(m_acc * 2 + n_acc, dtype="uint8"),
+                        bufC.access_ptr("w", offset=n_acc * 16 + m_acc * 16 * _strides_C / 4),
+                        _strides_C,
+                    )
+                )
+
+        return ib.get()
+
+    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: bufA, C: bufC})
diff --git a/python/tvm/topi/x86/utils.py b/python/tvm/topi/x86/utils.py
index c364027022da..efe5913269a1 100644
--- a/python/tvm/topi/x86/utils.py
+++ b/python/tvm/topi/x86/utils.py
@@ -123,6 +123,13 @@ def target_has_vnni(target):
     }
 
 
+@tvm._ffi.register_func("tvm.topi.x86.utils.target_has_amx")
+def target_has_amx(target):
+    return target in {
+        "sapphirerapids",
+    }
+
+
 @tvm._ffi.register_func("tvm.topi.x86.utils.get_simd_32bit_lanes")
 def get_simd_32bit_lanes():
     mcpu = tvm.target.Target.current().mcpu
diff --git a/src/runtime/contrib/amx/amx_config.cc b/src/runtime/contrib/amx/amx_config.cc
new file mode 100644
index 000000000000..2e034bd478b5
--- /dev/null
+++ b/src/runtime/contrib/amx/amx_config.cc
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * \file src/runtime/contrib/amx/amx_config.cc
+ * \brief extraction of AMX configuration on x86 platforms
+ */
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+namespace tvm {
+namespace runtime {
+
+#ifdef __linux__
+#include <dmlc/logging.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <immintrin.h>
+#include <signal.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#define XFEATURE_XTILECFG 17
+#define XFEATURE_XTILEDATA 18
+#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG)
+#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA)
+#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
+#define ARCH_GET_XCOMP_PERM 0x1022
+#define ARCH_REQ_XCOMP_PERM 0x1023
+
+typedef struct __tile_config {
+  uint8_t palette_id;
+  uint8_t start_row;
+  uint8_t reserved_0[14];
+  uint16_t colsb[8]; /* Colum size of each tmm register in bytes */
+  uint16_t reserved_1[8];
+  uint8_t rows[8]; /* Row size of each tmm reg in bytes */
+  uint8_t reserved_2[8];
+} __tilecfg;
+
+typedef union __union_tile_config {
+  __tilecfg s;
+  uint8_t a[64];
+} __tilecfg_u;
+
+void init_tile_config(__tilecfg_u* dst, uint16_t cols, uint8_t rows) {
+  dst->s.palette_id = 1;
+  dst->s.start_row = 0;
+
+  for (int i = 0; i < 14; i++) dst->s.reserved_0[i] = 0;
+
+  for (int i = 0; i < 8; i++) {
+    dst->s.colsb[i] = cols;
+    dst->s.rows[i] = rows;
+    dst->s.reserved_1[i] = 0;
+    dst->s.reserved_2[i] = 0;
+  }
+
+  _tile_loadconfig(dst->a);
+}
+
+TVM_REGISTER_GLOBAL("runtime.amx_tileconfig").set_body([](TVMArgs args, TVMRetValue* rv) {
+  int rows = args[0];
+  int cols = args[1];
+  LOG(INFO) << "rows: " << rows << ", cols:" << cols;
+  // -----------Config for AMX tile resgister----------------------
+  __tilecfg_u cfg;
+  init_tile_config(&cfg, cols, rows);
+
+  *rv = 1;
+  return;
+});
+
+// register a global packed function in c++，to init the system for AMX config
+TVM_REGISTER_GLOBAL("runtime.amx_init").set_body([](TVMArgs args, TVMRetValue* rv) {
+  // -----------Detect and request for AMX control----------------------
+  uint64_t bitmask = 0;
+  int64_t status = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
+  if (0 != status) {
+    *rv = 0;
+    LOG(FATAL) << "errno:" << errno << ", " << strerror(errno);
+    LOG(FATAL) << "status[0]: " << status << ", bitmask: " << bitmask
+               << ", XFEATURE_XTILEDATA setup is failed, TMUL feature is not allowed.";
+    return;
+  }
+  if (bitmask & XFEATURE_MASK_XTILEDATA) {
+    *rv = 1;
+    return;
+  }  // TILE_DATA feature was not detected
+
+  status = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
+  // if XFEATURE_XTILEDATA setup is failed, TMUL usage is not allowed
+  if (0 != status) {
+    *rv = 0;
+    LOG(FATAL) << "errno:" << errno << ", " << strerror(errno);
+    LOG(FATAL) << "status[1]: " << status << ", bitmask: " << bitmask
+               << ", XFEATURE_XTILEDATA setup is failed, TMUL usage is not allowed.";
+    return;
+  }
+
+  status = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
+  // if XFEATURE_XTILEDATA setup is failed, can't use TMUL
+  if (0 != status || !(bitmask & XFEATURE_MASK_XTILEDATA)) {
+    *rv = 0;
+    LOG(FATAL) << "errno:" << errno << ", " << strerror(errno);
+    LOG(FATAL) << "status[2]: " << status << ", bitmask: " << bitmask
+               << ", XFEATURE_XTILEDATA setup is failed, can't use TMUL.";
+    return;
+  }
+
+  // XFEATURE_XTILEDATA set successfully, TMUL usage is allowed
+  *rv = 1;
+  return;
+});
+
+#endif
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h
index 83477312dcc5..51dba038b6ac 100644
--- a/src/runtime/thread_storage_scope.h
+++ b/src/runtime/thread_storage_scope.h
@@ -62,6 +62,8 @@ enum class StorageRank {
   kWMMAAccumulator = 6,
   /*! \brief global scope texture memory */
   kTexture = 7,
+  /*! \brief global scope amx tmm memory */
+  kAMXTMM = 8,
 };
 
 /*!
@@ -149,6 +151,9 @@ struct StorageScope {
     } else if (s.compare(0, 7, "texture") == 0) {
       r.rank = StorageRank::kTexture;
       r.tag = s.substr(7, std::string::npos);
+    } else if (s.compare(0, 7, "amx.tmm") == 0) {
+      r.rank = StorageRank::kAMXTMM;
+      r.tag = s.substr(7, std::string::npos);
     } else {
       LOG(FATAL) << "unknown storage scope " << s;
     }
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 8d1332bee406..2c5a2e7a5a39 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -151,6 +151,10 @@
 #define TVM_INFO_USE_MKL "NOT-FOUND"
 #endif
 
+#ifndef TVM_INFO_USE_AMX
+#define TVM_INFO_USE_AMX "NOT-FOUND"
+#endif
+
 #ifndef TVM_INFO_USE_DNNL
 #define TVM_INFO_USE_DNNL "NOT-FOUND"
 #endif
@@ -274,6 +278,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_CUDNN", TVM_INFO_USE_CUDNN},
       {"USE_CUSTOM_LOGGING", TVM_INFO_USE_CUSTOM_LOGGING},
       {"USE_CUTLASS", TVM_INFO_USE_CUTLASS},
+      {"USE_AMX", TVM_INFO_USE_AMX},
       {"USE_DNNL", TVM_INFO_USE_DNNL},
       {"USE_ETHOSN", TVM_INFO_USE_ETHOSN},
       {"USE_FALLBACK_STL_MAP", TVM_INFO_USE_FALLBACK_STL_MAP},
diff --git a/tests/python/contrib/test_amx.py b/tests/python/contrib/test_amx.py
new file mode 100644
index 000000000000..30da7e56fb8d
--- /dev/null
+++ b/tests/python/contrib/test_amx.py
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
+
+import tvm
+from tvm import relay
+
+from tvm import te
+import tvm.testing
+from tvm.topi.x86.tensor_intrin import dot_32x128x32_u8s8s32_sapphirerapids
+from tvm.topi.x86.tensor_intrin import acc_32x32_int32_sapphirerapids
+import numpy as np
+import pytest
+
+
+@tvm.testing.requires_llvm
+@pytest.mark.skip("skip due to AMX feature not avaliable yet")
+def test_amx_u8s8s32_matmul_tensorize():
+    m = 1024
+    k = 1024
+    n = 1024
+
+    # --------------------------Config---------------------------
+    # Skip this test if "-mcpu=sapphirerapids" not supported by LLVM < 12.0
+    target = "llvm -mcpu=sapphirerapids"
+    dev = tvm.device(target, 0)
+    if not tvm.testing.device_enabled(target):
+        print("skip because %s is not enabled..." % target)
+        return
+
+    amx_init = tvm.get_global_func("runtime.amx_init")
+    amx_tileconfig = tvm.get_global_func("runtime.amx_tileconfig")
+    assert amx_init()
+    assert amx_tileconfig(16, 64)  # config tile size to 16 rows by 64 columns.
+    # --------------------------Compute--------------------------
+    X = te.placeholder((m, k), name="X", dtype="uint8")
+    ak = te.reduce_axis((0, k), name="k")
+    packedW = te.placeholder((n // 16, k // 4, 16, 4), name="packedW", dtype="int8")
+
+    C = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * packedW[tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(ak, 4), j % 16, ak % 4].astype(
+                "int32"
+            ),
+            axis=ak,
+        ),
+        name="F",
+    )
+
+    # --------------------------Schedule--------------------------
+    s = te.create_schedule(C.op)
+    a_x, a_y = C.op.axis
+    (a_k,) = C.op.reduce_axis
+
+    CF = s.cache_write(C, "amx.tmm")
+    a_xo, a_xi = s[C].split(a_x, factor=32)
+    a_yo, a_yi = s[C].split(a_y, factor=32)
+    s[C].reorder(a_xo, a_yo, a_xi, a_yi)
+
+    s[CF].compute_at(s[C], a_yo)
+    (a_k_f,) = CF.op.reduce_axis
+    a_x_f, a_y_f = CF.op.axis
+
+    a_xo_f, a_xi_f = s[CF].split(a_x_f, factor=32)
+    a_yo_f, a_yi_f = s[CF].split(a_y_f, factor=32)
+    a_ko_f, a_ki_f = s[CF].split(a_k_f, factor=128)
+    s[CF].reorder(a_ko_f, a_xo_f, a_yo_f, a_ki_f, a_xi_f, a_yi_f)
+
+    s[CF].tensorize(a_ki_f, dot_32x128x32_u8s8s32_sapphirerapids(LDA=k))
+    s[C].tensorize(a_xi, acc_32x32_int32_sapphirerapids(LDC=n))
+
+    lib = tvm.build(s, [X, packedW, C], target, name="intrinsic")
+    asm = lib.get_source("asm")
+    assert "tilezero" in asm
+    assert "tileloaddt1" in asm
+    assert "tdpbusd" in asm
+    assert "tilestored" in asm
+
+    # ----------------------- verify correctness --------------------------------
+    # generate the plain data
+    a = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
+    b = np.random.uniform(1, 10, size=(n, k)).astype("int8")
+    packW = np.random.uniform(1, 10, size=(n // 16, k // 4, 16, 4)).astype("int8")
+
+    # This should occurs in pre_pack (constant folding) stage,
+    # from plain data to blocked data(NC16n4c)
+    for i_n in range(n):
+        for i_k in range(k):
+            packW[i_n // 16][i_k // 4][i_n % 16][i_k % 4] = b[i_n][i_k]
+
+    x = tvm.nd.array(a, dev)
+    w = tvm.nd.array(packW, dev)
+    y = tvm.nd.array(np.zeros((m, n), dtype="int32"), dev)
+    t_evaluator = lib.time_evaluator(lib.entry_name, dev, number=100)
+    result = t_evaluator(x, w, y)
+    print(result)
+    tvm.testing.assert_allclose(y.numpy(), np.dot(a.astype("int32"), b.T.astype("int32")), rtol=0)
+
+
+@tvm.testing.requires_llvm
+@pytest.mark.skip("skip due to AMX feature not avaliable yet")
+def test_amx_check_support():
+    amx_init = tvm.get_global_func("runtime.amx_init")
+    amx_tileconfig = tvm.get_global_func("runtime.amx_tileconfig")
+    assert amx_init()
+    assert amx_tileconfig(16, 64)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index bd4e1b72c3cd..9f31acfa6d7f 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -799,6 +799,53 @@ def test_dense_vnni(m, n, k):
         np.testing.assert_equal(out, ref)
 
 
+@tvm.testing.requires_llvm
+@pytest.mark.skip("skip due to AMX feature not avaliable yet")
+def test_dense_amx_int8():
+    data_shape = (32, 128)
+    weight_shape = (32, 128)
+
+    amx_init = tvm.get_global_func("runtime.amx_init")
+    amx_tileconfig = tvm.get_global_func("runtime.amx_tileconfig")
+    assert amx_init()
+    assert amx_tileconfig(16, 64)  # config tile size to 16 rows by 64 columns.
+
+    for data_dtype in ["uint8", "int8"]:
+        data = relay.var("data", shape=data_shape, dtype=data_dtype)
+        weight = relay.var("weight", shape=weight_shape, dtype="int8")
+        bias = relay.var("bias", shape=(weight_shape[0],), dtype="int32")
+        dense = relay.nn.dense(data, weight, out_dtype="int32")
+        out = relay.nn.bias_add(dense, bias)
+        mod = tvm.IRModule.from_expr(out)
+
+        target = "llvm -mcpu=sapphirerapids"
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(mod, target=target)
+
+        asm = lib.lib.get_source("asm")
+        assert "tilezero" in asm
+        assert "tileloaddt1" in asm
+        assert "tdpbusd" in asm
+        assert "tilestored" in asm
+
+        dev = tvm.device(target, 0)
+        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+        a = np.random.uniform(1, 10, size=data_shape).astype(data_dtype)
+        b = np.random.uniform(1, 10, size=weight_shape).astype("int8")
+        c = np.random.uniform(1, 10, size=(weight_shape[0],)).astype("int32")
+
+        runtime.set_input("data", a)
+        runtime.set_input("weight", b)
+        runtime.set_input("bias", c)
+        runtime.run()
+
+        out = runtime.get_output(0).numpy()
+        ref = np.dot(a.astype("int32"), b.transpose().astype("int32")) + c
+
+        np.testing.assert_equal(out, ref)
+
+
 @pytest.mark.skip("Requires GFX10 AMDGPU")
 def test_dense_rocm_sdot4():
     data_shape = (32, 96)

From dd2525f95da717d9357752c48558adf34da1c655 Mon Sep 17 00:00:00 2001
From: lightzhan <1126207509@qq.com>
Date: Thu, 5 Jan 2023 21:03:40 +0800
Subject: [PATCH 119/286] [BugFix][TVMScript] Fix the roundtripability of 
 intrinsic pow (#13692)

* Fix the roundtripability of pow intrinsic.

* fix the lint.

* Fix the lint.

* add tir.pow to make it consistent.

Co-authored-by: lightzhan-intellif <zhan.liang@intellif.com>
---
 python/tvm/script/ir_builder/tir/ir.py        |  4 ++--
 python/tvm/tir/__init__.py                    |  2 +-
 python/tvm/tir/op.py                          | 22 +++++++++++++++++++
 .../unittest/test_tvmscript_roundtrip.py      |  9 ++++++++
 4 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index 842e21378fd1..ac1e990a96e2 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -1532,7 +1532,7 @@ def wrapped(*args, **kwargs):
 nearbyint = _op_wrapper(_tir_op.nearbyint)
 nextafter = _op_wrapper(_tir_op.nextafter)
 popcount = _op_wrapper(_tir_op.popcount)
-power = _op_wrapper(_tir_op.power)
+pow = _op_wrapper(_tir_op.pow)  # pylint: disable=redefined-builtin
 q_multiply_shift = _op_wrapper(_tir_op.q_multiply_shift)
 q_multiply_shift_per_axis = _op_wrapper(_tir_op.q_multiply_shift_per_axis)
 ret = _op_wrapper(_tir_op.ret)
@@ -1713,7 +1713,7 @@ def f():
     "nearbyint",
     "nextafter",
     "popcount",
-    "power",
+    "pow",
     "q_multiply_shift",
     "q_multiply_shift_per_axis",
     "ret",
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index a2e341d82354..9522181432f2 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -69,7 +69,7 @@
 from .op import cos, cosh, acos, acosh
 from .op import tan, tanh, atan, atan2, atanh
 from .op import erf, sigmoid, sqrt, rsqrt, floor, ceil, hypot
-from .op import trunc, abs, round, nextafter, nearbyint, power, popcount, fmod, if_then_else
+from .op import trunc, abs, round, nextafter, nearbyint, power, pow, popcount, fmod, if_then_else
 from .op import likely, isnan, isnullptr, isfinite, isinf, copysign
 from .op import div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod, ceildiv
 from .op import comm_reducer, min, max, sum
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index e1adc0a6bbd7..131e91de876e 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -2236,6 +2236,28 @@ def power(x, y, span=None):
     return _ffi_api._OpPow(convert(x), convert(y), span)  # type: ignore
 
 
+def pow(x, y, span=None):
+    """x power y
+
+    Parameters
+    ----------
+    x : PrimExpr
+        Input argument.
+
+    y : PrimExpr
+        The exponent
+
+    span : Optional[Span]
+        The location of this operator in the source code.
+
+    Returns
+    -------
+    z : PrimExpr
+        The result.
+    """
+    return _ffi_api._OpPow(convert(x), convert(y), span)  # type: ignore
+
+
 def popcount(x):
     """Count the number of set bits in input x.
 
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index c0174a0671c0..0e9be0463943 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3550,6 +3550,14 @@ def func(A: T.Buffer[128, "float32"], C: T.Buffer[128, "float32"]):
     return mod["main"]
 
 
+def intrinsic_pow():
+    @T.prim_func
+    def func():
+        T.pow(T.float32(1), T.float32(1))
+
+    return func
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3607,6 +3615,7 @@ def func(A: T.Buffer[128, "float32"], C: T.Buffer[128, "float32"]):
     elif_chain_with_else,
     *nested_boolean_expressions(),
     multi_env_threads,
+    intrinsic_pow,
 )
 
 
From 3db59460a5d077c0a7664521a9fbf48087466ff3 Mon Sep 17 00:00:00 2001
From: zhaojinxi <super-string@outlook.com>
Date: Thu, 5 Jan 2023 21:14:17 +0800
Subject: [PATCH 120/286] [BugFix][Runtime] Fix Incorrect node information
 (#13693)

* [BugFix][Runtime] Fix Incorrect node information

* 1

* 1
---
 python/tvm/contrib/debugger/debug_result.py   | 24 ++++++++++-------
 .../unittest/test_runtime_graph_debug.py      | 26 ++++++++++++++++++-
 2 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
index 006edd345802..8a1089f843cd 100644
--- a/python/tvm/contrib/debugger/debug_result.py
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -73,21 +73,25 @@ def _update_graph_json(self):
         """update the nodes_list with name, shape and data type,
         for temporarily storing the output.
         """
-        nodes_len = len(self._nodes_list)
-        for i in range(nodes_len):
-            node = self._nodes_list[i]
+        eid = 0
+        for node in self._nodes_list:
             input_list = []
-            for input_node in node["inputs"]:
-                input_list.append(self._nodes_list[input_node[0]]["name"])
-            node["inputs"] = input_list
-            dtype = str("type: " + self._dtype_list[1][i])
-            if "attrs" not in node:
+            if node["op"] == "null":
                 node["attrs"] = {}
                 node["op"] = "param"
-            else:
+                num_outputs = 1
+            elif node["op"] == "tvm_op":
+                for input_node in node["inputs"]:
+                    input_list.append(self._nodes_list[input_node[0]]["name"])
                 node["op"] = node["attrs"]["func_name"]
+                num_outputs = int(node["attrs"]["num_outputs"])
+            else:
+                raise ValueError("")
+            node["inputs"] = input_list
+            dtype = str("type: " + self._dtype_list[1][eid])
             node["attrs"].update({"T": dtype})
-            node["shape"] = self._shapes_list[1][i]
+            node["shape"] = self._shapes_list[1][eid]
+            eid += num_outputs
 
     def _cleanup_tensors(self):
         """Remove the tensor dump file (graph wont be removed)"""
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index bc0e96f50b45..9111ed38db33 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -29,7 +29,7 @@
 from tvm._ffi.base import TVMError
 from tvm.contrib import utils
 from tvm.contrib.debugger import debug_executor
-
+from tvm import relay
 
 # Constants for creating simple graphs, fixtures to avoid free globals
 @pytest.fixture
@@ -275,5 +275,29 @@ def test_run_single_node(graph, n, A, myadd):
         mod.run_individual_node(2)
 
 
+@tvm.testing.requires_llvm
+def test_multiple_output():
+    x = relay.var("x", shape=(1, 3, 48, 16), dtype="float32")
+    t = relay.split(x, [12, 16, 32], 2).astuple()
+    x0 = relay.TupleGetItem(t, 0)
+    x1 = relay.TupleGetItem(t, 1)
+    x2 = relay.TupleGetItem(t, 2)
+    x3 = relay.TupleGetItem(t, 3)
+    p0 = relay.const(np.random.uniform(-1, 1, (3, 3, 1, 1)).astype("float32"))
+    y = relay.nn.conv2d(x2, p0, kernel_size=(1, 1), kernel_layout="OIHW", out_dtype="float32") + x3
+
+    func = relay.Function([x], relay.Tuple([x0, x1, y]))
+    mod = tvm.IRModule.from_expr(func)
+    mod = relay.transform.InferType()(mod)
+    target = tvm.target.Target("llvm")
+    device = tvm.cpu()
+    lib = relay.build(mod, target=target)
+    m = debug_executor.GraphModuleDebug(
+        lib["debug_create"]("default", device), [device], lib.get_graph_json(), None
+    )
+    nodes = m.debug_datum.get_graph_nodes()
+    assert nodes[2]["shape"] == [3, 3, 1, 1]
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 50d2154f1468fb8019f0916dd85c60cbad040b22 Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Thu, 5 Jan 2023 11:01:49 -0800
Subject: [PATCH 121/286] [microTVM] Build standalone_crt with cmake instead of
 makefile (#13600)

Build standalone_crt with cmake instead of makefile to allow for better portability of microTVM code to other build environments.

fixes #13533
---
 cmake/modules/StandaloneCrt.cmake | 254 +++++++++++++++++-------------
 include/tvm/runtime/crt/crt.h     |   0
 2 files changed, 141 insertions(+), 113 deletions(-)
 mode change 100755 => 100644 include/tvm/runtime/crt/crt.h

diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake
index 8c25cf48df27..306e4af13c0c 100644
--- a/cmake/modules/StandaloneCrt.cmake
+++ b/cmake/modules/StandaloneCrt.cmake
@@ -23,6 +23,9 @@ if(MSVC)
   # When building for Windows, use standard CMake for compatibility with
   # Visual Studio build tools and not require Make to be on the system.
 
+  # TODO: test building with this MSVC conditional code removed
+  # when USE_MICRO is enabled
+
   set(CRT_CONFIG, "src/runtime/micro/crt_config.h")
 
   add_library(host_standalone_crt
@@ -40,129 +43,154 @@ if(MSVC)
 
 else()
 
-  message(STATUS "Build standalone CRT for microTVM")
+  function(create_crt_library CRT_LIBRARY)
 
-  function(tvm_crt_define_targets)
-    # Build an isolated build directory, separate from the TVM tree.
-    list(APPEND CRT_FILE_COPY_JOBS
-         "3rdparty/libcrc/include *.h -> include"
-         "3rdparty/libcrc/src crcccitt.c -> src/runtime/crt/microtvm_rpc_common"
-         "3rdparty/libcrc/tab gentab_ccitt.inc -> src/runtime/crt/tab"
-         "3rdparty/dlpack/include *.h -> include"
-         "3rdparty/dmlc-core/include *.h -> include"
-         "include/tvm/runtime c_*_api.h -> include/tvm/runtime"
-         "include/tvm/runtime metadata_types.h -> include/tvm/runtime"
-         "include/tvm/runtime/crt *.h -> include/tvm/runtime/crt"
-         "src/runtime/crt Makefile -> ."
-         "src/runtime/crt/include *.h -> include"
-         "src/runtime/crt/aot_executor *.c -> src/runtime/crt/aot_executor"
-         "src/runtime/crt/aot_executor_module *.c -> src/runtime/crt/aot_executor_module"
-         "src/runtime/crt/common *.c -> src/runtime/crt/common"
-         "src/runtime/crt/graph_executor *.c -> src/runtime/crt/graph_executor"
-         "src/runtime/crt/graph_executor_module *.c -> src/runtime/crt/graph_executor_module"
-         "src/runtime/crt/host *.cc -> template/host"
-         "src/runtime/crt/host *.py -> template/host"
-         "src/runtime/crt/host Makefile.template -> template/host"
-         "src/runtime/crt/memory *.c -> src/runtime/crt/memory"
-         "src/runtime/crt/microtvm_rpc_common *.cc -> src/runtime/crt/microtvm_rpc_common"
-         "src/runtime/crt/microtvm_rpc_server *.cc -> src/runtime/crt/microtvm_rpc_server"
-         "src/runtime/minrpc *.h -> src/runtime/minrpc"
-         "src/support generic_arena.h -> src/support"
-         "src/support ssize.h -> src/support"
-         "src/runtime/crt crt_config-template.h -> template"
-         )
-
-    set(standalone_crt_base "${CMAKE_CURRENT_BINARY_DIR}/standalone_crt")
-
-    foreach(job_spec IN LISTS CRT_FILE_COPY_JOBS)
-      string(REPLACE " " ";" job_spec "${job_spec}")
-      list(LENGTH job_spec job_spec_length)
-      math(EXPR job_spec_length_mod "${job_spec_length} % 3")
-      if(NOT "${job_spec_length_mod}" EQUAL 1)
-        message(FATAL_ERROR "CRT copy job spec list length is ${job_spec_length}; parsed job spec is ${job_spec}")
-      endif()
-      math(EXPR job_spec_stop "${job_spec_length} - 3")
-
-      list(GET job_spec 0 job_src_base)
-      set(job_src_base "${CMAKE_CURRENT_SOURCE_DIR}/${job_src_base}")
-      foreach(copy_pattern_index RANGE 1 "${job_spec_stop}" 3)
-        list(GET job_spec ${copy_pattern_index} copy_pattern)
-        math(EXPR copy_dest_index "${copy_pattern_index} + 2")
-        list(GET job_spec ${copy_dest_index} copy_dest)
-
-        tvm_file_glob(GLOB_RECURSE copy_files
-             RELATIVE "${job_src_base}"
-             "${job_src_base}/${copy_pattern}")
-        list(LENGTH copy_files copy_files_length)
-        if("${copy_files_length}" EQUAL 0)
-          message(FATAL_ERROR "CRT copy job matched 0 files: ${job_src_base}/${copy_pattern} -> ${copy_dest}")
-        endif()
-        foreach(copy_src IN LISTS copy_files)
-          get_filename_component(dest_path "${standalone_crt_base}/${copy_dest}/${copy_src}" ABSOLUTE)
-          tvm_micro_add_copy_file(host_isolated_build_deps ${job_src_base}/${copy_src} ${dest_path})
-        endforeach()
-      endforeach()
-    endforeach()
+    set(CRT_LIBRARY_NAME host_standalone_crt_${CRT_LIBRARY})
+    set(CRT_LIBRARY_SOURCES "")
 
-    add_custom_target(standalone_crt DEPENDS ${host_isolated_build_deps})
+    foreach(FILE_NAME IN LISTS ARGN)
+       list(APPEND CRT_LIBRARY_SOURCES ${FILE_NAME})
+    endforeach()
 
-    get_filename_component(host_build_dir_abspath "${CMAKE_CURRENT_BINARY_DIR}/host_standalone_crt" ABSOLUTE)
+    add_library(${CRT_LIBRARY_NAME}
+                STATIC
+                ${CRT_LIBRARY_SOURCES})
 
-    if(${VERBOSE})
-    set(make_quiet QUIET=)
-    else(${VERBOSE})
-    set(make_quiet )
-    endif(${VERBOSE})
+    # add this library to the list of CRT libraries
+    set(CRT_LIBRARIES ${CRT_LIBRARIES} ${CRT_LIBRARY_NAME} PARENT_SCOPE)
 
-    list(APPEND crt_libraries memory graph_executor microtvm_rpc_server microtvm_rpc_common common)  # NOTE: listed in link order.
-    foreach(crt_lib_name IN LISTS crt_libraries)
-      list(APPEND crt_library_paths "host_standalone_crt/lib${crt_lib_name}.a")
-    endforeach()
+    target_include_directories(${CRT_LIBRARY_NAME}
+                               PUBLIC
+                               ${CMAKE_CURRENT_SOURCE_DIR}/src/runtime/micro/
+                               ${STANDALONE_CRT_BASE}/include)
 
-    set(make_common_args
-        "CRT_CONFIG=${CMAKE_CURRENT_SOURCE_DIR}/src/runtime/micro/crt_config.h"
-        "BUILD_DIR=${host_build_dir_abspath}"
-        "EXTRA_CFLAGS=-fPIC"
-        "EXTRA_CXXFLAGS=-fPIC"
-        "EXTRA_LDFLAGS=-fPIC"
-        "${make_quiet}")
-
-    add_custom_command(
-          OUTPUT ${crt_library_paths}
-          COMMAND make ARGS ${make_common_args} clean
-          COMMAND make ARGS ${make_common_args} all
-          WORKING_DIRECTORY "${standalone_crt_base}"
-          DEPENDS standalone_crt ${host_isolated_build_deps})
-
-    add_custom_target(host_standalone_crt DEPENDS ${crt_library_paths})
-
-    foreach(crt_lib IN LISTS crt_libraries)
-      set(cmake_crt_lib_name host_standalone_crt_${crt_lib})
-      list(APPEND cmake_crt_libraries ${cmake_crt_lib_name})
-      add_library(${cmake_crt_lib_name} STATIC IMPORTED GLOBAL)
-      set(cmake_crt_lib_path "${CMAKE_CURRENT_BINARY_DIR}/host_standalone_crt/lib${crt_lib}.a")
-      add_dependencies(${cmake_crt_lib_name} host_standalone_crt "${cmake_crt_lib_path}")
-      set_target_properties(${cmake_crt_lib_name} PROPERTIES
-          IMPORTED_LOCATION "${cmake_crt_lib_path}"
-          IMPORTED_OBJECTS "${cmake_crt_lib_path}"
-          PUBLIC_HEADER "${crt_headers}")
-    endforeach()
+    set_target_properties(${CRT_LIBRARY_NAME}
+                          PROPERTIES
+                          ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/host_standalone_crt
+                          POSITION_INDEPENDENT_CODE ON)
 
-    # Create the `crttest` target if we can find GTest.  If not, we create dummy
-    # targets that give the user an informative error message.
-    if(GTEST_FOUND)
-      tvm_file_glob(GLOB TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/tests/crt/*.cc)
-      add_executable(crttest ${TEST_SRCS})
-      target_include_directories(crttest SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/standalone_crt/include ${CMAKE_CURRENT_SOURCE_DIR}/src/runtime/micro)
-      target_link_libraries(crttest PRIVATE ${cmake_crt_libraries} GTest::GTest GTest::Main pthread dl)
-      set_target_properties(crttest PROPERTIES EXCLUDE_FROM_ALL 1)
-      set_target_properties(crttest PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1)
-      gtest_discover_tests(crttest)
-    endif()
+    # make these libraries dependent on standalone_crt which depends on host_isolated_build_deps to avoid
+    # race with the file copy jobs
+    add_dependencies(${CRT_LIBRARY_NAME} standalone_crt)
 
   endfunction()
 
-  tvm_crt_define_targets()
+  message(STATUS "Build standalone CRT for microTVM")
+
+  # Build an isolated build directory, separate from the TVM tree.
+  list(APPEND CRT_FILE_COPY_JOBS
+        "3rdparty/libcrc/include *.h -> include"
+        "3rdparty/libcrc/src crcccitt.c -> src/runtime/crt/microtvm_rpc_common"
+        "3rdparty/libcrc/tab gentab_ccitt.inc -> src/runtime/crt/tab"
+        "3rdparty/dlpack/include *.h -> include"
+        "3rdparty/dmlc-core/include *.h -> include"
+        "include/tvm/runtime c_*_api.h -> include/tvm/runtime"
+        "include/tvm/runtime metadata_types.h -> include/tvm/runtime"
+        "include/tvm/runtime/crt *.h -> include/tvm/runtime/crt"
+        "src/runtime/crt Makefile -> ."
+        "src/runtime/crt/include *.h -> include"
+        "src/runtime/crt/aot_executor *.c -> src/runtime/crt/aot_executor"
+        "src/runtime/crt/aot_executor_module *.c -> src/runtime/crt/aot_executor_module"
+        "src/runtime/crt/common *.c -> src/runtime/crt/common"
+        "src/runtime/crt/graph_executor *.c -> src/runtime/crt/graph_executor"
+        "src/runtime/crt/graph_executor_module *.c -> src/runtime/crt/graph_executor_module"
+        "src/runtime/crt/host *.cc -> template/host"
+        "src/runtime/crt/host *.py -> template/host"
+        "src/runtime/crt/host Makefile.template -> template/host"
+        "src/runtime/crt/memory *.c -> src/runtime/crt/memory"
+        "src/runtime/crt/microtvm_rpc_common *.cc -> src/runtime/crt/microtvm_rpc_common"
+        "src/runtime/crt/microtvm_rpc_server *.cc -> src/runtime/crt/microtvm_rpc_server"
+        "src/runtime/minrpc *.h -> src/runtime/minrpc"
+        "src/support generic_arena.h -> src/support"
+        "src/support ssize.h -> src/support"
+        "src/runtime/crt crt_config-template.h -> template"
+        )
+
+  set(STANDALONE_CRT_BASE ${CMAKE_CURRENT_BINARY_DIR}/standalone_crt)
+
+  foreach(job_spec IN LISTS CRT_FILE_COPY_JOBS)
+    string(REPLACE " " ";" job_spec "${job_spec}")
+    list(LENGTH job_spec job_spec_length)
+    math(EXPR job_spec_length_mod "${job_spec_length} % 3")
+    if(NOT "${job_spec_length_mod}" EQUAL 1)
+      message(FATAL_ERROR "CRT copy job spec list length is ${job_spec_length}; parsed job spec is ${job_spec}")
+    endif()
+    math(EXPR job_spec_stop "${job_spec_length} - 3")
+
+    list(GET job_spec 0 job_src_base)
+    set(job_src_base "${CMAKE_CURRENT_SOURCE_DIR}/${job_src_base}")
+    foreach(copy_pattern_index RANGE 1 "${job_spec_stop}" 3)
+      list(GET job_spec ${copy_pattern_index} copy_pattern)
+      math(EXPR copy_dest_index "${copy_pattern_index} + 2")
+      list(GET job_spec ${copy_dest_index} copy_dest)
+
+      tvm_file_glob(GLOB_RECURSE copy_files
+            RELATIVE "${job_src_base}"
+            "${job_src_base}/${copy_pattern}")
+      list(LENGTH copy_files copy_files_length)
+      if("${copy_files_length}" EQUAL 0)
+        message(FATAL_ERROR "CRT copy job matched 0 files: ${job_src_base}/${copy_pattern} -> ${copy_dest}")
+      endif()
+      foreach(copy_src IN LISTS copy_files)
+        get_filename_component(dest_path "${STANDALONE_CRT_BASE}/${copy_dest}/${copy_src}" ABSOLUTE)
+        tvm_micro_add_copy_file(host_isolated_build_deps ${job_src_base}/${copy_src} ${dest_path})
+      endforeach()
+    endforeach()
+  endforeach()
+
+  add_custom_target(standalone_crt DEPENDS ${host_isolated_build_deps})
+
+  set(CRT_LIBRARIES "")
+  set(RUNTIME_CRT_SOURCE_DIR ${STANDALONE_CRT_BASE}/src/runtime/crt)
+
+  # these create_crt_library() targets are in link order and the common library needs to be last
+  create_crt_library(aot_executor
+                    ${RUNTIME_CRT_SOURCE_DIR}/aot_executor/aot_executor.c)
+
+  create_crt_library(aot_executor_module
+                    ${RUNTIME_CRT_SOURCE_DIR}/aot_executor_module/aot_executor_module.c)
+
+  create_crt_library(graph_executor
+                     ${RUNTIME_CRT_SOURCE_DIR}/graph_executor/graph_executor.c
+                     ${RUNTIME_CRT_SOURCE_DIR}/graph_executor/load_json.c)
+
+  create_crt_library(graph_executor_module
+                     ${RUNTIME_CRT_SOURCE_DIR}/graph_executor_module/graph_executor_module.c)
+
+  create_crt_library(memory
+                     ${RUNTIME_CRT_SOURCE_DIR}/memory/page_allocator.c
+                     ${RUNTIME_CRT_SOURCE_DIR}/memory/stack_allocator.c)
+
+  create_crt_library(microtvm_rpc_common
+                     ${RUNTIME_CRT_SOURCE_DIR}/microtvm_rpc_common/crcccitt.c
+                     ${RUNTIME_CRT_SOURCE_DIR}/microtvm_rpc_common/frame_buffer.cc
+                     ${RUNTIME_CRT_SOURCE_DIR}/microtvm_rpc_common/framing.cc
+                     ${RUNTIME_CRT_SOURCE_DIR}/microtvm_rpc_common/session.cc
+                     ${RUNTIME_CRT_SOURCE_DIR}/microtvm_rpc_common/write_stream.cc)
+
+  create_crt_library(microtvm_rpc_server
+                     ${RUNTIME_CRT_SOURCE_DIR}/microtvm_rpc_server/rpc_server.cc)
+
+  create_crt_library(common
+                     ${RUNTIME_CRT_SOURCE_DIR}/common/crt_backend_api.c
+                     ${RUNTIME_CRT_SOURCE_DIR}/common/crt_runtime_api.c
+                     ${RUNTIME_CRT_SOURCE_DIR}/common/func_registry.c
+                     ${RUNTIME_CRT_SOURCE_DIR}/common/ndarray.c
+                     ${RUNTIME_CRT_SOURCE_DIR}/common/packed_func.c)
+
+  add_custom_target(host_standalone_crt DEPENDS ${CRT_LIBRARIES} standalone_crt)
+
+  # Create the `crttest` target if we can find GTest.  If not, we create dummy
+  # targets that give the user an informative error message.
+  if(GTEST_FOUND)
+    tvm_file_glob(GLOB TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/tests/crt/*.cc)
+    add_executable(crttest ${TEST_SRCS})
+    target_include_directories(crttest SYSTEM PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/standalone_crt/include ${CMAKE_CURRENT_SOURCE_DIR}/src/runtime/micro)
+    target_link_libraries(crttest PRIVATE ${CRT_LIBRARIES} GTest::GTest GTest::Main pthread dl)
+    set_target_properties(crttest PROPERTIES EXCLUDE_FROM_ALL 1)
+    set_target_properties(crttest PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1)
+    gtest_discover_tests(crttest)
+  endif()
 
   set(TVM_CRT_LINKER_LIB host_standalone_crt_microtvm_rpc_common)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
diff --git a/include/tvm/runtime/crt/crt.h b/include/tvm/runtime/crt/crt.h
old mode 100755
new mode 100644

From f2bbb7e06bfb03e957ab46d95542cf53fe750f3e Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Thu, 5 Jan 2023 19:40:58 -0800
Subject: [PATCH 122/286] [microTVM] Fix MacOS build with USE_MICRO=ON (#13711)

fix MacOS build with USE_MICRO=ON
---
 cmake/modules/StandaloneCrt.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake
index 306e4af13c0c..2ca37f53d9f5 100644
--- a/cmake/modules/StandaloneCrt.cmake
+++ b/cmake/modules/StandaloneCrt.cmake
@@ -196,7 +196,7 @@ else()
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
   list(APPEND TVM_RUNTIME_LINKER_LIBS -Wl,--whole-archive ${TVM_CRT_LINKER_LIB} -Wl,--no-whole-archive)
   elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES ".*Clang")
-  list(APPEND TVM_RUNTIME_LINKER_LIBS -Wl,-force_load $<TARGET_PROPERTY:${TVM_CRT_LINKER_LIB},IMPORTED_LOCATION>)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS -Wl,-force_load ${TVM_CRT_LINKER_LIB})
   else()
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${TVM_CRT_LINKER_LIB})
   endif()

From 391b6591cad1468b396101da45d6ff987f23101b Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 6 Jan 2023 14:03:08 -0800
Subject: [PATCH 123/286] [tir] Add line level debug info (#13012)

* TIR debug info

* Fix location emission

* Comments 1/N (docs, cleanups)

* Remove leaky macro usage

* Add unit test

* Remove dead code

* Add accuracy test

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .gitignore                                |   6 +
 include/tvm/tir/transform.h               |   7 +
 python/tvm/tir/transform/transform.py     |  12 ++
 src/driver/driver_api.cc                  |   8 ++
 src/ir/transform.cc                       |   1 +
 src/printer/text_printer.h                |  40 +++---
 src/printer/tir_text_printer.cc           |  53 ++++----
 src/printer/tir_text_printer_debug.cc     |  97 ++++++++++++++
 src/printer/tir_text_printer_debug.h      |  70 ++++++++++
 src/target/llvm/codegen_cpu.cc            |  84 +++++++-----
 src/target/llvm/codegen_cpu.h             |   2 +
 src/target/llvm/codegen_llvm.cc           |  55 ++++++--
 src/target/llvm/codegen_llvm.h            |  10 +-
 src/tir/transforms/install_debug_spans.cc | 150 ++++++++++++++++++++++
 src/tir/transforms/install_debug_spans.h  | 132 +++++++++++++++++++
 tests/python/tir/test_debug_info.py       | 124 ++++++++++++++++++
 16 files changed, 762 insertions(+), 89 deletions(-)
 create mode 100644 src/printer/tir_text_printer_debug.cc
 create mode 100644 src/printer/tir_text_printer_debug.h
 create mode 100644 src/tir/transforms/install_debug_spans.cc
 create mode 100644 src/tir/transforms/install_debug_spans.h
 create mode 100644 tests/python/tir/test_debug_info.py

diff --git a/.gitignore b/.gitignore
index 03c0a0bc6af9..851552d95976 100644
--- a/.gitignore
+++ b/.gitignore
@@ -271,3 +271,9 @@ gallery/how_to/work_with_microtvm/micro_tvmc.py
 
 # Used in CI to communicate between Python and Jenkins
 .docker-image-names/
+
+# Printed TIR code on disk
+*.tir
+
+# GDB history file
+.gdb_history
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 48372565469b..829594d61b98 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -496,6 +496,13 @@ TVM_DLL Pass LowerAsyncDMA();
  */
 TVM_DLL Pass CommonSubexprElimTIR(bool enable_cse_tir = true, bool identify_equiv_terms = false);
 
+/*!
+ * \brief Add TIR-printer output as debug information to all ops in the module
+ * \return The pass.
+ */
+
+TVM_DLL Pass InstallDebugSpans();
+
 /*!
  * \brief Unify all the thread bindings for "blockIdx.x/y/z", "threadIdx.x/y/z", and
  *        "vthread.x/y/z". Before the unification, two vars that are bound to a thread axis (e.g.,
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index c0aa371b4d3d..82b162ef7df0 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -1039,3 +1039,15 @@ def InstrumentProfileIntrinsics():
         The result pass
     """
     return _ffi_api.InstrumentProfileIntrinsics()  # type: ignore
+
+
+def InstallDebugSpans():
+    """Add line information from the TIR printer as spans on each statement and
+    expression.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.InstallDebugSpans()  # type: ignore
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 92769d1cef45..288ac7b92a2c 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -45,6 +45,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_bound_checkers", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_assert", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_vectorize", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_cse_tir", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.enable_debug", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.enable_equiv_terms_in_cse_tir", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_storage_rewrite", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.is_entry_func", Bool);
@@ -603,6 +604,9 @@ TVM_REGISTER_GLOBAL("driver.mixed_mod_passes")
     });
 
 transform::Sequential HostModulePassManager(IRModule mixed_mod, Target target_host) {
+  transform::PassContext pass_ctx = transform::PassContext::Current();
+  bool enable_debug = pass_ctx->GetConfig<Bool>("tir.enable_debug", Bool(false)).value();
+
   Array<tvm::transform::Pass> host_pass_list;
 
   runtime::TypedPackedFunc<bool(tir::PrimFunc)> fcond = [](const tir::PrimFunc& f) {
@@ -621,6 +625,10 @@ transform::Sequential HostModulePassManager(IRModule mixed_mod, Target target_ho
   host_pass_list.push_back(tir::transform::LowerDeviceStorageAccessInfo());
   host_pass_list.push_back(tir::transform::CombineContextCall());
 
+  if (enable_debug) {
+    host_pass_list.push_back(tir::transform::InstallDebugSpans());
+  }
+
   return transform::Sequential(host_pass_list);
 }
 
diff --git a/src/ir/transform.cc b/src/ir/transform.cc
index 77ea942a0bb9..e0f08d28fb18 100644
--- a/src/ir/transform.cc
+++ b/src/ir/transform.cc
@@ -440,6 +440,7 @@ Pass GetPass(const String& pass_name) {
 // ordering problem needs to be handled in the future.
 IRModule SequentialNode::operator()(IRModule mod, const PassContext& pass_ctx) const {
   for (const Pass& pass : passes) {
+    VLOG(0) << "Running pass " << pass->Info()->name;
     ICHECK(pass.defined()) << "Found undefined pass for optimization.";
     const PassInfo& pass_info = pass->Info();
     if (!pass_ctx.PassEnabled(pass_info)) {
diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h
index 2dc0997f82ec..afc76112879e 100644
--- a/src/printer/text_printer.h
+++ b/src/printer/text_printer.h
@@ -280,6 +280,9 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   explicit TIRTextPrinter(bool show_meta, TextMetaDataContext* meta)
       : show_meta_(show_meta), meta_(meta), meta_collector_(meta) {}
 
+  /*! \brief Output a newline */
+  virtual Doc NewLine();
+
   /*! \brief Print the node */
   Doc Print(const ObjectRef& node);
 
@@ -290,24 +293,7 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
    */
   bool GetVarName(::tvm::tir::Var v, std::string* s);
 
- private:
-  /*! \brief whether show meta data */
-  bool show_meta_;
-  /*! \brief meta data context */
-  TextMetaDataContext* meta_;
-  /*! \brief meta collector */
-  MetaCollector meta_collector_;
-  /*! \brief Map from Var to Doc */
-  std::unordered_map<Var, Doc, ObjectPtrHash, ObjectPtrEqual> memo_var_;
-  /*! \brief Map from Buffer to Doc */
-  std::unordered_map<Buffer, Doc, ObjectPtrHash, ObjectPtrEqual> memo_buf_;
-  /*! \brief Map from Buffer to Doc */
-  std::unordered_map<DataProducer, Doc, ObjectPtrHash, ObjectPtrEqual> memo_producer_;
-  /*! \brief name allocation map */
-  std::unordered_map<std::string, int> name_alloc_map_;
-
-  friend class tvm::TextPrinter;
-
+ protected:
   Doc VisitExpr_(const IntImmNode* op) override;
   Doc VisitExpr_(const FloatImmNode* op) override;
   Doc VisitExpr_(const StringImmNode* op) override;
@@ -363,6 +349,24 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc VisitStmt_(const BlockRealizeNode* op) override;
   Doc VisitStmtDefault_(const Object* op) override;
 
+ private:
+  /*! \brief whether show meta data */
+  bool show_meta_;
+  /*! \brief meta data context */
+  TextMetaDataContext* meta_;
+  /*! \brief meta collector */
+  MetaCollector meta_collector_;
+  /*! \brief Map from Var to Doc */
+  std::unordered_map<Var, Doc, ObjectPtrHash, ObjectPtrEqual> memo_var_;
+  /*! \brief Map from Buffer to Doc */
+  std::unordered_map<Buffer, Doc, ObjectPtrHash, ObjectPtrEqual> memo_buf_;
+  /*! \brief Map from Buffer to Doc */
+  std::unordered_map<DataProducer, Doc, ObjectPtrHash, ObjectPtrEqual> memo_producer_;
+  /*! \brief name allocation map */
+  std::unordered_map<std::string, int> name_alloc_map_;
+
+  friend class tvm::TextPrinter;
+
   Doc VisitType_(const PrimTypeNode* node) override;
   Doc VisitType_(const PointerTypeNode* node) override;
   Doc VisitType_(const TupleTypeNode* node) override;
diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index fc3f49d76fae..4d74cc6d5a48 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -124,7 +124,7 @@ Doc TIRTextPrinter::PrintPrimFunc(const PrimFunc& prim_func) {
     for (const auto& it : op->attrs->dict) {
       attr_docs.push_back(Doc::StrLiteral(it.first) << ": " << Print(it.second));
     }
-    attr_doc << Doc::NewLine() << "attr = {" << PrintSep(attr_docs, Doc::Text(", ")) << "}";
+    attr_doc << NewLine() << "attr = {" << PrintSep(attr_docs, Doc::Text(", ")) << "}";
     doc << Doc::Indent(2, attr_doc);
   }
 
@@ -136,8 +136,8 @@ Doc TIRTextPrinter::PrintPrimFunc(const PrimFunc& prim_func) {
       const Buffer buf = op->buffer_map[v];
       buffer_docs.push_back(BufferNode2Doc(buf.get(), Print(buf)));
     }
-    buffer_doc << Doc::NewLine() << "buffers = {";
-    buffer_doc << PrintSep(buffer_docs, Doc::Indent(11, Doc::Text(",") << Doc::NewLine()));
+    buffer_doc << NewLine() << "buffers = {";
+    buffer_doc << PrintSep(buffer_docs, Doc::Indent(11, Doc::Text(",") << NewLine()));
     doc << Doc::Indent(2, buffer_doc) << "}";
   }
 
@@ -149,26 +149,28 @@ Doc TIRTextPrinter::PrintPrimFunc(const PrimFunc& prim_func) {
       buffer_map_doc.push_back(Print(v) << ": " << Print(buf));
     }
     doc << Doc::Indent(
-        2, Doc::NewLine() << "buffer_map = {" << PrintSep(buffer_map_doc, Doc::Text(", ")) << "}");
+        2, NewLine() << "buffer_map = {" << PrintSep(buffer_map_doc, Doc::Text(", ")) << "}");
   }
 
   doc << PrintBody(op->body);
   return doc;
 }
 
+Doc TIRTextPrinter::NewLine() { return Doc::NewLine(); }
+
 Doc TIRTextPrinter::PrintIRModule(const IRModule& module) {
   const auto* op = module.operator->();
   Doc doc;
 
   Doc body;
-  body << Doc::NewLine();
+  body << NewLine();
   std::vector<Doc> functions;
   for (auto it = op->functions.begin(); it != op->functions.end(); ++it) {
     if ((*it).second.as<PrimFuncNode>()) {
       functions.push_back(Print((*it).second));
     }
   }
-  body << TIRTextPrinter::PrintSep(functions, Doc::NewLine() << Doc::NewLine());
+  body << TIRTextPrinter::PrintSep(functions, NewLine() << NewLine());
   doc << Doc::Indent(0, body);
   return doc;
 }
@@ -451,7 +453,7 @@ Doc TIRTextPrinter::VisitExpr_(const ReduceNode* op) {
 
 Doc TIRTextPrinter::VisitStmt_(const LetStmtNode* op) {
   Doc doc;
-  doc << "let " << Print(op->var) << " = " << Print(op->value) << Doc::NewLine() << Print(op->body);
+  doc << "let " << Print(op->var) << " = " << Print(op->value) << NewLine() << Print(op->body);
   return doc;
 }
 
@@ -463,14 +465,14 @@ Doc TIRTextPrinter::VisitStmt_(const AttrStmtNode* op) {
   if (op->body->IsInstance<SeqStmtNode>()) {
     doc << PrintBody(op->body);
   } else {
-    doc << ";" << Doc::NewLine() << Print(op->body);
+    doc << ";" << NewLine() << Print(op->body);
   }
   return doc;
 }
 
 Doc TIRTextPrinter::VisitStmt_(const AssertStmtNode* op) {
   Doc doc;
-  doc << "assert(" << Print(op->condition) << ", " << Print(op->message) << ")" << Doc::NewLine()
+  doc << "assert(" << Print(op->condition) << ", " << Print(op->message) << ")" << NewLine()
       << Print(op->body);
   return doc;
 }
@@ -529,7 +531,7 @@ Doc TIRTextPrinter::VisitStmt_(const AllocateNode* op) {
   if (op->body->IsInstance<SeqStmtNode>()) {
     doc << PrintBody(op->body);
   } else {
-    doc << ";" << Doc::NewLine() << Print(op->body);
+    doc << ";" << NewLine() << Print(op->body);
   }
   return doc;
 }
@@ -542,7 +544,7 @@ Doc TIRTextPrinter::VisitStmt_(const AllocateConstNode* op) {
   if (op->body->IsInstance<SeqStmtNode>()) {
     doc << PrintBody(op->body);
   } else {
-    doc << ";" << Doc::NewLine() << Print(op->body);
+    doc << ";" << NewLine() << Print(op->body);
   }
   return doc;
 }
@@ -550,11 +552,11 @@ Doc TIRTextPrinter::VisitStmt_(const AllocateConstNode* op) {
 Doc TIRTextPrinter::VisitStmt_(const DeclBufferNode* op) {
   Doc doc;
   doc << AllocBuf(op->buffer) << " = decl_buffer(" << Print(op->buffer->data) << ", "
-      << PrintDType(op->buffer->dtype) << ", " << Print(op->buffer->shape) << ")" << Doc::NewLine();
+      << PrintDType(op->buffer->dtype) << ", " << Print(op->buffer->shape) << ")" << NewLine();
   if (op->body->IsInstance<SeqStmtNode>()) {
     doc << PrintBody(op->body);
   } else {
-    doc << ";" << Doc::NewLine() << Print(op->body);
+    doc << ";" << NewLine() << Print(op->body);
   }
   return doc;
 }
@@ -572,9 +574,9 @@ Doc TIRTextPrinter::VisitStmt_(const SeqStmtNode* op) {
   std::vector<Doc> stmts;
   Doc seq_doc, doc;
   for (Stmt stmt : op->seq) {
-    seq_doc << Doc::NewLine() << Print(stmt);
+    seq_doc << NewLine() << Print(stmt);
   }
-  doc << " {" << Doc::Indent(2, seq_doc) << Doc::NewLine() << "}";
+  doc << " {" << Doc::Indent(2, seq_doc) << NewLine() << "}";
   return doc;
 }
 
@@ -657,37 +659,36 @@ Doc TIRTextPrinter::VisitStmt_(const BlockRealizeNode* op) {
   Doc block_attr_doc;
   // print predicate, binding, read/write tensor region, annotations
   if (!is_one(op->predicate)) {
-    block_attr_doc << Doc::NewLine() << "where(" << Print(op->predicate) << ")";
+    block_attr_doc << NewLine() << "where(" << Print(op->predicate) << ")";
   }
   for (size_t i = 0; i < block_op->iter_vars.size(); ++i)
-    block_attr_doc << Doc::NewLine() << "bind(" << Print(block_op->iter_vars[i]->var) << ", "
+    block_attr_doc << NewLine() << "bind(" << Print(block_op->iter_vars[i]->var) << ", "
                    << Print(op->iter_values[i]) << ")";
-  block_attr_doc << Doc::NewLine() << "tir.reads(" << Print(block_op->reads) << ")";
-  block_attr_doc << Doc::NewLine() << "tir.writes(" << Print(block_op->writes) << ")";
+  block_attr_doc << NewLine() << "tir.reads(" << Print(block_op->reads) << ")";
+  block_attr_doc << NewLine() << "tir.writes(" << Print(block_op->writes) << ")";
   if (!block_op->annotations.empty()) {
     std::vector<Doc> attr_docs;
     for (const auto& it : block_op->annotations) {
       attr_docs.push_back(Doc::StrLiteral(it.first) << ": " << Print(it.second));
     }
-    block_attr_doc << Doc::NewLine() << "tir.attrs({" << PrintSep(attr_docs, Doc::Text(", "))
-                   << "})";
+    block_attr_doc << NewLine() << "tir.attrs({" << PrintSep(attr_docs, Doc::Text(", ")) << "})";
   }
   // print body
   Doc body;
-  body << Doc::NewLine();
+  body << NewLine();
   for (const auto& alloc_buf : block_op->alloc_buffers) {
     body << AllocBuf(alloc_buf) << " = alloc_buffer(" << PrintDType(alloc_buf->dtype)
-         << Print(alloc_buf->shape) << ")" << Doc::NewLine();
+         << Print(alloc_buf->shape) << ")" << NewLine();
   }
   for (const auto& match_buf : block_op->match_buffers) {
     body << AllocBuf(match_buf->buffer) << " = match_buffer(" << Print(match_buf->source) << ")"
-         << Doc::NewLine();
+         << NewLine();
   }
   if (block_op->init.defined()) {
     Doc init_block;
     init_block << "with init()";
     init_block << PrintBody(block_op->init.value());
-    body << init_block << Doc::NewLine();
+    body << init_block << NewLine();
   }
   body << Print(block_op->body);
   doc << Doc::Indent(2, block_attr_doc << body);
@@ -826,7 +827,7 @@ Doc TIRTextPrinter::PrintSep(const std::vector<Doc>& vec, const Doc& sep) {
 Doc TIRTextPrinter::PrintBody(const Stmt& body, bool indent) {
   Doc doc;
   if (body->IsInstance<SeqStmtNode>()) return Print(body);
-  doc << " {" << Doc::Indent(2, Doc::NewLine() << Print(body)) << Doc::NewLine() << "}";
+  doc << " {" << Doc::Indent(2, NewLine() << Print(body)) << NewLine() << "}";
   return doc;
 }
 
diff --git a/src/printer/tir_text_printer_debug.cc b/src/printer/tir_text_printer_debug.cc
new file mode 100644
index 000000000000..6c29558f722c
--- /dev/null
+++ b/src/printer/tir_text_printer_debug.cc
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tir_text_printer.cc
+ * \brief Printer to print out the IR text format
+ *        that can be parsed by a parser.
+ */
+
+#include "tir_text_printer_debug.h"
+
+#include <optional>
+#include <string>
+
+namespace tvm {
+namespace tir {
+
+std::optional<std::string> span_text(const Span& span) {
+  if (!span.defined()) {
+    return std::nullopt;
+  }
+
+  std::string source("main.tir");
+  if (span->source_name.defined() && span->source_name->name.get()) {
+    source = span->source_name->name;
+  }
+  return source + ":" + std::to_string(span->line) + ":" + std::to_string(span->column);
+}
+
+template <typename ObjectPtr>
+void add_all_relevant_lines(const std::vector<std::tuple<const ObjectPtr*, size_t>>& data,
+                            size_t current_line, Doc* output) {
+  ICHECK(output) << "output must be a valid Doc";
+  for (const auto& item : data) {
+    if (std::get<1>(item) != current_line - 1) {
+      // Item is not relevant for this line, skip it
+      continue;
+    }
+
+    // Print out the item's span info if present
+    auto text = span_text(std::get<0>(item)->span);
+    if (text.has_value()) {
+      *output << *text;
+    } else {
+      *output << "missing";
+    }
+    *output << ", ";
+  }
+}
+
+Doc TIRTextPrinterDebug::NewLine() {
+  current_line_ += 1;
+
+  if (!show_spans_) {
+    return TIRTextPrinter::NewLine();
+  }
+
+  Doc output;
+
+  output << " [";
+
+  add_all_relevant_lines(exprs_by_line_, current_line_, &output);
+  add_all_relevant_lines(stmts_by_line_, current_line_, &output);
+
+  output << "]" << TIRTextPrinter::NewLine();
+
+  return output;
+}
+
+Doc TIRTextPrinterDebug::VisitStmt(const tvm::tir::Stmt& n) {
+  stmts_by_line_.push_back(std::make_tuple(n.get(), current_line_));
+  return TIRTextPrinter::VisitStmt(n);
+}
+
+Doc TIRTextPrinterDebug::VisitExpr(const PrimExpr& e) {
+  exprs_by_line_.push_back(std::make_tuple(e.get(), current_line_));
+  return TIRTextPrinter::VisitExpr(e);
+}
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/printer/tir_text_printer_debug.h b/src/printer/tir_text_printer_debug.h
new file mode 100644
index 000000000000..d0046034cfbf
--- /dev/null
+++ b/src/printer/tir_text_printer_debug.h
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file text_printer.h
+ * \brief Printer to print out the unified IR text format
+ *        that can be parsed by a parser.
+ */
+
+#ifndef TVM_PRINTER_TIR_TEXT_PRINTER_DEBUG_H_
+#define TVM_PRINTER_TIR_TEXT_PRINTER_DEBUG_H_
+
+#include <tuple>
+#include <vector>
+
+#include "text_printer.h"
+
+namespace tvm {
+namespace tir {
+
+class TIRTextPrinterDebug : public TIRTextPrinter {
+ public:
+  explicit TIRTextPrinterDebug(bool show_spans)
+      : TIRTextPrinter(false, &meta_), current_line_(1), show_spans_(show_spans) {}
+
+  std::vector<std::tuple<const PrimExprNode*, size_t>> GetExprsByLine() const {
+    return exprs_by_line_;
+  }
+
+  std::vector<std::tuple<const StmtNode*, size_t>> GetStmtsByLine() const { return stmts_by_line_; }
+
+ private:
+  Doc NewLine() override;
+
+  Doc VisitStmt(const tvm::tir::Stmt& n) override;
+  Doc VisitExpr(const PrimExpr& e) override;
+
+  TextMetaDataContext meta_;
+
+  // Line that the printer is currently printing
+  size_t current_line_;
+
+  // Whether to include spans relevant to each line before a newline or not
+  bool show_spans_;
+
+  // Record of all stmts and exprs and their corresponding line
+  std::vector<std::tuple<const StmtNode*, size_t>> stmts_by_line_;
+  std::vector<std::tuple<const PrimExprNode*, size_t>> exprs_by_line_;
+};
+
+}  // namespace tir
+}  // namespace tvm
+
+#endif  // TVM_PRINTER_TIR_TEXT_PRINTER_DEBUG_H_
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index facb49660078..21d2c6ebe0a5 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -183,57 +183,63 @@ void CodeGenCPU::Init(const std::string& module_name, LLVMTarget* llvm_target, b
   InitGlobalContext(dynamic_lookup);
 }
 
-void CodeGenCPU::AddFunction(const PrimFunc& f) {
-  CodeGenLLVM::AddFunction(f);
-  if (f_tvm_register_system_symbol_ != nullptr) {
-    auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    ICHECK(global_symbol.defined())
-        << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
-    export_system_symbols_.emplace_back(
-        std::make_pair(global_symbol.value().operator std::string(), function_));
-  }
-  AddDebugInformation(f, function_);
-}
-
-// Following Glow |DebugInfo::generateFunctionDebugInfo|, https://git.io/fjadv
-void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) {
+llvm::DISubprogram* CodeGenCPU::CreateDebugFunction(const PrimFunc& f) {
 #if TVM_LLVM_VERSION >= 50
-  ICHECK(!f_llvm->getSubprogram());
   llvm::SmallVector<llvm::Metadata*, 4> paramTys;
-  // Functions in TIR can only return void or an int.
-  ICHECK(f_llvm->getReturnType() == t_void_ || f_llvm->getReturnType() == t_int_)
-      << "Unexpected return type";
-  auto ret_type_tir = f_llvm->getReturnType() == t_int_ ? DataType::Int(32) : DataType::Void();
-  llvm::DIType* returnTy =
-      GetDebugType(GetTypeFromRuntimeDataType(ret_type_tir), f_llvm->getReturnType());
-  paramTys.push_back(returnTy);
-  for (size_t i = 0; i < f_llvm->arg_size(); ++i) {
-    paramTys.push_back(
-        GetDebugType(GetType(f_tir->params[i]), f_llvm->getFunctionType()->getParamType(i)));
+
+  paramTys.push_back(GetDebugType(f->ret_type));
+  for (const auto& param : f->params) {
+    paramTys.push_back(GetDebugType(GetType(param)));
   }
+
   auto* DIFunctionTy = dbg_info_->di_builder_->createSubroutineType(
       dbg_info_->di_builder_->getOrCreateTypeArray(paramTys));
 
-  bool local_to_unit = llvm::GlobalValue::isLocalLinkage(f_llvm->getLinkage());
+  bool local_to_unit = llvm::GlobalVariable::isLocalLinkage(llvm::GlobalValue::InternalLinkage);
 
+  // TODO(driazati): determine the IRModule name instead of hardcoding 'main.tir'
 #if TVM_LLVM_VERSION >= 80
-  auto SPFlags =
-      llvm::DISubprogram::toSPFlags(local_to_unit, /*IsDefinition=*/true, /*IsOptimized=*/true);
+  auto SPFlags = llvm::DISubprogram::toSPFlags(local_to_unit, /*IsDefinition=*/true,
+                                               /*IsOptimized=*/true);
   auto* DIFunction = dbg_info_->di_builder_->createFunction(
-      /*Scope=*/dbg_info_->file_, /*Name=*/f_llvm->getName(), /*LinkageName=*/"",
+      /*Scope=*/dbg_info_->file_, /*Name=*/"main.tir", /*LinkageName=*/"",
       /*File=*/dbg_info_->file_, /*LineNo=*/0, /*Ty=*/DIFunctionTy,
       /*ScopeLine=*/0, /*Flags=*/llvm::DINode::FlagZero, /*SPFlags=*/SPFlags);
 #else
   auto* DIFunction = dbg_info_->di_builder_->createFunction(
-      /*Scope=*/dbg_info_->file_, /*Name=*/f_llvm->getName(), /*LinkageName=*/"",
+      /*Scope=*/dbg_info_->file_, /*Name=*/"main.tir", /*LinkageName=*/"",
       /*File=*/dbg_info_->file_, /*LineNo=*/0, /*Ty=*/DIFunctionTy,
       /*isLocalToUnit=*/local_to_unit, /*isDefinition=*/true, /*ScopeLine=*/0,
       /*Flags=*/llvm::DINode::FlagPrototyped, /*isOptimized=*/true);
 #endif
+  return DIFunction;
+#else
+  return nullptr;
+#endif
+}
+
+void CodeGenCPU::AddFunction(const PrimFunc& f) {
+#if TVM_LLVM_VERSION >= 50
+  di_subprogram_ = CreateDebugFunction(f);
+#endif
+  EmitDebugLocation(f->span);
+  CodeGenLLVM::AddFunction(f);
+  if (f_tvm_register_system_symbol_ != nullptr) {
+    auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
+    ICHECK(global_symbol.defined())
+        << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
+    export_system_symbols_.emplace_back(
+        std::make_pair(global_symbol.value().operator std::string(), function_));
+  }
+  AddDebugInformation(f, function_);
+}
 
-  ICHECK(DIFunction);
-  f_llvm->setSubprogram(DIFunction);
-  ICHECK_EQ(f_llvm->getSubprogram(), DIFunction);
+// Following Glow |DebugInfo::generateFunctionDebugInfo|, https://git.io/fjadv
+void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) {
+#if TVM_LLVM_VERSION >= 50
+  ICHECK(di_subprogram_);
+  f_llvm->setSubprogram(di_subprogram_);
+  ICHECK_EQ(f_llvm->getSubprogram(), di_subprogram_);
 
   IRBuilder builder(&f_llvm->getEntryBlock());
   if (!f_llvm->getEntryBlock().empty()) {
@@ -246,11 +252,11 @@ void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) {
     auto* paramAlloca = builder.CreateAlloca(f_llvm->getFunctionType()->getParamType(i));
     std::string paramName = "arg" + std::to_string(i + 1);
     auto param = dbg_info_->di_builder_->createParameterVariable(
-        DIFunction, paramName, i + 1, dbg_info_->file_, 0,
+        di_subprogram_, paramName, i + 1, dbg_info_->file_, 0,
         GetDebugType(GetType(f_tir->params[i]), f_llvm->getFunctionType()->getParamType(i)),
         /*alwaysPreserve=*/true);
     auto* store = builder.CreateStore(f_llvm->arg_begin() + i, paramAlloca);
-    auto* di_loc = llvm::DILocation::get(*ctx, 0, 0, DIFunction);
+    auto* di_loc = llvm::DILocation::get(*ctx, 0, 0, di_subprogram_);
     dbg_info_->di_builder_->insertDeclare(paramAlloca, param,
                                           dbg_info_->di_builder_->createExpression(),
                                           llvm::DebugLoc(di_loc), store);
@@ -260,6 +266,7 @@ void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) {
   if (!scope) {
     return;
   }
+
   for (auto& BB : *f_llvm) {
     for (auto& I : BB) {
       if (I.getDebugLoc()) {
@@ -272,6 +279,9 @@ void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) {
 #endif
 }
 
+llvm::DIType* CodeGenCPU::GetDebugType(const Type& ty_tir) {
+  return GetDebugType(ty_tir, GetLLVMType(ty_tir));
+}
 llvm::DIType* CodeGenCPU::GetDebugType(const Type& ty_tir, llvm::Type* ty_llvm) {
   if (ty_llvm == t_void_) {
     return nullptr;
@@ -541,6 +551,7 @@ llvm::BasicBlock* CodeGenCPU::CheckCallSuccess(llvm::Value* retcode) {
 }
 
 void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
+  EmitDebugLocation(op);
   /*! \brief maintain states that should be guarded when step into compute scope */
   struct ComputeScopeStates {
     explicit ComputeScopeStates(CodeGenCPU* parent) : parent_(parent) {}
@@ -1447,6 +1458,7 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const CallNode* op) {
 }
 
 void CodeGenCPU::VisitStmt_(const AssertStmtNode* op) {
+  EmitDebugLocation(op);
   llvm::Value* cond = MakeValue(op->condition);
   std::ostringstream os;
   os << "Assert fail: " << op->condition;
@@ -1475,6 +1487,7 @@ void CodeGenCPU::VisitStmt_(const AssertStmtNode* op) {
 }
 
 void CodeGenCPU::VisitStmt_(const AttrStmtNode* op) {
+  EmitDebugLocation(op);
   if (op->attr_key == tir::attr::coproc_uop_scope) {
     const StringImmNode* value = op->value.as<StringImmNode>();
     ICHECK(value != nullptr);
@@ -1517,6 +1530,7 @@ void CodeGenCPU::VisitStmt_(const AttrStmtNode* op) {
 }
 
 void CodeGenCPU::VisitStmt_(const ForNode* op) {
+  EmitDebugLocation(op);
   ICHECK(is_zero(op->min));
   if (op->kind == ForKind::kSerial || op->kind == ForKind::kUnrolled) {
     CodeGenLLVM::VisitStmt_(op);
diff --git a/src/target/llvm/codegen_cpu.h b/src/target/llvm/codegen_cpu.h
index e0716ac8be2d..afbd49e14348 100644
--- a/src/target/llvm/codegen_cpu.h
+++ b/src/target/llvm/codegen_cpu.h
@@ -164,6 +164,7 @@ class CodeGenCPU : public CodeGenLLVM {
   // if not directly finalize function and pass on return code.
   // return the end block after the check
   llvm::BasicBlock* CheckCallSuccess(llvm::Value* retcode);
+  llvm::DISubprogram* CreateDebugFunction(const PrimFunc& f);
   // Context for injection lookup
   llvm::GlobalVariable* gv_mod_ctx_{nullptr};
   llvm::GlobalVariable* gv_tvm_func_call_{nullptr};
@@ -194,6 +195,7 @@ class CodeGenCPU : public CodeGenLLVM {
 
   // Get the DWARF type corresponding to the LLVM type |ty|. The current API in practice only
   // generates |int32|, and |int8*|.
+  llvm::DIType* GetDebugType(const Type& ty_tir);
   llvm::DIType* GetDebugType(const Type& ty_tir, llvm::Type* ty_llvm);
   // Adds the DWARF debug information for |function| to |dbg_info_|.
   void AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm);
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 526bcf0fb26e..2182ecfa51ce 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -298,6 +298,7 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
   }
 #endif
 
+  EmitDebugLocation(f->span);
   if (ret_void) {
     builder_->CreateRetVoid();
   } else {
@@ -556,6 +557,7 @@ llvm::Type* CodeGenLLVM::GetLLVMType(const PrimExpr& expr) const {
 //
 void CodeGenLLVM::AddAliasInfo(llvm::Instruction* inst, const VarNode* buffer_var, PrimExpr index,
                                DataType access_dtype) {
+  EmitDebugLocation(index->span);
   if (alias_var_set_.count(buffer_var) != 0) {
     // Mark all possibly aliased pointer as same type.
     llvm::MDNode* meta = md_tbaa_alias_set_;
@@ -663,12 +665,13 @@ std::unique_ptr<CodeGenLLVM::DebugInfo> CodeGenLLVM::CreateDebugInfo(llvm::Modul
   debug_info->di_builder_ = llvm::make_unique<llvm::DIBuilder>(*module);
 #endif
   // TODO(tulloch): pass this information through relay::Span classes to the IRModule instance?
-  debug_info->file_ = debug_info->di_builder_->createFile("model.tvm", "/tmp/");
+  debug_info->file_ = debug_info->di_builder_->createFile("main.tir", ".");
+  const int runtime_version = 0;
+  const bool is_optimized = false;
+  const char* compiler_flags = "";
   debug_info->compilation_unit_ = debug_info->di_builder_->createCompileUnit(
-      llvm::dwarf::DW_LANG_C, debug_info->file_, "TVM", 0, "", 0, "",
-      llvm::DICompileUnit::DebugEmissionKind::FullDebug,
-      /* SplitDebugInlining */ true,
-      /* DebugInfoForProfiling */ true);
+      /*Lang=*/llvm::dwarf::DW_LANG_C, /*File=*/debug_info->file_, /*Producer=*/"TVM", is_optimized,
+      compiler_flags, runtime_version);
   return debug_info;
 }
 
@@ -789,6 +792,7 @@ llvm::Value* CodeGenLLVM::CreateVecConcat(std::vector<llvm::Value*> vecs) {
 
 void CodeGenLLVM::CreateSerialFor(llvm::Value* begin, llvm::Value* end, llvm::Value* stride,
                                   const Var& loop_var, const Stmt& body) {
+  EmitDebugLocation(body->span);
   llvm::BasicBlock* pre_block = builder_->GetInsertBlock();
   std::string loop_var_name = loop_var->name_hint;
   llvm::LLVMContext* ctx = llvm_target_->GetContext();
@@ -802,8 +806,8 @@ void CodeGenLLVM::CreateSerialFor(llvm::Value* begin, llvm::Value* end, llvm::Va
   loop_value->addIncoming(begin, pre_block);
   ICHECK(!var_map_.count(loop_var.get()));
   var_map_[loop_var.get()] = loop_value;
-  builder_->CreateCondBr(CreateLT(loop_var.dtype(), loop_value, end), for_body, for_end,
-                         md_very_likely_branch_);
+  auto lt = CreateLT(loop_var.dtype(), loop_value, end);
+  builder_->CreateCondBr(lt, for_body, for_end, md_very_likely_branch_);
   builder_->SetInsertPoint(for_body);
   this->VisitStmt(body);
   var_map_.erase(loop_var.get());
@@ -916,6 +920,7 @@ llvm::Value* CodeGenLLVM::GetVarValue(const VarNode* v) const {
 
 void CodeGenLLVM::CreatePrintf(const std::string& format,
                                llvm::ArrayRef<llvm::Value*> format_args) {
+  EmitDebugLocation();
   llvm::Function* func_printf = module_->getFunction("printf");
   if (func_printf == nullptr) {
     llvm::FunctionType* ftype = llvm::FunctionType::get(t_int32_, true);
@@ -946,6 +951,7 @@ void CodeGenLLVM::CreatePrintf(const std::string& format,
 }
 
 llvm::Value* CodeGenLLVM::CreateLookupReturnAddress(unsigned int level) {
+  EmitDebugLocation();
   llvm::Value* level_val = llvm::ConstantInt::get(t_int32_, level);
   llvm::Function* builtin =
       llvm::Intrinsic::getDeclaration(module_.get(), llvm::Intrinsic::returnaddress);
@@ -1755,6 +1761,7 @@ void CodeGenLLVM::VisitStmt_(const StoreNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const BufferStoreNode* op) {
+  EmitDebugLocation(op);
   DataType value_dtype = op->value.dtype();
   Var buffer_var = op->buffer->data;
 
@@ -1781,6 +1788,7 @@ void CodeGenLLVM::VisitStmt_(const BufferStoreNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const ForNode* op) {
+  EmitDebugLocation(op);
   ICHECK(is_zero(op->min));
   analyzer_->Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
   if (op->kind == ForKind::kUnrolled) {
@@ -1794,6 +1802,7 @@ void CodeGenLLVM::VisitStmt_(const ForNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const WhileNode* op) {
+  EmitDebugLocation(op);
   llvm::LLVMContext* ctx = llvm_target_->GetContext();
   auto* while_cond = llvm::BasicBlock::Create(*ctx, "while_cond", function_);
   auto* while_body = llvm::BasicBlock::Create(*ctx, "while_body", function_);
@@ -1808,6 +1817,7 @@ void CodeGenLLVM::VisitStmt_(const WhileNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const IfThenElseNode* op) {
+  EmitDebugLocation(op);
   llvm::Value* cond = MakeValue(op->condition);
   llvm::LLVMContext* ctx = llvm_target_->GetContext();
   auto* then_block = llvm::BasicBlock::Create(*ctx, "if_then", function_);
@@ -1831,6 +1841,7 @@ void CodeGenLLVM::VisitStmt_(const IfThenElseNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const AllocateConstNode* op) {
+  EmitDebugLocation(op);
   auto data = op->data.value();
   auto array = NDArrayToLLVMArray(llvm_target_->GetContext(), data);
   std::string symbol_name = op->buffer_var->name_hint;
@@ -1842,6 +1853,7 @@ void CodeGenLLVM::VisitStmt_(const AllocateConstNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const AllocateNode* op) {
+  EmitDebugLocation(op);
   ICHECK_EQ(op->extents.size(), 1)
       << "LLVM codegen only supports flat 1-d buffer allocation, but allocation of "
       << op->buffer_var->name_hint << " is " << op->extents << "-d";
@@ -1892,6 +1904,7 @@ void CodeGenLLVM::VisitStmt_(const AllocateNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const AttrStmtNode* op) {
+  EmitDebugLocation(op);
   if (op->attr_key == tir::attr::thread_extent) {
     IterVar iv = Downcast<IterVar>(op->node);
     if (iv->thread_tag.length() != 0) {
@@ -1917,11 +1930,14 @@ void CodeGenLLVM::VisitStmt_(const AttrStmtNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const AssertStmtNode* op) {
+  EmitDebugLocation(op);
+  // auto a_cu =
   With<arith::ConstraintContext> cctx(analyzer_.get(), op->condition);
   this->VisitStmt(op->body);
 }
 
 void CodeGenLLVM::VisitStmt_(const LetStmtNode* op) {
+  EmitDebugLocation(op);
   const VarNode* v = op->var.get();
   ICHECK(!var_map_.count(v));
   if (v->dtype.is_handle()) {
@@ -1941,12 +1957,35 @@ void CodeGenLLVM::VisitStmt_(const LetStmtNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const SeqStmtNode* op) {
+  EmitDebugLocation(op);
   for (Stmt stmt : op->seq) {
     this->VisitStmt(stmt);
   }
 }
 
-void CodeGenLLVM::VisitStmt_(const EvaluateNode* op) { MakeValue(op->value); }
+void CodeGenLLVM::VisitStmt_(const EvaluateNode* op) {
+  EmitDebugLocation(op);
+  MakeValue(op->value);
+}
+
+void CodeGenLLVM::EmitDebugLocation(const Span& span) {
+#if TVM_LLVM_VERSION >= 50
+  if (di_subprogram_ == nullptr) {
+    // debug info is not always generated outside of CPU codegen
+    return;
+  }
+  if (!span.defined()) {
+    VLOG(0) << "Cannot emit debug location for undefined span";
+    return;
+  }
+  llvm::LLVMContext* ctx = llvm_target_->GetContext();
+  auto loc = llvm::DebugLoc(llvm::DILocation::get(*ctx, span->line, span->column, di_subprogram_));
+  builder_->SetCurrentDebugLocation(loc);
+#endif
+}
+
+void CodeGenLLVM::EmitDebugLocation() { builder_->SetCurrentDebugLocation(nullptr); }
+void CodeGenLLVM::EmitDebugLocation(const StmtNode* op) { EmitDebugLocation(op->span); }
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 1ae9d14dc4ad..632cfaafc51a 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -37,6 +37,7 @@
 #else
 #include <llvm/IR/Operator.h>
 #endif
+#include <llvm/IR/DebugInfoMetadata.h>
 #include <llvm/IR/GlobalValue.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Instructions.h>
@@ -70,6 +71,7 @@
 #include "../../runtime/thread_storage_scope.h"
 #include "../../tir/transforms/ir_utils.h"
 #include "codegen_params.h"
+#include "llvm_instance.h"
 
 namespace llvm {
 class Argument;
@@ -92,8 +94,6 @@ class MDBuilder;
 namespace tvm {
 namespace codegen {
 
-class LLVMTarget;
-
 using namespace tir;
 
 /*!
@@ -523,6 +523,8 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
   ExprDeepEqual deep_equal_;
   // binding of let variables. Enables duplicate var defs that map to same value
   std::unordered_map<Var, const LetNode*, ObjectPtrHash, ObjectPtrEqual> let_binding_;
+  // debug info for function being compiled
+  llvm::DISubprogram* di_subprogram_;
   // Cache potential common path ops to slightly improve lookup time.
   // global symbol table.
   OpAttrMap<TGlobalSymbol> op_attr_global_symbol_ = Op::GetAttrMap<TGlobalSymbol>("TGlobalSymbol");
@@ -533,6 +535,10 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
   const Op& builtin_lookup_param_ = builtin::lookup_param();
   const Op& builtin_tvm_call_cpacked_lowered_ = builtin::tvm_call_cpacked_lowered();
 
+  void EmitDebugLocation();
+  void EmitDebugLocation(const Span& span);
+  void EmitDebugLocation(const StmtNode* op);
+
   /*! \brief Helper struct for debug infos. */
   struct DebugInfo {
     ~DebugInfo();  // Because of the std::unique_ptr.
diff --git a/src/tir/transforms/install_debug_spans.cc b/src/tir/transforms/install_debug_spans.cc
new file mode 100644
index 000000000000..4daa1aafe8cc
--- /dev/null
+++ b/src/tir/transforms/install_debug_spans.cc
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file install_debug_spans.cc
+ * \brief Prints TIR code in memory and replaces all spans in the module with
+    the location to which the ops would be printed
+ */
+
+#include "install_debug_spans.h"
+
+#include <tvm/tir/transform.h>
+
+#include <string>
+#include <utility>
+
+#include "../../printer/tir_text_printer_debug.h"
+
+namespace tvm {
+namespace tir {
+
+Stmt DebugInfoInstaller::InstallInfo(const std::string& name, const Stmt& stmt) {
+  DebugInfoInstaller installer(stmt, name + ".tir");
+  return installer.VisitStmt(stmt);
+}
+
+DebugInfoInstaller::DebugInfoInstaller(const Stmt& stmt, const std::string& filename) {
+  // Determine the line that each stmt/expr will be printed on
+  tvm::tir::TIRTextPrinterDebug printer(false);
+
+  // Fill in the stmts and exprs' line info
+  auto result = printer.Print(stmt).str();
+
+  // Create map of the stmt/expr -> its line number in the output to later
+  // create new spans for each stmt/expr
+  const auto& stmts = printer.GetStmtsByLine();
+  VLOG(0) << "Debug printer found " << stmts.size() << " stmts after printing";
+  for (const auto& line : stmts) {
+    stmt_lines_[std::get<0>(line)] = std::get<1>(line);
+  }
+
+  const auto& exprs = printer.GetExprsByLine();
+  VLOG(0) << "Debug printer found " << exprs.size() << " exprs after printing";
+  for (const auto& line : exprs) {
+    expr_lines_[std::get<0>(line)] = std::get<1>(line);
+  }
+
+  // Output the printed TIR to the specified file
+  VLOG(0) << "Outputting TIR to " << filename;
+  filename_ = std::move(filename);
+  std::ofstream out(filename_);
+  out << result;
+  out.close();
+}
+
+PrimExpr DebugInfoInstaller::VisitExpr(const PrimExpr& expr) {
+  PrimExpr result = expr;
+  result = StmtExprMutator::VisitExpr(result);
+  return result;
+}
+
+Stmt DebugInfoInstaller::VisitStmt(const Stmt& stmt) {
+  Stmt result = stmt;
+  result = StmtExprMutator::VisitStmt(result);
+  return result;
+}
+
+Span DebugInfoInstaller::MaybeSpan(const StmtNode* op) {
+  auto entry = stmt_lines_.find(op);
+  if (entry == stmt_lines_.end()) {
+    return Span();
+  } else {
+    size_t column = 0;
+    size_t line = entry->second;
+    return Span(SourceName::Get(filename_), line, line, column, column);
+  }
+}
+
+Span DebugInfoInstaller::MaybeSpan(const PrimExprNode* op) {
+  auto entry = expr_lines_.find(op);
+  if (entry == expr_lines_.end()) {
+    return Span();
+  } else {
+    size_t column = 0;
+    size_t line = entry->second;
+    return Span(SourceName::Get(filename_), line, line, column, column);
+  }
+}
+
+#define X(TypeName)                                                   \
+  PrimExpr DebugInfoInstaller::VisitExpr_(const TypeName##Node* op) { \
+    auto new_expr = StmtExprMutator::VisitExpr_(op);                  \
+    auto new_type = Downcast<TypeName>(new_expr);                     \
+    auto new_node = new_type.CopyOnWrite();                           \
+    new_node->span = MaybeSpan(op);                                   \
+    return new_type;                                                  \
+  }
+TVM_TIR_TRANSFORMS_INSTALL_DEBUG_SPANS_SUPPORTED_EXPRS
+#undef X
+
+#define X(TypeName)                                               \
+  Stmt DebugInfoInstaller::VisitStmt_(const TypeName##Node* op) { \
+    Stmt new_stmt = StmtExprMutator::VisitStmt_(op);              \
+    auto new_type = Downcast<TypeName>(new_stmt);                 \
+    auto new_node = new_type.CopyOnWrite();                       \
+    new_node->span = MaybeSpan(op);                               \
+    return new_type;                                              \
+  }
+TVM_TIR_TRANSFORMS_INSTALL_DEBUG_SPANS_SUPPORTED_STMTS
+#undef X
+
+namespace transform {
+
+Pass InstallDebugSpans() {
+  auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
+    ICHECK(m->functions.size() == 1)
+        << "Debug info can only be added to IRModules with a single function";
+    // There is known to be only 1 function in the module at this point
+    auto entry = m->functions.begin();
+    auto name = std::get<0>(*entry)->name_hint;
+    auto* n = f.CopyOnWrite();
+
+    n->body = DebugInfoInstaller::InstallInfo(std::move(name), std::move(f->body));
+
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.InstallDebugSpans", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.InstallDebugSpans").set_body_typed(InstallDebugSpans);
+
+}  // namespace transform
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/transforms/install_debug_spans.h b/src/tir/transforms/install_debug_spans.h
new file mode 100644
index 000000000000..c71891aba5a6
--- /dev/null
+++ b/src/tir/transforms/install_debug_spans.h
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file install_debug_spans.h
+ * \brief Interface of the InstallDebugSpans pass
+ */
+
+#ifndef TVM_TIR_TRANSFORMS_INSTALL_DEBUG_SPANS_H_
+#define TVM_TIR_TRANSFORMS_INSTALL_DEBUG_SPANS_H_
+
+#include <tvm/tir/expr.h>
+#include <tvm/tir/expr_functor.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include <string>
+#include <unordered_map>
+
+#ifndef TVM_TIR_TRANSFORMS_INSTALL_DEBUG_SPANS_OPS_H_
+#define TVM_TIR_TRANSFORMS_INSTALL_DEBUG_SPANS_OPS_H_
+
+#define TVM_TIR_TRANSFORMS_INSTALL_DEBUG_SPANS_SUPPORTED_EXPRS \
+  X(Call)                                                      \
+  X(Add)                                                       \
+  X(Sub)                                                       \
+  X(Mul)                                                       \
+  X(Div)                                                       \
+  X(Mod)                                                       \
+  X(FloorDiv)                                                  \
+  X(FloorMod)                                                  \
+  X(Min)                                                       \
+  X(Max)                                                       \
+  X(EQ)                                                        \
+  X(NE)                                                        \
+  X(LT)                                                        \
+  X(LE)                                                        \
+  X(GT)                                                        \
+  X(GE)                                                        \
+  X(And)                                                       \
+  X(Or)                                                        \
+  X(Reduce)                                                    \
+  X(Cast)                                                      \
+  X(Not)                                                       \
+  X(Select)                                                    \
+  X(Ramp)                                                      \
+  X(Broadcast)                                                 \
+  X(Shuffle)                                                   \
+  X(IntImm)                                                    \
+  X(FloatImm)                                                  \
+  X(StringImm)
+
+#define TVM_TIR_TRANSFORMS_INSTALL_DEBUG_SPANS_SUPPORTED_STMTS \
+  X(AttrStmt)                                                  \
+  X(IfThenElse)                                                \
+  X(LetStmt)                                                   \
+  X(For)                                                       \
+  X(While)                                                     \
+  X(Allocate)                                                  \
+  X(AllocateConst)                                             \
+  X(DeclBuffer)                                                \
+  X(Store)                                                     \
+  X(BufferStore)                                               \
+  X(BufferRealize)                                             \
+  X(AssertStmt)                                                \
+  X(ProducerStore)                                             \
+  X(ProducerRealize)                                           \
+  X(Prefetch)                                                  \
+  X(SeqStmt)                                                   \
+  X(Evaluate)                                                  \
+  X(BlockRealize)
+
+#endif  // TVM_TIR_TRANSFORMS_INSTALL_DEBUG_SPANS_OPS_H_
+
+namespace tvm {
+namespace tir {
+
+/*!
+ * \brief This Pass prints out the provided 'stmt' through the TIR debug printer
+ while recording the statements and expressions printed on each line. Running
+ this pass uses the per-line information to change the Spans attached to each
+ statement and expression to the source location in the printed TIR. This pass
+ also writes to a file called '<name>.tir' so the line information used is
+ saved to disk.
+ */
+class DebugInfoInstaller : public StmtExprMutator {
+ public:
+  static Stmt InstallInfo(const std::string& name, const Stmt& stmt);
+
+  PrimExpr VisitExpr(const PrimExpr& expr) override;
+  Stmt VisitStmt(const Stmt& stmt) override;
+
+ protected:
+  DebugInfoInstaller(const Stmt& stmt, const std::string& filename);
+
+#define X(TypeName) PrimExpr VisitExpr_(const TypeName##Node* op) override;
+  TVM_TIR_TRANSFORMS_INSTALL_DEBUG_SPANS_SUPPORTED_EXPRS
+#undef X
+
+#define X(TypeName) Stmt VisitStmt_(const TypeName##Node* op) override;
+  TVM_TIR_TRANSFORMS_INSTALL_DEBUG_SPANS_SUPPORTED_STMTS
+#undef X
+
+ private:
+  std::unordered_map<const StmtNode*, size_t> stmt_lines_;
+  std::unordered_map<const PrimExprNode*, size_t> expr_lines_;
+  std::string filename_;
+
+  Span MaybeSpan(const StmtNode* op);
+  Span MaybeSpan(const PrimExprNode* op);
+};
+
+}  // namespace tir
+}  // namespace tvm
+
+#endif  // TVM_TIR_TRANSFORMS_INSTALL_DEBUG_SPANS_H_
diff --git a/tests/python/tir/test_debug_info.py b/tests/python/tir/test_debug_info.py
new file mode 100644
index 000000000000..8ecabbd51a97
--- /dev/null
+++ b/tests/python/tir/test_debug_info.py
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test line-level debug info for TIR"""
+import tvm
+import tvm.testing
+from tvm import tir
+from tvm import relay
+from tvm.script import tir as T
+
+from typing import List, Dict
+import re
+
+
+def find_di_locations(source: str) -> Dict[int, int]:
+    """
+    Parse out DILocation references in printed LLVM IR
+    """
+    result = {}
+
+    for line in source.splitlines():
+        m = re.match(r"!(\d+) = !DILocation\(line: (\d+).*", line)
+        if m:
+            debug_id, line = m.groups()
+            result[debug_id] = line
+
+    return result
+
+
+def _module():
+    @tvm.script.ir_module
+    class MyModule:
+        @T.prim_func
+        def main(a: T.handle, b: T.handle):
+            # We exchange data between function by handles, which are similar to pointer.
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            # Create buffer from handles.
+            A = T.match_buffer(a, (8,), dtype="float32")
+            B = T.match_buffer(b, (8,), dtype="float32")
+            for i in range(8):
+                # A block is an abstraction for computation.
+                with T.block("B"):
+                    # Define a spatial block iterator and bind it to value i.
+                    vi = T.axis.spatial(8, i)
+                    assert 1 == 0, "Some numbers"
+                    B[vi] = A[vi] + 1.0
+
+    return MyModule
+
+
+def test_tir_debug_info():
+    """
+    Test that Spans are correctly replaced with debug spans that reference
+    the printed TIR
+    """
+
+    def find_span(m):
+        func = next(m.functions.values())
+        return func.body.block.body.span
+
+    module_before = _module()
+    span_before = find_span(module_before)
+    assert span_before is None
+
+    module_after = tir.transform.InstallDebugSpans()(module_before)
+    span_after = find_span(module_after)
+
+    # Check that the module name has been added and a line number is present
+    assert span_after.source_name.name == "main.tir"
+    assert span_after.line == 4
+
+
+def test_llvm_ir_debug_info():
+    """
+    Check that the right amount of debug locations are present
+    """
+    MyModule = _module()
+    with tvm.transform.PassContext(opt_level=3, config={"tir.enable_debug": True}):
+        runtime_module = tvm.build(MyModule, target="llvm")
+
+    source = runtime_module.get_source()
+
+    locations = find_di_locations(source)
+    assert len(locations) == 34
+
+
+def test_llvm_ir_debug_accuracy():
+    """
+    Check that the debug location on an assert is correct
+    """
+    MyModule = _module()
+    with tvm.transform.PassContext(opt_level=3, config={"tir.enable_debug": True}):
+        runtime_module = tvm.build(MyModule, target="llvm")
+    source = runtime_module.get_source()
+    locations = find_di_locations(source)
+
+    # Find the 'assert' from MyModule
+    debug_dir_match = re.search(
+        r"tail call void %0\(i8\* getelementptr inbounds .* !dbg !(\d+)\n", source
+    )
+
+    # Extract out the debug directive line
+    directive_idx = debug_dir_match.groups()[0]
+
+    # Check that it matches the expected line number (in main.tir)
+    debug_line_no = int(locations[directive_idx])
+    assert debug_line_no == 42
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From ef7dabb2b4f67f0c93c793f1d16248a511fafe2d Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 6 Jan 2023 16:48:23 -0800
Subject: [PATCH 124/286] [microTVM][Zephyr]Add project files for mlperftiny
 submission  (#13690)

This PR makes these changes:
1. add source/header files for generating a zephyr project which is compatible with EEMBC runner for MLPerfTiny
2. adjust microtvm_api_server.py and CMakeLists.template to support `mlperftiny` project type
3. adds EEMBC api files from https://github.com/mlcommons/tiny in `thirdparty/tiny`.

This pull request was co-authored by @alanmacd, @mkatanbaf, @guberti and @areusch as part of our effort to submit to MLPerfTiny. You can find our submission results here: https://mlcommons.org/en/inference-tiny-10/
---
 3rdparty/mlperftiny/README.md                 |   2 +
 .../mlperftiny/api/internally_implemented.cpp | 325 ++++++++++++++++++
 .../mlperftiny/api/internally_implemented.h   |  62 ++++
 .../mlperftiny/api/submitter_implemented.h    |  85 +++++
 .../template_project/CMakeLists.txt.template  |  13 +-
 .../template_project/microtvm_api_server.py   |   9 +-
 .../template_project/src/mlperftiny/README.md |  20 ++
 .../template_project/src/mlperftiny/main.cc   |  38 ++
 .../src/mlperftiny/submitter_implemented.cc   | 218 ++++++++++++
 .../src/mlperftiny/tvmruntime.cc              | 164 +++++++++
 .../src/mlperftiny/tvmruntime.h               |  62 ++++
 .../src/mlperftiny/zephyr_uart.cc             |  89 +++++
 .../src/mlperftiny/zephyr_uart.h              |  51 +++
 cmake/modules/Zephyr.cmake                    |   3 +
 14 files changed, 1139 insertions(+), 2 deletions(-)
 create mode 100644 3rdparty/mlperftiny/README.md
 create mode 100644 3rdparty/mlperftiny/api/internally_implemented.cpp
 create mode 100644 3rdparty/mlperftiny/api/internally_implemented.h
 create mode 100644 3rdparty/mlperftiny/api/submitter_implemented.h
 create mode 100644 apps/microtvm/zephyr/template_project/src/mlperftiny/README.md
 create mode 100644 apps/microtvm/zephyr/template_project/src/mlperftiny/main.cc
 create mode 100644 apps/microtvm/zephyr/template_project/src/mlperftiny/submitter_implemented.cc
 create mode 100644 apps/microtvm/zephyr/template_project/src/mlperftiny/tvmruntime.cc
 create mode 100644 apps/microtvm/zephyr/template_project/src/mlperftiny/tvmruntime.h
 create mode 100644 apps/microtvm/zephyr/template_project/src/mlperftiny/zephyr_uart.cc
 create mode 100644 apps/microtvm/zephyr/template_project/src/mlperftiny/zephyr_uart.h

diff --git a/3rdparty/mlperftiny/README.md b/3rdparty/mlperftiny/README.md
new file mode 100644
index 000000000000..28938e90d744
--- /dev/null
+++ b/3rdparty/mlperftiny/README.md
@@ -0,0 +1,2 @@
+# MLPerf™ Tiny Benchmark API
+This directory includes API files to build a microTVM project that could be tested with EEMBC benchmark runner. API files are captured from the [MLCommons/tiny repository](https://github.com/mlcommons/tiny).
diff --git a/3rdparty/mlperftiny/api/internally_implemented.cpp b/3rdparty/mlperftiny/api/internally_implemented.cpp
new file mode 100644
index 000000000000..4754d4e267e6
--- /dev/null
+++ b/3rdparty/mlperftiny/api/internally_implemented.cpp
@@ -0,0 +1,325 @@
+/*
+Copyright 2020 EEMBC and The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+This file is a modified version of the original EEMBC implementation of ee_lib.
+The file name has been changed and some functions removed. Malloc has been
+replaced by a fixed-size array.
+==============================================================================*/
+/// \file
+/// \brief Internally-implemented methods required to perform inference.
+
+#include "internally_implemented.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "submitter_implemented.h"
+
+// Command buffer (incoming commands from host)
+char volatile g_cmd_buf[EE_CMD_SIZE + 1];
+size_t volatile g_cmd_pos = 0u;
+
+// Generic buffer to db input.
+uint8_t gp_buff[MAX_DB_INPUT_SIZE];
+size_t g_buff_size = 0u;
+size_t g_buff_pos = 0u;
+
+/**
+ * Since the serial port ISR may be connected before the loop is ready, this
+ * flag turns off the parser until the main routine is ready.
+ */
+bool g_state_parser_enabled = false;
+
+/**
+ * This function assembles a command string from the UART. It should be called
+ * from the UART ISR for each new character received. When the parser sees the
+ * termination character, the user-defined th_command_ready() command is called.
+ * It is up to the application to then dispatch this command outside the ISR
+ * as soon as possible by calling ee_serial_command_parser_callback(), below.
+ */
+void ee_serial_callback(char c) {
+  if (c == EE_CMD_TERMINATOR) {
+    g_cmd_buf[g_cmd_pos] = (char)0;
+    th_command_ready(g_cmd_buf);
+    g_cmd_pos = 0;
+  } else {
+    g_cmd_buf[g_cmd_pos] = c;
+    g_cmd_pos = g_cmd_pos >= EE_CMD_SIZE ? EE_CMD_SIZE : g_cmd_pos + 1;
+  }
+}
+
+/**
+ * This is the minimal parser required to test the monitor; profile-specific
+ * commands are handled by whatever profile is compiled into the firmware.
+ *
+ * The most basic commands are:
+ *
+ * name             Print m-name-NAME, where NAME defines the intent of the f/w
+ * timestamp        Generate a signal used for timestamping by the framework
+ */
+/*@-mustfreefresh*/
+/*@-nullpass*/
+void ee_serial_command_parser_callback(char *p_command) {
+  char *tok;
+
+  if (g_state_parser_enabled != true) {
+    return;
+  }
+
+  tok = strtok(p_command, EE_CMD_DELIMITER);
+
+  if (strncmp(tok, EE_CMD_NAME, EE_CMD_SIZE) == 0) {
+    th_printf(EE_MSG_NAME, EE_DEVICE_NAME, TH_VENDOR_NAME_STRING);
+  } else if (strncmp(tok, EE_CMD_TIMESTAMP, EE_CMD_SIZE) == 0) {
+    th_timestamp();
+  } else if (ee_profile_parse(tok) == EE_ARG_CLAIMED) {
+  } else {
+    th_printf(EE_ERR_CMD, tok);
+  }
+
+  th_printf(EE_MSG_READY);
+}
+
+/**
+ * Perform the basic setup.
+ */
+void ee_benchmark_initialize(void) {
+  th_serialport_initialize();
+  th_timestamp_initialize();
+  th_final_initialize();
+  th_printf(EE_MSG_INIT_DONE);
+  // Enable the command parser here (the callback is connected)
+  g_state_parser_enabled = true;
+  // At this point, the serial monitor should be up and running,
+  th_printf(EE_MSG_READY);
+}
+
+arg_claimed_t ee_profile_parse(char *command) {
+  char *p_next; /* strtok already primed from ee_main.c */
+
+  if (strncmp(command, "profile", EE_CMD_SIZE) == 0) {
+    th_printf("m-profile-[%s]\r\n", EE_FW_VERSION);
+    th_printf("m-model-[%s]\r\n", TH_MODEL_VERSION);
+  } else if (strncmp(command, "help", EE_CMD_SIZE) == 0) {
+    th_printf("%s\r\n", EE_FW_VERSION);
+    th_printf("\r\n");
+    /* These are the three common functions for all IoTConnect f/w. */
+    th_printf("help         : Print this information\r\n");
+    th_printf("name         : Print the name of the device\r\n");
+    th_printf("timestsamp   : Generate a timetsamp\r\n");
+    /* These are profile-specific commands. */
+    th_printf("db SUBCMD    : Manipulate a generic byte buffer\r\n");
+    th_printf("  load N     : Allocate N bytes and set load counter\r\n");
+    th_printf("  db HH[HH]* : Load 8-bit hex byte(s) until N bytes\r\n");
+    th_printf("  print [N=16] [offset=0]\r\n");
+    th_printf("             : Print N bytes at offset as hex\r\n");
+    th_printf(
+        "infer N [W=0]: Load input, execute N inferences after W warmup "
+        "loops\r\n");
+    th_printf("results      : Return the result fp32 vector\r\n");
+  } else if (ee_buffer_parse(command) == EE_ARG_CLAIMED) {
+  } else if (strncmp(command, "infer", EE_CMD_SIZE) == 0) {
+    size_t n = 1;
+    size_t w = 10;
+    int i;
+
+    /* Check for inference iterations */
+    p_next = strtok(NULL, EE_CMD_DELIMITER);
+    if (p_next) {
+      i = atoi(p_next);
+      if (i <= 0) {
+        th_printf("e-[Inference iterations must be >0]\r\n");
+        return EE_ARG_CLAIMED;
+      }
+      n = (size_t)i;
+      /* Check for warmup iterations */
+      p_next = strtok(NULL, EE_CMD_DELIMITER);
+      if (p_next) {
+        i = atoi(p_next);
+        if (i < 0) {
+          th_printf("e-[Inference warmup must be >=0]\r\n");
+          return EE_ARG_CLAIMED;
+        }
+        w = (size_t)i;
+      }
+    }
+
+    ee_infer(n, w);
+  } else if (strncmp(command, "results", EE_CMD_SIZE) == 0) {
+    th_results();
+  } else {
+    return EE_ARG_UNCLAIMED;
+  }
+  return EE_ARG_CLAIMED;
+}
+
+/**
+ * Inference without feature engineering. The inpput tensor is expected to
+ * have been loaded from the buffer via the th_load_tensor() function, which in
+ * turn was loaded from the interface via `db` commands.
+ *
+ * For testing, you can pre-load known-good data into the buffer during the
+ * th_final_initialize() function.
+ *
+ */
+void ee_infer(size_t n, size_t n_warmup) {
+  th_load_tensor(); /* if necessary */
+  th_printf("m-warmup-start-%d\r\n", n_warmup);
+  while (n_warmup-- > 0) {
+    th_infer(); /* call the API inference function */
+  }
+  th_printf("m-warmup-done\r\n");
+  th_printf("m-infer-start-%d\r\n", n);
+  th_timestamp();
+  th_pre();
+  while (n-- > 0) {
+    th_infer(); /* call the API inference function */
+  }
+  th_post();
+  th_timestamp();
+  th_printf("m-infer-done\r\n");
+  th_results();
+}
+
+arg_claimed_t ee_buffer_parse(char *p_command) {
+  char *p_next;
+
+  if (strncmp(p_command, "db", EE_CMD_SIZE) != 0) {
+    return EE_ARG_UNCLAIMED;
+  }
+
+  p_next = strtok(NULL, EE_CMD_DELIMITER);
+
+  if (p_next == NULL) {
+    th_printf("e-[Command 'db' requires a subcommand]\r\n");
+  } else if (strncmp(p_next, "load", EE_CMD_SIZE) == 0) {
+    p_next = strtok(NULL, EE_CMD_DELIMITER);
+
+    if (p_next == NULL) {
+      th_printf("e-[Command 'db load' requires the # of bytes]\r\n");
+    } else {
+      g_buff_size = (size_t)atoi(p_next);
+      if (g_buff_size == 0) {
+        th_printf("e-[Command 'db load' must be >0 bytes]\r\n");
+      } else {
+        g_buff_pos = 0;
+        if (g_buff_size > MAX_DB_INPUT_SIZE) {
+          th_printf("Supplied buffer size %d exceeds maximum of %d\n",
+                    g_buff_size, MAX_DB_INPUT_SIZE);
+        } else {
+          th_printf("m-[Expecting %d bytes]\r\n", g_buff_size);
+        }
+      }
+    }
+  } else if (strncmp(p_next, "print", EE_CMD_SIZE) == 0) {
+    size_t i = 0;
+    const size_t max = 8;
+    for (; i < g_buff_size; ++i) {
+    if ((i + max) % max == 0 || i == 0) {
+        th_printf("m-buffer-");
+    }
+    /* N.B. Not every `printf` supports the spacing prefix! */
+    th_printf("%02x", gp_buff[i]);
+    if (((i + 1) % max == 0) || ((i + 1) == g_buff_size)) {
+        th_printf("\r\n");
+    } else {
+        th_printf("-");
+    }
+    }
+    if (i % max != 0) {
+    th_printf("\r\n");
+    }
+  } else {
+    size_t numbytes;
+    char test[3];
+    long res;
+
+    /* Two hexdigits per byte */
+    numbytes = th_strnlen(p_next, EE_CMD_SIZE);
+
+    if ((numbytes & 1) != 0) {
+      th_printf("e-[Insufficent number of hex digits]\r\n");
+      return EE_ARG_CLAIMED;
+    }
+    test[2] = 0;
+    for (size_t i = 0; i < numbytes;) {
+      test[0] = p_next[i++];
+      test[1] = p_next[i++];
+      res = ee_hexdec(test);
+      if (res < 0) {
+        th_printf("e-[Invalid hex digit '%s']\r\n", test);
+        return EE_ARG_CLAIMED;
+      } else {
+        gp_buff[g_buff_pos] = (uint8_t)res;
+        g_buff_pos++;
+        if (g_buff_pos == g_buff_size) {
+          th_printf("m-load-done\r\n");
+          /* Disregard the remainder of the digits when done. */
+          return EE_ARG_CLAIMED;
+        }
+      }
+    }
+  }
+  return EE_ARG_CLAIMED;
+}
+
+/**
+ * @brief convert a hexidecimal string to a signed long
+ * will not produce or process negative numbers except
+ * to signal error.
+ *
+ * @param hex without decoration, case insensitive.
+ *
+ * @return -1 on error, or result (max (sizeof(long)*8)-1 bits)
+ *
+ */
+long ee_hexdec(char *hex) {
+  char c;
+  long dec = 0;
+  long ret = 0;
+
+  while (*hex && ret >= 0) {
+    c = *hex++;
+    if (c >= '0' && c <= '9') {
+      dec = c - '0';
+    } else if (c >= 'a' && c <= 'f') {
+      dec = c - 'a' + 10;
+    } else if (c >= 'A' && c <= 'F') {
+      dec = c - 'A' + 10;
+    } else {
+      return -1;
+    }
+    ret = (ret << 4) + dec;
+  }
+  return ret;
+}
+
+/**
+ * @brief get the buffer resulting from the last db command. Returns length 0
+ * if the db command has not been used yet.
+ *
+ * @param buffer to fill with bytes from internal buffer filled by db commands.
+ * @param maximum number of bytes to copy into provided buffer. This is
+ * typically the length of the provided buffer.
+ *
+ * @return number of bytes copied from internal buffer.
+ *
+ */
+size_t ee_get_buffer(uint8_t* buffer, size_t max_len) {
+  int len = max_len < g_buff_pos ? max_len : g_buff_pos;
+  if (buffer != nullptr) {
+    memcpy(buffer, gp_buff, len * sizeof(uint8_t));
+  }
+  return len;
+}
+
+uint8_t* ee_get_buffer_pointer() { return gp_buff; }
diff --git a/3rdparty/mlperftiny/api/internally_implemented.h b/3rdparty/mlperftiny/api/internally_implemented.h
new file mode 100644
index 000000000000..44583173f8f8
--- /dev/null
+++ b/3rdparty/mlperftiny/api/internally_implemented.h
@@ -0,0 +1,62 @@
+/*
+Copyright 2020 EEMBC and The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+This file is a modified version of the original EEMBC implementation of ee_lib.
+The file name has been changed and some functions removed.
+==============================================================================*/
+
+/// \file
+/// \brief Internally-implemented methods required to perform inference.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifndef MLPERF_TINY_V0_1_API_INTERNALLY_IMPLEMENTED_H_
+#define MLPERF_TINY_V0_1_API_INTERNALLY_IMPLEMENTED_H_
+
+#define EE_MONITOR_VERSION "2.2.0"
+#define EE_FW_VERSION "ULPMark for tinyML Firmware V0.0.1"
+
+/* Version 1.0 of the benchmark only supports these models */
+#define EE_MODEL_VERSION_KWS01 "kws01"
+#define EE_MODEL_VERSION_VWW01 "vww01"
+#define EE_MODEL_VERSION_AD01 "ad01"
+#define EE_MODEL_VERSION_IC01 "ic01"
+
+typedef enum { EE_ARG_CLAIMED, EE_ARG_UNCLAIMED } arg_claimed_t;
+typedef enum { EE_STATUS_OK = 0, EE_STATUS_ERROR } ee_status_t;
+
+#define EE_DEVICE_NAME "dut"
+
+#define EE_CMD_SIZE 80u
+#define EE_CMD_DELIMITER " "
+#define EE_CMD_TERMINATOR '%'
+
+#define EE_CMD_NAME "name"
+#define EE_CMD_TIMESTAMP "timestamp"
+
+#define EE_MSG_READY "m-ready\r\n"
+#define EE_MSG_INIT_DONE "m-init-done\r\n"
+#define EE_MSG_NAME "m-name-%s-[%s]\r\n"
+
+#define EE_ERR_CMD "e-[Unknown command: %s]\r\n"
+
+void ee_serial_callback(char);
+void ee_serial_command_parser_callback(char*);
+void ee_benchmark_initialize(void);
+long ee_hexdec(char*);
+void ee_infer(size_t n, size_t n_warmup);
+size_t ee_get_buffer(uint8_t* buffer, size_t max_len);
+arg_claimed_t ee_buffer_parse(char* command);
+arg_claimed_t ee_profile_parse(char* command);
+uint8_t* ee_get_buffer_pointer();
+
+#endif /* MLPERF_TINY_V0_1_API_INTERNALLY_IMPLEMENTED_H_ */
diff --git a/3rdparty/mlperftiny/api/submitter_implemented.h b/3rdparty/mlperftiny/api/submitter_implemented.h
new file mode 100644
index 000000000000..fc03a733c9cb
--- /dev/null
+++ b/3rdparty/mlperftiny/api/submitter_implemented.h
@@ -0,0 +1,85 @@
+/*
+Copyright 2020 EEMBC and The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+This file reflects a modified version of th_lib from EEMBC. All wrapped libc
+methods from th_libc.h and all testharness methods from th_lib.h are here.
+==============================================================================*/
+/// \file
+/// \brief Submitter-implemented methods required to perform inference.
+/// \detail All methods with names starting with th_ are to be implemented by
+/// the submitter. All basic I/O, inference and timer APIs must be implemented
+/// in order for the benchmark to output useful results, but some auxiliary
+/// methods default to an empty implementation. These methods are provided to
+/// enable submitter optimizations, and are not required for submission.
+
+#ifndef MLPERF_TINY_V0_1_API_SUBMITTER_IMPLEMENTED_H_
+#define MLPERF_TINY_V0_1_API_SUBMITTER_IMPLEMENTED_H_
+
+/// \brief These defines set logging prefixes for test harness integration.
+/// \detail This API is designed for performance evaluation only. In order to
+/// gather energy measurments we recommend using the EEMBC test suite.
+#define EE_MSG_TIMESTAMP "m-lap-us-%lu\r\n"
+#define TH_VENDOR_NAME_STRING "unspecified"
+
+// MAX_DB_INPUT_SIZE defined in CMakeList.txt
+#ifndef TH_MODEL_VERSION
+// See "internally_implemented.h" for a list
+#error "PLease set TH_MODEL_VERSION to one of the EE_MODEL_VERSION_* defines"
+// e.g.: to inform the user of model `ic01` use this:
+// #define TH_MODEL_VERSION EE_MODEL_VERSION_IC01
+#endif
+
+// Use this to switch between DUT-direct (perf) & DUT-inderrect (energy) modes
+#ifndef EE_CFG_ENERGY_MODE
+#define EE_CFG_ENERGY_MODE 0
+#endif
+
+// This is a visual cue to the user when reviewing logs or plugging an
+// unknown device into the system.
+#if EE_CFG_ENERGY_MODE == 1
+#define EE_MSG_TIMESTAMP_MODE "m-timestamp-mode-energy\r\n"
+#else
+#define EE_MSG_TIMESTAMP_MODE "m-timestamp-mode-performance\r\n"
+#endif
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/// \brief required core API
+void th_load_tensor();
+void th_results();
+void th_infer();
+void th_timestamp(void);
+void th_printf(const char* fmt, ...);
+char th_getchar();
+
+/// \brief optional API
+void th_serialport_initialize(void);
+void th_timestamp_initialize(void);
+void th_final_initialize(void);
+void th_pre();
+void th_post();
+void th_command_ready(char volatile* msg);
+
+/// \brief libc hooks
+int th_strncmp(const char* str1, const char* str2, size_t n);
+char* th_strncpy(char* dest, const char* src, size_t n);
+size_t th_strnlen(const char* str, size_t maxlen);
+char* th_strcat(char* dest, const char* src);
+char* th_strtok(/*@null@*/ char* str1, const char* sep);
+int th_atoi(const char* str);
+void* th_memset(void* b, int c, size_t len);
+void* th_memcpy(void* dst, const void* src, size_t n);
+int th_vprintf(const char* format, va_list ap);
+
+#endif  // MLPERF_TINY_V0_1_API_SUBMITTER_IMPLEMENTED_H_
diff --git a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
index 9386576c394b..a41d68a134ef 100644
--- a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
+++ b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
@@ -23,6 +23,8 @@ set(ENV{QEMU_BIN_PATH} "${CMAKE_SOURCE_DIR}/qemu-hack")
 
 set(QEMU_PIPE <QEMU_PIPE> CACHE PATH "Path to QEMU pipe")
 
+option(ENERGY_MODE "Enable energy mode for MLPerfTiny tests." 0)
+
 <CMAKE_ARGS>
 
 find_package(Zephyr HINTS $ENV{ZEPHYR_BASE})
@@ -70,6 +72,15 @@ target_include_directories(tvm_model PRIVATE ${CMAKE_SOURCE_DIR}/include crt_con
 target_compile_options(tvm_model PRIVATE -Wno-unused-variable)  # TVM-generated code tends to include lots of these.
 target_link_libraries(app PRIVATE tvm_model)
 
-file(GLOB_RECURSE app_srcs src/**.c)
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/api)
+  zephyr_library_named(tinymlperf_api)
+  file(GLOB_RECURSE tiny_internal_srcs src/api/*.cpp)
+  target_sources(tinymlperf_api PRIVATE ${tiny_internal_srcs})
+  target_compile_options(tinymlperf_api PRIVATE -Wno-unused-variable)  # TVM-generated code tends to include lots of these.
+  target_link_libraries(app PRIVATE tinymlperf_api)
+  target_compile_definitions(tinymlperf_api PUBLIC -DEE_CFG_ENERGY_MODE=${ENERGY_MODE})
+endif()
+
+file(GLOB_RECURSE app_srcs src/**.c src/**.cc)
 target_sources(app PRIVATE ${app_srcs} ${cmsis_lib_srcs})
 target_include_directories(app PRIVATE crt_config ${CMAKE_SOURCE_DIR}/include crt/include ${cmsis_includes})
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index b0cd21e4adb2..e93918e44844 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -448,6 +448,7 @@ def _create_prj_conf(
     CRT_LIBS_BY_PROJECT_TYPE = {
         "host_driven": "microtvm_rpc_server microtvm_rpc_common aot_executor_module aot_executor common",
         "aot_standalone_demo": "memory microtvm_rpc_common common",
+        "mlperftiny": "memory common",
     }
 
     def _get_platform_version(self, zephyr_base: str) -> float:
@@ -623,7 +624,13 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
                 if compile_definitions:
                     flags = compile_definitions
                     for item in flags:
-                        cmake_f.write(f"target_compile_definitions(app PUBLIC {item})\n")
+                        if "MAX_DB_INPUT_SIZE" in item or "TH_MODEL_VERSION" in item:
+                            compile_target = "tinymlperf_api"
+                        else:
+                            compile_target = "app"
+                        cmake_f.write(
+                            f"target_compile_definitions({compile_target} PUBLIC {item})\n"
+                        )
 
                 if self._is_fvp(zephyr_board, use_fvp):
                     cmake_f.write(f"target_compile_definitions(app PUBLIC -DFVP=1)\n")
diff --git a/apps/microtvm/zephyr/template_project/src/mlperftiny/README.md b/apps/microtvm/zephyr/template_project/src/mlperftiny/README.md
new file mode 100644
index 000000000000..c38a1f05bfc6
--- /dev/null
+++ b/apps/microtvm/zephyr/template_project/src/mlperftiny/README.md
@@ -0,0 +1,20 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# MLPerf Tiny Project API
+This directory includes source files to build a Zephyr microTVM project to use for benchmarking with EEMBC runner.
+This project has been tested with NUCLEO_L4R5ZI and NRF5340DK.
diff --git a/apps/microtvm/zephyr/template_project/src/mlperftiny/main.cc b/apps/microtvm/zephyr/template_project/src/mlperftiny/main.cc
new file mode 100644
index 000000000000..4c91177062ad
--- /dev/null
+++ b/apps/microtvm/zephyr/template_project/src/mlperftiny/main.cc
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "api/internally_implemented.h"
+#include "api/submitter_implemented.h"
+
+int main(int argc, char* argv[]) {
+#if NRF_BOARD == 1
+  // Set frequency to 128MHz for nrf5340dk_nrf534 by setting the clock divider to 0.
+  // 0x50005558 is the clock division reg address.
+  uint32_t* clock_div = (uint32_t*)0x50005558;
+  *clock_div = 0;
+#endif
+
+  ee_benchmark_initialize();
+  while (1) {
+    int c;
+    c = th_getchar();
+    ee_serial_callback(c);
+  }
+  return 0;
+}
diff --git a/apps/microtvm/zephyr/template_project/src/mlperftiny/submitter_implemented.cc b/apps/microtvm/zephyr/template_project/src/mlperftiny/submitter_implemented.cc
new file mode 100644
index 000000000000..84baee3072cd
--- /dev/null
+++ b/apps/microtvm/zephyr/template_project/src/mlperftiny/submitter_implemented.cc
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "api/submitter_implemented.h"
+
+#include <drivers/gpio.h>
+#include <kernel.h>
+#include <tvm/runtime/crt/platform.h>
+#include <unistd.h>
+
+#include "api/internally_implemented.h"
+#include "tvmruntime.h"
+#include "zephyr_uart.h"
+
+static void* g_input_data;
+#if TARGET_MODEL == 3  // AD
+static uint8_t __aligned(4) g_input_data_buffer_aligned[MAX_DB_INPUT_SIZE];
+#endif
+
+#if EE_CFG_ENERGY_MODE == 1 && NRF_BOARD != 1
+// use GPIO PC6 which is on connector CN7 pin 1 on the nucleo_l4r5zi
+static const char* g_gpio_device_name = "GPIOC";
+static const struct device* g_gpio_dev;
+static const gpio_pin_t g_gpio_pin = 6;
+#endif
+
+// Implement this method to prepare for inference and preprocess inputs.
+// Modified from source
+void th_load_tensor() {
+#if TARGET_MODEL == 1  // KWS
+  g_input_data = static_cast<void*>(ee_get_buffer_pointer());
+#elif TARGET_MODEL == 2  // VWW
+  // Converting uint8 to int8
+  int8_t* temp_int = reinterpret_cast<int8_t*>(ee_get_buffer_pointer());
+  for (size_t i = 0; i < MAX_DB_INPUT_SIZE; i++) {
+    temp_int[i] -= 128;
+  }
+  g_input_data = static_cast<void*>(temp_int);
+#elif TARGET_MODEL == 3  // AD
+  uint8_t* buffer = ee_get_buffer_pointer();
+  memcpy(g_input_data_buffer_aligned, buffer, sizeof(g_input_data_buffer_aligned));
+  g_input_data = g_input_data_buffer_aligned;
+#elif TARGET_MODEL == 4  // IC
+  uint8_t* temp_uint = reinterpret_cast<uint8_t*>(ee_get_buffer_pointer());
+  int8_t* temp_int = reinterpret_cast<int8_t*>(ee_get_buffer_pointer());
+  for (size_t i = 0; i < MAX_DB_INPUT_SIZE; i++) {
+    if (temp_uint[i] <= 127)
+      temp_int[i] = ((int8_t)temp_uint[i]) - 128;
+    else
+      temp_int[i] = (int8_t)(temp_uint[i] - 128);
+  }
+  g_input_data = reinterpret_cast<void*>(temp_int);
+#else
+#error Wrong model
+#endif
+}
+
+#if TARGET_MODEL == 3  // model AD
+// calculate |output - input| for AD model
+static float calculate_result() {
+  size_t feature_size = g_output_data_len;
+  float diffsum = 0;
+  float* input_float = reinterpret_cast<float*>(g_input_data);
+  float* output_float = reinterpret_cast<float*>(g_output_data);
+
+  for (size_t i = 0; i < feature_size; i++) {
+    float diff = output_float[i] - input_float[i];
+    diffsum += diff * diff;
+  }
+  diffsum /= feature_size;
+
+  return diffsum;
+}
+#endif
+
+// Add to this method to return real inference results.
+void th_results() {
+  /**
+   * The results need to be printed back in exactly this format; if easier
+   * to just modify this loop than copy to results[] above, do that.
+   */
+#if TARGET_MODEL == 3  // model AD
+  th_printf("m-results-[%0.3f]\r\n", calculate_result());
+#else
+  size_t kCategoryCount = g_output_data_len;
+  th_printf("m-results-[");
+  for (size_t i = 0; i < kCategoryCount; i++) {
+    float converted = static_cast<float>(g_quant_scale * (g_output_data[i] - g_quant_zero));
+    // float converted = static_cast<float>(g_output_data[i]);
+    th_printf("%.3f", converted);
+    if (i < (kCategoryCount - 1)) {
+      th_printf(",");
+    }
+  }
+  th_printf("]\r\n");
+#endif
+}
+
+// Implement this method with the logic to perform one inference cycle.
+// Modified from source
+void th_infer() { TVMInfer(g_input_data); }
+
+/// \brief optional API.
+// Modified from source
+void th_final_initialize(void) { TVMRuntimeInit(); }
+
+void th_pre() {}
+void th_post() {}
+
+void th_command_ready(char volatile* p_command) {
+  p_command = p_command;
+  ee_serial_command_parser_callback((char*)p_command);
+}
+
+// th_libc implementations.
+int th_strncmp(const char* str1, const char* str2, size_t n) { return strncmp(str1, str2, n); }
+
+char* th_strncpy(char* dest, const char* src, size_t n) { return strncpy(dest, src, n); }
+
+size_t th_strnlen(const char* str, size_t maxlen) { return strlen(str); }
+
+char* th_strcat(char* dest, const char* src) { return strcat(dest, src); }
+
+char* th_strtok(char* str1, const char* sep) { return strtok(str1, sep); }
+
+int th_atoi(const char* str) { return atoi(str); }
+
+void* th_memset(void* b, int c, size_t len) { return memset(b, c, len); }
+
+void* th_memcpy(void* dst, const void* src, size_t n) { return memcpy(dst, src, n); }
+
+/* N.B.: Many embedded *printf SDKs do not support all format specifiers. */
+int th_vprintf(const char* format, va_list ap) { return vprintf(format, ap); }
+
+// Modified from source
+void th_printf(const char* p_fmt, ...) {
+  char buffer[128];
+  int size;
+  va_list args;
+  va_start(args, p_fmt);
+  size = TVMPlatformFormatMessage(buffer, 128, p_fmt, args);
+  va_end(args);
+  TVMPlatformWriteSerial(buffer, (size_t)size);
+}
+
+// Modified from source
+char th_getchar() { return TVMPlatformUartRxRead(); }
+
+// Modified from source
+void th_serialport_initialize(void) {
+#if EE_CFG_ENERGY_MODE == 1 && NRF_BOARD != 1
+  TVMPlatformUARTInit(9600);
+#else
+  TVMPlatformUARTInit();
+#endif
+}
+
+// Modified from source
+void th_timestamp(void) {
+#if EE_CFG_ENERGY_MODE == 1 && NRF_BOARD != 1
+  /* USER CODE 1 BEGIN */
+  /* Step 1. Pull pin low */
+  gpio_pin_set(g_gpio_dev, g_gpio_pin, 0);
+  /* Step 2. Hold low for at least 1us */
+  k_busy_wait(1);
+  /* Step 3. Release driver */
+  gpio_pin_set(g_gpio_dev, g_gpio_pin, 1);
+  /* USER CODE 1 END */
+#else
+  /* USER CODE 2 BEGIN */
+  unsigned long microSeconds = (unsigned long)(k_uptime_get() * 1000LL);
+  /* USER CODE 2 END */
+  /* This message must NOT be changed. */
+  th_printf(EE_MSG_TIMESTAMP, microSeconds);
+#endif
+}
+
+// Modified from source
+void th_timestamp_initialize(void) {
+  /* USER CODE 1 BEGIN */
+  // Setting up BOTH perf and energy here
+#if EE_CFG_ENERGY_MODE == 1 && NRF_BOARD != 1
+  g_gpio_dev = device_get_binding(g_gpio_device_name);
+  if (g_gpio_dev == NULL) {
+    th_printf("GPIO device init failed\r\n");
+    return;
+  }
+
+  int ret = gpio_pin_configure(g_gpio_dev, g_gpio_pin, GPIO_OUTPUT_HIGH);
+  if (ret < 0) {
+    th_printf("GPIO pin configure failed\r\n");
+    return;
+  }
+#endif
+
+  /* USER CODE 1 END */
+  /* This message must NOT be changed. */
+  th_printf(EE_MSG_TIMESTAMP_MODE);
+  /* Always call the timestamp on initialize so that the open-drain output
+     is set to "1" (so that we catch a falling edge) */
+  th_timestamp();
+}
diff --git a/apps/microtvm/zephyr/template_project/src/mlperftiny/tvmruntime.cc b/apps/microtvm/zephyr/template_project/src/mlperftiny/tvmruntime.cc
new file mode 100644
index 000000000000..b16a1e711f8c
--- /dev/null
+++ b/apps/microtvm/zephyr/template_project/src/mlperftiny/tvmruntime.cc
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "tvmruntime.h"
+
+#include <assert.h>
+#include <float.h>
+#include <kernel.h>
+#include <math.h>
+#include <power/reboot.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/crt/logging.h>
+#include <tvm/runtime/crt/platform.h>
+#include <tvm/runtime/crt/stack_allocator.h>
+
+#include "output_data.h"
+#include "tvmgen_default.h"
+#include "zephyr_uart.h"
+
+#ifdef CONFIG_ARCH_POSIX
+#include "posix_board_if.h"
+#endif
+
+// OUT_QUANT_SCALE and OUT_QUANT_ZERO are set in python.
+#if TARGET_MODEL == 3
+float* g_output_data = output_data;
+#else
+int8_t* g_output_data = output_data;
+float g_quant_scale = OUT_QUANT_SCALE;
+int8_t g_quant_zero = OUT_QUANT_ZERO;
+#endif
+size_t g_output_data_len = output_data_len;
+
+// WORKSPACE_SIZE is defined in python
+static uint8_t g_aot_memory[WORKSPACE_SIZE];
+tvm_workspace_t app_workspace;
+
+size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
+                                va_list args) {
+  return vsnprintk(out_buf, out_buf_size_bytes, fmt, args);
+}
+
+void TVMLogf(const char* msg, ...) {
+  char buffer[128];
+  int size;
+  va_list args;
+  va_start(args, msg);
+  size = TVMPlatformFormatMessage(buffer, 128, msg, args);
+  va_end(args);
+  TVMPlatformWriteSerial(buffer, (size_t)size);
+}
+
+void __attribute__((noreturn)) TVMPlatformAbort(tvm_crt_error_t error) {
+  TVMLogf("TVMPlatformAbort: %08x\n", error);
+  sys_reboot(SYS_REBOOT_COLD);
+  for (;;)
+    ;
+}
+
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
+  return StackMemoryManager_Allocate(&app_workspace, num_bytes, out_ptr);
+}
+
+tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
+  return StackMemoryManager_Free(&app_workspace, ptr);
+}
+
+void timer_expiry_function(struct k_timer* timer_id) { return; }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t nbytes, int dtype_code_hint,
+                               int dtype_bits_hint) {
+  tvm_crt_error_t err = kTvmErrorNoError;
+  void* ptr = 0;
+  DLDevice dev = {(DLDeviceType)device_type, device_id};
+  assert(nbytes > 0);
+  err = TVMPlatformMemoryAllocate(nbytes, dev, &ptr);
+  CHECK_EQ(err, kTvmErrorNoError,
+           "TVMBackendAllocWorkspace(%d, %d, %" PRIu64 ", %d, %d) -> %" PRId32, device_type,
+           device_id, nbytes, dtype_code_hint, dtype_bits_hint, err);
+  return ptr;
+}
+
+int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
+  tvm_crt_error_t err = kTvmErrorNoError;
+  DLDevice dev = {(DLDeviceType)device_type, device_id};
+  err = TVMPlatformMemoryFree(ptr, dev);
+  CHECK_EQ(err, kTvmErrorNoError, "TVMBackendFreeWorkspace(%d, %d)", device_type, device_id);
+  return err;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+void TVMRuntimeInit() { StackMemoryManager_Init(&app_workspace, g_aot_memory, WORKSPACE_SIZE); }
+
+void TVMInfer(void* input_ptr) {
+  struct tvmgen_default_inputs inputs = {
+#if TARGET_MODEL == MODEL_KWS
+    .input_1 = input_ptr,
+#elif TARGET_MODEL == MODEL_IC
+    .input_1_int8 = input_ptr,
+#elif TARGET_MODEL == MODEL_VWW
+    .input_1_int8 = input_ptr,
+#elif TARGET_MODEL == MODEL_AD
+    .input_1 = input_ptr,
+#elif
+#error Wrong model.
+#endif
+  };
+
+  struct tvmgen_default_outputs outputs = {
+#if TARGET_MODEL == MODEL_KWS
+#if COMPILE_WITH_CMSISNN
+    .Identity = output_data,
+#else
+    .output = output_data,
+#endif
+#elif TARGET_MODEL == MODEL_IC
+    .Identity_int8 = output_data,
+#elif TARGET_MODEL == MODEL_VWW
+    .Identity_int8 = output_data,
+#elif TARGET_MODEL == MODEL_AD
+    .Identity = output_data,
+#endif
+  };
+
+  int ret_val = tvmgen_default_run(&inputs, &outputs);
+  if (ret_val != 0) {
+    TVMLogf("Error: %d\n", ret_val);
+  }
+}
+
+int8_t QuantizeFloatToInt8(float value, float scale, int zero_point) {
+  int32_t result = round(value / scale) + zero_point;
+  if (result < INT8_MIN) {
+    result = INT8_MIN;
+  }
+  if (result > INT8_MAX) {
+    result = INT8_MAX;
+  }
+  return (int8_t)(result);
+}
diff --git a/apps/microtvm/zephyr/template_project/src/mlperftiny/tvmruntime.h b/apps/microtvm/zephyr/template_project/src/mlperftiny/tvmruntime.h
new file mode 100644
index 000000000000..940d64634d59
--- /dev/null
+++ b/apps/microtvm/zephyr/template_project/src/mlperftiny/tvmruntime.h
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef APPS_MICROTVM_ZEPHYR_TEMPLATE_PROJECT_SRC_MLPERFTINY_TVMRUNTIME_H_
+#define APPS_MICROTVM_ZEPHYR_TEMPLATE_PROJECT_SRC_MLPERFTINY_TVMRUNTIME_H_
+
+#include <stdarg.h>
+#include <tvm/runtime/crt/error_codes.h>
+#include <unistd.h>
+
+#define MODEL_KWS 1
+#define MODEL_VWW 2
+#define MODEL_AD 3
+#define MODEL_IC 4
+
+extern const unsigned char g_wakeup_sequence[];
+extern size_t g_output_data_len;
+
+#if TARGET_MODEL == 3
+extern float* g_output_data;
+#else
+extern int8_t* g_output_data;
+#endif
+
+extern float g_quant_scale;
+extern int8_t g_quant_zero;
+
+/*!
+ * \brief Initialize TVM runtime.
+ */
+void TVMRuntimeInit();
+
+/*!
+ * \brief Run TVM inference.
+ */
+void TVMInfer(void* input_ptr);
+
+/*!
+ * \brief Quantize float to int8.
+ * \param value Input data in float.
+ * \param scale Quantization scale factor.
+ * \param zero_point Quantization zero point.
+ */
+int8_t QuantizeFloatToInt8(float value, float scale, int zero_point);
+
+#endif /* APPS_MICROTVM_ZEPHYR_TEMPLATE_PROJECT_SRC_MLPERFTINY_TVMRUNTIME_H_ */
diff --git a/apps/microtvm/zephyr/template_project/src/mlperftiny/zephyr_uart.cc b/apps/microtvm/zephyr/template_project/src/mlperftiny/zephyr_uart.cc
new file mode 100644
index 000000000000..9880eadd4d9b
--- /dev/null
+++ b/apps/microtvm/zephyr/template_project/src/mlperftiny/zephyr_uart.cc
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "zephyr_uart.h"
+
+#include <drivers/uart.h>
+#include <sys/ring_buffer.h>
+#include <tvm/runtime/crt/error_codes.h>
+
+#include "crt_config.h"
+
+static const struct device* g_microtvm_uart;
+
+static uint8_t uart_data[8];
+
+// UART interrupt callback.
+void uart_irq_cb(const struct device* dev, void* user_data) {
+  while (uart_irq_update(dev) && uart_irq_is_pending(dev)) {
+    struct ring_buf* rbuf = (struct ring_buf*)user_data;
+    if (uart_irq_rx_ready(dev) != 0) {
+      for (;;) {
+        // Read a small chunk of data from the UART.
+        int bytes_read = uart_fifo_read(dev, uart_data, sizeof(uart_data));
+        if (bytes_read < 0) {
+          TVMPlatformAbort((tvm_crt_error_t)(0xbeef1));
+        } else if (bytes_read == 0) {
+          break;
+        }
+        // Write it into the ring buffer.
+        int bytes_written = ring_buf_put(rbuf, uart_data, bytes_read);
+        if (bytes_read != bytes_written) {
+          TVMPlatformAbort((tvm_crt_error_t)(0xbeef2));
+        }
+      }
+    }
+  }
+}
+
+// Initialize the UART receiver.
+void uart_rx_init(struct ring_buf* rbuf, const struct device* dev) {
+  uart_irq_callback_user_data_set(dev, uart_irq_cb, (void*)rbuf);
+  uart_irq_rx_enable(dev);
+}
+
+// UART read.
+char TVMPlatformUartRxRead() {
+  unsigned char c;
+  int ret = -1;
+  while (ret != 0) {
+    ret = uart_poll_in(g_microtvm_uart, &c);
+  }
+  return (char)c;
+}
+
+// UART write.
+uint32_t TVMPlatformWriteSerial(const char* data, uint32_t size) {
+  for (uint32_t i = 0; i < size; i++) {
+    uart_poll_out(g_microtvm_uart, data[i]);
+  }
+  return size;
+}
+
+// Initialize UART.
+void TVMPlatformUARTInit(uint32_t baudrate /* = TVM_UART_DEFAULT_BAUDRATE */) {
+  // Claim console device.
+  g_microtvm_uart = device_get_binding(DT_LABEL(DT_CHOSEN(zephyr_console)));
+  const struct uart_config config = {.baudrate = baudrate,
+                                     .parity = UART_CFG_PARITY_NONE,
+                                     .stop_bits = UART_CFG_STOP_BITS_1,
+                                     .data_bits = UART_CFG_DATA_BITS_8,
+                                     .flow_ctrl = UART_CFG_FLOW_CTRL_NONE};
+  uart_configure(g_microtvm_uart, &config);
+}
diff --git a/apps/microtvm/zephyr/template_project/src/mlperftiny/zephyr_uart.h b/apps/microtvm/zephyr/template_project/src/mlperftiny/zephyr_uart.h
new file mode 100644
index 000000000000..f10cf0262224
--- /dev/null
+++ b/apps/microtvm/zephyr/template_project/src/mlperftiny/zephyr_uart.h
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef APPS_MICROTVM_ZEPHYR_TEMPLATE_PROJECT_SRC_MLPERFTINY_ZEPHYR_UART_H_
+#define APPS_MICROTVM_ZEPHYR_TEMPLATE_PROJECT_SRC_MLPERFTINY_ZEPHYR_UART_H_
+
+#include <stdint.h>
+
+#define TVM_UART_DEFAULT_BAUDRATE 115200
+
+/*!
+ * \brief Read Uart Rx buffer.
+ * \param data Pointer to read data.
+ * \param data_size_bytes Read request size in bytes.
+ *
+ * \return Number of data read in bytes.
+ */
+char TVMPlatformUartRxRead();
+
+/*!
+ * \brief Write data in serial.
+ * \param data Pointer to data to write.
+ * \param size Size of data in bytes.
+ *
+ * \return Number of write in bytes.
+ */
+uint32_t TVMPlatformWriteSerial(const char* data, uint32_t size);
+
+/*!
+ * \brief Initialize Uart.
+ * \param baudrate Desired UART baudrate.
+ */
+void TVMPlatformUARTInit(uint32_t baudrate = TVM_UART_DEFAULT_BAUDRATE);
+
+#endif /* APPS_MICROTVM_ZEPHYR_TEMPLATE_PROJECT_SRC_MLPERFTINY_ZEPHYR_UART_H_ */
diff --git a/cmake/modules/Zephyr.cmake b/cmake/modules/Zephyr.cmake
index 644675dcf871..1f506f5132fd 100644
--- a/cmake/modules/Zephyr.cmake
+++ b/cmake/modules/Zephyr.cmake
@@ -27,6 +27,9 @@ if(USE_MICRO)
       "apps/microtvm/zephyr/template_project/src/aot_standalone_demo *.h -> zephyr/src/aot_standalone_demo"
       "apps/microtvm/zephyr/template_project/src/host_driven *.c -> zephyr/src/host_driven"
       "apps/microtvm/zephyr/template_project/src/host_driven *.h -> zephyr/src/host_driven"
+      "apps/microtvm/zephyr/template_project/src/mlperftiny *.cc -> zephyr/src/mlperftiny"
+      "apps/microtvm/zephyr/template_project/src/mlperftiny *.h -> zephyr/src/mlperftiny"
+      "3rdparty/mlperftiny/api * -> zephyr/src/mlperftiny/api"
       "apps/microtvm/zephyr/template_project/fvp-hack * -> zephyr/fvp-hack"
       "apps/microtvm/zephyr/template_project/qemu-hack * -> zephyr/qemu-hack"
       "apps/microtvm/zephyr/template_project/app-overlay * -> zephyr/app-overlay"

From e1b4877fdf6f3a6ab75b894b6325854655f9bb54 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Fri, 6 Jan 2023 17:32:23 -0800
Subject: [PATCH 125/286] [docs] Add "Open with Colab" button to documentation
 (#13627)

* Add Colab header to TVM tutorials by default
* Fix library imports to work with Colab
* Better support for Jupyter magic and directives

Co-authored-by: Mehrdad Hessar <mhessar@octoml.ai>
---
 docs/README.md                                |  39 ++++
 docs/conf.py                                  | 210 +++++++++++++++++-
 gallery/how_to/compile_models/from_coreml.py  |   7 +-
 gallery/how_to/compile_models/from_darknet.py |   5 +-
 gallery/how_to/compile_models/from_keras.py   |   7 +-
 gallery/how_to/compile_models/from_mxnet.py   |  10 +-
 gallery/how_to/compile_models/from_oneflow.py |   4 +-
 gallery/how_to/compile_models/from_onnx.py    |   9 +-
 gallery/how_to/compile_models/from_paddle.py  |   6 +-
 gallery/how_to/compile_models/from_pytorch.py |  12 +-
 .../how_to/compile_models/from_tensorflow.py  |   5 +
 gallery/how_to/compile_models/from_tflite.py  |   5 +-
 .../deploy_models/deploy_model_on_adreno.py   |   1 +
 .../deploy_models/deploy_model_on_nano.py     |   1 +
 .../deploy_object_detection_pytorch.py        |   4 +-
 .../extend_tvm/bring_your_own_datatypes.py    |   2 +-
 .../optimize_operators/opt_conv_cuda.py       |   1 +
 .../optimize_operators/opt_conv_tensorcore.py |   1 +
 .../tune_conv2d_layer_cuda.py                 |   1 +
 .../tune_with_autotvm/tune_conv2d_cuda.py     |   1 +
 .../tune_with_autotvm/tune_relay_cuda.py      |   1 +
 .../work_with_microtvm/install_cmsis.rst      |  35 +++
 .../install_dependencies.rst                  |  33 +++
 .../work_with_microtvm/install_zephyr.rst     |  52 +++++
 .../how_to/work_with_microtvm/micro_aot.py    |  31 ++-
 .../work_with_microtvm/micro_autotune.py      |  26 ++-
 .../work_with_microtvm/micro_pytorch.py       |   6 +-
 .../how_to/work_with_microtvm/micro_tflite.py | 128 +++--------
 .../how_to/work_with_microtvm/micro_train.py  |  15 +-
 .../work_with_pytorch/using_as_torch.py       |   8 +
 .../using_optimized_torch.py                  |  10 +-
 gallery/how_to/work_with_relay/build_gcn.py   |   8 +-
 .../how_to/work_with_relay/using_relay_viz.py |   7 +
 .../how_to/work_with_schedules/reduction.py   |   1 +
 gallery/how_to/work_with_schedules/scan.py    |   1 +
 gallery/tutorial/intro_topi.py                |   1 +
 gallery/tutorial/relay_quick_start.py         |   1 +
 gallery/tutorial/tensor_ir_blitz_course.py    |   1 +
 tests/lint/check_request_hook.py              |  35 ++-
 39 files changed, 564 insertions(+), 167 deletions(-)
 create mode 100644 gallery/how_to/work_with_microtvm/install_cmsis.rst
 create mode 100644 gallery/how_to/work_with_microtvm/install_dependencies.rst
 create mode 100644 gallery/how_to/work_with_microtvm/install_zephyr.rst

diff --git a/docs/README.md b/docs/README.md
index 6c32d2d6bfed..572b72fc3cee 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -111,3 +111,42 @@ python tests/scripts/ci.py docs --full
 You can define the order of tutorials with `subsection_order` and
 `within_subsection_order` in [`conf.py`](conf.py).
 By default, the tutorials within one subsection are sorted by filename.
+
+## Google Colab Integration
+
+All the TVM tutorials can be opened and used interactively in Google Colab by
+clicking the button at the top of the page. To do this, `sphinx-gallery` builds
+`.ipynb` files from each tutorial, which are automatically deployed to the
+[apache/tvm-site](https://github.com/apache/tvm-site/tree/asf-site) repo's
+`asf-site` branch by [@tvm-bot](https://github.com/tvm-bot).
+
+To make sure your tutorial runs correctly on Colab, any non-Python parts of
+the tutorial (e.g. dependency installations) should be prefixed by an
+[IPython magic command](https://ipython.readthedocs.io/en/stable/interactive/magics.html).
+These will not be included in the built `HTML` file. For example, to install
+Pytorch in your tutorial, add a ReStructured Text block like the following:
+
+```python
+######################################################################
+# To run this tutorial, we must install PyTorch:
+#
+# .. code-block:: bash
+#
+#     %%shell
+#     pip install torch
+#
+```
+
+### Interactive Bash Scripts
+
+In stock IPython, the `%%bash` magic command should be used to run shell
+commands. However, this command does not give real-time output - the
+tutorial's user will not see any output until the entire cell finishes
+running. When running commands that take several minutes (e.g. installing
+dependencies), this is annoying.
+
+Luckily, Google Colab has the `%%shell` magic command that does the same
+thing as `%%bash`, but gives output in real time. This command is specific
+to Colab, and its [source code](https://github.com/googlecolab/colabtools)
+is public. Thus, `%%shell` should be used instead of `%%bash` when writing
+TVM tutorials.
diff --git a/docs/conf.py b/docs/conf.py
index b4982f14c049..357df8cef12c 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -29,15 +29,17 @@
 #
 # All configuration values have a default; values that are commented out
 # serve to show the default.
+from functools import partial
 import gc
+from importlib import import_module
 import inspect
+from hashlib import md5
 import os
 from pathlib import Path
 import re
 import sys
-
-import sphinx_gallery
-
+from textwrap import dedent, indent
+from unittest.mock import patch
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
@@ -84,6 +86,198 @@ def git_describe_version(original_version):
 version = git_describe_version(tvm.__version__)
 release = version
 
+
+def monkey_patch(module_name, func_name):
+    """Helper function for monkey-patching library functions.
+
+    Used to modify a few sphinx-gallery behaviors to make the "Open in Colab" button work correctly.
+    Should be called as a decorator with arguments. Note this behaves differently from unittest's
+    @mock.patch, as our monkey_patch decorator should be placed on the new version of the function.
+    """
+    module = import_module(module_name)
+    original_func = getattr(module, func_name)
+
+    def decorator(function):
+        updated_func = partial(function, real_func=original_func)
+        setattr(module, func_name, updated_func)
+        return updated_func
+
+    return decorator
+
+
+CURRENT_FILE_CONF = None
+
+
+@monkey_patch("sphinx_gallery.py_source_parser", "split_code_and_text_blocks")
+def split_code_and_text_blocks(source_file, return_node, real_func):
+    """Monkey-patch split_code_and_text_blocks to expose sphinx-gallery's file-level config.
+
+    It's kinda gross, but we need access to file_conf to detect the requires_cuda flag.
+    """
+    global CURRENT_FILE_CONF
+    file_conf, blocks, node = real_func(source_file, return_node)
+    CURRENT_FILE_CONF = file_conf
+    return (file_conf, blocks, node)
+
+
+# This header replaces the default sphinx-gallery one in sphinx_gallery/gen_rst.py.
+COLAB_HTML_HEADER = """
+.. DO NOT EDIT. THIS FILE WAS AUTOMATICALLY GENERATED BY
+.. TVM'S MONKEY-PATCHED VERSION OF SPHINX-GALLERY. TO MAKE
+.. CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "{python_file}"
+
+.. only:: html
+
+    .. note::
+        :class: sphx-glr-download-link-note
+
+        This tutorial can be used interactively with Google Colab! You can also click
+        :ref:`here <sphx_glr_download_{ref_name}>` to run the Jupyter notebook locally.
+
+        .. image:: {button_svg}
+            :align: center
+            :target: {colab_url}
+            :width: 300px
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_{ref_name}:
+
+"""
+
+# Google Colab allows opening .ipynb files on GitHub by appending a GitHub path to this base URL.
+COLAB_URL_BASE = "https://colab.research.google.com/github"
+
+# The GitHub path where the site is automatically deployed by tvm-bot.
+IPYTHON_GITHUB_BASE = "apache/tvm-site/blob/asf-site/docs/_downloads/"
+
+# The SVG image of the "Open in Colab" button.
+BUTTON = "https://raw.githubusercontent.com/apache/web-data/main/images/utilities/colab_button.svg"
+
+
+@monkey_patch("sphinx_gallery.gen_rst", "save_rst_example")
+def save_rst_example(example_rst, example_file, time_elapsed, memory_used, gallery_conf, real_func):
+    """Monkey-patch save_rst_example to include the "Open in Colab" button."""
+
+    # The url is the md5 hash of the notebook path.
+    example_fname = os.path.relpath(example_file, gallery_conf["src_dir"])
+    ref_fname = example_fname.replace(os.path.sep, "_")
+    notebook_path = example_fname[:-2] + "ipynb"
+    digest = md5(notebook_path.encode()).hexdigest()
+
+    # Fixed documentation versions must link to different (earlier) .ipynb notebooks.
+    colab_url = f"{COLAB_URL_BASE}/{IPYTHON_GITHUB_BASE}"
+    if "dev" not in version:
+        colab_url += version + "/"
+    colab_url += digest + "/" + os.path.basename(notebook_path)
+
+    new_header = COLAB_HTML_HEADER.format(
+        python_file=example_fname, ref_name=ref_fname, colab_url=colab_url, button_svg=BUTTON
+    )
+    with patch("sphinx_gallery.gen_rst.EXAMPLE_HEADER", new_header):
+        real_func(example_rst, example_file, time_elapsed, memory_used, gallery_conf)
+
+
+INCLUDE_DIRECTIVE_RE = re.compile(r"^([ \t]*)\.\. include::\s*(.+)\n", flags=re.M)
+COMMENT_DIRECTIVE_RE = re.compile(r"^\.\.(?: .*)?\n(?:(?:  .*)?\n)*", flags=re.M)
+ADMONITION_DIRECTIVE_RE = re.compile(rf"^\.\. admonition:: *(.*)\n((?:(?:  .*)?\n)*)\n", flags=re.M)
+
+
+@monkey_patch("sphinx_gallery.notebook", "rst2md")
+def rst2md(text, gallery_conf, target_dir, heading_levels, real_func):
+    """Monkey-patch rst2md to support comments and some include directives.
+
+    Currently, only include directives without any parameters are supported. Also, note that in
+    reStructuredText any unrecognized explicit markup block is treated as a comment (see
+    https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#comments).
+
+    For callouts, we only replace generic "admonition" directives. All others should be replaced by
+    sphinx-gallery's rst2md. Note that the "alert" and "alert-info" tags are support in most IPython
+    notebooks, but they render kinda funky on Colab.
+    """
+
+    def load_include(match):
+        full_path = os.path.join(target_dir, match.group(2))
+        with open(full_path) as f:
+            lines = f.read()
+        indented = indent(lines, match.group(1)) + "\n"
+        return indented
+
+    text = re.sub(INCLUDE_DIRECTIVE_RE, load_include, text)
+
+    # Replace generic, titled admonitions with indented text. Other admonitions (e.g. .. note::)
+    # will be handled by sphinx-gallery's
+    def rewrite_generic_admonition(match):
+        title, text = match.groups()
+        stripped_text = dedent(text).strip()
+        return f'<div class="alert alert-info"><h4>{title}</h4><p>{stripped_text}</p></div>'
+
+    text = re.sub(ADMONITION_DIRECTIVE_RE, rewrite_generic_admonition, text)
+
+    # Call the real function, and then strip any remaining directives (i.e. comments)
+    text = real_func(text, gallery_conf, target_dir, heading_levels)
+    text = re.sub(COMMENT_DIRECTIVE_RE, "", text)
+    return text
+
+
+INSTALL_TVM_DEV = f"""\
+%%shell
+# Installs the latest dev build of TVM from PyPI. If you wish to build
+# from source, see https://tvm.apache.org/docs/install/from_source.html
+pip install apache-tvm --pre"""
+
+INSTALL_TVM_FIXED = f"""\
+%%shell
+# Installs TVM version {version} from PyPI. If you wish to build
+# from source, see https://tvm.apache.org/docs/install/from_source.html
+pip install apache-tvm=={version}"""
+
+INSTALL_TVM_CUDA_DEV = f"""\
+%%shell
+# Installs the latest dev build of TVM from PyPI, with CUDA enabled. To use this,
+# you must request a Google Colab instance with a GPU by going to Runtime ->
+# Change runtime type -> Hardware accelerator -> GPU. If you wish to build from
+# source, see see https://tvm.apache.org/docs/install/from_source.html
+pip install tlcpack-nightly-cu113 --pre -f https://tlcpack.ai/wheels"""
+
+INSTALL_TVM_CUDA_FIXED = f"""\
+%%shell
+# Installs TVM version {version} from PyPI, with CUDA enabled. To use this,
+# you must request a Google Colab instance with a GPU by going to Runtime ->
+# Change runtime type -> Hardware accelerator -> GPU. If you wish to build from
+# source, see see https://tvm.apache.org/docs/install/from_source.html
+pip install apache-tvm-cu113=={version} -f https://tlcpack.ai/wheels"""
+
+
+@monkey_patch("sphinx_gallery.gen_rst", "jupyter_notebook")
+def jupyter_notebook(script_blocks, gallery_conf, target_dir, real_func):
+    """Monkey-patch sphinx-gallery to add a TVM import block to each IPython notebook.
+
+    If we had only one import block, we could skip the patching and just set first_notebook_cell.
+    However, how we import TVM depends on if we are using a fixed or dev version, and whether we
+    will use the GPU.
+
+    Tutorials requiring a CUDA-enabled build of TVM should use the flag:
+    # sphinx_gallery_requires_cuda = True
+    """
+
+    requires_cuda = CURRENT_FILE_CONF.get("requires_cuda", False)
+    fixed_version = not "dev" in version
+
+    if fixed_version and requires_cuda:
+        install_block = INSTALL_TVM_CUDA_FIXED
+    elif fixed_version and not requires_cuda:
+        install_block = INSTALL_TVM_FIXED
+    elif not fixed_version and requires_cuda:
+        install_block = INSTALL_TVM_CUDA_DEV
+    else:
+        install_block = INSTALL_TVM_DEV
+
+    new_conf = {**gallery_conf, "first_notebook_cell": install_block}
+    return real_func(script_blocks, new_conf, target_dir)
+
+
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
 extensions = [
@@ -506,6 +700,16 @@ def process_docstring(app, what, name, obj, options, lines):
 from legacy_redirect import build_legacy_redirect
 
 
+def strip_ipython_magic(app, docname, source):
+    """Prevents IPython magic commands from being rendered in HTML files.
+
+    TODO rework this function to remove IPython magic commands from include directives too.
+    """
+    for i in range(len(source)):
+        source[i] = re.sub(r"%%.*\n\s*", "", source[i])
+
+
 def setup(app):
+    app.connect("source-read", strip_ipython_magic)
     app.connect("autodoc-process-docstring", process_docstring)
     app.connect("build-finished", build_legacy_redirect(tvm_path))
diff --git a/gallery/how_to/compile_models/from_coreml.py b/gallery/how_to/compile_models/from_coreml.py
index 96d2967947f6..4d0eea2d8d52 100644
--- a/gallery/how_to/compile_models/from_coreml.py
+++ b/gallery/how_to/compile_models/from_coreml.py
@@ -23,13 +23,12 @@
 
 This article is an introductory tutorial to deploy CoreML models with Relay.
 
-For us to begin with, coremltools module is required to be installed.
-
-A quick solution is to install via pip
+To begin, we must install coremltools:
 
 .. code-block:: bash
 
-    pip install -U coremltools --user
+    %%shell
+    pip install coremltools
 
 or please refer to official site
 https://github.com/apple/coremltools
diff --git a/gallery/how_to/compile_models/from_darknet.py b/gallery/how_to/compile_models/from_darknet.py
index c12a9e7e1574..8397efa63b97 100644
--- a/gallery/how_to/compile_models/from_darknet.py
+++ b/gallery/how_to/compile_models/from_darknet.py
@@ -27,8 +27,9 @@
 
 .. code-block:: bash
 
-  pip install cffi
-  pip install opencv-python
+  %%shell
+  pip install cffi opencv-python
+
 """
 
 # sphinx_gallery_start_ignore
diff --git a/gallery/how_to/compile_models/from_keras.py b/gallery/how_to/compile_models/from_keras.py
index 895a601ada0a..ac961ca16ad0 100644
--- a/gallery/how_to/compile_models/from_keras.py
+++ b/gallery/how_to/compile_models/from_keras.py
@@ -19,7 +19,7 @@
 =====================
 **Author**: `Yuwei Hu <https://Huyuwei.github.io/>`_
 
-This article is an introductory tutorial to deploy keras models with Relay.
+This article is an introductory tutorial to deploy Keras models with Relay.
 
 For us to begin with, keras should be installed.
 Tensorflow is also required since it's used as the default backend of keras.
@@ -28,14 +28,15 @@
 
 .. code-block:: bash
 
-    pip install -U keras --user
-    pip install -U tensorflow --user
+    %%shell
+    pip install keras tensorflow
 
 or please refer to official site
 https://keras.io/#installation
 """
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/gallery/how_to/compile_models/from_mxnet.py b/gallery/how_to/compile_models/from_mxnet.py
index 38084618628f..cfd66ecdb74c 100644
--- a/gallery/how_to/compile_models/from_mxnet.py
+++ b/gallery/how_to/compile_models/from_mxnet.py
@@ -22,21 +22,19 @@
 **Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_, \
             `Kazutaka Morita <https://github.com/kazum>`_
 
-This article is an introductory tutorial to deploy mxnet models with Relay.
-
-For us to begin with, mxnet module is required to be installed.
-
-A quick solution is
+This article is an introductory tutorial to deploy mxnet models with Relay. To begin, we must install `mxnet`:
 
 .. code-block:: bash
 
-    pip install mxnet --user
+    %%shell
+    pip install mxnet
 
 or please refer to official installation guide.
 https://mxnet.apache.org/versions/master/install/index.html
 """
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/gallery/how_to/compile_models/from_oneflow.py b/gallery/how_to/compile_models/from_oneflow.py
index eb27c4b3e34b..0925c9fe81ce 100644
--- a/gallery/how_to/compile_models/from_oneflow.py
+++ b/gallery/how_to/compile_models/from_oneflow.py
@@ -27,8 +27,9 @@
 
 .. code-block:: bash
 
+    %%shell
     pip install flowvision==0.1.0
-    python3 -m pip install -f https://release.oneflow.info oneflow==0.7.0+cpu
+    pip install -f https://release.oneflow.info oneflow==0.7.0+cpu
 
 or please refer to official site:
 https://github.com/Oneflow-Inc/oneflow
@@ -37,6 +38,7 @@
 """
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/gallery/how_to/compile_models/from_onnx.py b/gallery/how_to/compile_models/from_onnx.py
index f0256bc7d3ae..980091d391bd 100644
--- a/gallery/how_to/compile_models/from_onnx.py
+++ b/gallery/how_to/compile_models/from_onnx.py
@@ -21,15 +21,14 @@
 
 This article is an introductory tutorial to deploy ONNX models with Relay.
 
-For us to begin with, ONNX package must be installed.
-
-A quick solution is to install protobuf compiler, and
+To begin, install the ONNX package:
 
 .. code-block:: bash
 
-    pip install --user onnx onnxoptimizer
+    %%shell
+    pip install onnx onnxoptimizer
 
-or please refer to official site.
+Alternatively, you can refer to official site:
 https://github.com/onnx/onnx
 """
 
diff --git a/gallery/how_to/compile_models/from_paddle.py b/gallery/how_to/compile_models/from_paddle.py
index fecb1c48dafb..199547b814a4 100644
--- a/gallery/how_to/compile_models/from_paddle.py
+++ b/gallery/how_to/compile_models/from_paddle.py
@@ -20,14 +20,14 @@
 **Author**: `Ziyuan Ma <https://github.com/ZiyuanMa/>`_
 
 This article is an introductory tutorial to deploy PaddlePaddle models with Relay.
-For us to begin with, PaddlePaddle>=2.1.3 is required to be installed.
-A quick solution is
+To begin, we'll install PaddlePaddle>=2.1.3:
 
 .. code-block:: bash
 
+    %%shell
     pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple
 
-or please refer to official site.
+For more details, refer to the official install instructions at:
 https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html
 """
 
diff --git a/gallery/how_to/compile_models/from_pytorch.py b/gallery/how_to/compile_models/from_pytorch.py
index 98b531fa6d6e..064ed70e4645 100644
--- a/gallery/how_to/compile_models/from_pytorch.py
+++ b/gallery/how_to/compile_models/from_pytorch.py
@@ -21,15 +21,15 @@
 
 This article is an introductory tutorial to deploy PyTorch models with Relay.
 
-For us to begin with, PyTorch should be installed.
-TorchVision is also required since we will be using it as our model zoo.
-
-A quick solution is to install via pip
+For us to begin, PyTorch should be installed.
+TorchVision is also required so we can use the model zoo.
+A quick solution is to install via pip:
 
 .. code-block:: bash
 
-    pip install torch==1.7.0
-    pip install torchvision==0.8.1
+    %%shell
+    pip install torch
+    pip install torchvision
 
 or please refer to official site
 https://pytorch.org/get-started/locally/
diff --git a/gallery/how_to/compile_models/from_tensorflow.py b/gallery/how_to/compile_models/from_tensorflow.py
index 9a32397815ef..b85b9e669a20 100644
--- a/gallery/how_to/compile_models/from_tensorflow.py
+++ b/gallery/how_to/compile_models/from_tensorflow.py
@@ -21,6 +21,11 @@
 
 For us to begin with, tensorflow python module is required to be installed.
 
+.. code-block:: bash
+
+    %%shell
+    pip install tensorflow
+
 Please refer to https://www.tensorflow.org/install
 """
 
diff --git a/gallery/how_to/compile_models/from_tflite.py b/gallery/how_to/compile_models/from_tflite.py
index d1b78f11d5b0..a248346c2971 100644
--- a/gallery/how_to/compile_models/from_tflite.py
+++ b/gallery/how_to/compile_models/from_tflite.py
@@ -25,9 +25,8 @@
 
 .. code-block:: bash
 
-    # install tflite
-    pip install tflite==2.1.0 --user
-
+    %%shell
+    pip install tflite==2.1.0
 
 or you could generate TFLite package yourself. The steps are the following:
 
diff --git a/gallery/how_to/deploy_models/deploy_model_on_adreno.py b/gallery/how_to/deploy_models/deploy_model_on_adreno.py
index d6ed1f1f99a3..8d25e50b56b1 100644
--- a/gallery/how_to/deploy_models/deploy_model_on_adreno.py
+++ b/gallery/how_to/deploy_models/deploy_model_on_adreno.py
@@ -31,6 +31,7 @@
 
 .. code-block:: bash
 
+  %%shell
   pip install torch
   pip install torchvision
 
diff --git a/gallery/how_to/deploy_models/deploy_model_on_nano.py b/gallery/how_to/deploy_models/deploy_model_on_nano.py
index 5e59dccf205d..3d8a4a796f8c 100644
--- a/gallery/how_to/deploy_models/deploy_model_on_nano.py
+++ b/gallery/how_to/deploy_models/deploy_model_on_nano.py
@@ -26,6 +26,7 @@
 """
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py b/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py
index 0d8d0f2867a2..ffde042e2b88 100644
--- a/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py
+++ b/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py
@@ -27,8 +27,8 @@
 
 .. code-block:: bash
 
-    pip install torch==1.7.0
-    pip install torchvision==0.8.1
+    pip install torch
+    pip install torchvision
 
 or please refer to official site
 https://pytorch.org/get-started/locally/
diff --git a/gallery/how_to/extend_tvm/bring_your_own_datatypes.py b/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
index 479269a224a3..bbd207dbac8b 100644
--- a/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
+++ b/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
@@ -47,7 +47,7 @@
 
 If you would like to try this with your own datatype library, first bring the library's functions into the process space with ``CDLL``:
 
-.. code-block :: python
+.. code-block:: python
 
     ctypes.CDLL('my-datatype-lib.so', ctypes.RTLD_GLOBAL)
 """
diff --git a/gallery/how_to/optimize_operators/opt_conv_cuda.py b/gallery/how_to/optimize_operators/opt_conv_cuda.py
index e5b452af66a9..33e5d9855361 100644
--- a/gallery/how_to/optimize_operators/opt_conv_cuda.py
+++ b/gallery/how_to/optimize_operators/opt_conv_cuda.py
@@ -31,6 +31,7 @@
 """
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
index 8db20b9b9bf8..5734f064f0dc 100644
--- a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
+++ b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
@@ -52,6 +52,7 @@
 # NHWCnc memory layout.The following code defines the convolution algorithm in TVM.
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
index 5d173e38128e..7964694e68c0 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
@@ -38,6 +38,7 @@
 """
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
index 4560cf881ed8..a73b97525f12 100644
--- a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
+++ b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
@@ -49,6 +49,7 @@
 # Now return to python code. Import packages.
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py b/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py
index 4cf397e2567e..7cb6cb8dd3f9 100644
--- a/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py
+++ b/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py
@@ -60,6 +60,7 @@
 # Now return to python code. Import packages.
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/gallery/how_to/work_with_microtvm/install_cmsis.rst b/gallery/how_to/work_with_microtvm/install_cmsis.rst
new file mode 100644
index 000000000000..2f1d2fb1189a
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/install_cmsis.rst
@@ -0,0 +1,35 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+..  Boilerplate script for installing CMSIS-NN in the microTVM
+    tutorials that use it. Does not show up as a separate file
+    on the documentation website.
+
+Install CMSIS-NN
+----------------------------
+
+    .. code-block:: bash
+
+        %%shell
+        CMSIS_SHA="51263182d16c92649a48144ba56c0945f9fce60e"
+        CMSIS_URL="http://github.com/ARM-software/CMSIS_5/archive/${CMSIS_SHA}.tar.gz"
+        export CMSIS_PATH=/content/cmsis
+        DOWNLOAD_PATH="/content/${CMSIS_SHA}.tar.gz"
+        mkdir ${CMSIS_PATH}
+        wget ${CMSIS_URL} -O "${DOWNLOAD_PATH}"
+        tar -xf "${DOWNLOAD_PATH}" -C ${CMSIS_PATH} --strip-components=1
+        rm ${DOWNLOAD_PATH}
diff --git a/gallery/how_to/work_with_microtvm/install_dependencies.rst b/gallery/how_to/work_with_microtvm/install_dependencies.rst
new file mode 100644
index 000000000000..d1bee4176d94
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/install_dependencies.rst
@@ -0,0 +1,33 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+..  Boilerplate script for installing Zephyr in the microTVM
+    tutorials that use it. Does not show up as a separate file
+    on the documentation website.
+
+
+Install microTVM Python dependencies
+------------------------------------
+
+TVM does not include a package for Python serial communication, so
+we must install one before using microTVM. We will also need TFLite
+to load models.
+
+    .. code-block:: bash
+
+        %%shell
+        pip install pyserial==3.5 tflite==2.1
diff --git a/gallery/how_to/work_with_microtvm/install_zephyr.rst b/gallery/how_to/work_with_microtvm/install_zephyr.rst
new file mode 100644
index 000000000000..a4c412f0f746
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/install_zephyr.rst
@@ -0,0 +1,52 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+..  Boilerplate script for installing Zephyr in the microTVM
+    tutorials that use it. Does not show up as a separate file
+    on the documentation website.
+
+Install Zephyr
+----------------------------
+
+    .. code-block:: bash
+
+        %%shell
+        # Install west and ninja
+        python3 -m pip install west
+        apt-get install -y ninja-build
+
+        # Install ZephyrProject
+        ZEPHYR_PROJECT_PATH="/content/zephyrproject"
+        export ZEPHYR_BASE=${ZEPHYR_PROJECT_PATH}/zephyr
+        west init ${ZEPHYR_PROJECT_PATH}
+        cd ${ZEPHYR_BASE}
+        git checkout v2.7-branch
+        cd ..
+        west update
+        west zephyr-export
+        chmod -R o+w ${ZEPHYR_PROJECT_PATH}
+
+        # Install Zephyr SDK
+        ZEPHYR_SDK_VERSION=0.13.2
+        ZEPHYR_SDK_FILE="/content/zephyr-sdk-linux-setup.run"
+        wget --no-verbose -O $ZEPHYR_SDK_FILE \
+            https://github.com/zephyrproject-rtos/sdk-ng/releases/download/v${ZEPHYR_SDK_VERSION}/zephyr-sdk-${ZEPHYR_SDK_VERSION}-linux-x86_64-setup.run
+        chmod +x $ZEPHYR_SDK_FILE
+        "$ZEPHYR_SDK_FILE" -- -d /content/zephyr-sdk --quiet
+
+        # Install python dependencies
+        python3 -m pip install -r "${ZEPHYR_BASE}/scripts/requirements.txt"
diff --git a/gallery/how_to/work_with_microtvm/micro_aot.py b/gallery/how_to/work_with_microtvm/micro_aot.py
index 4d6890f8d936..8646b6d7ecfa 100644
--- a/gallery/how_to/work_with_microtvm/micro_aot.py
+++ b/gallery/how_to/work_with_microtvm/micro_aot.py
@@ -30,16 +30,42 @@
 or on Zephyr platform on a microcontroller/board supported by Zephyr.
 """
 
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
+#
+
 # sphinx_gallery_start_ignore
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
 
+import os
+
+# By default, this tutorial runs on x86 CPU using TVM's C runtime. If you would like
+# to run on real Zephyr hardware, you must export the `TVM_MICRO_USE_HW` environment
+# variable. Otherwise (if you are using the C runtime), you can skip installing
+# Zephyr and CMSIS-NN. It takes ~20 minutes to install both of them.
+use_physical_hw = bool(os.getenv("TVM_MICRO_USE_HW"))
+
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_zephyr.rst
+#
+
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_cmsis.rst
+#
+
+######################################################################
+# Import Python dependencies
+# -------------------------------
+#
 import numpy as np
 import pathlib
 import json
-import os
 
 import tvm
 from tvm import relay
@@ -57,7 +83,6 @@
 # **Note:** By default this tutorial runs on x86 CPU using CRT, if you would like to run on Zephyr platform
 # you need to export `TVM_MICRO_USE_HW` environment variable.
 #
-use_physical_hw = bool(os.getenv("TVM_MICRO_USE_HW"))
 MODEL_URL = "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/keyword_spotting_quant.tflite"
 MODEL_PATH = download_testdata(MODEL_URL, "keyword_spotting_quant.tflite", module="model")
 SAMPLE_URL = "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy"
@@ -139,6 +164,8 @@
         "board": BOARD,
         "serial_number": SERIAL,
         "config_main_stack_size": 4096,
+        "cmsis_path": os.getenv("CMSIS_PATH", default="/content/cmsis"),
+        "zephyr_base": os.getenv("ZEPHYR_BASE", default="/content/zephyrproject/zephyr"),
     }
 
 temp_dir = tvm.contrib.utils.tempdir()
diff --git a/gallery/how_to/work_with_microtvm/micro_autotune.py b/gallery/how_to/work_with_microtvm/micro_autotune.py
index 13bf4efac138..3dd4cab6c9af 100644
--- a/gallery/how_to/work_with_microtvm/micro_autotune.py
+++ b/gallery/how_to/work_with_microtvm/micro_autotune.py
@@ -27,13 +27,37 @@
 This tutorial explains how to autotune a model using the C runtime.
 """
 
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
+#
+
 # sphinx_gallery_start_ignore
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
 
+# You can skip the following two sections (installing Zephyr and CMSIS-NN) if the following flag is False.
+# Installing Zephyr takes ~20 min.
 import os
+
+use_physical_hw = bool(os.getenv("TVM_MICRO_USE_HW"))
+
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_zephyr.rst
+#
+
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_cmsis.rst
+#
+
+######################################################################
+# Import Python dependencies
+# -------------------------------
+#
 import json
 import numpy as np
 import pathlib
@@ -41,8 +65,6 @@
 import tvm
 from tvm.relay.backend import Runtime
 
-use_physical_hw = bool(os.getenv("TVM_MICRO_USE_HW"))
-
 ####################
 # Defining the model
 ####################
diff --git a/gallery/how_to/work_with_microtvm/micro_pytorch.py b/gallery/how_to/work_with_microtvm/micro_pytorch.py
index cd4af05fb561..f7f0c9209a87 100644
--- a/gallery/how_to/work_with_microtvm/micro_pytorch.py
+++ b/gallery/how_to/work_with_microtvm/micro_pytorch.py
@@ -29,6 +29,11 @@
 since the model would not fit on our current supported Zephyr boards.
 """
 
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
+#
+
 # sphinx_gallery_start_ignore
 from tvm import testing
 
@@ -36,7 +41,6 @@
 # sphinx_gallery_end_ignore
 
 import pathlib
-
 import torch
 import torchvision
 from torchvision import transforms
diff --git a/gallery/how_to/work_with_microtvm/micro_tflite.py b/gallery/how_to/work_with_microtvm/micro_tflite.py
index 5822a1a1e97d..cbdf6cd6f4ca 100644
--- a/gallery/how_to/work_with_microtvm/micro_tflite.py
+++ b/gallery/how_to/work_with_microtvm/micro_tflite.py
@@ -26,101 +26,9 @@
 """
 
 ######################################################################
-# .. note::
-#     If you want to run this tutorial on the microTVM Reference VM, download the Jupyter
-#     notebook using the link at the bottom of this page and save it into the TVM directory. Then:
 #
-#     #. Login to the reference VM with a modified ``vagrant ssh`` command:
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
 #
-#         ``$ vagrant ssh -- -L8888:localhost:8888``
-#
-#     #. Install jupyter:  ``pip install jupyterlab``
-#     #. ``cd`` to the TVM directory.
-#     #. Install tflite: poetry install -E importer-tflite
-#     #. Launch Jupyter Notebook: ``jupyter notebook``
-#     #. Copy the localhost URL displayed, and paste it into your browser.
-#     #. Navigate to saved Jupyter Notebook (``.ipynb`` file).
-#
-#
-# Setup
-# -----
-#
-# Install TFLite
-# ^^^^^^^^^^^^^^
-#
-# To get started, TFLite package needs to be installed as prerequisite. You can do this in two ways:
-#
-# 1. Install tflite with ``pip``
-#
-#     .. code-block:: bash
-#
-#       pip install tflite=2.1.0 --user
-#
-# 2. Generate the TFLite package yourself. The steps are the following:
-#
-#     Get the flatc compiler.
-#     Please refer to https://github.com/google/flatbuffers for details
-#     and make sure it is properly installed.
-#
-#     .. code-block:: bash
-#
-#       flatc --version
-#
-#     Get the TFLite schema.
-#
-#     .. code-block:: bash
-#
-#       wget https://raw.githubusercontent.com/tensorflow/tensorflow/r1.13/tensorflow/lite/schema/schema.fbs
-#
-#     Generate TFLite package.
-#
-#     .. code-block:: bash
-#
-#       flatc --python schema.fbs
-#
-#     Add the current folder (which contains generated tflite module) to PYTHONPATH.
-#
-#     .. code-block:: bash
-#
-#       export PYTHONPATH=${PYTHONPATH:+$PYTHONPATH:}$(pwd)
-#
-# To validate that the TFLite package was installed successfully, ``python -c "import tflite"``
-#
-# Install Zephyr (physical hardware only)
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-#
-# When running this tutorial with a host simulation (the default), you can use the host ``gcc`` to
-# build a firmware image that simulates the device. When compiling to run on physical hardware, you
-# need to install a *toolchain* plus some target-specific dependencies. microTVM allows you to
-# supply any compiler and runtime that can launch the TVM RPC server, but to get started, this
-# tutorial relies on the Zephyr RTOS to provide these pieces.
-#
-# You can install Zephyr by following the
-# `Installation Instructions <https://docs.zephyrproject.org/latest/getting_started/index.html>`_.
-#
-# Aside: Recreating your own Pre-Trained TFLite model
-#  The tutorial downloads a pretrained TFLite model. When working with microcontrollers
-#  you need to be mindful these are highly resource constrained devices as such standard
-#  models like MobileNet may not fit into their modest memory.
-#
-#  For this tutorial, we'll make use of one of the TF Micro example models.
-#
-#  If you wish to replicate the training steps see:
-#  https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world/train
-#
-#    .. note::
-#
-#      If you accidentally download the example pretrained model from:
-#
-#      ``wget https://storage.googleapis.com/download.tensorflow.org/models/tflite/micro/hello_world_2020_04_13.zip``
-#
-#      this will fail due to an unimplemented opcode (114)
-#
-# Load and prepare the Pre-Trained Model
-# --------------------------------------
-#
-# Load the pretrained TFLite model from a file in your current
-# directory into a buffer
 
 # sphinx_gallery_start_ignore
 from tvm import testing
@@ -129,6 +37,27 @@
 # sphinx_gallery_end_ignore
 
 import os
+
+# By default, this tutorial runs on x86 CPU using TVM's C runtime. If you would like
+# to run on real Zephyr hardware, you must export the `TVM_MICRO_USE_HW` environment
+# variable. Otherwise (if you are using the C runtime), you can skip installing
+# Zephyr and CMSIS-NN. It takes ~20 minutes to install both of them.
+use_physical_hw = bool(os.getenv("TVM_MICRO_USE_HW"))
+
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_zephyr.rst
+#
+
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_cmsis.rst
+#
+
+######################################################################
+# Import Python dependencies
+# -------------------------------
+#
 import json
 import tarfile
 import pathlib
@@ -140,7 +69,6 @@
 import tvm.contrib.utils
 from tvm.contrib.download import download_testdata
 
-use_physical_hw = bool(os.getenv("TVM_MICRO_USE_HW"))
 model_url = "https://people.linaro.org/~tom.gall/sine_model.tflite"
 model_file = "sine_model.tflite"
 model_path = download_testdata(model_url, model_file, module="data")
@@ -207,8 +135,7 @@
     boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
     with open(boards_file) as f:
         boards = json.load(f)
-
-    BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_f746zg")
+    BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
     SERIAL = os.getenv("TVM_MICRO_SERIAL", default=None)
     TARGET = tvm.target.target.micro(boards[BOARD]["model"])
 
@@ -292,7 +219,14 @@
 
 if use_physical_hw:
     template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
-    project_options = {"project_type": "host_driven", "board": BOARD, "serial_number": SERIAL}
+    project_options = {
+        "project_type": "host_driven",
+        "board": BOARD,
+        "serial_number": SERIAL,
+        "config_main_stack_size": 4096,
+        "cmsis_path": os.getenv("CMSIS_PATH", default="/content/cmsis"),
+        "zephyr_base": os.getenv("ZEPHYR_BASE", default="/content/zephyrproject/zephyr"),
+    }
 
 # Create a temporary directory
 
diff --git a/gallery/how_to/work_with_microtvm/micro_train.py b/gallery/how_to/work_with_microtvm/micro_train.py
index 44e0dd5cb730..9b8a9a68dde3 100644
--- a/gallery/how_to/work_with_microtvm/micro_train.py
+++ b/gallery/how_to/work_with_microtvm/micro_train.py
@@ -27,17 +27,6 @@
 """
 
 ######################################################################
-# .. note::
-#
-#   This tutorial is best viewed as a Jupyter Notebook. You can download and run it locally
-#   using the link at the bottom of this page, or open it online for free using Google Colab.
-#   Click the icon below to open in Google Colab.
-#
-# .. image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/utilities/colab_button.png
-#      :align: center
-#      :target: https://colab.research.google.com/github/apache/tvm-site/blob/asf-site/docs/_downloads/a7c7ea4b5017ae70db1f51dd8e6dcd82/micro_train.ipynb
-#      :width: 300px
-#
 # Motivation
 # ----------
 # When building IOT devices, we often want them to **see and understand** the world around them.
@@ -71,7 +60,7 @@
 #
 #     .. code-block:: bash
 #
-#       %%bash
+#       %%shell
 #       pip install -q tensorflow tflite
 #       pip install -q tlcpack-nightly -f https://tlcpack.ai/wheels
 #       apt-get -qq install imagemagick curl
@@ -515,7 +504,7 @@ def representative_dataset():
 #
 #     .. code-block:: bash
 #
-#       %%bash
+#       %%shell
 #       mkdir -p ~/tests
 #       curl "https://i.imgur.com/JBbEhxN.png" -o ~/tests/car_224.png
 #       convert ~/tests/car_224.png -resize 64 ~/tests/car_64.png
diff --git a/gallery/how_to/work_with_pytorch/using_as_torch.py b/gallery/how_to/work_with_pytorch/using_as_torch.py
index 3528e754fdce..e2351a0d7c65 100644
--- a/gallery/how_to/work_with_pytorch/using_as_torch.py
+++ b/gallery/how_to/work_with_pytorch/using_as_torch.py
@@ -22,8 +22,16 @@
 
 This article is a tutorial on wrapping the TVMScript code as the PyTorch module.
 Using the decorator `as_torch`, users can wrap TVMScript code into a PyTorch nn.Module naturally.
+To follow the tutorial, PyTorch should be installed:
+
+.. code-block:: bash
+
+    %%shell
+    pip install torch
+
 """
 
+
 # sphinx_gallery_start_ignore
 from tvm import testing
 
diff --git a/gallery/how_to/work_with_pytorch/using_optimized_torch.py b/gallery/how_to/work_with_pytorch/using_optimized_torch.py
index dc6caf5d597c..baf80541b964 100644
--- a/gallery/how_to/work_with_pytorch/using_optimized_torch.py
+++ b/gallery/how_to/work_with_pytorch/using_optimized_torch.py
@@ -21,10 +21,18 @@
 `Yaoda Zhou <https://github.com/juda>`_
 
 This article is a tutorial to optimize PyTorch models by using decorator `optimize_torch`.
-To follow this tutorial, PyTorch, as well as TorchVision, should be installed.
+To follow this tutorial, PyTorch, as well as TorchVision, should be installed:
+
+.. code-block:: bash
+
+    %%shell
+    pip install torch
+    pip install torchvision
+
 """
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/gallery/how_to/work_with_relay/build_gcn.py b/gallery/how_to/work_with_relay/build_gcn.py
index 8953ffc2e474..e6106dd95b84 100644
--- a/gallery/how_to/work_with_relay/build_gcn.py
+++ b/gallery/how_to/work_with_relay/build_gcn.py
@@ -25,7 +25,13 @@
 Cora dataset is a common benchmark for Graph Neural Networks (GNN) and frameworks that support GNN training and inference.
 We directly load the dataset from DGL library to do the apples to apples comparison against DGL.
 
-Please refer to DGL doc for DGL installation at
+.. code-block:: bash
+
+    %%shell
+    pip install torch==1.9.0
+    pip install dgl==v0.7.2 -f https://data.dgl.ai/wheels/repo.html
+
+Please refer to DGL doc for installation at
 https://docs.dgl.ai/install/index.html.
 
 Please refer to PyTorch guide for PyTorch installation at
diff --git a/gallery/how_to/work_with_relay/using_relay_viz.py b/gallery/how_to/work_with_relay/using_relay_viz.py
index 2e68ce902899..ae22fe20e1f2 100644
--- a/gallery/how_to/work_with_relay/using_relay_viz.py
+++ b/gallery/how_to/work_with_relay/using_relay_viz.py
@@ -32,6 +32,13 @@
 Here we use a renderer rendering graph in the text-form.
 It is a lightweight, AST-like visualizer, inspired by `clang ast-dump <https://clang.llvm.org/docs/IntroductionToTheClangAST.html>`_.
 We will introduce how to implement customized parsers and renderers through interface classes.
+To install dependencies, run:
+
+.. code-block:: bash
+
+    %%shell
+    pip install graphviz
+
 
 For more details, please refer to :py:mod:`tvm.contrib.relay_viz`.
 """
diff --git a/gallery/how_to/work_with_schedules/reduction.py b/gallery/how_to/work_with_schedules/reduction.py
index 432e9cd143b1..c084c45d3839 100644
--- a/gallery/how_to/work_with_schedules/reduction.py
+++ b/gallery/how_to/work_with_schedules/reduction.py
@@ -29,6 +29,7 @@
 
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/gallery/how_to/work_with_schedules/scan.py b/gallery/how_to/work_with_schedules/scan.py
index d21673acd9e4..d523d5b9959d 100644
--- a/gallery/how_to/work_with_schedules/scan.py
+++ b/gallery/how_to/work_with_schedules/scan.py
@@ -26,6 +26,7 @@
 
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/gallery/tutorial/intro_topi.py b/gallery/tutorial/intro_topi.py
index e10a74c849c0..f2a4db608646 100644
--- a/gallery/tutorial/intro_topi.py
+++ b/gallery/tutorial/intro_topi.py
@@ -27,6 +27,7 @@
 """
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/gallery/tutorial/relay_quick_start.py b/gallery/tutorial/relay_quick_start.py
index 8910817c2117..e59f0107f943 100644
--- a/gallery/tutorial/relay_quick_start.py
+++ b/gallery/tutorial/relay_quick_start.py
@@ -27,6 +27,7 @@
 """
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/gallery/tutorial/tensor_ir_blitz_course.py b/gallery/tutorial/tensor_ir_blitz_course.py
index a62fa3979393..dc75a3fb9452 100644
--- a/gallery/tutorial/tensor_ir_blitz_course.py
+++ b/gallery/tutorial/tensor_ir_blitz_course.py
@@ -30,6 +30,7 @@
 """
 
 # sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
 from tvm import testing
 
 testing.utils.install_request_hook(depth=3)
diff --git a/tests/lint/check_request_hook.py b/tests/lint/check_request_hook.py
index 35b1a85c3a43..925af5597c12 100644
--- a/tests/lint/check_request_hook.py
+++ b/tests/lint/check_request_hook.py
@@ -23,13 +23,27 @@
 
 
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent
-EXPECTED = """
+
+EXPECTED_HOOK = """
 # sphinx_gallery_start_ignore
 from tvm import testing
 
-testing.utils.install_request_hook(depth=3)
+testing.utils.install_request_hook(depth=3)\
 # sphinx_gallery_end_ignore
-""".rstrip()
+"""
+
+# Extra sphinx-gallery config options may be passed inside the ignore block before the hook. This
+# is a workaround that can be removed once sphinx-gallery #1059 merges and the version is updated.
+EXPECTED_REGEX = re.compile(
+    r"""
+\# sphinx_gallery_start_ignore
+(?:.*\n)*from tvm import testing
+
+testing\.utils\.install_request_hook\(depth=3\)\
+\# sphinx_gallery_end_ignore
+""".rstrip(),
+    re.MULTILINE,
+)
 IGNORE_PATTERNS = ["*/micro_tvmc.py", "*/micro_train.py"]
 APACHE_HEADER_LINES = 16
 
@@ -84,14 +98,13 @@ def find_code_block_line(lines: List[str]) -> Optional[int]:
         with open(file) as f:
             content = f.read()
 
-        if EXPECTED not in content:
+        regex_match = EXPECTED_REGEX.search(content)
+        if not regex_match:
             errors.append((file, None))
             continue
 
-        index = content.index(EXPECTED)
-        line = content.count("\n", 0, index) + EXPECTED.count("\n") + 2
+        line = content.count("\n", 0, regex_match.end()) + 2
         expected = find_code_block_line(content.split("\n"))
-
         if expected is not None and line < expected:
             errors.append((file, (line, expected)))
 
@@ -106,19 +119,19 @@ def find_code_block_line(lines: List[str]) -> Optional[int]:
             if "from __future__" in content:
                 # Place after the last __future__ import
                 new_content = re.sub(
-                    r"((?:from __future__.*?\n)+)", r"\1\n" + EXPECTED, content, flags=re.MULTILINE
+                    r"((?:from __future__.*?\n)+)", r"\1\n" + EXPECTED_HOOK, content, flags=re.M
                 )
             else:
                 # Place in the first codeblock
                 lines = content.split("\n")
                 position = find_code_block_line(lines)
                 if position is None:
-                    new_content = "\n".join(lines) + EXPECTED + "\n"
+                    new_content = "\n".join(lines) + EXPECTED_HOOK + "\n"
                 else:
                     print(position)
                     new_content = (
                         "\n".join(lines[:position])
-                        + EXPECTED
+                        + EXPECTED_HOOK
                         + "\n\n"
                         + "\n".join(lines[position:])
                     )
@@ -134,7 +147,7 @@ def find_code_block_line(lines: List[str]) -> Optional[int]:
                 "the whitespace is incorrect.\n"
                 "You can run 'python3 tests/lint/check_request_hook.py --fix' to "
                 "automatically fix these errors:\n"
-                f"{EXPECTED}\n\nFiles:"
+                f"{EXPECTED_HOOK}\n\nFiles:"
             )
             for file, line_info in errors:
                 if line_info is None:

From 9d929d93d064441c798227dadb70b335e62afd82 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sat, 7 Jan 2023 13:21:42 +0900
Subject: [PATCH 126/286] [TIR] Fix dtype mismatch error due to LetStmt
 (#13710)

* [TIR] Fix dtype mismatch error due to LetStmt

* add comment

* improve letstmt visitor

* remove SubstituteWithDataTypeLegalization

* consolidate vmap look up logic in the base class
---
 include/tvm/tir/data_type_rewriter.h          |  5 +-
 src/tir/ir/data_type_rewriter.cc              | 43 ++++++++++++---
 src/tir/transforms/narrow_datatype.cc         | 11 +---
 .../unittest/test_te_create_primfunc.py       | 55 ++++++++++++++++++-
 4 files changed, 95 insertions(+), 19 deletions(-)

diff --git a/include/tvm/tir/data_type_rewriter.h b/include/tvm/tir/data_type_rewriter.h
index bf90aaedfec0..5f72f75ede41 100644
--- a/include/tvm/tir/data_type_rewriter.h
+++ b/include/tvm/tir/data_type_rewriter.h
@@ -53,6 +53,8 @@ class DataTypeLegalizer : public StmtExprMutator {
   Stmt VisitStmt_(const AttrStmtNode* op) override;
   Stmt VisitStmt_(const BlockRealizeNode* op) override;
   Stmt VisitStmt_(const BlockNode* op) override;
+  Stmt VisitStmt_(const LetStmtNode* op) override;
+  PrimExpr VisitExpr_(const VarNode* op) override;
   PrimExpr VisitExpr_(const SelectNode* op) override;
   PrimExpr VisitExpr_(const RampNode* op) override;
   PrimExpr VisitExpr_(const AddNode* op) override;
@@ -79,6 +81,8 @@ class DataTypeLegalizer : public StmtExprMutator {
   // a map from IterVar before rewrite to that after rewrite,
   // ensures one old IterVar maps to exactly one new IterVar
   std::unordered_map<const IterVarNode*, IterVar> ivmap_;
+  // a map from original vars to ones with new dtype
+  std::unordered_map<const VarNode*, Var> var_remap_;
 };
 
 /*!
@@ -123,7 +127,6 @@ class IndexDataTypeRewriter : public DataTypeLegalizer {
   // indicator of condition
   bool is_condition_{false};
 
-  Map<Var, Var> var_remap_;
   Map<Buffer, Buffer> buffer_remap_;
 };
 
diff --git a/src/tir/ir/data_type_rewriter.cc b/src/tir/ir/data_type_rewriter.cc
index 1c61f0bf15e7..f0f0d84644fe 100644
--- a/src/tir/ir/data_type_rewriter.cc
+++ b/src/tir/ir/data_type_rewriter.cc
@@ -107,6 +107,35 @@ Stmt DataTypeLegalizer::VisitStmt_(const AttrStmtNode* op) {
   return StmtExprMutator::VisitStmt_(op);
 }
 
+Stmt DataTypeLegalizer::VisitStmt_(const LetStmtNode* op) {
+  PrimExpr value = this->VisitExpr(op->value);
+  auto new_var = op->var.copy_with_dtype(value.dtype());
+
+  if (value.dtype() != op->var->dtype) {
+    var_remap_[op->var.get()] = new_var;
+  }
+
+  Stmt new_body = this->VisitStmt(op->body);
+
+  if (value.same_as(op->value) && new_body.same_as(op->body)) {
+    return GetRef<Stmt>(op);
+  } else if (value.dtype() == op->var->dtype) {
+    auto n = CopyOnWrite(op);
+    n->value = std::move(value);
+    n->body = std::move(new_body);
+    return Stmt(n);
+  } else {
+    return LetStmt(new_var, value, new_body, op->span);
+  }
+}
+
+PrimExpr DataTypeLegalizer::VisitExpr_(const VarNode* op) {
+  if (auto it = var_remap_.find(op); it != var_remap_.end()) {
+    return it->second;
+  }
+  return GetRef<Var>(op);
+}
+
 PrimExpr DataTypeLegalizer::VisitExpr_(const SelectNode* op) {
   PrimExpr condition = this->VisitExpr(op->condition);
   PrimExpr true_value = this->VisitExpr(op->true_value);
@@ -397,6 +426,9 @@ Stmt IndexDataTypeRewriter::VisitStmt_(const BufferStoreNode* op) {
 
   Buffer new_buffer = GetRemappedBuffer(op->buffer);
   auto value = this->VisitExpr(op->value);
+  if (new_buffer->dtype != value->dtype && value->dtype.lanes() == 1) {
+    value = cast(new_buffer->dtype, value);
+  }
   auto indices = VisitIndices(op->indices);
 
   if (!new_buffer.same_as(op->buffer) || !value.same_as(op->value) ||
@@ -535,15 +567,10 @@ PrimExpr IndexDataTypeNormalizer::VisitExpr_(const IntImmNode* op) {
 }
 
 PrimExpr IndexDataTypeNormalizer::VisitExpr_(const VarNode* op) {
-  if (auto it = var_remap_.find(GetRef<Var>(op)); it != var_remap_.end()) {
-    return (*it).second;
-  }
-  if (is_enabled_ && op->dtype != target_data_type_) {
-    Var new_var = GetRef<Var>(op).copy_with_dtype(target_data_type_);
-    var_remap_.Set(GetRef<Var>(op), new_var);
-    return std::move(new_var);
+  if (is_enabled_ && op->dtype != target_data_type_ && !var_remap_.count(op)) {
+    var_remap_[op] = GetRef<Var>(op).copy_with_dtype(target_data_type_);
   }
-  return GetRef<PrimExpr>(op);
+  return DataTypeLegalizer::VisitExpr_(op);
 }
 
 PrimExpr IndexDataTypeNormalizer::VisitExpr_(const CastNode* op) {
diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc
index 2f116a02295f..e1dc2f5bf113 100644
--- a/src/tir/transforms/narrow_datatype.cc
+++ b/src/tir/transforms/narrow_datatype.cc
@@ -233,12 +233,8 @@ class NarrowDataTypeRewriter : public IndexDataTypeRewriter {
   }
 
   PrimExpr VisitExpr_(const VarNode* op) final {
-    if (auto it = var_remap_.find(GetRef<Var>(op)); it != var_remap_.end()) {
-      return (*it).second;
-    } else if (visitor_.vmap.find(op) != visitor_.vmap.end()) {
-      Var v = Var(op->name_hint, visitor_.vmap[op]);
-      var_remap_.Set(GetRef<Var>(op), v);
-      return v;
+    if (auto it = visitor_.vmap.find(op); !var_remap_.count(op) && it != visitor_.vmap.end()) {
+      var_remap_[op] = Var(op->name_hint, it->second);
     }
     return Parent::VisitExpr_(op);
   }
@@ -266,9 +262,6 @@ class NarrowDataTypeRewriter : public IndexDataTypeRewriter {
  private:
   // the internal visitor to deduce the narrowed dtype
   DataTypeVisitor visitor_;
-  // a map from Var before rewrite to that after rewrite,
-  // ensures one old Var maps to exactly one new Var
-  std::unordered_map<const VarNode*, Var> vmap_;
 };
 
 Stmt NarrowDataType(Stmt stmt, int target_bits) {
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index 271e0a339cbd..c13ede08313d 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -18,7 +18,7 @@
 import numpy as np
 import tvm
 import tvm.testing
-from tvm import te, tir, topi
+from tvm import te, tir, topi, relay
 from tvm.script import tir as T
 import pytest
 
@@ -636,5 +636,58 @@ def test_reshape():
     _check_workload(te_reshape, tir_reshape, index_dtype_override="int64")
 
 
+@T.prim_func
+def argmax_expected(
+    p0: T.Buffer[(T.int64(1), T.int64(64), T.int64(56), T.int64(56)), "uint8"],
+    p0_red: T.Buffer[(T.int64(1), T.int64(56), T.int64(56)), "int32"],
+):
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    p0_red_temp_v0 = T.alloc_buffer([T.int64(1), T.int64(56), T.int64(56)], dtype="int32")
+    p0_red_temp_v1 = T.alloc_buffer([T.int64(1), T.int64(56), T.int64(56)], dtype="uint8")
+    for ax0, ax1, ax2, k1 in T.grid(T.int64(1), T.int64(56), T.int64(56), T.int64(64)):
+        with T.block("p0_red_temp"):
+            v_ax0, v_ax1, v_ax2, v_k1 = T.axis.remap("SSSR", [ax0, ax1, ax2, k1])
+            T.reads(p0[v_ax0, v_k1, v_ax1, v_ax2])
+            T.writes(p0_red_temp_v0[v_ax0, v_ax1, v_ax2], p0_red_temp_v1[v_ax0, v_ax1, v_ax2])
+            with T.init():
+                p0_red_temp_v0[v_ax0, v_ax1, v_ax2] = -1
+                p0_red_temp_v1[v_ax0, v_ax1, v_ax2] = T.uint8(0)
+            v_p0_red_temp_v0: T.int64 = T.Select(
+                p0_red_temp_v1[v_ax0, v_ax1, v_ax2] > p0[v_ax0, v_k1, v_ax1, v_ax2]
+                or (
+                    p0_red_temp_v1[v_ax0, v_ax1, v_ax2] == p0[v_ax0, v_k1, v_ax1, v_ax2]
+                    and T.Cast("int64", p0_red_temp_v0[v_ax0, v_ax1, v_ax2]) < v_k1
+                ),
+                T.Cast("int64", p0_red_temp_v0[v_ax0, v_ax1, v_ax2]),
+                v_k1,
+            )
+            v_p0_red_temp_v1: T.uint8 = T.Select(
+                p0_red_temp_v1[v_ax0, v_ax1, v_ax2] > p0[v_ax0, v_k1, v_ax1, v_ax2],
+                p0_red_temp_v1[v_ax0, v_ax1, v_ax2],
+                p0[v_ax0, v_k1, v_ax1, v_ax2],
+            )
+            p0_red_temp_v0[v_ax0, v_ax1, v_ax2] = T.Cast("int32", v_p0_red_temp_v0)
+            p0_red_temp_v1[v_ax0, v_ax1, v_ax2] = v_p0_red_temp_v1
+    for ax0, ax1, ax2 in T.grid(T.int64(1), T.int64(56), T.int64(56)):
+        with T.block("p0_red"):
+            v_ax0, v_ax1, v_ax2 = T.axis.remap("SSS", [ax0, ax1, ax2])
+            T.reads(p0_red_temp_v0[v_ax0, v_ax1, v_ax2])
+            T.writes(p0_red[v_ax0, v_ax1, v_ax2])
+            p0_red[v_ax0, v_ax1, v_ax2] = p0_red_temp_v0[v_ax0, v_ax1, v_ax2]
+
+
+def test_argmax():
+    data = relay.var("data", shape=(1, 64, 56, 56), dtype="uint8")
+    mod = tvm.IRModule.from_expr(relay.argmax(data, axis=1))
+
+    target = tvm.target.Target("llvm")
+
+    opt_mod, _ = relay.optimize(mod, params={}, target=target)
+
+    prim_func = relay.backend.te_compiler.lower_to_primfunc(opt_mod["main"].body.op, target)
+
+    tvm.ir.assert_structural_equal(prim_func, argmax_expected)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From a49f448a1c6b2caadf6d7b8375ed69ab6f28d751 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 6 Jan 2023 21:15:38 -0800
Subject: [PATCH 127/286] [Fix,AutoScheduler] Handle 0-dim buffers in
 featurization (#13718)

0-dim buffers get lowered to a 0-dim buffer and a 1-dim buffer decl. We
explicitly check for this case and return a stride of 1.
---
 src/auto_scheduler/feature.cc                 |  5 ++++
 .../unittest/test_auto_scheduler_feature.py   | 26 +++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 4ce7ad13bc60..884215c24a13 100644
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -443,6 +443,11 @@ class CoefficientExtractor : public StmtExprVisitor {
 // Compute stride for the accesses to a buffer
 int64_t ComputeStride(const std::vector<std::vector<PrimExpr>>& indices,
                       const std::vector<int>& shape, const VarNode* stride_var) {
+  // Use stride of 1 for 0-dimensional buffers. 0-dim buffers has a single
+  // index access, so we have to check here.
+  if (shape.size() == 0) {
+    return 1;
+  }
   int64_t min_stride = std::numeric_limits<int64_t>::max();
   bool find = false;
   CoefficientExtractor extractor;
diff --git a/tests/python/unittest/test_auto_scheduler_feature.py b/tests/python/unittest/test_auto_scheduler_feature.py
index 3f435366e176..8be6e0a8f2ed 100644
--- a/tests/python/unittest/test_auto_scheduler_feature.py
+++ b/tests/python/unittest/test_auto_scheduler_feature.py
@@ -273,6 +273,32 @@ def test_negative_extent():
     assert features["B0.unique_bytes"] == 0
 
 
+@T.prim_func
+def zero_dim(
+    p2: T.Buffer[(), "float32"],
+    T_cast: T.Buffer[(T.int64(1), T.int64(768)), "int8"],
+):
+    # function attr dict
+    T.func_attr(
+        {
+            "tir.noalias": True,
+            "Primitive": 1,
+        }
+    )
+    # buffer definition
+    T_cast_1 = T.buffer_decl([T.int64(768)], dtype="int8", data=T_cast.data)
+    p2_1 = T.buffer_decl([1], dtype="float32", data=p2.data)
+    # body
+    for i0_i1_fused in T.serial(768):
+        T_cast_1[i0_i1_fused] = p2_1[0]
+
+
+def test_zero_dim():
+    features = auto_scheduler.feature.named_features_from_primfunc(zero_dim)
+    assert features["B1.stride"] == 1
+    assert features["B0.stride"] == 1
+
+
 if __name__ == "__main__":
     test_cpu_matmul()
     test_cpu_fusion()

From 691c28fed017144d29ca3d029856f733c4838a9d Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Fri, 6 Jan 2023 21:52:53 -0800
Subject: [PATCH 128/286] [TVMScript] Linter-friendly function definitions
 (#13713)

Initially, the functions like `T.int8`, `T.uint32` and `T.float64x64` were generated by loops and use `globals()` to add symbols globally to reduce code complexity.
But for linters like Pylint may not be able to look into `globals()`, where the functions defined implicitly.
This pr refactors the definitions of these functions explicitly for better linter experience.
---
 python/tvm/script/ir_builder/tir/ir.py | 205 +++++++++++++++++++++----
 1 file changed, 174 insertions(+), 31 deletions(-)

diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index ac1e990a96e2..48b283447969 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -1229,36 +1229,107 @@ def evaluate(value: PrimExpr) -> None:
     return _ffi_api.Evaluate(value)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-__all__ = []
-for _dtype in ["Float", "UInt", "Int"]:
-    for _size in ["8", "16", "32", "64"]:
-        for _lanes in ["", "x4", "x8", "x16", "x32", "x64"]:
-            _name = _dtype + _size + _lanes  # pylint: disable=invalid-name
-
-            def func_gen(name: str):
-                """Generate a function for each PrimExpr dtype.
-
-                Parameters
-                ----------
-                name: str
-                    The ffi function name to call.
-                """
-
-                def func(
-                    expr: Union[
-                        None,
-                        PrimExpr,
-                        Literal["inf", "-inf", "nan"],
-                    ] = None
-                ) -> PrimExpr:
-                    if isinstance(expr, str):
-                        expr = float(expr)
-                    return getattr(_ffi_api, name)(expr)
-
-                return func
-
-            globals()[_name.lower()] = func_gen(_name)
-            __all__.append(_name.lower())
+def func_gen(name: str):
+    """Generate a function for each PrimExpr dtype.
+
+    Parameters
+    ----------
+    name: str
+        The ffi function name to call.
+    """
+
+    def func(
+        expr: Union[
+            None,
+            PrimExpr,
+            Literal["inf", "-inf", "nan"],
+            int,
+            float,
+        ] = None
+    ) -> PrimExpr:
+        if isinstance(expr, str):
+            expr = float(expr)
+        return getattr(_ffi_api, name)(expr)
+
+    return func
+
+
+# pylint: disable=invalid-name
+int8 = func_gen(("Int8"))
+int16 = func_gen(("Int16"))
+int32 = func_gen(("Int32"))
+int64 = func_gen(("Int64"))
+int8x4 = func_gen(("Int8x4"))
+int16x4 = func_gen(("Int16x4"))
+int32x4 = func_gen(("Int32x4"))
+int64x4 = func_gen(("Int64x4"))
+int8x8 = func_gen(("Int8x8"))
+int16x8 = func_gen(("Int16x8"))
+int32x8 = func_gen(("Int32x8"))
+int64x8 = func_gen(("Int64x8"))
+int8x16 = func_gen(("Int8x16"))
+int16x16 = func_gen(("Int16x16"))
+int32x16 = func_gen(("Int32x16"))
+int64x16 = func_gen(("Int64x16"))
+int8x32 = func_gen(("Int8x32"))
+int16x32 = func_gen(("Int16x32"))
+int32x32 = func_gen(("Int32x32"))
+int64x32 = func_gen(("Int64x32"))
+int8x64 = func_gen(("Int8x64"))
+int16x64 = func_gen(("Int16x64"))
+int32x64 = func_gen(("Int32x64"))
+int64x64 = func_gen(("Int64x64"))
+
+uint8 = func_gen(("UInt8"))
+uint16 = func_gen(("UInt16"))
+uint32 = func_gen(("UInt32"))
+uint64 = func_gen(("UInt64"))
+uint8x4 = func_gen(("UInt8x4"))
+uint16x4 = func_gen(("UInt16x4"))
+uint32x4 = func_gen(("UInt32x4"))
+uint64x4 = func_gen(("UInt64x4"))
+uint8x8 = func_gen(("UInt8x8"))
+uint16x8 = func_gen(("UInt16x8"))
+uint32x8 = func_gen(("UInt32x8"))
+uint64x8 = func_gen(("UInt64x8"))
+uint8x16 = func_gen(("UInt8x16"))
+uint16x16 = func_gen(("UInt16x16"))
+uint32x16 = func_gen(("UInt32x16"))
+uint64x16 = func_gen(("UInt64x16"))
+uint8x32 = func_gen(("UInt8x32"))
+uint16x32 = func_gen(("UInt16x32"))
+uint32x32 = func_gen(("UInt32x32"))
+uint64x32 = func_gen(("UInt64x32"))
+uint8x64 = func_gen(("UInt8x64"))
+uint16x64 = func_gen(("UInt16x64"))
+uint32x64 = func_gen(("UInt32x64"))
+uint64x64 = func_gen(("UInt64x64"))
+
+float8 = func_gen(("Float8"))
+float16 = func_gen(("Float16"))
+float32 = func_gen(("Float32"))
+float64 = func_gen(("Float64"))
+float8x4 = func_gen(("Float8x4"))
+float16x4 = func_gen(("Float16x4"))
+float32x4 = func_gen(("Float32x4"))
+float64x4 = func_gen(("Float64x4"))
+float8x8 = func_gen(("Float8x8"))
+float16x8 = func_gen(("Float16x8"))
+float32x8 = func_gen(("Float32x8"))
+float64x8 = func_gen(("Float64x8"))
+float8x16 = func_gen(("Float8x16"))
+float16x16 = func_gen(("Float16x16"))
+float32x16 = func_gen(("Float32x16"))
+float64x16 = func_gen(("Float64x16"))
+float8x32 = func_gen(("Float8x32"))
+float16x32 = func_gen(("Float16x32"))
+float32x32 = func_gen(("Float32x32"))
+float64x32 = func_gen(("Float64x32"))
+float8x64 = func_gen(("Float8x64"))
+float16x64 = func_gen(("Float16x64"))
+float32x64 = func_gen(("Float32x64"))
+float64x64 = func_gen(("Float64x64"))
+# pylint: enable=invalid-name
 
 
 def boolean(expr: Optional[PrimExpr] = None) -> PrimExpr:
@@ -1621,7 +1692,79 @@ def f():
 # pylint: enable=invalid-name
 
 
-__all__ += [
+__all__ = [
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "int8x4",
+    "int16x4",
+    "int32x4",
+    "int64x4",
+    "int8x8",
+    "int16x8",
+    "int32x8",
+    "int64x8",
+    "int8x16",
+    "int16x16",
+    "int32x16",
+    "int64x16",
+    "int8x32",
+    "int16x32",
+    "int32x32",
+    "int64x32",
+    "int8x64",
+    "int16x64",
+    "int32x64",
+    "int64x64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "uint8x4",
+    "uint16x4",
+    "uint32x4",
+    "uint64x4",
+    "uint8x8",
+    "uint16x8",
+    "uint32x8",
+    "uint64x8",
+    "uint8x16",
+    "uint16x16",
+    "uint32x16",
+    "uint64x16",
+    "uint8x32",
+    "uint16x32",
+    "uint32x32",
+    "uint64x32",
+    "uint8x64",
+    "uint16x64",
+    "uint32x64",
+    "uint64x64",
+    "float8",
+    "float16",
+    "float32",
+    "float64",
+    "float8x4",
+    "float16x4",
+    "float32x4",
+    "float64x4",
+    "float8x8",
+    "float16x8",
+    "float32x8",
+    "float64x8",
+    "float8x16",
+    "float16x16",
+    "float32x16",
+    "float64x16",
+    "float8x32",
+    "float16x32",
+    "float32x32",
+    "float64x32",
+    "float8x64",
+    "float16x64",
+    "float32x64",
+    "float64x64",
     "buffer_decl",
     "prim_func",
     "arg",

From 469acae211701f9b64df16e679d6aa03ea70ce94 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Sat, 7 Jan 2023 05:54:07 +0000
Subject: [PATCH 129/286] [Tests] Replace pytest.main with tvm.testing.main
 (#13717)

This ensures that if you want to run a specific test script then at least it's reasonably consistent and as people copy test files they'll use the new function :smile_cat:
---
 tests/micro/stm32/test_code_emitter.py                       | 2 +-
 tests/python/contrib/test_cublas.py                          | 2 +-
 .../cascader/test_ethosu_binary_elementwise_matcher.py       | 2 +-
 .../contrib/test_ethosu/cascader/test_ethosu_block_config.py | 2 +-
 .../test_ethosu/cascader/test_ethosu_conv2d_matcher.py       | 2 +-
 .../test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py  | 2 +-
 .../test_ethosu/cascader/test_ethosu_identity_matcher.py     | 2 +-
 .../test_ethosu/cascader/test_ethosu_inline_matcher.py       | 2 +-
 .../python/contrib/test_ethosu/cascader/test_ethosu_part.py  | 2 +-
 .../test_ethosu/cascader/test_ethosu_part_performance.py     | 2 +-
 .../test_ethosu/cascader/test_ethosu_pooling_matcher.py      | 2 +-
 .../cascader/test_ethosu_unary_elementwise_matcher.py        | 2 +-
 tests/python/contrib/test_ethosu/cascader/test_graph.py      | 2 +-
 tests/python/contrib/test_ethosu/cascader/test_pareto.py     | 2 +-
 tests/python/contrib/test_ethosu/cascader/test_plan.py       | 2 +-
 .../contrib/test_ethosu/cascader/test_plan_generator.py      | 2 +-
 tests/python/contrib/test_ethosu/cascader/test_propagator.py | 2 +-
 .../contrib/test_ethosu/cascader/test_proposal_generator.py  | 2 +-
 tests/python/contrib/test_ethosu/cascader/test_scheduler.py  | 2 +-
 .../contrib/test_ethosu/cascader/test_stripe_config.py       | 2 +-
 .../contrib/test_ethosu/cascader/test_tensor_config.py       | 2 +-
 tests/python/contrib/test_ethosu/test_attr_passing.py        | 2 +-
 tests/python/contrib/test_ethosu/test_codegen.py             | 5 +----
 tests/python/contrib/test_ethosu/test_compiler.py            | 2 +-
 .../contrib/test_ethosu/test_copy_compute_reordering.py      | 2 +-
 tests/python/contrib/test_ethosu/test_create_tiles.py        | 2 +-
 tests/python/contrib/test_ethosu/test_encode_constants.py    | 2 +-
 tests/python/contrib/test_ethosu/test_extract_constants.py   | 2 +-
 tests/python/contrib/test_ethosu/test_layout_optimizer.py    | 2 +-
 tests/python/contrib/test_ethosu/test_legalize.py            | 2 +-
 tests/python/contrib/test_ethosu/test_lookup_table.py        | 2 +-
 tests/python/contrib/test_ethosu/test_lower_to_te.py         | 2 +-
 tests/python/contrib/test_ethosu/test_networks.py            | 2 +-
 tests/python/contrib/test_ethosu/test_preprocess.py          | 2 +-
 tests/python/contrib/test_ethosu/test_remove_concatenates.py | 2 +-
 .../contrib/test_ethosu/test_replace_binary_elementwise.py   | 2 +-
 tests/python/contrib/test_ethosu/test_replace_conv2d.py      | 2 +-
 tests/python/contrib/test_ethosu/test_replace_copy.py        | 2 +-
 tests/python/contrib/test_ethosu/test_replace_identity.py    | 2 +-
 tests/python/contrib/test_ethosu/test_replace_pooling.py     | 2 +-
 .../contrib/test_ethosu/test_replace_unary_elementwise.py    | 2 +-
 tests/python/contrib/test_ethosu/test_rolling_buffer.py      | 2 +-
 tests/python/contrib/test_ethosu/test_scheduler.py           | 2 +-
 .../python/contrib/test_ethosu/test_tir_to_cs_translator.py  | 2 +-
 tests/python/contrib/test_ethosu/test_type_inference.py      | 2 +-
 tests/python/contrib/test_ethosu/test_vela_api.py            | 2 +-
 tests/python/contrib/test_libtorch_ops.py                    | 2 +-
 tests/python/contrib/test_nnpack.py                          | 2 +-
 tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py  | 2 +-
 .../contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py  | 2 +-
 tests/python/driver/tvmc/test_pass_list.py                   | 2 +-
 tests/python/frontend/mxnet/test_forward.py                  | 2 +-
 tests/python/frontend/paddlepaddle/test_forward.py           | 2 +-
 tests/python/frontend/tensorflow/test_forward.py             | 2 +-
 tests/python/frontend/tensorflow2/test_functional_models.py  | 2 +-
 tests/python/frontend/tensorflow2/test_sequential_models.py  | 2 +-
 tests/python/integration/test_reduce.py                      | 3 +--
 tests/python/integration/test_winograd_nnpack.py             | 4 +---
 tests/python/relay/test_adt.py                               | 2 +-
 tests/python/relay/test_analysis_basic_block_normal_form.py  | 2 +-
 .../python/relay/test_analysis_extract_intermediate_expr.py  | 2 +-
 tests/python/relay/test_analysis_extract_operators.py        | 2 +-
 tests/python/relay/test_auto_scheduler_task_extraction.py    | 2 +-
 tests/python/relay/test_backend_graph_executor.py            | 2 +-
 tests/python/relay/test_backend_interpreter.py               | 2 +-
 tests/python/relay/test_call_graph.py                        | 2 +-
 tests/python/relay/test_name_mangling.py                     | 2 +-
 tests/python/relay/test_op_grad_level2.py                    | 2 +-
 tests/python/relay/test_op_grad_level3.py                    | 2 +-
 tests/python/relay/test_op_grad_level4.py                    | 2 +-
 tests/python/relay/test_op_level1.py                         | 2 +-
 tests/python/relay/test_op_level6.py                         | 2 +-
 tests/python/relay/test_pass_alter_op_layout.py              | 2 +-
 tests/python/relay/test_pass_convert_op_layout.py            | 2 +-
 tests/python/relay/test_pass_defunctionalization.py          | 2 +-
 tests/python/relay/test_pass_defuse_ops.py                   | 2 +-
 tests/python/relay/test_pass_dynamic_to_static.py            | 2 +-
 tests/python/relay/test_pass_flexible_shape_dispatch.py      | 2 +-
 tests/python/relay/test_pass_fuse_ops.py                     | 2 +-
 tests/python/relay/test_pass_gradient.py                     | 2 +-
 tests/python/relay/test_pass_inline.py                       | 2 +-
 tests/python/relay/test_pass_lambda_lift.py                  | 2 +-
 tests/python/relay/test_pass_manager.py                      | 2 +-
 tests/python/relay/test_pass_merge_composite.py              | 2 +-
 tests/python/relay/test_pass_remove_unused_functions.py      | 2 +-
 tests/python/relay/test_pass_simplify_expr.py                | 2 +-
 tests/python/relay/test_pass_to_basic_block_normal_form.py   | 2 +-
 tests/python/relay/test_pass_unmatched_cases.py              | 2 +-
 tests/python/relay/test_pipeline_executor.py                 | 2 +-
 tests/python/relay/test_tensor_array.py                      | 2 +-
 tests/python/relay/test_to_mixed_precision.py                | 2 +-
 tests/python/relay/test_type_infer.py                        | 4 +---
 tests/python/relay/test_vm_serialization.py                  | 2 +-
 tests/python/unittest/test_aot_legalize_packed_call.py       | 2 +-
 tests/python/unittest/test_arith_deduce_bound.py             | 2 +-
 tests/python/unittest/test_arith_rewrite_simplify.py         | 2 +-
 tests/python/unittest/test_arith_solve_linear_equations.py   | 2 +-
 tests/python/unittest/test_arith_solve_linear_inequality.py  | 2 +-
 tests/python/unittest/test_ir_container.py                   | 2 +-
 tests/python/unittest/test_micro_model_library_format.py     | 2 +-
 tests/python/unittest/test_target_codegen_cuda.py            | 2 +-
 tests/python/unittest/test_tir_analysis_stmt_finding.py      | 2 +-
 tests/python/unittest/test_tir_buffer.py                     | 2 +-
 tests/python/unittest/test_tir_nodes.py                      | 2 +-
 tests/python/unittest/test_tir_te_extern_primfunc.py         | 2 +-
 .../unittest/test_tir_transform_convert_for_loops_serial.py  | 2 +-
 tests/python/unittest/test_tir_transform_hoist_if.py         | 2 +-
 .../unittest/test_tir_transform_inject_rolling_buffer.py     | 2 +-
 tests/python/unittest/test_tir_transform_ir_utils.py         | 2 +-
 .../python/unittest/test_tir_transform_lower_warp_memory.py  | 2 +-
 .../python/unittest/test_tir_transform_make_unpacked_api.py  | 2 +-
 .../unittest/test_tir_usmp_analysis_extract_bufferinfo.py    | 2 +-
 ...tir_usmp_transform_convert_pool_allocations_to_offsets.py | 2 +-
 tests/python/unittest/test_tir_usmp_utils.py                 | 2 +-
 tests/python/unittest/test_type_annotation_checker.py        | 3 ++-
 115 files changed, 116 insertions(+), 123 deletions(-)

diff --git a/tests/micro/stm32/test_code_emitter.py b/tests/micro/stm32/test_code_emitter.py
index 01bfaefc18cf..0d1290acfcb2 100644
--- a/tests/micro/stm32/test_code_emitter.py
+++ b/tests/micro/stm32/test_code_emitter.py
@@ -392,4 +392,4 @@ def test_mnist():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([os.path.dirname(__file__)] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
index 0ae1e8e9ad5b..f3e3aa74af0e 100644
--- a/tests/python/contrib/test_cublas.py
+++ b/tests/python/contrib/test_cublas.py
@@ -378,4 +378,4 @@ def test_relay_cublas_dense(n, m, k, in_dtype, out_dtype):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py
index 062e5ba0fafd..ffc24ca0067e 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py
@@ -175,4 +175,4 @@ def test_ethosu_binary_elementwise_matcher(
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
index 66d9b4647cbe..8b372e45c37f 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
@@ -457,4 +457,4 @@ def test_force_block_config_elementwise(ofm_layout, block_config_str, expected_b
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
index 76adb0b4cbd4..f9f2312ba7a9 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
@@ -179,4 +179,4 @@ def test_ethosu_conv2d_block_config_from_matcher(
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py
index 1e6b6d58b24a..8625a4844405 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py
@@ -100,4 +100,4 @@ def test_ethosu_depthwise2d_matcher(kernel, stride, dilation, padding, ifm_layou
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_identity_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_identity_matcher.py
index 4609a5bc3779..4bdccfced33f 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_identity_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_identity_matcher.py
@@ -55,4 +55,4 @@ def test_ethosu_identity_matcher():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_inline_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_inline_matcher.py
index 1eebbe40c1b3..ff5530d433f6 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_inline_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_inline_matcher.py
@@ -47,4 +47,4 @@ def test_ethosu_inline_matcher():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py
index 105b6722e8c6..22196e237e3d 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py
@@ -57,4 +57,4 @@ def test_ethosu_part():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py
index 437b0a9ead9d..f68e29559743 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py
@@ -231,4 +231,4 @@ def test_conv_performance(
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py
index b998ddaf7045..38aeee05f936 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py
@@ -79,4 +79,4 @@ def test_ethosu_pooling_matcher(pool_shape, stride, padding, ifm_layout, ofm_lay
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py
index 8139f1518f56..e79c75c00cb0 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py
@@ -131,4 +131,4 @@ def test_ethosu_unary_elementwise_matcher(ofm_shape, ifm_layout, ofm_layout, op_
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_graph.py b/tests/python/contrib/test_ethosu/cascader/test_graph.py
index 616800f69d7e..c3d5c0fd0061 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_graph.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_graph.py
@@ -201,4 +201,4 @@ def test_create_diamond_graph(MobileNetv2DiamondTE):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_pareto.py b/tests/python/contrib/test_ethosu/cascader/test_pareto.py
index baf8739c0878..65d3619c64bb 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_pareto.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_pareto.py
@@ -146,4 +146,4 @@ def _make_plans(num):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_plan.py b/tests/python/contrib/test_ethosu/cascader/test_plan.py
index ddc40b49ac8a..0d33743cd945 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_plan.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_plan.py
@@ -241,4 +241,4 @@ def test_plan_merge(DRAM, SRAM):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
index c35ad15e2363..c0d2a2f0c944 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
@@ -302,4 +302,4 @@ def test_plan_generator_two_conv2d_with_slice(FLASH, SRAM, TwoConv2DWithSliceGra
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_propagator.py b/tests/python/contrib/test_ethosu/cascader/test_propagator.py
index 2a6f442f1221..9712d00e52a9 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_propagator.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_propagator.py
@@ -133,4 +133,4 @@ def test_propagate(propagator, input_stripe_config, output_stripe_config):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_proposal_generator.py b/tests/python/contrib/test_ethosu/cascader/test_proposal_generator.py
index b1cba8dfd930..8a573c05fa2a 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_proposal_generator.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_proposal_generator.py
@@ -157,4 +157,4 @@ def test_generate_proposals_mobilenetv1_disable_striping(FLASH, SRAM, MobileNetv
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
index 2dce6dfdd67e..417aeb9ed67f 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
@@ -80,4 +80,4 @@ def test_compute_cycles_annotation(SRAM, FLASH, TwoConv2DTE):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_stripe_config.py b/tests/python/contrib/test_ethosu/cascader/test_stripe_config.py
index 2ca1838b7f34..f0142167e76a 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_stripe_config.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_stripe_config.py
@@ -212,4 +212,4 @@ def test_count_stripes_sliding_window(stripe_config, expected_stripe_counts):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/cascader/test_tensor_config.py b/tests/python/contrib/test_ethosu/cascader/test_tensor_config.py
index 68290e667eb0..eaab420fbfba 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_tensor_config.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_tensor_config.py
@@ -107,4 +107,4 @@ def test_get_recompute_buffer(DRAM):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_attr_passing.py b/tests/python/contrib/test_ethosu/test_attr_passing.py
index bb8b4491eed0..a770a2668a01 100644
--- a/tests/python/contrib/test_ethosu/test_attr_passing.py
+++ b/tests/python/contrib/test_ethosu/test_attr_passing.py
@@ -45,4 +45,4 @@ def test_compiler_attr_default():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 89c87325baaf..e06e36638d7f 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -1144,7 +1144,4 @@ def fully_connected(x):
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_compiler.py b/tests/python/contrib/test_ethosu/test_compiler.py
index 5da91632bd86..3bf7abb8f113 100644
--- a/tests/python/contrib/test_ethosu/test_compiler.py
+++ b/tests/python/contrib/test_ethosu/test_compiler.py
@@ -63,4 +63,4 @@ def test_lower_to_tir_arg_count(relay_function, arg_count):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
index 8c598fe0d794..586b8b380e22 100644
--- a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
+++ b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
@@ -679,4 +679,4 @@ def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_create_tiles.py b/tests/python/contrib/test_ethosu/test_create_tiles.py
index 77b69df91116..d51c438cbf4e 100644
--- a/tests/python/contrib/test_ethosu/test_create_tiles.py
+++ b/tests/python/contrib/test_ethosu/test_create_tiles.py
@@ -167,4 +167,4 @@ def main(placeholder1: T.Buffer[(100,), "int8"], placeholder2: T.Buffer[(100,),
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index 61128da71c37..a70e091b2cee 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -529,4 +529,4 @@ def get_graph():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_extract_constants.py b/tests/python/contrib/test_ethosu/test_extract_constants.py
index 98094d8a4ed4..c5646b2c1229 100644
--- a/tests/python/contrib/test_ethosu/test_extract_constants.py
+++ b/tests/python/contrib/test_ethosu/test_extract_constants.py
@@ -96,4 +96,4 @@ def _expected():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_layout_optimizer.py b/tests/python/contrib/test_ethosu/test_layout_optimizer.py
index 4e134c9f4df0..9cc3637fdf8c 100644
--- a/tests/python/contrib/test_ethosu/test_layout_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_layout_optimizer.py
@@ -737,4 +737,4 @@ def get_graph():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__] + sys.argv[1:])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_legalize.py b/tests/python/contrib/test_ethosu/test_legalize.py
index 0f8fa4d84bf7..9b4dd467ff9f 100644
--- a/tests/python/contrib/test_ethosu/test_legalize.py
+++ b/tests/python/contrib/test_ethosu/test_legalize.py
@@ -2807,4 +2807,4 @@ def representative_dataset():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_lookup_table.py b/tests/python/contrib/test_ethosu/test_lookup_table.py
index 8e044b5b9929..e2b22897a0ab 100644
--- a/tests/python/contrib/test_ethosu/test_lookup_table.py
+++ b/tests/python/contrib/test_ethosu/test_lookup_table.py
@@ -172,4 +172,4 @@ def test_random_lut(accel_type):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_lower_to_te.py b/tests/python/contrib/test_ethosu/test_lower_to_te.py
index c6b4ae05d3a5..9ec59af44163 100644
--- a/tests/python/contrib/test_ethosu/test_lower_to_te.py
+++ b/tests/python/contrib/test_ethosu/test_lower_to_te.py
@@ -60,4 +60,4 @@ def test_ethosu_conv2d():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index 2b4ffd96caef..0df73d6dc561 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -199,4 +199,4 @@ def test_networks_with_usmp_and_cascader_with_striping(accel_type, model_url, wo
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_preprocess.py b/tests/python/contrib/test_ethosu/test_preprocess.py
index 41831f270d12..0a0aa2cf69a6 100644
--- a/tests/python/contrib/test_ethosu/test_preprocess.py
+++ b/tests/python/contrib/test_ethosu/test_preprocess.py
@@ -340,4 +340,4 @@ def create_external_func1(mod_, compiler_name, symbol_name):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_remove_concatenates.py b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
index 379a35b1b4a4..4b4ba52b86f6 100644
--- a/tests/python/contrib/test_ethosu/test_remove_concatenates.py
+++ b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
@@ -81,4 +81,4 @@ def _get_func():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py b/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py
index 330d5f7aa92a..c1f82fbb7a0f 100644
--- a/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py
+++ b/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py
@@ -341,4 +341,4 @@ def _visit(stmt):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index 46c6976567c8..649f2a611d50 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -818,4 +818,4 @@ def _get_func():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
index 7da3d7e5be82..07124c62ae8b 100644
--- a/tests/python/contrib/test_ethosu/test_replace_copy.py
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -132,4 +132,4 @@ def _get_func():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_replace_identity.py b/tests/python/contrib/test_ethosu/test_replace_identity.py
index 2155d33f43c0..775ef1260665 100644
--- a/tests/python/contrib/test_ethosu/test_replace_identity.py
+++ b/tests/python/contrib/test_ethosu/test_replace_identity.py
@@ -113,4 +113,4 @@ def _visit(stmt):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_replace_pooling.py b/tests/python/contrib/test_ethosu/test_replace_pooling.py
index 8e8ed3f351e1..564701637856 100644
--- a/tests/python/contrib/test_ethosu/test_replace_pooling.py
+++ b/tests/python/contrib/test_ethosu/test_replace_pooling.py
@@ -278,4 +278,4 @@ def _visit(stmt):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py b/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py
index e48016180b7a..f61ace0d51ec 100644
--- a/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py
+++ b/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py
@@ -154,4 +154,4 @@ def _visit(stmt):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_rolling_buffer.py b/tests/python/contrib/test_ethosu/test_rolling_buffer.py
index 8d348823d755..58376d8b614c 100644
--- a/tests/python/contrib/test_ethosu/test_rolling_buffer.py
+++ b/tests/python/contrib/test_ethosu/test_rolling_buffer.py
@@ -100,4 +100,4 @@ def _cascader(cached_func, const_dict, sch):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
index 695aed0d1919..1e9b43b47ada 100644
--- a/tests/python/contrib/test_ethosu/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -234,4 +234,4 @@ def test_copy_constants_fully_connected_weights():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
index f8a84aa08367..f205bc3b26ca 100644
--- a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
+++ b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
@@ -1499,4 +1499,4 @@ def populate_ethosu_binary_elementwise_calls(stmt):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_type_inference.py b/tests/python/contrib/test_ethosu/test_type_inference.py
index 1f304c117ffd..380d7532f8c1 100644
--- a/tests/python/contrib/test_ethosu/test_type_inference.py
+++ b/tests/python/contrib/test_ethosu/test_type_inference.py
@@ -475,4 +475,4 @@ def test_ethosu_unary_elementwise_invalid_dtype():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_ethosu/test_vela_api.py b/tests/python/contrib/test_ethosu/test_vela_api.py
index 75ca22d08202..9f95e4b70925 100644
--- a/tests/python/contrib/test_ethosu/test_vela_api.py
+++ b/tests/python/contrib/test_ethosu/test_vela_api.py
@@ -560,4 +560,4 @@ def verify(test_vec, mock_enc_w):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_libtorch_ops.py b/tests/python/contrib/test_libtorch_ops.py
index 2bfb78b407aa..153232a2f531 100644
--- a/tests/python/contrib/test_libtorch_ops.py
+++ b/tests/python/contrib/test_libtorch_ops.py
@@ -90,4 +90,4 @@ def script_fn(x, y):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py
index c693af7e4198..881226725ac3 100644
--- a/tests/python/contrib/test_nnpack.py
+++ b/tests/python/contrib/test_nnpack.py
@@ -217,4 +217,4 @@ def verify(target="llvm", algorithm=nnpack.ConvolutionAlgorithm.AUTO, with_bias=
 
 
 if __name__ == "__main__":
-    pytest.main()
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py b/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
index e9195db88c5b..058faa8a24e6 100644
--- a/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
+++ b/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
@@ -380,4 +380,4 @@ def expected():
     if sys.platform == "win32":
         print("Skip test on Windows for now")
         sys.exit(0)
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py b/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
index 0a2c13c5af60..4273f5fa34d5 100644
--- a/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
+++ b/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
@@ -84,4 +84,4 @@ def test_extern_vitis_ai_resnet18(dpu_target):
     if sys.platform == "win32":
         print("Skip test on Windows for now")
         sys.exit(0)
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/driver/tvmc/test_pass_list.py b/tests/python/driver/tvmc/test_pass_list.py
index f43da6371b9b..5b6c6710158d 100644
--- a/tests/python/driver/tvmc/test_pass_list.py
+++ b/tests/python/driver/tvmc/test_pass_list.py
@@ -33,4 +33,4 @@ def test_parse_pass_list_str():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 0e34719ea27d..880416c7bee8 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -2363,4 +2363,4 @@ def test_forward_split_v2(
 
 
 if __name__ == "__main__":
-    pytest.main(["test_forward.py"])
+    tvm.testing.main()
diff --git a/tests/python/frontend/paddlepaddle/test_forward.py b/tests/python/frontend/paddlepaddle/test_forward.py
index ba983eb0878e..c0e54657a950 100644
--- a/tests/python/frontend/paddlepaddle/test_forward.py
+++ b/tests/python/frontend/paddlepaddle/test_forward.py
@@ -1682,4 +1682,4 @@ def forward(self, inputs, prev_h):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index f3195f05d40f..dce18ee231d3 100755
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -5835,4 +5835,4 @@ def test_forward_dense_bincount():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/frontend/tensorflow2/test_functional_models.py b/tests/python/frontend/tensorflow2/test_functional_models.py
index 42ad5b29af79..53ece82217a1 100644
--- a/tests/python/frontend/tensorflow2/test_functional_models.py
+++ b/tests/python/frontend/tensorflow2/test_functional_models.py
@@ -646,4 +646,4 @@ def func(self, x):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/frontend/tensorflow2/test_sequential_models.py b/tests/python/frontend/tensorflow2/test_sequential_models.py
index 1b5a6342f07d..2ad41508630c 100644
--- a/tests/python/frontend/tensorflow2/test_sequential_models.py
+++ b/tests/python/frontend/tensorflow2/test_sequential_models.py
@@ -165,4 +165,4 @@ def call(self, inputs):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index eaac8ed26684..283eab3eea4c 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Test scheduling of reduction operations."""
-import pytest
 import numpy as np
 
 import tvm
@@ -675,4 +674,4 @@ def check_store_dst_remapped(op):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py
index 9d9f4e10e646..d53dc21d6328 100644
--- a/tests/python/integration/test_winograd_nnpack.py
+++ b/tests/python/integration/test_winograd_nnpack.py
@@ -183,6 +183,4 @@ def test_conv2d_nchw():
 
 
 if __name__ == "__main__":
-    import pytest
-
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_adt.py b/tests/python/relay/test_adt.py
index 8cf31f94378e..655ab11ee0a0 100644
--- a/tests/python/relay/test_adt.py
+++ b/tests/python/relay/test_adt.py
@@ -821,4 +821,4 @@ def test_iterate():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_analysis_basic_block_normal_form.py b/tests/python/relay/test_analysis_basic_block_normal_form.py
index 5395be35ad63..558f55ef40d5 100644
--- a/tests/python/relay/test_analysis_basic_block_normal_form.py
+++ b/tests/python/relay/test_analysis_basic_block_normal_form.py
@@ -211,4 +211,4 @@ def test_higher_order_nested():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_analysis_extract_intermediate_expr.py b/tests/python/relay/test_analysis_extract_intermediate_expr.py
index abcaf880b4aa..57585552b4a1 100644
--- a/tests/python/relay/test_analysis_extract_intermediate_expr.py
+++ b/tests/python/relay/test_analysis_extract_intermediate_expr.py
@@ -127,4 +127,4 @@ def expected_4():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_analysis_extract_operators.py b/tests/python/relay/test_analysis_extract_operators.py
index 5878b2a6e497..5218fbf7003e 100644
--- a/tests/python/relay/test_analysis_extract_operators.py
+++ b/tests/python/relay/test_analysis_extract_operators.py
@@ -104,4 +104,4 @@ def test_extract_resnet():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_auto_scheduler_task_extraction.py b/tests/python/relay/test_auto_scheduler_task_extraction.py
index a53b68cca885..9dbc653da23f 100644
--- a/tests/python/relay/test_auto_scheduler_task_extraction.py
+++ b/tests/python/relay/test_auto_scheduler_task_extraction.py
@@ -291,4 +291,4 @@ def counting_unique_hash(str_dag):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
index 0522c0db1075..179077e8742d 100644
--- a/tests/python/relay/test_backend_graph_executor.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -527,4 +527,4 @@ def test_benchmark_end_to_end_rpc():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_backend_interpreter.py b/tests/python/relay/test_backend_interpreter.py
index af2dcf32c305..3c94452311de 100644
--- a/tests/python/relay/test_backend_interpreter.py
+++ b/tests/python/relay/test_backend_interpreter.py
@@ -286,4 +286,4 @@ def test_functional_returns():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_call_graph.py b/tests/python/relay/test_call_graph.py
index a5978248901c..26106c31d5ce 100644
--- a/tests/python/relay/test_call_graph.py
+++ b/tests/python/relay/test_call_graph.py
@@ -145,4 +145,4 @@ def test_recursive_func():
 
 
 if __name__ == "__main__":
-    pytest.main()
+    tvm.testing.main()
diff --git a/tests/python/relay/test_name_mangling.py b/tests/python/relay/test_name_mangling.py
index 7b0a60f74587..46195d1fa215 100644
--- a/tests/python/relay/test_name_mangling.py
+++ b/tests/python/relay/test_name_mangling.py
@@ -35,4 +35,4 @@ def test_mangle_mod_name():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py
index 820f724bfc43..32e7d2ca3867 100644
--- a/tests/python/relay/test_op_grad_level2.py
+++ b/tests/python/relay/test_op_grad_level2.py
@@ -364,4 +364,4 @@ def test_conv2d_backward_weight_infer_type():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py
index 89b8199b9e22..4ca7cb9ce07f 100644
--- a/tests/python/relay/test_op_grad_level3.py
+++ b/tests/python/relay/test_op_grad_level3.py
@@ -194,4 +194,4 @@ def test_zeros_ones_grad_dynamic(executor_kind):
 
 
 if __name__ == "__main__":
-    pytest.main()
+    tvm.testing.main()
diff --git a/tests/python/relay/test_op_grad_level4.py b/tests/python/relay/test_op_grad_level4.py
index 9ed2ef262777..b85e692c5fe2 100644
--- a/tests/python/relay/test_op_grad_level4.py
+++ b/tests/python/relay/test_op_grad_level4.py
@@ -123,4 +123,4 @@ def check(sh, dtype, begin, end, strides, slice_mode):
 
 
 if __name__ == "__main__":
-    pytest.main()
+    tvm.testing.main()
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 9f31acfa6d7f..3bb9918c7c77 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -911,4 +911,4 @@ def @main(%p0844: Tensor[(1, 384), int64], %p1652: Tensor[(2016, 128), float16])
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py
index 78db5b87385d..47cf73d6915d 100644
--- a/tests/python/relay/test_op_level6.py
+++ b/tests/python/relay/test_op_level6.py
@@ -173,4 +173,4 @@ def verify_searchsorted(right, dtype):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index 3fd7cb69771b..4caab0ea095b 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -1948,4 +1948,4 @@ def test_alter_with_reduce():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index 223926a8779c..72d0232100dc 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -2786,4 +2786,4 @@ def expected():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_defunctionalization.py b/tests/python/relay/test_pass_defunctionalization.py
index 30f2203be0b5..a01c1c7e39d7 100644
--- a/tests/python/relay/test_pass_defunctionalization.py
+++ b/tests/python/relay/test_pass_defunctionalization.py
@@ -227,4 +227,4 @@ def @main(%l: List[int32]) -> int32 {
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_defuse_ops.py b/tests/python/relay/test_pass_defuse_ops.py
index f123bd582b87..ec6431ee269a 100644
--- a/tests/python/relay/test_pass_defuse_ops.py
+++ b/tests/python/relay/test_pass_defuse_ops.py
@@ -212,4 +212,4 @@ def golden_defused(conv_layer1_weight, conv_layer2_weight):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_dynamic_to_static.py b/tests/python/relay/test_pass_dynamic_to_static.py
index f523ad2a27fa..7d492b4fc3f4 100644
--- a/tests/python/relay/test_pass_dynamic_to_static.py
+++ b/tests/python/relay/test_pass_dynamic_to_static.py
@@ -638,4 +638,4 @@ def test_dynamic_to_static_dynamic_if():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_flexible_shape_dispatch.py b/tests/python/relay/test_pass_flexible_shape_dispatch.py
index a6d547f4f54b..86ccb25db54c 100644
--- a/tests/python/relay/test_pass_flexible_shape_dispatch.py
+++ b/tests/python/relay/test_pass_flexible_shape_dispatch.py
@@ -116,4 +116,4 @@ def test_multiple_outputs():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index fe662a30766c..06c93fbc5549 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -829,4 +829,4 @@ def expected():
 
 
 if __name__ == "__main__":
-    pytest.main([__pfile__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_gradient.py b/tests/python/relay/test_pass_gradient.py
index 126fcf22e823..33f0775b2d87 100644
--- a/tests/python/relay/test_pass_gradient.py
+++ b/tests/python/relay/test_pass_gradient.py
@@ -457,4 +457,4 @@ def test_global_function():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_inline.py b/tests/python/relay/test_pass_inline.py
index fb58b9032e5a..f5898774f50b 100644
--- a/tests/python/relay/test_pass_inline.py
+++ b/tests/python/relay/test_pass_inline.py
@@ -827,4 +827,4 @@ def expected():
 
 
 if __name__ == "__main__":
-    pytest.main()
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_lambda_lift.py b/tests/python/relay/test_pass_lambda_lift.py
index ce737b7bedbb..518a8c3078b6 100644
--- a/tests/python/relay/test_pass_lambda_lift.py
+++ b/tests/python/relay/test_pass_lambda_lift.py
@@ -83,4 +83,4 @@ def test_recursive():
 
 
 if __name__ == "__main__":
-    pytest.main()
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_manager.py b/tests/python/relay/test_pass_manager.py
index c7926f7a3d79..4088cfdef073 100644
--- a/tests/python/relay/test_pass_manager.py
+++ b/tests/python/relay/test_pass_manager.py
@@ -614,4 +614,4 @@ def test_print_debug_callback():
 
 
 if __name__ == "__main__":
-    pytest.main()
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_merge_composite.py b/tests/python/relay/test_pass_merge_composite.py
index 06cb1ecde78f..eefec17aa30a 100644
--- a/tests/python/relay/test_pass_merge_composite.py
+++ b/tests/python/relay/test_pass_merge_composite.py
@@ -980,4 +980,4 @@ def _check_type_false(extract):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_remove_unused_functions.py b/tests/python/relay/test_pass_remove_unused_functions.py
index 0764a88b3159..67efc9b20262 100644
--- a/tests/python/relay/test_pass_remove_unused_functions.py
+++ b/tests/python/relay/test_pass_remove_unused_functions.py
@@ -121,4 +121,4 @@ def get_mod():
 
 
 if __name__ == "__main__":
-    pytest.main()
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py
index fa9773b8e3d9..ad0b33551c77 100644
--- a/tests/python/relay/test_pass_simplify_expr.py
+++ b/tests/python/relay/test_pass_simplify_expr.py
@@ -745,4 +745,4 @@ def expected():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_to_basic_block_normal_form.py b/tests/python/relay/test_pass_to_basic_block_normal_form.py
index d04afe15b5bb..2a97e985d91d 100644
--- a/tests/python/relay/test_pass_to_basic_block_normal_form.py
+++ b/tests/python/relay/test_pass_to_basic_block_normal_form.py
@@ -514,4 +514,4 @@ def test_immutability():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_unmatched_cases.py b/tests/python/relay/test_pass_unmatched_cases.py
index 255cecf76f2e..885f26025167 100644
--- a/tests/python/relay/test_pass_unmatched_cases.py
+++ b/tests/python/relay/test_pass_unmatched_cases.py
@@ -467,4 +467,4 @@ def @expand_on_empty_tuple_match(%a: (List[()], ())) -> int {
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index 0f9d3633c5d7..3d71fdfc1d94 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -624,4 +624,4 @@ def test_pipeline():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_tensor_array.py b/tests/python/relay/test_tensor_array.py
index 21043abb3c84..4973fa20c447 100644
--- a/tests/python/relay/test_tensor_array.py
+++ b/tests/python/relay/test_tensor_array.py
@@ -782,4 +782,4 @@ def run(dtype, shape):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_to_mixed_precision.py b/tests/python/relay/test_to_mixed_precision.py
index 51d040c311f4..771d366df079 100644
--- a/tests/python/relay/test_to_mixed_precision.py
+++ b/tests/python/relay/test_to_mixed_precision.py
@@ -538,4 +538,4 @@ def test_convert_follow_node_with_integer_arguments(target_precision):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index b0b7ef048192..13d164c2caf6 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -583,6 +583,4 @@ def test_argreduce_infer_return_type():
 
 
 if __name__ == "__main__":
-    import sys
-
-    pytest.main(sys.argv)
+    tvm.testing.main()
diff --git a/tests/python/relay/test_vm_serialization.py b/tests/python/relay/test_vm_serialization.py
index 1a49fc5a0184..f5a495bc71c7 100644
--- a/tests/python/relay/test_vm_serialization.py
+++ b/tests/python/relay/test_vm_serialization.py
@@ -316,4 +316,4 @@ def test_dynamic_bcast():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_aot_legalize_packed_call.py b/tests/python/unittest/test_aot_legalize_packed_call.py
index 106e0f52adac..3f6e3f776cff 100644
--- a/tests/python/unittest/test_aot_legalize_packed_call.py
+++ b/tests/python/unittest/test_aot_legalize_packed_call.py
@@ -115,4 +115,4 @@ def test_aot_packed_call():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_arith_deduce_bound.py b/tests/python/unittest/test_arith_deduce_bound.py
index ef478b4c2ffb..0915df3051db 100644
--- a/tests/python/unittest/test_arith_deduce_bound.py
+++ b/tests/python/unittest/test_arith_deduce_bound.py
@@ -234,4 +234,4 @@ def test_non_support(lhs):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index d6c2cfe8bbdd..975af097c030 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -1071,4 +1071,4 @@ def test_if_then_else_simplify():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_arith_solve_linear_equations.py b/tests/python/unittest/test_arith_solve_linear_equations.py
index 87aea26e664c..24eb860c55f6 100644
--- a/tests/python/unittest/test_arith_solve_linear_equations.py
+++ b/tests/python/unittest/test_arith_solve_linear_equations.py
@@ -178,4 +178,4 @@ def test_ill_formed():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_arith_solve_linear_inequality.py b/tests/python/unittest/test_arith_solve_linear_inequality.py
index 9fbe98fe5741..dd2fbdf72b94 100644
--- a/tests/python/unittest/test_arith_solve_linear_inequality.py
+++ b/tests/python/unittest/test_arith_solve_linear_inequality.py
@@ -197,4 +197,4 @@ def test_no_solution():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_ir_container.py b/tests/python/unittest/test_ir_container.py
index 3652d5bdb280..1915849e1044 100644
--- a/tests/python/unittest/test_ir_container.py
+++ b/tests/python/unittest/test_ir_container.py
@@ -113,4 +113,4 @@ def test_ndarray_container():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index 7ccaf72b1baf..e8ffed82062e 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -633,4 +633,4 @@ def test_multiple_relay_modules_aot_graph():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_target_codegen_cuda.py b/tests/python/unittest/test_target_codegen_cuda.py
index 96b947e20655..e60138a9c8d6 100644
--- a/tests/python/unittest/test_target_codegen_cuda.py
+++ b/tests/python/unittest/test_target_codegen_cuda.py
@@ -1045,4 +1045,4 @@ def build(A, C, N, C_N):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_analysis_stmt_finding.py b/tests/python/unittest/test_tir_analysis_stmt_finding.py
index acb5faa0de12..72fb4898befd 100644
--- a/tests/python/unittest/test_tir_analysis_stmt_finding.py
+++ b/tests/python/unittest/test_tir_analysis_stmt_finding.py
@@ -51,4 +51,4 @@ def test_no_anchor_block():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_buffer.py b/tests/python/unittest/test_tir_buffer.py
index d250fada6ae4..55c83167392f 100644
--- a/tests/python/unittest/test_tir_buffer.py
+++ b/tests/python/unittest/test_tir_buffer.py
@@ -277,4 +277,4 @@ def check_auto_bind():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_nodes.py b/tests/python/unittest/test_tir_nodes.py
index c4ab76cd2688..83cd64fa229b 100644
--- a/tests/python/unittest/test_tir_nodes.py
+++ b/tests/python/unittest/test_tir_nodes.py
@@ -525,4 +525,4 @@ def test_tir_allocate():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_te_extern_primfunc.py b/tests/python/unittest/test_tir_te_extern_primfunc.py
index a622f77cc737..f6eb2e8a9b86 100644
--- a/tests/python/unittest/test_tir_te_extern_primfunc.py
+++ b/tests/python/unittest/test_tir_te_extern_primfunc.py
@@ -222,4 +222,4 @@ def tensors_from_extern_op(extern, func):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py b/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py
index e08f04fa1f25..1bf21d5c5924 100644
--- a/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py
+++ b/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py
@@ -59,4 +59,4 @@ def verify_serial_loops(stmt):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_hoist_if.py b/tests/python/unittest/test_tir_transform_hoist_if.py
index 0270500828b8..dd10e15853f1 100644
--- a/tests/python/unittest/test_tir_transform_hoist_if.py
+++ b/tests/python/unittest/test_tir_transform_hoist_if.py
@@ -809,4 +809,4 @@ def test_hoisting_op_conv():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
index 2e2b03ba721c..70c14b02f0eb 100644
--- a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
+++ b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
@@ -277,4 +277,4 @@ def test_rolling_buffer_ir_transform():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_ir_utils.py b/tests/python/unittest/test_tir_transform_ir_utils.py
index 0946b32cca3f..d2cae351610c 100644
--- a/tests/python/unittest/test_tir_transform_ir_utils.py
+++ b/tests/python/unittest/test_tir_transform_ir_utils.py
@@ -37,4 +37,4 @@ def test_convert_ssa():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_lower_warp_memory.py b/tests/python/unittest/test_tir_transform_lower_warp_memory.py
index 13f3a5ff7ba2..48af3ebaf529 100644
--- a/tests/python/unittest/test_tir_transform_lower_warp_memory.py
+++ b/tests/python/unittest/test_tir_transform_lower_warp_memory.py
@@ -349,4 +349,4 @@ def test_lower_warp_memory_divide_by_factor():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_make_unpacked_api.py b/tests/python/unittest/test_tir_transform_make_unpacked_api.py
index e5f41e7b520f..245ff53f9105 100644
--- a/tests/python/unittest/test_tir_transform_make_unpacked_api.py
+++ b/tests/python/unittest/test_tir_transform_make_unpacked_api.py
@@ -135,4 +135,4 @@ def test_body():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
index 52880e40cbee..662f86479c09 100644
--- a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
+++ b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
@@ -1690,4 +1690,4 @@ def test_multiple_calls_to_same_primfunc():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__] + sys.argv[1:])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
index d1f86814e7d6..25e895573551 100644
--- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
+++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
@@ -622,4 +622,4 @@ def test_tensor_intrin():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__] + sys.argv[1:])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_usmp_utils.py b/tests/python/unittest/test_tir_usmp_utils.py
index 756b97b0d223..0fece9dcd263 100644
--- a/tests/python/unittest/test_tir_usmp_utils.py
+++ b/tests/python/unittest/test_tir_usmp_utils.py
@@ -198,4 +198,4 @@ def test_create_array_buffer_info():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__] + sys.argv[1:])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_type_annotation_checker.py b/tests/python/unittest/test_type_annotation_checker.py
index 204c15331339..9af356b97198 100644
--- a/tests/python/unittest/test_type_annotation_checker.py
+++ b/tests/python/unittest/test_type_annotation_checker.py
@@ -22,6 +22,7 @@
 import pytest
 import _pytest
 
+import tvm
 from tvm.tir.schedule._type_checker import type_checked
 
 
@@ -187,4 +188,4 @@ def func(_: type_annotation):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()

From b05e91807ea8f01fa2c6407f5ddceef66b0ff4b2 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sun, 8 Jan 2023 10:57:33 -0800
Subject: [PATCH 130/286] [TVMScript] Refactor IRDocsifier (#13593)

This PR refactors the TVMScript printer and includes the following
changes:
- Consolidate the logics of VarTable into IRDocsifier
- Decouple TracedObject into Object and ObjectPath for less syntactic
  noise
- Restructure the folder to ensure logics and consistency

Some tests removed because the APIs do not exist any more due to the
consolidation.
---
 include/tvm/script/printer.h                  |  56 --
 include/tvm/script/printer/doc.h              |  70 +--
 include/tvm/script/printer/doc_printer.h      |  48 --
 include/tvm/script/printer/frame.h            | 140 -----
 include/tvm/script/printer/ir_docsifier.h     | 308 ++++++-----
 .../tvm/script/printer/ir_docsifier_functor.h | 163 ++++++
 include/tvm/script/printer/printer.h          |  86 ++++
 include/tvm/script/printer/traced_object.h    | 484 ------------------
 .../script/printer/traced_object_functor.h    | 175 -------
 include/tvm/script/printer/var_table.h        | 155 ------
 include/tvm/support/with.h                    |  29 --
 include/tvm/tir/op.h                          |   3 +
 include/tvm/tir/op_attr_types.h               |   5 +
 python/tvm/script/__init__.py                 |   5 +-
 python/tvm/script/ir_builder/tir/ir.py        | 121 +++--
 python/tvm/script/printer/__init__.py         |   7 +-
 python/tvm/script/printer/entry.py            |  71 ---
 python/tvm/script/printer/frame.py            |  81 ---
 python/tvm/script/printer/ir_docsifier.py     | 245 ---------
 python/tvm/script/printer/printer.py          |  64 +++
 python/tvm/script/printer/var_table.py        | 118 -----
 src/script/printer/doc.cc                     |  10 +-
 .../{ => doc_printer}/base_doc_printer.cc     |   0
 .../{ => doc_printer}/base_doc_printer.h      |   7 +-
 .../{ => doc_printer}/python_doc_printer.cc   |  11 +-
 src/script/printer/frame.cc                   |  50 --
 src/script/printer/ir/ir.cc                   |  74 +++
 src/script/printer/ir/misc.cc                 |  77 +++
 src/script/printer/ir/utils.h                 |  61 +++
 src/script/printer/ir_docsifier.cc            | 184 ++++---
 src/script/{ => printer}/printer.cc           |  34 +-
 src/script/printer/tir/block.cc               | 150 ++++++
 src/script/printer/tir/buffer.cc              | 193 +++++++
 src/script/printer/tir/expr.cc                | 299 +++++++++++
 src/script/printer/tir/for_loop.cc            | 122 +++++
 src/script/printer/tir/function.cc            |  86 ++++
 src/script/printer/tir/ir.cc                  |  97 ++++
 src/script/printer/tir/stmt.cc                | 374 ++++++++++++++
 src/script/printer/tir/utils.h                | 176 +++++++
 src/script/printer/traced_object_functor.cc   |  85 ---
 src/script/printer/utils.h                    |  93 ----
 src/script/printer/var_table.cc               | 109 ----
 src/tir/ir/stmt.cc                            |   2 +-
 src/tir/op/builtin.cc                         |  16 +-
 src/tir/op/op.cc                              |  92 ++--
 src/tir/op/runtime.cc                         |  41 --
 tests/cpp/traced_object_test.cc               | 268 ----------
 .../cpp/tvmscript_printer_irdocsifier_test.cc | 117 -----
 ...ript_printer_traced_object_functor_test.cc | 188 -------
 tests/cpp/tvmscript_printer_var_table_test.cc | 158 ------
 .../test_tvmscript_printer_entry_point.py     |  30 --
 .../unittest/test_tvmscript_printer_frame.py  |  60 ---
 .../test_tvmscript_printer_irdocsifier.py     | 123 -----
 .../test_tvmscript_printer_var_table.py       |  89 ----
 54 files changed, 2499 insertions(+), 3411 deletions(-)
 delete mode 100644 include/tvm/script/printer.h
 delete mode 100644 include/tvm/script/printer/doc_printer.h
 delete mode 100644 include/tvm/script/printer/frame.h
 create mode 100644 include/tvm/script/printer/ir_docsifier_functor.h
 create mode 100644 include/tvm/script/printer/printer.h
 delete mode 100644 include/tvm/script/printer/traced_object.h
 delete mode 100644 include/tvm/script/printer/traced_object_functor.h
 delete mode 100644 include/tvm/script/printer/var_table.h
 delete mode 100644 python/tvm/script/printer/entry.py
 delete mode 100644 python/tvm/script/printer/frame.py
 delete mode 100644 python/tvm/script/printer/ir_docsifier.py
 create mode 100644 python/tvm/script/printer/printer.py
 delete mode 100644 python/tvm/script/printer/var_table.py
 rename src/script/printer/{ => doc_printer}/base_doc_printer.cc (100%)
 rename src/script/printer/{ => doc_printer}/base_doc_printer.h (97%)
 rename src/script/printer/{ => doc_printer}/python_doc_printer.cc (98%)
 delete mode 100644 src/script/printer/frame.cc
 create mode 100644 src/script/printer/ir/ir.cc
 create mode 100644 src/script/printer/ir/misc.cc
 create mode 100644 src/script/printer/ir/utils.h
 rename src/script/{ => printer}/printer.cc (57%)
 create mode 100644 src/script/printer/tir/block.cc
 create mode 100644 src/script/printer/tir/buffer.cc
 create mode 100644 src/script/printer/tir/expr.cc
 create mode 100644 src/script/printer/tir/for_loop.cc
 create mode 100644 src/script/printer/tir/function.cc
 create mode 100644 src/script/printer/tir/ir.cc
 create mode 100644 src/script/printer/tir/stmt.cc
 create mode 100644 src/script/printer/tir/utils.h
 delete mode 100644 src/script/printer/traced_object_functor.cc
 delete mode 100644 src/script/printer/utils.h
 delete mode 100644 src/script/printer/var_table.cc
 delete mode 100644 src/tir/op/runtime.cc
 delete mode 100644 tests/cpp/traced_object_test.cc
 delete mode 100644 tests/cpp/tvmscript_printer_irdocsifier_test.cc
 delete mode 100644 tests/cpp/tvmscript_printer_traced_object_functor_test.cc
 delete mode 100644 tests/cpp/tvmscript_printer_var_table_test.cc
 delete mode 100644 tests/python/unittest/test_tvmscript_printer_entry_point.py
 delete mode 100644 tests/python/unittest/test_tvmscript_printer_frame.py
 delete mode 100644 tests/python/unittest/test_tvmscript_printer_irdocsifier.py
 delete mode 100644 tests/python/unittest/test_tvmscript_printer_var_table.py

diff --git a/include/tvm/script/printer.h b/include/tvm/script/printer.h
deleted file mode 100644
index b0fc54108c92..000000000000
--- a/include/tvm/script/printer.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef TVM_SCRIPT_PRINTER_H_
-#define TVM_SCRIPT_PRINTER_H_
-
-#include <tvm/node/node.h>
-#include <tvm/node/object_path.h>
-
-namespace tvm {
-namespace script {
-namespace printer {
-
-/*!
- * \brief Print IR graph as TVMScript code
- *
- * \param root_node The root node to print.
- * \param ir_name The dispatch token of the target IR, e.g., "tir", "relax".
- * \param ir_prefix The symbol name for TVMScript IR namespaces. For example, {"tir": "T"}.
- * \param indent_spaces Number of spaces used for indentation
- * \param print_line_numbers Whether to print line numbers
- * \param num_context_lines Number of context lines to print around the underlined text
- * \param path_to_underline Object path to be underlined
- *
- * \return the TVMScript code as string.
- */
-String Script(                                        //
-    const ObjectRef& root_node,                       //
-    String ir_name,                                   //
-    Map<String, String> ir_prefix,                    //
-    int indent_spaces = 4,                            //
-    bool print_line_numbers = false,                  //
-    int num_context_lines = -1,                       //
-    Optional<ObjectPath> path_to_underline = NullOpt  //
-);
-
-}  // namespace printer
-}  // namespace script
-}  // namespace tvm
-
-#endif  // TVM_SCRIPT_PRINTER_H_
diff --git a/include/tvm/script/printer/doc.h b/include/tvm/script/printer/doc.h
index 1ee7fd6a7fd4..094d3fdf51df 100644
--- a/include/tvm/script/printer/doc.h
+++ b/include/tvm/script/printer/doc.h
@@ -22,12 +22,13 @@
 #include <tvm/ir/expr.h>
 #include <tvm/node/node.h>
 #include <tvm/runtime/data_type.h>
-#include <tvm/script/printer/traced_object.h>
 
 namespace tvm {
 namespace script {
 namespace printer {
 
+class Doc;
+
 /*!
  * \brief The base class of all Doc.
  *
@@ -88,15 +89,6 @@ class ExprDocNode : public DocNode {
    */
   ExprDoc Attr(String attr) const;
 
-  /*!
-   * \brief Create a doc representing attribute access on the current ExprDoc
-   * \param attr The attribute to access.
-   *
-   * The ObjectPath of attr will be pushed to the source_path of the returned
-   * doc.
-   */
-  ExprDoc Attr(TracedObject<String> attr) const;
-
   /*!
    * \brief Create a doc representing index access on the current ExprDoc
    * \param indices The indices to access.
@@ -259,83 +251,33 @@ class LiteralDoc : public ExprDoc {
    * \brief Create a LiteralDoc to represent None/null/empty value.
    */
   static LiteralDoc None() { return LiteralDoc(ObjectRef(nullptr)); }
-
-  /*!
-   * \brief Create a LiteralDoc to represent None/null/empty value.
-   * \param object_path The source path of the returned Doc.
-   */
-  static LiteralDoc None(ObjectPath object_path) {
-    return LiteralDoc(ObjectRef(nullptr), object_path);
-  }
-
   /*!
    * \brief Create a LiteralDoc to represent integer.
    * \param v The integer value.
    */
-  static LiteralDoc Int(int v) { return LiteralDoc(IntImm(DataType::Int(64), v)); }
-
-  /*!
-   * \brief Create a LiteralDoc to represent integer.
-   * \param v The integer value.
-   *
-   * The ObjectPath of v will be pushed to the source_path of the returned doc.
-   */
-  static LiteralDoc Int(const TracedObject<IntImm>& v) { return LiteralDoc(v.Get(), v.GetPath()); }
-
-  /*!
-   * \brief Create a LiteralDoc to represent integer.
-   * \param v The integer value.
-   *
-   * The ObjectPath of v will be pushed to the source_path of the returned doc.
-   */
-  static LiteralDoc Int(const TracedBasicValue<int>& v) {
-    return LiteralDoc(IntImm(DataType::Int(64), v.Get()), v.GetPath());
-  }
+  static LiteralDoc Int(int64_t v) { return LiteralDoc(IntImm(DataType::Int(64), v)); }
   /*!
    * \brief Create a LiteralDoc to represent boolean.
    * \param v The boolean value.
    */
   static LiteralDoc Boolean(bool v) { return LiteralDoc(IntImm(DataType::Bool(), v)); }
-
-  /*!
-   * \brief Create a LiteralDoc to represent boolean.
-   * \param v The boolean value.
-   *
-   * The ObjectPath of v will be pushed to the source_path of the returned doc.
-   */
-  static LiteralDoc Boolean(const TracedBasicValue<bool>& v) {
-    return LiteralDoc(IntImm(DataType::Bool(), v.Get()), v.GetPath());
-  }
-
   /*!
    * \brief Create a LiteralDoc to represent float.
    * \param v The float value.
    */
   static LiteralDoc Float(double v) { return LiteralDoc(FloatImm(DataType::Float(64), v)); }
-
-  /*!
-   * \brief Create a LiteralDoc to represent float.
-   * \param v The float value.
-   *
-   * The ObjectPath of v will be pushed to the source_path of the returned doc.
-   */
-  static LiteralDoc Float(const TracedObject<FloatImm>& v) {
-    return LiteralDoc(v.Get(), v.GetPath());
-  }
-
   /*!
    * \brief Create a LiteralDoc to represent string.
    * \param v The string value.
    */
   static LiteralDoc Str(const String& v) { return LiteralDoc(v); }
-
   /*!
    * \brief Create a LiteralDoc to represent string.
    * \param v The string value.
-   *
-   * The ObjectPath of v will be pushed to the source_path of the returned doc.
    */
-  static LiteralDoc Str(const TracedObject<String>& v) { return LiteralDoc(v.Get(), v.GetPath()); }
+  static LiteralDoc DataType(const DLDataType& v) {
+    return LiteralDoc::Str(runtime::DLDataType2String(v));
+  }
 
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(LiteralDoc, ExprDoc, LiteralDocNode);
 };
diff --git a/include/tvm/script/printer/doc_printer.h b/include/tvm/script/printer/doc_printer.h
deleted file mode 100644
index 04a67a9b8209..000000000000
--- a/include/tvm/script/printer/doc_printer.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef TVM_SCRIPT_PRINTER_DOC_PRINTER_H_
-#define TVM_SCRIPT_PRINTER_DOC_PRINTER_H_
-
-#include <tvm/script/printer/doc.h>
-
-namespace tvm {
-namespace script {
-namespace printer {
-
-/*!
- * \brief Convert Doc into Python script.
- *
- * This function unpacks the DocPrinterOptions into function arguments
- * to be FFI friendly.
- *
- * \param doc Doc to be converted
- * \param indent_spaces Number of spaces used for indentation
- * \param print_line_numbers Whether to print line numbers
- * \param num_context_lines Number of context lines to print around the underlined text
- * \param path_to_underline Object path to be underlined
- */
-String DocToPythonScript(Doc doc, int indent_spaces = 4, bool print_line_numbers = false,
-                         int num_context_lines = -1,
-                         Optional<ObjectPath> path_to_underline = NullOpt);
-
-}  // namespace printer
-}  // namespace script
-}  // namespace tvm
-
-#endif  // TVM_SCRIPT_PRINTER_DOC_PRINTER_H_
diff --git a/include/tvm/script/printer/frame.h b/include/tvm/script/printer/frame.h
deleted file mode 100644
index 407ad16007e9..000000000000
--- a/include/tvm/script/printer/frame.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef TVM_SCRIPT_PRINTER_FRAME_H_
-#define TVM_SCRIPT_PRINTER_FRAME_H_
-
-#include <tvm/node/node.h>
-#include <tvm/script/printer/doc.h>
-
-#include <utility>
-#include <vector>
-
-namespace tvm {
-namespace script {
-namespace printer {
-
-/*!
- * Frame is the core data structure for semantic information
- * when printing IR graph into TVMScript code.
- */
-class FrameNode : public Object {
- public:
-  void VisitAttrs(tvm::AttrVisitor* v) {}
-
-  virtual ~FrameNode() = default;
-
-  /*!
-   * \brief Add a callback function to be called when this frame exits.
-   * \param cb The callback function. It should have signature void().
-   */
-  template <typename TCallback>
-  void AddExitCallback(TCallback&& cb) {
-    callbacks_.emplace_back(std::forward<TCallback>(cb));
-  }
-
-  /*!
-   * \brief Method that's called when Frame enters the scope.
-   */
-  virtual void EnterWithScope() {}
-
-  /*!
-   * \brief Method that's called when Frame exits the scope.
-   */
-  virtual void ExitWithScope() {
-    for (const std::function<void()>& callback : callbacks_) {
-      callback();
-    }
-    callbacks_.clear();
-  }
-
-  static constexpr const char* _type_key = "script.printer.Frame";
-  TVM_DECLARE_BASE_OBJECT_INFO(FrameNode, Object);
-
- private:
-  std::vector<std::function<void()>> callbacks_;
-};
-
-/*!
- * \brief Reference type of FrameNode
- */
-class Frame : public ObjectRef {
- protected:
-  Frame() = default;
-
- public:
-  virtual ~Frame() = default;
-  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(Frame, ObjectRef, FrameNode);
-};
-
-/*!
- * \brief MetadataFrame contains information like contant parameter array.
- */
-class MetadataFrameNode : public FrameNode {
- public:
-  Array<ObjectRef> metadata;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    FrameNode::VisitAttrs(v);
-    v->Visit("metadata", &metadata);
-  }
-
-  static constexpr const char* _type_key = "script.printer.MetadataFrame";
-  TVM_DECLARE_FINAL_OBJECT_INFO(MetadataFrameNode, FrameNode);
-};
-
-/*!
- * \brief Reference type of MetadataFrameNode
- */
-class MetadataFrame : public Frame {
- public:
-  MetadataFrame();
-  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(MetadataFrame, Frame, MetadataFrameNode);
-};
-
-/*!
- * \brief VarDefFrame contains information about the free variables that needs to be defined
- * at the beginning of the printed snippet.
- */
-class VarDefFrameNode : public FrameNode {
- public:
-  Array<StmtDoc> stmts;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    FrameNode::VisitAttrs(v);
-    v->Visit("stmts", &stmts);
-  }
-
-  static constexpr const char* _type_key = "script.printer.VarDefFrame";
-  TVM_DECLARE_FINAL_OBJECT_INFO(VarDefFrameNode, FrameNode);
-};
-
-/*!
- * \brief Reference type of VarDefFrameNode
- */
-class VarDefFrame : public Frame {
- public:
-  VarDefFrame();
-  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(VarDefFrame, Frame, VarDefFrameNode);
-};
-
-}  // namespace printer
-}  // namespace script
-}  // namespace tvm
-
-#endif  // TVM_SCRIPT_PRINTER_FRAME_H_
diff --git a/include/tvm/script/printer/ir_docsifier.h b/include/tvm/script/printer/ir_docsifier.h
index 8945bd6e7a94..e97ddc0234b6 100644
--- a/include/tvm/script/printer/ir_docsifier.h
+++ b/include/tvm/script/printer/ir_docsifier.h
@@ -19,45 +19,117 @@
 #ifndef TVM_SCRIPT_PRINTER_IR_DOCSIFIER_H_
 #define TVM_SCRIPT_PRINTER_IR_DOCSIFIER_H_
 
+#include <tvm/ir/module.h>
 #include <tvm/node/node.h>
-#include <tvm/runtime/logging.h>
 #include <tvm/script/printer/doc.h>
-#include <tvm/script/printer/frame.h>
-#include <tvm/script/printer/traced_object.h>
-#include <tvm/script/printer/traced_object_functor.h>
-#include <tvm/script/printer/var_table.h>
-#include <tvm/support/with.h>
+#include <tvm/script/printer/ir_docsifier_functor.h>
+
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
 
 namespace tvm {
 namespace script {
 namespace printer {
 
-using WithCtx = With<ContextManager>;
+//////////////////////// Frame ////////////////////////
+
+class IRDocsifier;
+class IRDocsifierNode;
+
+/*!
+ * Frame is the core data structure for semantic information
+ * when printing IR graph into TVMScript code.
+ */
+class FrameNode : public Object {
+ public:
+  /*! The docs generated in the frame */
+  Array<StmtDoc> stmts;
+  /*! The corresponding IRDocsifier */
+  IRDocsifierNode* d;
+  /*! The callbacks that are going to be invoked when the frame exits */
+  std::vector<std::function<void()>> callbacks;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("stmts", &stmts);
+    // `d` is not visited
+    // `callbacks` is not visited
+  }
+
+  static constexpr const char* _type_key = "script.printer.Frame";
+  TVM_DECLARE_BASE_OBJECT_INFO(FrameNode, Object);
+
+ public:
+  virtual ~FrameNode() = default;
+
+  /*!
+   * \brief Add a callback function to be called when this frame exits.
+   * \param cb The callback function. It should have signature void().
+   */
+  template <typename TCallback>
+  void AddExitCallback(TCallback&& cb) {
+    callbacks.emplace_back(std::forward<TCallback>(cb));
+  }
+  /*!
+   * \brief Add a dispatch token to the docsifier, and a callback that pops the token when this
+   * frame exits.
+   * \param d The docsifier.
+   * \param token The token to be added.
+   */
+  void AddDispatchToken(const IRDocsifier& d, const String& token);
+  /*!
+   * \brief Method that's called when Frame enters the scope.
+   */
+  virtual void EnterWithScope();
+  /*!
+   * \brief Method that's called when Frame exits the scope.
+   */
+  virtual void ExitWithScope();
+};
+
+/*!
+ * \brief Reference type of FrameNode
+ */
+class Frame : public ObjectRef {
+ protected:
+  Frame() = default;
+
+ public:
+  virtual ~Frame() = default;
+
+  /*! \brief Method that's called when Frame enters the scope. */
+  void EnterWithScope() { get()->EnterWithScope(); }
+
+  /*! \brief Method that's called when Frame exits the scope. */
+  void ExitWithScope() { get()->ExitWithScope(); }
+
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(Frame, ObjectRef, FrameNode);
+};
+
+//////////////////////// IRDocsifier ////////////////////////
 
 /*!
  * \brief IRDocsifier is the top-level interface in the IR->Doc process.
  *
  * It provides methods to convert IR node object to Doc, operate on Frame
  * objects and change dispatch tokens.
- *
- * Example usage:
- * \code
- * TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
- *    .set_dispatch([](TracedObject<tir::Var> obj, IRDocsifier p) { return IdDoc("x"); });
- *
- * TracedObject<tir::Var> var = ...;
- * IRDocsifier p;
- * p->AsDoc(var); // returns an IdDoc("x")
- * \endcode
- *
  */
 class IRDocsifierNode : public Object {
  public:
+  /*! \brief A function that creates the doc for a variable */
+  using DocCreator = std::function<ExprDoc()>;
+  /*! \brief Information about a variable, including its optional name and its doc creator */
+  struct VariableInfo {
+    /*! \brief The creator */
+    DocCreator creator;
+    /*! \brief The name of the variable */
+    Optional<String> name;
+  };
   /*!
-   * \brief The var table to use during the printing process.
-   * \sa VarTableNode
+   * \brief This map connects IR dispatch token to the name of identifier.
    */
-  VarTable vars;
+  Map<String, String> ir_prefix;
   /*!
    * \brief The stack of frames.
    * \sa FrameNode
@@ -70,16 +142,23 @@ class IRDocsifierNode : public Object {
    * when converting IR node object to Doc.
    */
   Array<String> dispatch_tokens;
-  /*!
-   * \brief This map connects IR dipatch token to the name of identifier.
-   */
-  Map<String, String> ir_prefix;
+  /*! \brief The IRModule to be docsifier is handling */
+  Optional<IRModule> mod;
+  /*! \brief Mapping from a var to its info */
+  std::unordered_map<ObjectRef, VariableInfo, ObjectPtrHash, ObjectPtrEqual> obj2info;
+  /*! \brief The variable names used already */
+  std::unordered_set<String> defined_names;
+  /*! \brief Common prefixes of variable usages */
+  std::unordered_map<const Object*, std::vector<const Object*>> common_prefix;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("vars", &vars);
+    v->Visit("ir_prefix", &ir_prefix);
     v->Visit("frames", &frames);
     v->Visit("dispatch_tokens", &dispatch_tokens);
-    v->Visit("ir_prefix", &ir_prefix);
+    v->Visit("mod", &mod);
+    // `obj2info` is not visited
+    // `defined_names` is not visited
+    // `common_prefix` is not visited
   }
 
   static constexpr const char* _type_key = "script.printer.IRDocsifier";
@@ -87,79 +166,68 @@ class IRDocsifierNode : public Object {
 
  public:
   /*!
-   * \brief Transform the input object into TDoc.
-   * \param obj The object to be transformed.
+   * \brief Define variable by name.
+   * \param obj The variable object.
+   * \param frame The frame that this variable is defined in.
+   * \param name_hint The hint for variable name.
    *
-   * \return The Doc object.
+   * \return The id doc for this variable.
+   *
+   * This function will rename the variable to avoid name conflict with other variables
+   * in the table.
    */
-  template <class TDoc>
-  TDoc AsDoc(const TracedObject<ObjectRef>& obj) const {
-    auto result = Downcast<TDoc>(AsDocImpl(obj));
-    result->source_paths.push_back(obj.GetPath());
-    return result;
-  }
+  IdDoc Define(const ObjectRef& obj, const Frame& frame, const String& name_hint);
 
   /*!
-   * \brief Helper method to transform object into ExprDoc.
-   * \param obj The object to be transformed.
+   * \brief Define variable by doc factory.
+   * \param obj The variable object.
+   * \param frame The frame that this variable is defined in.
+   * \param doc_factory The function to return an ExprDoc object for this variable.
    *
-   * \return The ExprDoc object.
+   * This function is a special form of `Define`. Variable is mapped to ExprDoc rather
+   * than IdDoc. It's useful when a variable is implicitly defined without a name, like
+   * the buf->data in TIR, which should be mapped to `AttrDoc(IdDoc("<buffer_name>"), "data")`.
+   *
+   * This function takes a DocFactory instead of Doc. It's because GetVarDoc needs to
+   * return a new Doc object every time it's called, as the returned doc will have
+   * different `source_path`. Currently there isn't a good way to deep copy a TVMObject
+   * so VarTable needs to call a factory function to get a freshly-constructed Doc object
+   * every time GetVarDoc is called.
    */
-  ExprDoc AsExprDoc(const TracedObject<ObjectRef>& obj) { return AsDoc<ExprDoc>(obj); }
+  void Define(const ObjectRef& obj, const Frame& frame, DocCreator doc_factory);
 
   /*!
-   * \brief Push a new dispatch token into the stack
-   * \details The top dispatch token decides which dispatch table to use
-   *          when printing Object. This method returns a RAII guard which
-   *          pops the token when going out of the scope.
-   *
-   * \param token The dispatch token to push.
+   * \brief Get the doc for variable.
+   * \param obj The variable object.
    *
-   * \return A RAII guard to pop dispatch token when going out of scope.
+   * \return The doc for variable, if it exists in the table. Otherwise it returns NullOpt.
    */
-  WithCtx WithDispatchToken(const String& token) {
-    this->dispatch_tokens.push_back(token);
-    return WithCtx(nullptr, [this]() { this->dispatch_tokens.pop_back(); });
-  }
+  Optional<ExprDoc> GetVarDoc(const ObjectRef& obj) const;
 
   /*!
-   * \brief Push a new frame the stack
-   * \details Frame contains the contextual information that's needed during printing,
-   *          for example, variables in the scope. This method returns a RAII guard which
-   *          pops the frame and call the cleanup method of frame when going out of the scope.
-   *
-   * \param frame The frame to push.
+   * \brief Check if a variable exists in the table.
+   * \param obj The variable object.
    *
-   * \return A RAII guard to pop frame and call the exit method of frame
-   *          when going out of scope
+   * \return a boolean for whether variable exists.
    */
-  WithCtx WithFrame(const Frame& frame) {
-    frame->EnterWithScope();
-    this->frames.push_back(frame);
-    return WithCtx(nullptr, [this, pushed_frame = frame]() {
-      Frame last_frame = this->frames.back();
-      ICHECK_EQ(last_frame, pushed_frame);
-      this->frames.pop_back();
-      last_frame->ExitWithScope();
-    });
-  }
-
+  bool IsVarDefined(const ObjectRef& obj) const;
+  /*! \brief Remove the variable defined */
+  void RemoveVar(const ObjectRef& obj);
   /*!
-   * \brief Get the top frame with type FrameType
-   * \tparam FrameType The type of frame to get.
+   * \brief Set the common prefix information of variable usage.
+   * \param root The root of the AST.
+   * \param is_var A function that returns true if the given object is considered a variable.
    */
-  template <typename FrameType>
-  Optional<FrameType> GetFrame() const {
-    for (auto it = frames.rbegin(); it != frames.rend(); ++it) {
-      if (const auto* f = (*it).as<typename FrameType::ContainerType>()) {
-        return GetRef<FrameType>(f);
-      }
-    }
-    return NullOpt;
-  }
-
- private:
-  Doc AsDocImpl(const TracedObject<ObjectRef>& obj) const;
+  void SetCommonPrefix(const ObjectRef& root, runtime::TypedPackedFunc<bool(ObjectRef)> is_var);
+  /*!
+   * \brief Transform the input object into TDoc.
+   * \param obj The object to be transformed.
+   * \param path The path to this object.
+   *
+   * \return The Doc object.
+   */
+  template <class TDoc = Doc>
+  inline TDoc AsDoc(const ObjectRef& obj, const ObjectPath& path) const;
 };
 
 /*!
@@ -167,61 +235,49 @@ class IRDocsifierNode : public Object {
  */
 class IRDocsifier : public ObjectRef {
  public:
+  using FType = IRDocsifierFunctor<printer::Doc, ObjectPath, IRDocsifier>;
   /*!
    * \brief Create a IRDocsifier.
    * \param ir_prefix The ir_prefix to use for this IRDocsifier.
    */
   explicit IRDocsifier(Map<String, String> ir_prefix);
-
-  using FType = TracedObjectFunctor<printer::Doc, IRDocsifier>;
-  /*!
-   * \brief The registration table for IRDocsifier.
-   */
+  /*! \brief The registration table for IRDocsifier. */
   TVM_DLL static FType& vtable();
 
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(IRDocsifier, ObjectRef, IRDocsifierNode);
 };
 
-/*!
- * \brief A wrapper object to provide injection point for printer of each IR.
- *
- * For any IR node to be transformed by IRDocsifier, it will be wrapped by RootNodeContainer
- * and be dispatched to the corresponding function first. This provides an injection point for
- * each IR's printer implemention to add specialized logic, for example, pushing a special
- * Frame to the IRDocsifier before doing any IR->Doc transformation.
- *
- * \code
- * TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
- *     .set_dispatch("relax", [](TracedObject<RootNodeContainer> obj, IRDocsifier p) {
- *       const ObjectRef& root_node = obj.Get()->root_node;
- *       // For example, relax printer can create a Frame specialized to Relax here
- *       RelaxGeneralFrame frame;
- *       auto ctx = p->WithFrame(frame);
- *       // More specialized logic for your IR.
- *       return p->AsDoc<Doc>(MakeTraced(root_node));
- *     });
- * \endcode
- */
-class RootNodeContainerNode : public Object {
- public:
-  /*! \brief The root node to print. */
-  ObjectRef root_node;
+//////////////////////// Implementation ////////////////////////
 
-  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("root_node", &root_node); }
+inline void FrameNode::EnterWithScope() {
+  if (d != nullptr) {
+    d->frames.push_back(GetRef<Frame>(this));
+  }
+}
 
-  static constexpr const char* _type_key = "script.printer.RootNodeContainer";
-  TVM_DECLARE_FINAL_OBJECT_INFO(RootNodeContainerNode, Object);
-};
+inline void FrameNode::ExitWithScope() {
+  for (const std::function<void()>& callback : callbacks) {
+    callback();
+  }
+  callbacks.clear();
+  if (d != nullptr) {
+    d->frames.pop_back();
+  }
+}
 
-class RootNodeContainer : public ObjectRef {
- public:
-  /*!
-   * \brief Constructor of RootNodeContainer.
-   * \param root_node The root node to print.
-   * */
-  explicit RootNodeContainer(ObjectRef root_node);
-  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(RootNodeContainer, ObjectRef, RootNodeContainerNode);
-};
+template <class TDoc>
+inline TDoc IRDocsifierNode::AsDoc(const ObjectRef& obj, const ObjectPath& path) const {
+  if (!obj.defined()) {
+    return Downcast<TDoc>(LiteralDoc::None());
+  }
+  return Downcast<TDoc>(
+      IRDocsifier::vtable()(dispatch_tokens.back(), obj, path, GetRef<IRDocsifier>(this)));
+}
+
+inline void FrameNode::AddDispatchToken(const IRDocsifier& d, const String& token) {
+  d->dispatch_tokens.push_back(token);
+  this->AddExitCallback([doc = d.get()]() { doc->dispatch_tokens.pop_back(); });
+}
 
 }  // namespace printer
 }  // namespace script
diff --git a/include/tvm/script/printer/ir_docsifier_functor.h b/include/tvm/script/printer/ir_docsifier_functor.h
new file mode 100644
index 000000000000..d04d8c4d028a
--- /dev/null
+++ b/include/tvm/script/printer/ir_docsifier_functor.h
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SCRIPT_PRINTER_IR_DOCSIFIER_FUNCTOR_H_
+#define TVM_SCRIPT_PRINTER_IR_DOCSIFIER_FUNCTOR_H_
+
+#include <tvm/node/node.h>
+#include <tvm/runtime/logging.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+/*!
+ * \brief Dynamic dispatch functor based on ObjectPath.
+ *
+ * This functor dispatches based on the type of object and the input dispatch token.
+ */
+template <typename R, typename... Args>
+class IRDocsifierFunctor {
+ private:
+  using TSelf = IRDocsifierFunctor<R, Args...>;
+
+  template <class TObjectRef, class TCallable>
+  using IsDispatchFunction =
+      typename std::is_convertible<TCallable, std::function<R(TObjectRef, Args...)>>;
+
+ public:
+  /*!
+   * \brief Call the dispatch function.
+   * \param token The dispatch token.
+   * \param obj The object.
+   * \param args Other args.
+   *
+   * \return The return value of the dispatch function
+   *
+   * If the TObjectRef isn't registered with the token, it will try to find
+   * dispatch function for TObjectRef with the default dispatch token (empty string).
+   */
+  template <class TObjectRef>
+  R operator()(const String& token, TObjectRef obj, Args... args) const {
+    uint32_t type_index = obj.defined() ? obj->type_index() : 0;
+    const runtime::PackedFunc* pf = nullptr;
+    if ((pf = LookupDispatchTable(token, type_index)) != nullptr) {
+      return (*pf)(obj, args...);
+    }
+    if ((pf = LookupDispatchTable("", type_index)) != nullptr) {
+      return (*pf)(obj, args...);
+    }
+    ICHECK(false) << "ObjectFunctor calls un-registered function on type: "
+                  << runtime::Object::TypeIndex2Key(type_index) << " (token: " << token << ")"
+                  << ". ObjectType: " << obj->GetTypeKey() << ". Object: " << obj;
+  }
+
+  /*!
+   * \brief Set the dispatch function
+   * \param token The dispatch token.
+   * \param type_index The TVM object type index for this dispatch function.
+   * \param f The dispatch function.
+   *
+   * This takes a type-erased packed function as input. It should be used
+   * through FFI boundary, for example, registering dispatch function from Python.
+   */
+  TSelf& set_dispatch(String token, uint32_t type_index, runtime::PackedFunc f) {
+    std::vector<runtime::PackedFunc>* table = &dispatch_table_[token];
+    if (table->size() <= type_index) {
+      table->resize(type_index + 1, nullptr);
+    }
+    runtime::PackedFunc& slot = (*table)[type_index];
+    if (slot != nullptr) {
+      ICHECK(false) << "Dispatch for type is already registered: "
+                    << runtime::Object::TypeIndex2Key(type_index);
+    }
+    slot = f;
+    return *this;
+  }
+
+  /*!
+   * \brief Set the dispatch function
+   * \param token The dispatch token.
+   * \param f The dispatch function.
+   */
+  template <typename TObjectRef, typename TCallable,
+            typename = std::enable_if_t<IsDispatchFunction<TObjectRef, TCallable>::value>>
+  TSelf& set_dispatch(String token, TCallable f) {
+    return set_dispatch(token, TObjectRef::ContainerType::RuntimeTypeIndex(),
+                        runtime::TypedPackedFunc<R(TObjectRef, Args...)>(f));
+  }
+
+  /*!
+   * \brief Remove dispatch function
+   * \param token The dispatch token.
+   * \param type_index The TVM object type index for the dispatch function to be removed.
+   *
+   * This is useful when dispatch function comes from other language's runtime, and
+   * those function should be removed before that language runtime shuts down.
+   */
+  void remove_dispatch(String token, uint32_t type_index) {
+    std::vector<runtime::PackedFunc>* table = &dispatch_table_[token];
+    if (table->size() <= type_index) {
+      return;
+    }
+    (*table)[type_index] = nullptr;
+  }
+
+ private:
+  /*!
+   * \brief Look up the dispatch table for the given token and type_index.
+   * \param token The dispatch token.
+   * \param type_index The TVM object type index.
+   * \return Returns the functor if the lookup succeeds, nullptr otherwise.
+   */
+  const runtime::PackedFunc* LookupDispatchTable(const String& token, uint32_t type_index) const {
+    auto it = dispatch_table_.find(token);
+    if (it == dispatch_table_.end()) {
+      return nullptr;
+    }
+    const std::vector<runtime::PackedFunc>& tab = it->second;
+    if (type_index >= tab.size()) {
+      return nullptr;
+    }
+    const PackedFunc* f = &tab[type_index];
+    if (f->defined()) {
+      return f;
+    } else {
+      return nullptr;
+    }
+  }
+  /*
+   * This type alias and the following free functions are created to reduce the binary bloat
+   * from template and also hide implementation details from this header
+   */
+  using DispatchTable = std::unordered_map<std::string, std::vector<runtime::PackedFunc>>;
+  /*! \brief The dispatch table. */
+  DispatchTable dispatch_table_;
+};
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
+#endif  // TVM_SCRIPT_PRINTER_IR_DOCSIFIER_FUNCTOR_H_
diff --git a/include/tvm/script/printer/printer.h b/include/tvm/script/printer/printer.h
new file mode 100644
index 000000000000..31abd7d9ec89
--- /dev/null
+++ b/include/tvm/script/printer/printer.h
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SCRIPT_PRINTER_PRINTER_H_
+#define TVM_SCRIPT_PRINTER_PRINTER_H_
+
+#include <tvm/node/node.h>
+#include <tvm/script/printer/ir_docsifier.h>
+
+#include <unordered_map>
+#include <vector>
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+/*! \brief Default values in the TVMScript printer */
+struct Default {
+  /*! \brief Default data type of TIR buffer */
+  DataType buffer_dtype = DataType::Float(32);
+  /*! \brief Default data type of integer literals */
+  DataType int_dtype = DataType::Int(32);
+  /*!
+   * \brief Default data type of float literals. Right now we always print out the explicit type
+   * of floating point values, so setting it to Void means we do not print without the
+   * T.float32/T.float64 wrapper.
+   */
+  DataType float_dtype = DataType::Void();
+  /*! \brief Returns a singleton of the configuration */
+  static Default* Instance();
+  static DataType& BufferDType() { return Instance()->buffer_dtype; }
+  static DataType& IntDType() { return Instance()->int_dtype; }
+  static DataType& FloatDType() { return Instance()->float_dtype; }
+};
+
+/*!
+ * \brief The entry method for TVMScript printing
+ * \param obj The object to be printed
+ * \param ir_prefix The prefix of IR nodes
+ * \param indent_spaces Number of spaces used for indentation
+ * \param print_line_numbers Whether to print line numbers
+ * \param num_context_lines Number of context lines to print around the underlined text
+ * \param path_to_underline Object path to be underlined
+ * \return The TVMScript text format
+ */
+String Script(ObjectRef obj,                                                //
+              Map<String, String> ir_prefix = {{"ir", "I"}, {"tir", "T"}},  //
+              int indent_spaces = 4,                                        //
+              bool print_line_numbers = false,                              //
+              int num_context_lines = -1,                                   //
+              Optional<ObjectPath> path_to_underline = NullOpt);
+
+/*!
+ * \brief Convert Doc into Python script.
+ * \param doc Doc to be converted
+ * \param indent_spaces Number of spaces used for indentation
+ * \param print_line_numbers Whether to print line numbers
+ * \param num_context_lines Number of context lines to print around the underlined text
+ * \param path_to_underline Object path to be underlined
+ */
+String DocToPythonScript(Doc doc,                          //
+                         int indent_spaces = 4,            //
+                         bool print_line_numbers = false,  //
+                         int num_context_lines = -1,       //
+                         Optional<ObjectPath> path_to_underline = NullOpt);
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
+
+#endif  // TVM_SCRIPT_PRINTER_PRINTER_H_
diff --git a/include/tvm/script/printer/traced_object.h b/include/tvm/script/printer/traced_object.h
deleted file mode 100644
index cb63c31cd4a5..000000000000
--- a/include/tvm/script/printer/traced_object.h
+++ /dev/null
@@ -1,484 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/script/printer/traced_object.h
- * Wrappers around TVM objects that also store an ObjectPath from some "root" object
- * to the wrapper object.
- */
-
-#ifndef TVM_SCRIPT_PRINTER_TRACED_OBJECT_H_
-#define TVM_SCRIPT_PRINTER_TRACED_OBJECT_H_
-
-#include <tvm/node/object_path.h>
-#include <tvm/node/reflection.h>
-#include <tvm/runtime/object.h>
-
-#include <string>
-#include <utility>
-
-namespace tvm {
-
-template <typename RefT>
-class TracedObject;
-template <typename K, typename V>
-class TracedMap;
-template <typename T>
-class TracedArray;
-template <typename T>
-class TracedOptional;
-template <typename T>
-class TracedBasicValue;
-
-namespace detail {
-
-template <typename T, bool IsObject = std::is_base_of<ObjectRef, T>::value>
-struct TracedObjectWrapperSelector;
-
-template <typename T>
-struct TracedObjectWrapperSelector<T, false> {
-  using Type = TracedBasicValue<T>;
-};
-
-template <typename T>
-struct TracedObjectWrapperSelector<T, true> {
-  using Type = TracedObject<T>;
-};
-
-template <typename K, typename V>
-struct TracedObjectWrapperSelector<Map<K, V>, true> {
-  using Type = TracedMap<K, V>;
-};
-
-template <typename T>
-struct TracedObjectWrapperSelector<Array<T>, true> {
-  using Type = TracedArray<T>;
-};
-
-template <typename T>
-struct TracedObjectWrapperSelector<Optional<T>, true> {
-  using Type = TracedOptional<T>;
-};
-
-}  // namespace detail
-
-/*!
- * \brief Traced wrapper for regular (non-container) TVM objects.
- */
-template <typename RefT>
-class TracedObject {
-  using ObjectType = typename RefT::ContainerType;
-
- public:
-  using ObjectRefType = RefT;
-
-  // Don't use this direcly. For convenience, call MakeTraced() instead.
-  explicit TracedObject(const RefT& object_ref, ObjectPath path)
-      : ref_(object_ref), path_(std::move(path)) {}
-
-  // Implicit conversion from a derived reference class
-  template <typename DerivedRef>
-  TracedObject(const TracedObject<DerivedRef>& derived)
-      : ref_(derived.Get()), path_(derived.GetPath()) {}
-
-  /*!
-   * \brief Get a traced wrapper for an attribute of the wrapped object.
-   */
-  template <typename T, typename BaseType>
-  typename detail::TracedObjectWrapperSelector<T>::Type GetAttr(T BaseType::*member_ptr) const {
-    using WrapperType = typename detail::TracedObjectWrapperSelector<T>::Type;
-    const ObjectType* node = static_cast<const ObjectType*>(ref_.get());
-    const T& attr = node->*member_ptr;
-    Optional<String> attr_key = ICHECK_NOTNULL(GetAttrKeyByAddress(node, &attr));
-    return WrapperType(attr, path_->Attr(attr_key));
-  }
-
-  /*!
-   * \brief Access the wrapped object.
-   */
-  const RefT& Get() const { return ref_; }
-
-  /*!
-   * \brief Check if the reference to the wrapped object can be converted to `RefU`.
-   */
-  template <typename RefU>
-  bool IsInstance() const {
-    return ref_->template IsInstance<typename RefU::ContainerType>();
-  }
-
-  /*!
-   * \brief Same as Get().defined().
-   */
-  bool defined() const { return ref_.defined(); }
-
-  /*!
-   * \brief Convert the wrapped reference type to a subtype.
-   *
-   * Throws an exception if IsInstance<RefU>() is false.
-   */
-  template <typename RefU>
-  TracedObject<RefU> Downcast() const {
-    return TracedObject<RefU>(tvm::runtime::Downcast<RefU>(ref_), path_);
-  }
-
-  /*!
-   * \brief Convert the wrapped reference type to a subtype.
-   *
-   * Returns an empty optional if IsInstance<RefU>() is false.
-   */
-  template <typename RefU>
-  TracedOptional<RefU> TryDowncast() const {
-    if (ref_->template IsInstance<typename RefU::ContainerType>()) {
-      return Downcast<RefU>();
-    } else {
-      return TracedOptional<RefU>(NullOpt, path_);
-    }
-  }
-
-  /*!
-   * \brief Get the path of the wrapped object.
-   */
-  const ObjectPath& GetPath() const { return path_; }
-
- private:
-  RefT ref_;
-  ObjectPath path_;
-};
-
-/*!
- * \brief Iterator class for TracedMap<K, V>
- */
-template <typename K, typename V>
-class TracedMapIterator {
- public:
-  using WrappedV = typename detail::TracedObjectWrapperSelector<V>::Type;
-  using MapIter = typename Map<K, V>::iterator;
-
-  using iterator_category = std::bidirectional_iterator_tag;
-  using difference_type = ptrdiff_t;
-  using value_type = const std::pair<K, WrappedV>;
-  using pointer = value_type*;
-  using reference = value_type;
-
-  explicit TracedMapIterator(MapIter iter, ObjectPath map_path)
-      : iter_(iter), map_path_(std::move(map_path)) {}
-
-  bool operator==(const TracedMapIterator& other) const { return iter_ == other.iter_; }
-
-  bool operator!=(const TracedMapIterator& other) const { return iter_ != other.iter_; }
-
-  pointer operator->() const = delete;
-
-  reference operator*() const {
-    auto kv = *iter_;
-    return std::make_pair(kv.first, WrappedV(kv.second, map_path_->MapValue(kv.first)));
-  }
-
-  TracedMapIterator& operator++() {
-    ++iter_;
-    return *this;
-  }
-
-  TracedMapIterator operator++(int) {
-    TracedMapIterator copy = *this;
-    ++(*this);
-    return copy;
-  }
-
- private:
-  MapIter iter_;
-  ObjectPath map_path_;
-};
-
-/*!
- * \brief Traced wrapper for Map objects.
- */
-template <typename K, typename V>
-class TracedMap {
- public:
-  using WrappedV = typename detail::TracedObjectWrapperSelector<V>::Type;
-
-  using iterator = TracedMapIterator<K, V>;
-
-  // Don't use this direcly. For convenience, call MakeTraced() instead.
-  explicit TracedMap(Map<K, V> map, ObjectPath path)
-      : map_(std::move(map)), path_(std::move(path)) {}
-
-  /*!
-   * \brief Get a value by its key, wrapped in a traced wrapper.
-   */
-  WrappedV at(const K& key) const {
-    auto it = map_.find(key);
-    ICHECK(it != map_.end()) << "No such key in Map";
-    auto kv = *it;
-    return WrappedV(kv.second, path_->MapValue(kv.first));
-  }
-
-  /*!
-   * \brief Access the wrapped map object.
-   */
-  const Map<K, V>& Get() const { return map_; }
-
-  /*!
-   * \brief Get the path of the wrapped object.
-   */
-  const ObjectPath& GetPath() const { return path_; }
-
-  /*!
-   * \brief Get an iterator to the first item of the map.
-   */
-  iterator begin() const { return iterator(map_.begin(), path_); }
-
-  /*!
-   * \brief Get an iterator to the end of the map.
-   */
-  iterator end() const { return iterator(map_.end(), path_); }
-
-  /*!
-   * \brief Returns true iff the wrapped map is empty.
-   */
-  bool empty() const { return map_.empty(); }
-
- private:
-  Map<K, V> map_;
-  ObjectPath path_;
-};
-
-/*!
- * \brief Iterator class for TracedArray<T>
- */
-template <typename T>
-class TracedArrayIterator {
- public:
-  using WrappedT = typename detail::TracedObjectWrapperSelector<T>::Type;
-
-  using difference_type = ptrdiff_t;
-  using value_type = WrappedT;
-  using pointer = WrappedT*;
-  using reference = WrappedT&;
-  using iterator_category = std::random_access_iterator_tag;
-
-  explicit TracedArrayIterator(Array<T> array, size_t index, ObjectPath array_path)
-      : array_(array), index_(index), array_path_(array_path) {}
-
-  TracedArrayIterator& operator++() {
-    ++index_;
-    return *this;
-  }
-  TracedArrayIterator& operator--() {
-    --index_;
-    return *this;
-  }
-  TracedArrayIterator operator++(int) {
-    TracedArrayIterator copy = *this;
-    ++index_;
-    return copy;
-  }
-  TracedArrayIterator operator--(int) {
-    TracedArrayIterator copy = *this;
-    --index_;
-    return copy;
-  }
-
-  TracedArrayIterator operator+(difference_type offset) const {
-    return TracedArrayIterator(array_, index_ + offset, array_path_);
-  }
-
-  TracedArrayIterator operator-(difference_type offset) const {
-    return TracedArrayIterator(array_, index_ - offset, array_path_);
-  }
-
-  difference_type operator-(const TracedArrayIterator& rhs) const { return index_ - rhs.index_; }
-
-  bool operator==(TracedArrayIterator other) const {
-    return array_.get() == other.array_.get() && index_ == other.index_;
-  }
-  bool operator!=(TracedArrayIterator other) const { return !(*this == other); }
-  value_type operator*() const { return WrappedT(array_[index_], array_path_->ArrayIndex(index_)); }
-
- private:
-  Array<T> array_;
-  size_t index_;
-  ObjectPath array_path_;
-};
-
-/*!
- * \brief Traced wrapper for Array objects.
- */
-template <typename T>
-class TracedArray {
- public:
-  using WrappedT = typename detail::TracedObjectWrapperSelector<T>::Type;
-
-  using iterator = TracedArrayIterator<T>;
-
-  // Don't use this direcly. For convenience, call MakeTraced() instead.
-  explicit TracedArray(Array<T> array, ObjectPath path)
-      : array_(std::move(array)), path_(std::move(path)) {}
-
-  /*!
-   * \brief Access the wrapped array object.
-   */
-  const Array<T>& Get() const { return array_; }
-
-  /*!
-   * \brief Get the path of the wrapped array object.
-   */
-  const ObjectPath& GetPath() const { return path_; }
-
-  /*!
-   * \brief Get an element by index, wrapped in a traced wrapper.
-   */
-  WrappedT operator[](size_t index) const {
-    return WrappedT(array_[index], path_->ArrayIndex(index));
-  }
-
-  /*!
-   * \brief Get an iterator to the first array element.
-   *
-   * The iterator's dereference operator will automatically wrap each element in a traced wrapper.
-   */
-  iterator begin() const { return iterator(array_, 0, path_); }
-
-  /*!
-   * \brief Get an iterator to the end of the array.
-   *
-   * The iterator's dereference operator will automatically wrap each element in a traced wrapper.
-   */
-  iterator end() const { return iterator(array_, array_.size(), path_); }
-
-  /*!
-   * \brief Returns true iff the wrapped array is empty.
-   */
-  bool empty() const { return array_.empty(); }
-
-  /*!
-   * \brief Get the size of the wrapped array.
-   */
-  size_t size() const { return array_.size(); }
-
- private:
-  Array<T> array_;
-  ObjectPath path_;
-};
-
-/*!
- * \brief Traced wrapper for Optional objects.
- */
-template <typename T>
-class TracedOptional {
- public:
-  using WrappedT = typename detail::TracedObjectWrapperSelector<T>::Type;
-
-  /*!
-   * \brief Implicit conversion from the corresponding non-optional traced wrapper.
-   */
-  TracedOptional(const WrappedT& value)  // NOLINT(runtime/explicit)
-      : optional_(value.Get().defined() ? value.Get() : Optional<T>(NullOpt)),
-        path_(value.GetPath()) {}
-
-  // Don't use this direcly. For convenience, call MakeTraced() instead.
-  explicit TracedOptional(Optional<T> optional, ObjectPath path)
-      : optional_(std::move(optional)), path_(std::move(path)) {}
-
-  /*!
-   * \brief Access the wrapped optional object.
-   */
-  const Optional<T>& Get() const { return optional_; }
-
-  /*!
-   * \brief Get the path of the wrapped optional object.
-   */
-  const ObjectPath& GetPath() const { return path_; }
-
-  /*!
-   * \brief Returns true iff the object is present.
-   */
-  bool defined() const { return optional_.defined(); }
-
-  /*!
-   * \brief Returns a non-optional traced wrapper, throws if defined() is false.
-   */
-  WrappedT value() const { return WrappedT(optional_.value(), path_); }
-
-  /*!
-   * \brief Same as defined().
-   */
-  explicit operator bool() const { return optional_.defined(); }
-
- private:
-  Optional<T> optional_;
-  ObjectPath path_;
-};
-
-/*!
- * \brief Traced wrapper for basic values (i.e. non-TVM objects)
- */
-template <typename T>
-class TracedBasicValue {
- public:
-  explicit TracedBasicValue(const T& value, ObjectPath path)
-      : value_(value), path_(std::move(path)) {}
-
-  /*!
-   * \brief Access the wrapped value.
-   */
-  const T& Get() const { return value_; }
-
-  /*!
-   * \brief Get the path of the wrapped value.
-   */
-  const ObjectPath& GetPath() const { return path_; }
-
-  /*!
-   * \brief Transform the wrapped value without changing its path.
-   */
-  template <typename F>
-  typename detail::TracedObjectWrapperSelector<typename std::invoke_result<F, const T&>::type>::Type
-  ApplyFunc(F&& f) const {
-    return MakeTraced(f(value_), path_);
-  }
-
- private:
-  T value_;
-  ObjectPath path_;
-};
-
-/*!
- * \brief Wrap the given root object in an appropriate traced wrapper class.
- */
-template <typename RefT>
-typename detail::TracedObjectWrapperSelector<RefT>::Type MakeTraced(const RefT& object) {
-  using WrappedT = typename detail::TracedObjectWrapperSelector<RefT>::Type;
-  return WrappedT(object, ObjectPath::Root());
-}
-
-/*!
- * \brief Wrap the given object with the given path in an appropriate traced wrapper class.
- */
-template <typename RefT>
-typename detail::TracedObjectWrapperSelector<RefT>::Type MakeTraced(const RefT& object,
-                                                                    ObjectPath path) {
-  using WrappedT = typename detail::TracedObjectWrapperSelector<RefT>::Type;
-  return WrappedT(object, std::move(path));
-}
-
-}  // namespace tvm
-
-#endif  // TVM_SCRIPT_PRINTER_TRACED_OBJECT_H_
diff --git a/include/tvm/script/printer/traced_object_functor.h b/include/tvm/script/printer/traced_object_functor.h
deleted file mode 100644
index 8f72d139a5a5..000000000000
--- a/include/tvm/script/printer/traced_object_functor.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef TVM_SCRIPT_PRINTER_TRACED_OBJECT_FUNCTOR_H_
-#define TVM_SCRIPT_PRINTER_TRACED_OBJECT_FUNCTOR_H_
-
-#include <tvm/node/node.h>
-#include <tvm/runtime/logging.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/script/printer/traced_object.h>
-
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-namespace tvm {
-namespace script {
-namespace printer {
-
-/*
- * This type alias and the following free functions are created to reduce the binary bloat
- * from template and also hide implementation details from this header
- */
-using DispatchTable = std::unordered_map<std::string, std::vector<runtime::PackedFunc>>;
-
-/*!
- * \brief Get function from dispatch table.
- * \param dispatch_table The dispatch table.
- * \param token The dispatch token.
- * \param type_index The type index of the Object type to be dispatched.
- *
- * \return The dispatch function.
- */
-const runtime::PackedFunc& GetDispatchFunction(const DispatchTable& dispatch_table,
-                                               const String& token, uint32_t type_index);
-
-/*!
- * \brief Set function in dispatch table.
- * \param dispatch_table The dispatch table.
- * \param token The dispatch token.
- * \param type_index The type index of the Object type to be dispatched.
- * \param f The dispatch function.
- */
-void SetDispatchFunction(DispatchTable* dispatch_table, const String& token, uint32_t type_index,
-                         runtime::PackedFunc f);
-
-/*!
- * \brief Remove function from dispatch table.
- * \param dispatch_table The dispatch table.
- * \param token The dispatch token.
- * \param type_index The TVM object type index for the dispatch function to be removed.
- */
-void RemoveDispatchFunction(DispatchTable* dispatch_table, const String& token,
-                            uint32_t type_index);
-
-constexpr const char* kDefaultDispatchToken = "";
-
-/*!
- * \brief Dynamic dispatch functor based on TracedObject.
- *
- * This functor dispatches based on the type of object ref inside the input TracedObject,
- * and the input dispatch token.
- */
-template <typename R, typename... Args>
-class TracedObjectFunctor {
- private:
-  using TSelf = TracedObjectFunctor<R, Args...>;
-
-  template <class TObjectRef, class TCallable>
-  using IsDispatchFunction =
-      typename std::is_convertible<TCallable, std::function<R(TracedObject<TObjectRef>, Args...)>>;
-
- public:
-  /*!
-   * \brief Call the dispatch function.
-   * \param token The dispatch token.
-   * \param traced_object The traced object.
-   * \param args Other args.
-   *
-   * \return The return value of the dispatch function
-   *
-   * If the TObjectRef isn't registered with the token, it will try to find
-   * dispatch function for TObjectRef with kDefaultDispatchToken.
-   */
-  template <class TObjectRef>
-  R operator()(const String& token, TracedObject<TObjectRef> traced_object, Args... args) const {
-    const runtime::PackedFunc& dispatch_function =
-        GetDispatchFunction(dispatch_table_, token, traced_object.Get()->type_index());
-    return dispatch_function(traced_object.Get(), traced_object.GetPath(), args...);
-  }
-
-  /*!
-   * \brief Set the dispatch function
-   * \param token The dispatch token.
-   * \param type_index The TVM object type index for this dispatch function.
-   * \param f The dispatch function.
-   *
-   * This takes a type-erased packed function as input. It should be used
-   * through FFI boundary, for example, registering dispatch function from Python.
-   */
-  TSelf& set_dispatch(String token, uint32_t type_index, runtime::PackedFunc f) {
-    SetDispatchFunction(&dispatch_table_, token, type_index, std::move(f));
-    return *this;
-  }
-
-  /*!
-   * \brief Set the dispatch function
-   * \param token The dispatch token.
-   * \param f The dispatch function.
-   *
-   * The diaptch function should have signature `R(TracedObject<TObjectRef>, Args...)`.
-   */
-  template <typename TObjectRef, typename TCallable,
-            typename = std::enable_if_t<IsDispatchFunction<TObjectRef, TCallable>::value>>
-  TSelf& set_dispatch(String token, TCallable f) {
-    return set_dispatch(
-        token,                                          //
-        TObjectRef::ContainerType::RuntimeTypeIndex(),  //
-        runtime::TypedPackedFunc<R(TObjectRef, ObjectPath, Args...)>(
-            [f = std::move(f)](TObjectRef object, ObjectPath path, Args... args) -> R {
-              return f(MakeTraced(object, path), args...);
-            }));
-  }
-  /*!
-   * \brief Set the default dispatch function
-   * \param f The dispatch function.
-   *
-   * Default dispatch function will be used if there is no function registered
-   * with the requested dispatch token.
-   *
-   * Default dispatch function has an empty string as dispatch token.
-   */
-  template <typename TObjectRef, typename TCallable,
-            typename = std::enable_if_t<IsDispatchFunction<TObjectRef, TCallable>::value>>
-  TSelf& set_dispatch(TCallable&& f) {
-    return set_dispatch<TObjectRef>(kDefaultDispatchToken, std::forward<TCallable>(f));
-  }
-
-  /*!
-   * \brief Remove dispatch function
-   * \param token The dispatch token.
-   * \param type_index The TVM object type index for the dispatch function to be removed.
-   *
-   * This is useful when dispatch function comes from other language's runtime, and
-   * those function should be removed before that language runtime shuts down.
-   */
-  void remove_dispatch(String token, uint32_t type_index) {
-    RemoveDispatchFunction(&dispatch_table_, token, type_index);
-  }
-
- private:
-  DispatchTable dispatch_table_;
-};
-
-}  // namespace printer
-}  // namespace script
-}  // namespace tvm
-#endif  // TVM_SCRIPT_PRINTER_TRACED_OBJECT_FUNCTOR_H_
diff --git a/include/tvm/script/printer/var_table.h b/include/tvm/script/printer/var_table.h
deleted file mode 100644
index 2cd9335213a3..000000000000
--- a/include/tvm/script/printer/var_table.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef TVM_SCRIPT_PRINTER_VAR_TABLE_H_
-#define TVM_SCRIPT_PRINTER_VAR_TABLE_H_
-
-#include <tvm/node/node.h>
-#include <tvm/node/object_path.h>
-#include <tvm/script/printer/doc.h>
-#include <tvm/script/printer/frame.h>
-#include <tvm/script/printer/traced_object.h>
-
-#include <unordered_map>
-#include <unordered_set>
-
-namespace tvm {
-namespace script {
-namespace printer {
-
-/*!
- * \brief Variable Table manages mapping from variable object to ExprDoc during
- * the process of printing TVMScript.
- *
- * The value type of this map is ExprDoc rather than IdDoc or String. It's
- * because variables can be implicitly defined. For example in TIR buffer (tir::Buffer),
- * `buf->data` is a variable, while its representation in TVMScript should be an
- * expression `x.data`, where `x` is the variable for the buffer itself.
- */
-class VarTableNode : public Object {
- public:
-  void VisitAttrs(AttrVisitor*) {}
-
-  /*!
-   * \brief Define variable by name.
-   * \param obj The variable object.
-   * \param name_hint The hint for variable name.
-   * \param object_path The object_path for the returned ExprDoc.
-   * \param frame The frame that this variable is defined in.
-   *
-   * \return The id doc for this variable.
-   *
-   * This function will rename the variable to avoid name conflict with other variables
-   * in the table.
-   */
-  IdDoc Define(const ObjectRef& obj, const String& name_hint, const ObjectPath& object_path,
-               const Frame& frame);
-
-  /*!
-   * \brief Define variable by name.
-   * \param obj The variable object.
-   * \param name_hint The hint for variable name.
-   * \param frame The frame that this variable is defined in.
-   *
-   * \return The id doc for this variable.
-   *
-   * This is a shortcut version of `Define` which accepts a traced string.
-   */
-  IdDoc Define(const ObjectRef& obj, const TracedObject<String>& name_hint, const Frame& frame) {
-    return Define(obj, name_hint.Get(), name_hint.GetPath(), frame);
-  }
-
-  using DocFactory = std::function<ExprDoc()>;
-
-  /*!
-   * \brief Define variable by doc factory.
-   * \param obj The variable object.
-   * \param doc_factory The function to return an ExprDoc object for this variable.
-   * \param frame The frame that this variable is defined in.
-   *
-   * This function is a special form of `Define`. Variable is mapped to ExprDoc rather
-   * than IdDoc. It's useful when a variable is implicitly defined without a name, like
-   * the buf->data in TIR, which should be mapped to `AttrDoc(IdDoc("<buffer_name>"), "data")`.
-   *
-   * This function takes a DocFactory instead of Doc. It's because GetVarDoc needs to
-   * return a new Doc object every time it's called, as the returned doc will have
-   * different `soruce_path`. Currently there isn't a good way to deep copy a TVMObject
-   * so VarTable needs to call a factory function to get a freshly-constructed Doc object
-   * every time GetVarDoc is called.
-   */
-  void DefineByDoc(const ObjectRef& obj, DocFactory doc_factory, const Frame& frame);
-
-  /*!
-   * \brief Get the doc for variable.
-   * \param obj The variable object.
-   * \param object_path The object path for the variable.
-   *
-   * \return The doc for variable, if it exists in the table. Otherwise it returns NullOpt.
-   */
-  Optional<ExprDoc> GetVarDoc(const ObjectRef& obj, const ObjectPath& object_path) const;
-
-  /*!
-   * \brief Get the doc for variable.
-   * \param obj The traced variable object.
-   *
-   * \return The doc for variable, if it exists in the table. Otherwise it returns NullOpt.
-   */
-  template <typename TObjectRef>
-  Optional<ExprDoc> GetVarDoc(const TracedObject<TObjectRef> obj) const {
-    return GetVarDoc(obj.Get(), obj.GetPath());
-  }
-
-  /*!
-   * \brief Check if a variable exists in the table.
-   * \param obj The variable object.
-   *
-   * \return a boolean for whether variable exists.
-   */
-  bool IsVarDefined(const ObjectRef& obj) const;
-
-  static constexpr const char* _type_key = "script.printer.VarTable";
-  TVM_DECLARE_FINAL_OBJECT_INFO(VarTableNode, Object);
-
- private:
-  void RemoveVar(const ObjectRef& obj);
-
-  struct VariableInfo {
-    DocFactory doc_factory;
-    Optional<String> name;
-  };
-  std::unordered_map<ObjectRef, VariableInfo, ObjectPtrHash, ObjectPtrEqual> obj2info;
-  std::unordered_set<String> defined_names;
-};
-
-/*!
- * \brief Reference type of VarTableNode.
- */
-class VarTable : public ObjectRef {
- public:
-  /*!
-   * \brief Create an empty VarTable.
-   */
-  VarTable();
-  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(VarTable, ObjectRef, VarTableNode);
-};
-
-}  // namespace printer
-}  // namespace script
-}  // namespace tvm
-
-#endif  // TVM_SCRIPT_PRINTER_VAR_TABLE_H_
diff --git a/include/tvm/support/with.h b/include/tvm/support/with.h
index 5959affafdb3..8333adc9e613 100644
--- a/include/tvm/support/with.h
+++ b/include/tvm/support/with.h
@@ -92,34 +92,5 @@ class With {
   ContextType ctx_;
 };
 
-/*!
- * \brief A context type that delegates EnterWithScope and ExitWithScope
- *        to user-provided functions.
- */
-class ContextManager {
- public:
-  /*!
-   * \brief Constructor of ContextManager.
-   * \param f_enter The function to call when entering scope. If it's nullptr, do nothing when
-   *                entering.
-   * \param f_exit The function to call when exiting scope. If it's nullptr, do nothing
-   *               when exiting.
-   */
-  template <class FEnter, class FExit>
-  explicit ContextManager(FEnter f_enter, FExit f_exit) : f_enter_(f_enter), f_exit_(f_exit) {}
-
- private:
-  void EnterWithScope() {
-    if (f_enter_) f_enter_();
-  }
-  void ExitWithScope() {
-    if (f_exit_) f_exit_();
-  }
-  std::function<void()> f_enter_;
-  std::function<void()> f_exit_;
-  template <typename>
-  friend class With;
-};
-
 }  // namespace tvm
 #endif  // TVM_SUPPORT_WITH_H_
diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h
index 9b48b0ccebd1..21bc7e7a5056 100644
--- a/include/tvm/tir/op.h
+++ b/include/tvm/tir/op.h
@@ -40,6 +40,9 @@
 
 namespace tvm {
 
+#define TVM_TIR_REGISTER_OP(OpName) \
+  TVM_REGISTER_OP("tir." OpName).set_attr<TScriptPrinterName>("TScriptPrinterName", OpName)
+
 // Most common operators can be overloaded by argument type(PrimExpr).
 // So we put them under the root namespace.
 //
diff --git a/include/tvm/tir/op_attr_types.h b/include/tvm/tir/op_attr_types.h
index 2dc174f7d2a1..858d89c2d551 100644
--- a/include/tvm/tir/op_attr_types.h
+++ b/include/tvm/tir/op_attr_types.h
@@ -56,6 +56,11 @@ using FLowerIntrinsic = runtime::TypedPackedFunc<PrimExpr(PrimExpr)>;
  */
 using FLegalize = runtime::TypedPackedFunc<PrimExpr(PrimExpr)>;
 
+/*!
+ * \brief The operator's name in TVMScript printer
+ */
+using TScriptPrinterName = String;
+
 /*!
  * \brief The effect type of the call.
  */
diff --git a/python/tvm/script/__init__.py b/python/tvm/script/__init__.py
index 21bdfa6f1691..82bb698f2773 100644
--- a/python/tvm/script/__init__.py
+++ b/python/tvm/script/__init__.py
@@ -15,4 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """TVM Script APIs of TVM Python Package"""
-from .parser import ir, ir_module, parse as from_source, tir
+from .parser import ir, ir_module
+from .parser import parse as from_source
+from .parser import tir
+from .printer import script
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index 48b283447969..06a85fa34082 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -27,6 +27,7 @@
 # isort: on
 
 import numpy as np  # type: ignore
+
 from tvm.ir import Range, Type
 from tvm.runtime import convert, ndarray
 from tvm.target import Target
@@ -508,7 +509,9 @@ class axis:  # pylint: disable=invalid-name
 
     @staticmethod
     def spatial(
-        dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]], binding: PrimExpr, dtype: str = "int32"
+        dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]],
+        binding: PrimExpr,
+        dtype: str = "int32",
     ) -> Var:
         """The spatial block axis defining function.
 
@@ -534,7 +537,9 @@ def spatial(
 
     @staticmethod
     def reduce(
-        dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]], binding: PrimExpr, dtype: str = "int32"
+        dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]],
+        binding: PrimExpr,
+        dtype: str = "int32",
     ) -> Var:
         """The reduced block axis defining function.
 
@@ -560,7 +565,9 @@ def reduce(
 
     @staticmethod
     def scan(
-        dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]], binding: PrimExpr, dtype: str = "int32"
+        dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]],
+        binding: PrimExpr,
+        dtype: str = "int32",
     ) -> Var:
         """The scanning block axis defining function.
 
@@ -586,7 +593,9 @@ def scan(
 
     @staticmethod
     def opaque(
-        dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]], binding: PrimExpr, dtype: str = "int32"
+        dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]],
+        binding: PrimExpr,
+        dtype: str = "int32",
     ) -> Var:
         """The opaque block axis defining function.
 
@@ -1534,34 +1543,41 @@ def target(target_config: Union[Dict, str]) -> Target:
     return Target(target_config)
 
 
-def _op_wrapper(func):
-    @functools.wraps(func)
-    def wrapped(*args, **kwargs):
-        if "dtype" in kwargs:
-            kwargs.pop("dtype")
-        return func(*args, **kwargs)
+class meta_var:  # pylint: disable=invalid-name
+    """A meta variable used in TVMScript metaprogramming. It means that the value of the variable
+    does not appear in the final TIR, but only stays in the parser.
 
-    return wrapped
+    Parameters
+    ----------
+    value: Any
+        The meta variable.
+    """
 
+    def __init__(self, value: Any) -> None:
+        self.value = value
 
-def _dtype_forward(func):
+    def __iter__(self):
+        def f():
+            for i in self.value:
+                yield meta_var(i)
+
+        return f()
+
+
+# pylint: disable=invalid-name
+
+
+def _op_wrapper(func):
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
-            args = (kwargs.pop("dtype"),) + args
+            kwargs.pop("dtype")
         return func(*args, **kwargs)
 
     return wrapped
 
 
-# pylint: disable=invalid-name
-
-broadcast = Broadcast
-ramp = Ramp
-
-buffer_var = ptr
 abs = _op_wrapper(_tir_op.abs)  # pylint: disable=redefined-builtin
-fabs = abs
 acos = _op_wrapper(_tir_op.acos)
 acosh = _op_wrapper(_tir_op.acosh)
 address_of = _op_wrapper(_tir_op.address_of)
@@ -1607,7 +1623,6 @@ def wrapped(*args, **kwargs):
 q_multiply_shift = _op_wrapper(_tir_op.q_multiply_shift)
 q_multiply_shift_per_axis = _op_wrapper(_tir_op.q_multiply_shift_per_axis)
 ret = _op_wrapper(_tir_op.ret)
-reinterpret = _dtype_forward(_tir_op.reinterpret)
 round = _op_wrapper(_tir_op.round)  # pylint: disable=redefined-builtin
 rsqrt = _op_wrapper(_tir_op.rsqrt)
 shift_left = _op_wrapper(_tir_op.shift_left)
@@ -1631,11 +1646,6 @@ def wrapped(*args, **kwargs):
 call_cpacked = _op_wrapper(_tir_op.call_cpacked)
 call_packed_lowered = _op_wrapper(_tir_op.call_packed_lowered)
 call_cpacked_lowered = _op_wrapper(_tir_op.call_cpacked_lowered)
-call_extern = _dtype_forward(_tir_op.call_extern)
-call_intrin = _dtype_forward(_tir_op.call_intrin)
-call_llvm_intrin = _dtype_forward(_tir_op.call_llvm_intrin)
-call_llvm_pure_intrin = _dtype_forward(_tir_op.call_llvm_pure_intrin)
-call_pure_extern = _dtype_forward(_tir_op.call_pure_extern)
 tvm_tuple = _op_wrapper(_tir_op.tvm_tuple)
 tvm_struct_set = _op_wrapper(_tir_op.tvm_struct_set)
 tvm_struct_get = _tir_op.tvm_struct_get
@@ -1645,48 +1655,51 @@ def wrapped(*args, **kwargs):
 tvm_bmma_sync = _op_wrapper(_tir_op.tvm_bmma_sync)
 tvm_fill_fragment = _op_wrapper(_tir_op.tvm_fill_fragment)
 tvm_store_matrix_sync = _op_wrapper(_tir_op.tvm_store_matrix_sync)
-ptx_mma = _dtype_forward(_tir_op.ptx_mma)
-ptx_mma_sp = _dtype_forward(_tir_op.ptx_mma_sp)
-ptx_ldmatrix = _dtype_forward(_tir_op.ptx_ldmatrix)
-ptx_cp_async = _dtype_forward(_tir_op.ptx_cp_async)
 ptx_wait_group = _op_wrapper(_tir_op.ptx_wait_group)
 ptx_commit_group = _op_wrapper(_tir_op.ptx_commit_group)
-mma_store = _dtype_forward(_tir_op.mma_store)
-mma_fill = _dtype_forward(_tir_op.mma_fill)
-vectorlow = _dtype_forward(_tir_op.vectorlow)
-vectorhigh = _dtype_forward(_tir_op.vectorhigh)
-vectorcombine = _dtype_forward(_tir_op.vectorcombine)
 assume = _op_wrapper(_tir_op.assume)
 undef = _op_wrapper(_tir_op.undef)
-tvm_call_packed = call_packed
-tvm_call_cpacked = call_cpacked
-tvm_call_packed_lowered = call_packed_lowered
-tvm_call_cpacked_lowered = call_cpacked_lowered
 TVMBackendAllocWorkspace = _op_wrapper(_tir_op.TVMBackendAllocWorkspace)
 TVMBackendFreeWorkspace = _op_wrapper(_tir_op.TVMBackendFreeWorkspace)
 start_profile_intrinsic = _op_wrapper(_tir_op.start_profile_intrinsic)
 end_profile_intrinsic = _op_wrapper(_tir_op.end_profile_intrinsic)
 
 
-class meta_var:
-    """A meta variable used in TVMScript metaprogramming. It means that the value of the variable
-    does not appear in the final TIR, but only stays in the parser.
+def _dtype_forward(func):
+    @functools.wraps(func)
+    def wrapped(*args, **kwargs):
+        if "dtype" in kwargs:
+            args = (kwargs.pop("dtype"),) + args
+        return func(*args, **kwargs)
 
-    Parameters
-    ----------
-    value: Any
-        The meta variable.
-    """
+    return wrapped
 
-    def __init__(self, value: Any) -> None:
-        self.value = value
 
-    def __iter__(self):
-        def f():
-            for i in self.value:
-                yield meta_var(i)
+reinterpret = _dtype_forward(_tir_op.reinterpret)
+call_extern = _dtype_forward(_tir_op.call_extern)
+call_intrin = _dtype_forward(_tir_op.call_intrin)
+call_llvm_intrin = _dtype_forward(_tir_op.call_llvm_intrin)
+call_llvm_pure_intrin = _dtype_forward(_tir_op.call_llvm_pure_intrin)
+call_pure_extern = _dtype_forward(_tir_op.call_pure_extern)
+ptx_mma = _dtype_forward(_tir_op.ptx_mma)
+ptx_mma_sp = _dtype_forward(_tir_op.ptx_mma_sp)
+ptx_ldmatrix = _dtype_forward(_tir_op.ptx_ldmatrix)
+ptx_cp_async = _dtype_forward(_tir_op.ptx_cp_async)
+mma_store = _dtype_forward(_tir_op.mma_store)
+mma_fill = _dtype_forward(_tir_op.mma_fill)
+vectorlow = _dtype_forward(_tir_op.vectorlow)
+vectorhigh = _dtype_forward(_tir_op.vectorhigh)
+vectorcombine = _dtype_forward(_tir_op.vectorcombine)
 
-        return f()
+
+broadcast = Broadcast
+ramp = Ramp
+buffer_var = ptr
+fabs = abs
+tvm_call_packed = call_packed
+tvm_call_cpacked = call_cpacked
+tvm_call_packed_lowered = call_packed_lowered
+tvm_call_cpacked_lowered = call_cpacked_lowered
 
 
 # pylint: enable=invalid-name
diff --git a/python/tvm/script/printer/__init__.py b/python/tvm/script/printer/__init__.py
index d49614db0f21..25ea619a410c 100644
--- a/python/tvm/script/printer/__init__.py
+++ b/python/tvm/script/printer/__init__.py
@@ -16,12 +16,7 @@
 # under the License.
 """
 TVMScript Unified Printer
-
 This package provides a set of APIs to print supported TVM IR into TVMScript
 in a roundtrippable way.
-
-https://github.com/apache/tvm-rfcs/blob/main/rfcs/0074-tvmscript-unified-printer.md
 """
-
-from . import _ffi_api
-from .entry import script
+from .printer import script
diff --git a/python/tvm/script/printer/entry.py b/python/tvm/script/printer/entry.py
deleted file mode 100644
index c015702af09b..000000000000
--- a/python/tvm/script/printer/entry.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-This file contains the entry point of TVMScript Unified Printer.
-"""
-
-from typing import Dict, Optional
-
-from tvm.runtime import Object, ObjectPath
-
-from . import _ffi_api
-
-
-def script(  # pylint: disable=too-many-arguments
-    root_node: Object,
-    ir_name: str,
-    ir_prefix: Dict[str, str],
-    indent_spaces: int = 4,
-    print_line_numbers: bool = False,
-    num_context_lines: int = -1,
-    path_to_underline: Optional[ObjectPath] = None,
-) -> str:
-    """
-    Print IR graph as TVMScript code
-
-    Parameters
-    ----------
-    root_node : Object
-        The root node to print.
-    ir_name : str
-        The dispatch token of the target IR, e.g., "tir", "relax".
-    ir_prefix : Dict[str, str]
-        The symbol name for TVMScript IR namespaces. For example,
-        {"tir": "T"}.
-    indent_spaces : int
-        The number of indent spaces to use in the output
-    print_line_numbers: bool
-        Whether to print line numbers
-    num_context_lines : Optional[int]
-        Number of context lines to print around the underlined text
-    path_to_underline : Optional[ObjectPath]
-        Object path to be underlined
-
-    Returns
-    -------
-    script : str
-        The TVMScript code of the root_node
-    """
-    return _ffi_api.Script(  # type: ignore # pylint: disable=no-member
-        root_node,
-        ir_name,
-        ir_prefix,
-        indent_spaces,
-        print_line_numbers,
-        num_context_lines,
-        path_to_underline,
-    )
diff --git a/python/tvm/script/printer/frame.py b/python/tvm/script/printer/frame.py
deleted file mode 100644
index c967382b8b5d..000000000000
--- a/python/tvm/script/printer/frame.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Frame is the core data structure for semantic information when printing
-IR graph into TVMScript code.
-"""
-
-from typing import Callable, Sequence
-
-from tvm._ffi import register_object
-from tvm.runtime import Object
-from tvm.script.printer.doc import StmtDoc
-
-from . import _ffi_api
-
-
-class Frame(Object):
-    """
-    Frame is the core data structure for semantic information
-    when printing IR graph into TVMScript code.
-
-    Frame base class manages a list of callbacks to be executed
-    when frame goes out of scope.
-    """
-
-    def add_exit_callback(self, callback: Callable[[], None]) -> None:
-        """
-        Adds a callback function to be executed when frame goes out of scope.
-
-        Parameters
-        ----------
-        callback : Callable[[], None]
-            The callback function.
-        """
-        _ffi_api.FrameAddExitCallback(self, callback)  # type: ignore # pylint: disable=no-member
-
-    def __enter__(self):
-        _ffi_api.FrameEnterWithScope(self)  # type: ignore # pylint: disable=no-member
-        return self
-
-    def __exit__(self, *exception_info):
-        _ffi_api.FrameExitWithScope(self)  # type: ignore # pylint: disable=no-member
-
-
-@register_object("script.printer.MetadataFrame")
-class MetadataFrame(Frame):
-    """
-    MetadataFrame contains information like contant parameter array.
-    """
-
-    metadata: Sequence[Object]
-
-    def __init__(self):
-        self.__init_handle_by_constructor__(_ffi_api.MetadataFrame)  # type: ignore # pylint: disable=no-member
-
-
-@register_object("script.printer.VarDefFrame")
-class VarDefFrame(Frame):
-    """
-    VarDefFrame contains information about the free variables that needs to
-    be defined at the beginning of the printed snippet.
-    """
-
-    stmts: Sequence[StmtDoc]
-
-    def __init__(self):
-        self.__init_handle_by_constructor__(_ffi_api.VarDefFrame)  # type: ignore # pylint: disable=no-member
diff --git a/python/tvm/script/printer/ir_docsifier.py b/python/tvm/script/printer/ir_docsifier.py
deleted file mode 100644
index c5ba8a498b1e..000000000000
--- a/python/tvm/script/printer/ir_docsifier.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-IRDocsifier is the top-level interface in the process of transforming
-IR graph into Doc tree, during printing IR graph as TVMScript code.
-"""
-
-import atexit
-from contextlib import ExitStack, contextmanager
-from typing import Callable, Dict, Generator, Mapping, Optional, Sequence, Set, Tuple, Type, TypeVar
-
-from tvm._ffi import get_object_type_index, register_object
-from tvm.runtime import Object, ObjectPath
-
-from . import _ffi_api
-from .doc import Doc
-from .frame import Frame
-from .var_table import VarTable
-
-_REGISTERED_TYPES: Set[Tuple[str, int]] = set()  # {(dispatch_token, type_index)}
-
-
-def _cleanup_dispatch_function():
-    for dispatch_token, type_index in _REGISTERED_TYPES:
-        _ffi_api.IRDocsifierRemoveDispatch(dispatch_token, type_index)  # type: ignore # pylint: disable=no-member
-
-
-_CLEANUP_REGISTERED = False
-
-
-def _ensure_cleanup_function_registered():
-    """
-    Add a cleanup function to be called on interpreter termination,
-    to remove all dispatch functions registered on the Python side.
-
-    Without cleaning up those dispatch functions, program will segfault
-    on termination. It's because dispatch functions are referenced from the
-    static memory of libtvm, thus they will be cleaned up at the very end,
-    making calls to Py_DecRef after Python interpreter terminates.
-    """
-    global _CLEANUP_REGISTERED  # pylint: disable=global-statement
-
-    if not _CLEANUP_REGISTERED:
-        atexit.register(_cleanup_dispatch_function)
-        _CLEANUP_REGISTERED = True
-
-
-@register_object("script.printer.RootNodeContainer")
-class RootNodeContainer(Object):
-    """
-    A wrapper object to provide injection point for printer of each IR.
-
-    This class shouldn't be used directly. `IRDocsifier.set_root_dispatch`
-    should be used instead.
-    """
-
-    root_node: Object
-
-    def __init__(self, root_node: Object):
-        self.__init_handle_by_constructor__(_ffi_api.RootNodeContainer, root_node)  # type: ignore # pylint: disable=no-member
-
-
-@register_object("script.printer.IRDocsifier")
-class IRDocsifier(Object):
-    """
-    IRDocsifier is the top-level interface in the IR->Doc process.
-
-    It provides methods to convert IR node object to Doc, operate on Frame
-    objects and change dispatch tokens.
-    """
-
-    ir_prefix: Mapping[str, str]
-    vars: VarTable
-    frames: Sequence[Frame]
-    dispatch_tokens: Sequence[str]
-
-    def __init__(self, ir_prefix: Dict[str, str]):
-        """
-        Create a new IRDocsifier.
-
-        Parameters
-        ----------
-        ir_prefix : Dict[str, str]
-            The ir prefix to use. Key is the IR dispatch token and
-            value is the name of identifier for this IR's namespace in TVMScript.
-        """
-        self.__init_handle_by_constructor__(_ffi_api.IRDocsifier, ir_prefix)  # type: ignore # pylint: disable=no-member
-
-    _TObject = TypeVar("_TObject", bound=Object)
-
-    @classmethod
-    def set_dispatch(
-        cls,
-        node_type: Type[_TObject],
-        dispatch_function: Callable[[_TObject, ObjectPath, "IRDocsifier"], Doc],
-        dispatch_token: str = "",
-    ) -> None:
-        """
-        Set the dispatch function to transform a particular IR node type to Doc
-
-        Parameters
-        ----------
-        node_type : Type[_TObject]
-            The type of object to dispatch on.
-        dispatch_function : Callable[[_TObject, ObjectPath, "IRDocsifier"], Doc]
-            The dispatch function. It's called to transform IR node object to Doc.
-        dispatch_token : str
-            Function will only be called when this dispatch_token is the same as the one
-            on the top of IRDocsifier's dispatch_tokens stack. An empty dispatch token
-            means registering as default dispatch function, which will be called when
-            there is no dispatch function registered with the current dispatch token.
-        """
-        type_index = get_object_type_index(node_type)
-        if type_index is None:
-            raise TypeError(f"{type(node_type)} is not a registered TVM object type.")
-
-        _ensure_cleanup_function_registered()
-        _ffi_api.IRDocsifierSetDispatch(  # type: ignore # pylint: disable=no-member
-            dispatch_token, type_index, dispatch_function
-        )
-        _REGISTERED_TYPES.add((dispatch_token, type_index))
-
-    @classmethod
-    def set_root_dispatch(
-        cls, dispatch_token: str, root_dispatch_function: Callable[[Object, "IRDocsifier"], Doc]
-    ) -> None:
-        """
-        Set the root dispatch function for an IR.
-
-        The root dispatch function will be called with the root node of an IR graph
-        that's being transformed to Doc. This provides an injection point for
-        each IR's printer implemention to add specialized logic, for example,
-        pushing a special Frame to the IRDocsifier before doing actual IR->Doc
-        transformation.
-
-        The simplest root dispatch function is
-        ```
-        def f(obj, ir_docsifier)
-            return ir_docsifier.as_doc(obj, ObjectPath.root())
-        ```
-
-        Parameters
-        ----------
-        root_dispatch_function : Callable[[_TObject, "IRDocsifier"], Doc]
-            The root dispatch function. It's called with the root node to be printed.
-        dispatch_token : str
-            The dispatch token of the IR that root_dispatch_funnction applies to.
-        """
-
-        def dispatch_function(obj: RootNodeContainer, _, ir_docsifier):
-            return root_dispatch_function(obj.root_node, ir_docsifier)
-
-        cls.set_dispatch(RootNodeContainer, dispatch_function, dispatch_token)
-
-    def as_doc(self, obj: Object, object_path: ObjectPath) -> Doc:
-        """
-        Transform the input object into Doc.
-
-        Parameters
-        ----------
-        obj : Object
-            The IR node object.
-        object_path : ObjectPath
-            The object path of this object. It's used for locating diagnostic message.
-
-        Returns
-        -------
-        doc : Doc
-            The doc for this object.
-        """
-        return _ffi_api.IRDocsifierAsDoc(self, obj, object_path)  # type: ignore # pylint: disable=no-member
-
-    def get_frame(self, frame_type: Type[Frame]) -> Optional[Frame]:
-        """
-        Get the top frame with type `frame_type`.
-
-        Parameters
-        ----------
-        frame_type : Type[Frame]
-            The target frame type.
-
-        Returns
-        -------
-        frame : Optional[Frame]
-            The frame if found, otherwise None.
-        """
-        for i in range(len(self.frames) - 1, -1, -1):
-            if isinstance(self.frames[i], frame_type):
-                return self.frames[i]
-        return None
-
-    @contextmanager
-    def dispatch_token(self, token: str):
-        """
-        Push a new dispatch token to the stack.
-
-        Parameters
-        ----------
-        token : str
-            The token to push.
-
-        Returns
-        -------
-        A context manager that pops this dispatch token when exits.
-        """
-        with ExitStack() as stack:
-            _ffi_api.IRDocsifierPushDispatchToken(self, token)  # type: ignore # pylint: disable=no-member
-            stack.callback(_ffi_api.IRDocsifierPopDispatchToken, self)  # type: ignore # pylint: disable=no-member
-            yield
-
-    _TFrame = TypeVar("_TFrame", bound=Frame)
-
-    @contextmanager
-    def frame(self, frame: _TFrame) -> Generator[_TFrame, None, None]:
-        """
-        Push a new frame to the stack.
-
-        Parameters
-        ----------
-        frame : Frame
-            The frame to push.
-
-        Returns
-        -------
-        A context manager that pops this frame when exits.
-        """
-        with ExitStack() as stack:
-            stack.enter_context(frame)
-            _ffi_api.IRDocsifierPushFrame(self, frame)  # type: ignore # pylint: disable=no-member
-            stack.callback(_ffi_api.IRDocsifierPopFrame, self)  # type: ignore # pylint: disable=no-member
-            yield frame
diff --git a/python/tvm/script/printer/printer.py b/python/tvm/script/printer/printer.py
new file mode 100644
index 000000000000..120ef03f57d7
--- /dev/null
+++ b/python/tvm/script/printer/printer.py
@@ -0,0 +1,64 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The printer interface"""
+
+from typing import Mapping, Optional
+
+from tvm.runtime.object_path import ObjectPath
+
+from . import _ffi_api
+
+
+def script(
+    obj,
+    ir_prefix: Optional[Mapping[str, str]] = None,
+    indent_space: int = 4,
+    print_line_number: bool = False,
+    num_context_lines: int = -1,
+    path_to_underline: Optional[ObjectPath] = None,
+):
+    """Print a TVM IR as a TVMScript text format.
+
+    Parameters
+    ----------
+    obj : object
+        An TVM object representing TVM IR
+    ir_prefix : Optional[Mapping[str, str]]
+        A mapping from IR type to the prefix of the script.
+        Default to {"ir": "I", "tir": T}
+    indent_space : int = 4
+        The number of spaces to indent
+    print_line_number : bool = False
+        Whether to print line number
+    num_context_lines : int = -1
+        The number of context lines to print. -1 means all lines.
+    path_to_underline : Optional[ObjectPath]
+        The path to underline in the script.
+
+    Returns
+    -------
+    script : str
+        The TVMScript text format
+    """
+    if ir_prefix is None:
+        ir_prefix = {
+            "ir": "I",
+            "tir": "T",
+        }
+    return _ffi_api.Script(  # type: ignore # pylint: disable=no-member
+        obj, ir_prefix, indent_space, print_line_number, num_context_lines, path_to_underline
+    )
diff --git a/python/tvm/script/printer/var_table.py b/python/tvm/script/printer/var_table.py
deleted file mode 100644
index ea1fa41b3210..000000000000
--- a/python/tvm/script/printer/var_table.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Functions to print doc into text format"""
-
-from typing import Callable, Optional
-
-from tvm._ffi import register_object
-from tvm.runtime import Object, ObjectPath
-
-from . import _ffi_api
-from .doc import ExprDoc, IdDoc
-from .frame import Frame
-
-
-@register_object("script.printer.VarTable")
-class VarTable(Object):
-    """
-    Variable Table manages mapping from variable object to ExprDoc during
-    the process of printing TVMScript.
-    """
-
-    def __init__(self):
-        """
-        Create an empty VarTable.
-        """
-        self.__init_handle_by_constructor__(_ffi_api.VarTable)  # type: ignore # pylint: disable=no-member
-
-    def define(self, obj: Object, name_hint: str, object_path: ObjectPath, frame: Frame) -> IdDoc:
-        """
-        Define a variable by name.
-
-        Parameters
-        ----------
-        obj : Object
-            The variable object.
-        name_hint : str
-            The hint for variable name.
-        object_path : ObjectPath
-            The object path to be associated with the returned ExprDoc.
-        frame : Frame
-            Then frame that this variable is defined in.
-
-        Returns
-        -------
-        doc : IdDoc
-            The doc for this variable.
-        """
-        return _ffi_api.VarTableDefine(self, obj, name_hint, object_path, frame)  # type: ignore # pylint: disable=no-member
-
-    def define_by_doc(self, obj: Object, doc_factory: Callable[[], ExprDoc], frame: Frame) -> None:
-        """
-        Define a variable by ExprDoc.
-
-        Parameters
-        ----------
-        obj : Object
-            The variable object.
-        doc_factory : Callable[[], ExprDoc]
-            The hint for variable name.
-        frame : Frame
-            Then frame that this variable is defined in.
-
-        Returns
-        -------
-        None
-        """
-        _ffi_api.VarTableDefineByDoc(self, obj, doc_factory, frame)  # type: ignore # pylint: disable=no-member
-
-    def get_var_doc(self, obj: Object, object_path: ObjectPath) -> Optional[ExprDoc]:
-        """
-        Get the doc for a variable.
-
-        Parameters
-        ----------
-        obj : Object
-            The variable object.
-        object_path : ObjectPath
-            The object path to be associated with the returned ExprDoc.
-
-        Returns
-        -------
-        doc : ExprDoc
-            The doc for this variable.
-        """
-        return _ffi_api.VarTableGetVarDoc(self, obj, object_path)  # type: ignore # pylint: disable=no-member
-
-    def is_var_defined(self, obj: Object) -> bool:
-        """
-        Check whether a variable is defined.
-
-        Parameters
-        ----------
-        obj : Object
-            The variable object.
-
-        Returns
-        -------
-        is_defined : bool
-            Whether the variable is defined.
-        """
-        return _ffi_api.VarTableIsVarDefined(self, obj)  # type: ignore # pylint: disable=no-member
-
-    def __contains__(self, obj: Object) -> bool:
-        return self.is_var_defined(obj)
diff --git a/src/script/printer/doc.cc b/src/script/printer/doc.cc
index 1ca7ced8e8a7..f41b40c92cc9 100644
--- a/src/script/printer/doc.cc
+++ b/src/script/printer/doc.cc
@@ -27,18 +27,12 @@ namespace printer {
 
 ExprDoc ExprDocNode::Attr(String attr) const { return AttrAccessDoc(GetRef<ExprDoc>(this), attr); }
 
-ExprDoc ExprDocNode::Attr(TracedObject<String> attr) const {
-  auto doc = AttrAccessDoc(GetRef<ExprDoc>(this), attr.Get());
-  doc->source_paths.push_back(attr.GetPath());
-  return std::move(doc);
-}
-
 ExprDoc ExprDocNode::operator[](Array<Doc> indices) const {
   return IndexDoc(GetRef<ExprDoc>(this), indices);
 }
 
 ExprDoc ExprDocNode::Call(Array<ExprDoc, void> args) const {
-  return CallDoc(GetRef<ExprDoc>(this), args, {}, {});
+  return CallDoc(GetRef<ExprDoc>(this), args, Array<String>(), Array<ExprDoc>());
 }
 
 ExprDoc ExprDocNode::Call(Array<ExprDoc, void> args, Array<String, void> kwargs_keys,
@@ -258,7 +252,7 @@ TVM_REGISTER_GLOBAL("script.printer.StmtBlockDoc").set_body_typed([](Array<StmtD
 TVM_REGISTER_NODE_TYPE(LiteralDocNode);
 TVM_REGISTER_GLOBAL("script.printer.LiteralDocNone").set_body_typed<LiteralDoc()>(LiteralDoc::None);
 TVM_REGISTER_GLOBAL("script.printer.LiteralDocInt")
-    .set_body_typed<LiteralDoc(int)>(LiteralDoc::Int);
+    .set_body_typed<LiteralDoc(int64_t)>(LiteralDoc::Int);
 TVM_REGISTER_GLOBAL("script.printer.LiteralDocBoolean")
     .set_body_typed<LiteralDoc(bool)>(LiteralDoc::Boolean);
 TVM_REGISTER_GLOBAL("script.printer.LiteralDocFloat")
diff --git a/src/script/printer/base_doc_printer.cc b/src/script/printer/doc_printer/base_doc_printer.cc
similarity index 100%
rename from src/script/printer/base_doc_printer.cc
rename to src/script/printer/doc_printer/base_doc_printer.cc
diff --git a/src/script/printer/base_doc_printer.h b/src/script/printer/doc_printer/base_doc_printer.h
similarity index 97%
rename from src/script/printer/base_doc_printer.h
rename to src/script/printer/doc_printer/base_doc_printer.h
index f3fb24d946e1..db1d733d96ad 100644
--- a/src/script/printer/base_doc_printer.h
+++ b/src/script/printer/doc_printer/base_doc_printer.h
@@ -16,11 +16,10 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#ifndef TVM_SCRIPT_PRINTER_BASE_DOC_PRINTER_H_
-#define TVM_SCRIPT_PRINTER_BASE_DOC_PRINTER_H_
+#ifndef TVM_SCRIPT_PRINTER_DOC_PRINTER_BASE_DOC_PRINTER_H_
+#define TVM_SCRIPT_PRINTER_DOC_PRINTER_BASE_DOC_PRINTER_H_
 
 #include <tvm/script/printer/doc.h>
-#include <tvm/script/printer/doc_printer.h>
 
 #include <limits>
 #include <memory>
@@ -287,4 +286,4 @@ class DocPrinter {
 }  // namespace script
 }  // namespace tvm
 
-#endif  // TVM_SCRIPT_PRINTER_BASE_DOC_PRINTER_H_
+#endif  // TVM_SCRIPT_PRINTER_DOC_PRINTER_BASE_DOC_PRINTER_H_
diff --git a/src/script/printer/python_doc_printer.cc b/src/script/printer/doc_printer/python_doc_printer.cc
similarity index 98%
rename from src/script/printer/python_doc_printer.cc
rename to src/script/printer/doc_printer/python_doc_printer.cc
index 753f907c423c..6851baf63866 100644
--- a/src/script/printer/python_doc_printer.cc
+++ b/src/script/printer/doc_printer/python_doc_printer.cc
@@ -21,10 +21,11 @@
 #include <tvm/script/printer/doc.h>
 
 #include <algorithm>
+#include <cmath>
 #include <string>
 
-#include "../../support/str_escape.h"
-#include "../../support/utils.h"
+#include "../../../support/str_escape.h"
+#include "../../../support/utils.h"
 #include "./base_doc_printer.h"
 
 namespace tvm {
@@ -294,7 +295,11 @@ void PythonDocPrinter::PrintTypedDoc(const LiteralDoc& doc) {
   } else if (const auto* float_imm = value.as<FloatImmNode>()) {
     // TODO(yelite): Make float number printing roundtrippable
     output_.precision(17);
-    output_ << float_imm->value;
+    if (std::isinf(float_imm->value) || std::isnan(float_imm->value)) {
+      output_ << '"' << float_imm->value << '"';
+    } else {
+      output_ << float_imm->value;
+    }
   } else if (const auto* string_obj = value.as<StringObj>()) {
     output_ << "\"" << support::StrEscape(string_obj->data, string_obj->size) << "\"";
   } else {
diff --git a/src/script/printer/frame.cc b/src/script/printer/frame.cc
deleted file mode 100644
index b342c7c886c7..000000000000
--- a/src/script/printer/frame.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/script/printer/frame.h>
-
-namespace tvm {
-namespace script {
-namespace printer {
-
-MetadataFrame::MetadataFrame() : MetadataFrame(make_object<MetadataFrameNode>()) {}
-
-VarDefFrame::VarDefFrame() : VarDefFrame(make_object<VarDefFrameNode>()) {}
-
-TVM_REGISTER_NODE_TYPE(FrameNode);
-TVM_REGISTER_GLOBAL("script.printer.FrameAddExitCallback")
-    .set_body_typed([](Frame frame, runtime::TypedPackedFunc<void()> callback) {
-      frame->AddExitCallback(callback);
-    });
-TVM_REGISTER_GLOBAL("script.printer.FrameEnterWithScope")
-    .set_body_method<Frame>(&FrameNode::EnterWithScope);
-TVM_REGISTER_GLOBAL("script.printer.FrameExitWithScope")
-    .set_body_method<Frame>(&FrameNode::ExitWithScope);
-
-TVM_REGISTER_NODE_TYPE(MetadataFrameNode);
-TVM_REGISTER_GLOBAL("script.printer.MetadataFrame").set_body_typed([]() {
-  return MetadataFrame();
-});
-
-TVM_REGISTER_NODE_TYPE(VarDefFrameNode);
-TVM_REGISTER_GLOBAL("script.printer.VarDefFrame").set_body_typed([]() { return VarDefFrame(); });
-
-}  // namespace printer
-}  // namespace script
-}  // namespace tvm
diff --git a/src/script/printer/ir/ir.cc b/src/script/printer/ir/ir.cc
new file mode 100644
index 000000000000..c4ecf92e9116
--- /dev/null
+++ b/src/script/printer/ir/ir.cc
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "./utils.h"
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+TVM_REGISTER_NODE_TYPE(IRFrameNode);
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<IRModule>("", [](IRModule mod, ObjectPath p, IRDocsifier d) -> Doc {
+      std::vector<std::pair<GlobalVar, BaseFunc>> functions{mod->functions.begin(),
+                                                            mod->functions.end()};
+      // print "main" first
+      std::sort(functions.begin(), functions.end(), [](const auto& lhs, const auto& rhs) {
+        String lhs_name = lhs.first->name_hint;
+        String rhs_name = rhs.first->name_hint;
+        if (lhs_name == "main") {
+          lhs_name = "";
+        }
+        if (rhs_name == "main") {
+          rhs_name = "";
+        }
+        return lhs_name < rhs_name;
+      });
+      ICHECK(!d->mod.defined());
+      d->mod = mod;
+      {
+        With<IRFrame> f(d);
+        (*f)->AddDispatchToken(d, "ir");
+        for (const auto& kv : functions) {
+          GlobalVar gv = kv.first;
+          BaseFunc func = kv.second;
+          (*f)->stmts.push_back(d->AsDoc<FunctionDoc>(func, p->Attr("functions")->MapValue(gv)));
+        }
+        return ClassDoc(IdDoc("Module"), {IR(d)}, (*f)->stmts);
+      }
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<DictAttrs>("", [](DictAttrs attrs, ObjectPath p, IRDocsifier d) -> Doc {
+      return d->AsDoc(attrs->dict, p->Attr("dict"));
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<GlobalVar>("", [](GlobalVar gv, ObjectPath p, IRDocsifier d) -> Doc {
+      return IdDoc("GlobalVar")->Call({LiteralDoc::Str(gv->name_hint)});
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<Op>("", [](Op op, ObjectPath p, IRDocsifier d) -> Doc {
+      return IdDoc("Op")->Call({LiteralDoc::Str(op->name)});
+    });
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
diff --git a/src/script/printer/ir/misc.cc b/src/script/printer/ir/misc.cc
new file mode 100644
index 000000000000..bd2792167194
--- /dev/null
+++ b/src/script/printer/ir/misc.cc
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "./utils.h"
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<String>("", [](String s, ObjectPath p, IRDocsifier d) -> Doc {
+      return LiteralDoc::Str(s);
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<Array<ObjectRef>>(  //
+        "", [](Array<ObjectRef> array, ObjectPath p, IRDocsifier d) -> Doc {
+          int n = array.size();
+          Array<ExprDoc> results;
+          results.reserve(n);
+          for (int i = 0; i < n; ++i) {
+            results.push_back(d->AsDoc<ExprDoc>(array[i], p->ArrayIndex(i)));
+          }
+          return ListDoc(results);
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<Map<ObjectRef, ObjectRef>>(  //
+        "", [](Map<ObjectRef, ObjectRef> dict, ObjectPath p, IRDocsifier d) -> Doc {
+          using POO = std::pair<ObjectRef, ObjectRef>;
+          std::vector<POO> items{dict.begin(), dict.end()};
+          bool is_str_map = true;
+          for (const auto& kv : items) {
+            if (!kv.first.as<runtime::StringObj>()) {
+              is_str_map = false;
+              break;
+            }
+          }
+          if (is_str_map) {
+            std::sort(items.begin(), items.end(), [](const POO& lhs, const POO& rhs) {
+              return Downcast<String>(lhs.first) < Downcast<String>(rhs.first);
+            });
+          } else {
+            std::sort(items.begin(), items.end(), [](const POO& lhs, const POO& rhs) {
+              return lhs.first.get() < rhs.first.get();
+            });
+          }
+          int n = dict.size();
+          Array<ExprDoc> ks;
+          Array<ExprDoc> vs;
+          ks.reserve(n);
+          vs.reserve(n);
+          for (int i = 0; i < n; ++i) {
+            ks.push_back(d->AsDoc<ExprDoc>(items[i].first, p->MissingMapEntry()));
+            vs.push_back(d->AsDoc<ExprDoc>(items[i].second, p->MapValue(items[i].first)));
+          }
+          return DictDoc(ks, vs);
+        });
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
diff --git a/src/script/printer/ir/utils.h b/src/script/printer/ir/utils.h
new file mode 100644
index 000000000000..4065b895c1bb
--- /dev/null
+++ b/src/script/printer/ir/utils.h
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SCRIPT_PRINTER_IR_UTILS_H_
+#define TVM_SCRIPT_PRINTER_IR_UTILS_H_
+
+#include <tvm/ir/expr.h>
+#include <tvm/ir/function.h>
+#include <tvm/ir/op.h>
+#include <tvm/script/printer/ir_docsifier.h>
+#include <tvm/script/printer/printer.h>
+#include <tvm/support/with.h>
+
+#include <utility>
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+inline ExprDoc IR(const IRDocsifier& d) { return IdDoc("tvm")->Attr("script"); }
+
+class IRFrameNode : public FrameNode {
+ public:
+  void VisitAttrs(AttrVisitor* v) { FrameNode::VisitAttrs(v); }
+
+  static constexpr const char* _type_key = "script.printer.IRFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(IRFrameNode, FrameNode);
+};
+
+class IRFrame : public Frame {
+ public:
+  explicit IRFrame(const IRDocsifier& d) {
+    ObjectPtr<IRFrameNode> n = make_object<IRFrameNode>();
+    n->stmts.clear();
+    n->d = d.get();
+    data_ = std::move(n);
+  }
+
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(IRFrame, Frame, IRFrameNode);
+};
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
+
+#endif  // TVM_SCRIPT_PRINTER_IR_UTILS_H_
diff --git a/src/script/printer/ir_docsifier.cc b/src/script/printer/ir_docsifier.cc
index 7f032ec50269..8584f360312f 100644
--- a/src/script/printer/ir_docsifier.cc
+++ b/src/script/printer/ir_docsifier.cc
@@ -20,21 +20,136 @@
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/script/printer/ir_docsifier.h>
-#include <tvm/script/printer/traced_object.h>
-#include <tvm/script/printer/traced_object_functor.h>
 
 namespace tvm {
 namespace script {
 namespace printer {
 
-Doc IRDocsifierNode::AsDocImpl(const TracedObject<ObjectRef>& obj) const {
-  return IRDocsifier::vtable()(dispatch_tokens.back(), obj, GetRef<IRDocsifier>(this));
+String GenerateUniqueName(std::string name_hint, std::unordered_set<String>* defined_names) {
+  for (char& c : name_hint) {
+    if (c != 'c' && !std::isalnum(c)) {
+      c = '_';
+    }
+  }
+  std::string name = name_hint;
+  for (int i = 1; !defined_names->insert(name).second; ++i) {
+    name = name_hint + "_" + std::to_string(i);
+  }
+  return name;
+}
+
+IdDoc IRDocsifierNode::Define(const ObjectRef& obj, const Frame& frame, const String& name_hint) {
+  String name = GenerateUniqueName(name_hint, &this->defined_names);
+  DocCreator doc_factory = [name]() { return IdDoc(name); };
+  auto result = obj2info.insert({obj, VariableInfo{std::move(doc_factory), name}});
+  ICHECK(result.second) << "Duplicated object: " << obj;
+  IdDoc def_doc(name);
+  frame->AddExitCallback([this, obj]() { this->RemoveVar(obj); });
+  return def_doc;
+}
+
+void IRDocsifierNode::Define(const ObjectRef& obj, const Frame& frame, DocCreator doc_factory) {
+  ICHECK(obj2info.find(obj) == obj2info.end()) << "Duplicated object: " << obj;
+  ICHECK(!doc_factory()->IsInstance<IdDocNode>())
+      << "IRDocsifierNode::Define cannot be used for variable that's mapped to IdDoc.";
+  obj2info.insert({obj, VariableInfo{std::move(doc_factory), NullOpt}});
+  frame->AddExitCallback([this, obj]() { this->RemoveVar(obj); });
+}
+
+Optional<ExprDoc> IRDocsifierNode::GetVarDoc(const ObjectRef& obj) const {
+  auto it = obj2info.find(obj);
+  if (it == obj2info.end()) {
+    return NullOpt;
+  }
+  return it->second.creator();
+}
+
+bool IRDocsifierNode::IsVarDefined(const ObjectRef& obj) const { return obj2info.count(obj); }
+
+void IRDocsifierNode::RemoveVar(const ObjectRef& obj) {
+  auto it = obj2info.find(obj);
+  ICHECK(it != obj2info.end()) << "No such object: " << obj;
+  if (it->second.name.defined()) {
+    defined_names.erase(it->second.name.value());
+  }
+  obj2info.erase(it);
+}
+
+void IRDocsifierNode::SetCommonPrefix(const ObjectRef& root,
+                                      runtime::TypedPackedFunc<bool(ObjectRef)> is_var) {
+  class Visitor : public AttrVisitor {
+   public:
+    inline void operator()(ObjectRef obj) { Visit("", &obj); }
+
+   private:
+    void Visit(const char* key, double* value) final {}
+    void Visit(const char* key, int64_t* value) final {}
+    void Visit(const char* key, uint64_t* value) final {}
+    void Visit(const char* key, int* value) final {}
+    void Visit(const char* key, bool* value) final {}
+    void Visit(const char* key, std::string* value) final {}
+    void Visit(const char* key, void** value) final {}
+    void Visit(const char* key, DataType* value) final {}
+    void Visit(const char* key, runtime::NDArray* value) final {}
+    void Visit(const char* key, ObjectRef* value) final {
+      const Object* obj = value->get();
+      if (obj == nullptr) {
+        return;
+      }
+      stack_.push_back(obj);
+      if (obj->IsInstance<ArrayNode>()) {
+        const ArrayNode* array = static_cast<const ArrayNode*>(obj);
+        for (ObjectRef element : *array) {
+          this->Visit("", &element);
+        }
+      } else if (obj->IsInstance<MapNode>()) {
+        const MapNode* map = static_cast<const MapNode*>(obj);
+        for (std::pair<ObjectRef, ObjectRef> kv : *map) {
+          this->Visit("", &kv.first);
+          this->Visit("", &kv.second);
+        }
+      } else {
+        vtable_->VisitAttrs(const_cast<Object*>(obj), this);
+      }
+      if (is_var(GetRef<ObjectRef>(obj))) {
+        HandleVar(obj);
+      }
+      stack_.pop_back();
+    }
+
+    void HandleVar(const Object* var) {
+      if (common_prefix.count(var) == 0) {
+        common_prefix[var] = stack_;
+        return;
+      }
+      std::vector<const Object*>& a = common_prefix[var];
+      std::vector<const Object*>& b = stack_;
+      int n = std::min(a.size(), b.size());
+      for (int i = 0; i < n; ++i) {
+        if (a[i] != b[i]) {
+          a.resize(i);
+          break;
+        }
+      }
+    }
+
+    ReflectionVTable* vtable_ = ReflectionVTable::Global();
+    std::vector<const Object*> stack_;
+
+   public:
+    runtime::TypedPackedFunc<bool(ObjectRef)> is_var;
+    std::unordered_map<const Object*, std::vector<const Object*>> common_prefix;
+  };
+  Visitor visitor;
+  visitor.is_var = is_var;
+  visitor(root);
+  this->common_prefix = std::move(visitor.common_prefix);
 }
 
 IRDocsifier::IRDocsifier(Map<String, String> ir_prefix) {
   auto n = make_object<IRDocsifierNode>();
   n->ir_prefix = std::move(ir_prefix);
-  n->dispatch_tokens.push_back(kDefaultDispatchToken);
+  n->dispatch_tokens.push_back("");
   data_ = std::move(n);
 }
 
@@ -43,65 +158,8 @@ IRDocsifier::FType& IRDocsifier::vtable() {
   return inst;
 }
 
-RootNodeContainer::RootNodeContainer(ObjectRef root_node) {
-  auto n = make_object<RootNodeContainerNode>();
-  n->root_node = std::move(root_node);
-  data_ = std::move(n);
-}
-
-// Add a default dispatch for the RootNodeContainer to throw error.
-// To add implementation for a new IR, RootNodeContainer needs to be
-// registered under the dispatch token of that IR, like:
-// \code
-// TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-//     .set_dispatch("relax", [](TracedObject<RootNodeContainer> obj, IRDocsifier p) {
-//       const ObjectRef& root_node = obj.Get()->root_node;
-//       \\ More specialized logic for your IR.
-//       return p->AsDoc<Doc>(MakeTraced(root_node));
-//     });
-// \endcode
-TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<RootNodeContainer>([](TracedObject<RootNodeContainer> obj, IRDocsifier p) -> Doc {
-      String top_dispatch_token = p->dispatch_tokens.back();
-      ICHECK_NE(top_dispatch_token, "");
-      ICHECK(false) << "Printing IR " << top_dispatch_token << " is not implemented.";
-      throw;
-    });
-
+TVM_REGISTER_NODE_TYPE(FrameNode);
 TVM_REGISTER_NODE_TYPE(IRDocsifierNode);
-TVM_REGISTER_GLOBAL("script.printer.IRDocsifier").set_body_typed([](Map<String, String> ir_prefix) {
-  return IRDocsifier(ir_prefix);
-});
-TVM_REGISTER_GLOBAL("script.printer.IRDocsifierAsDoc")
-    .set_body_typed([](IRDocsifier p, ObjectRef obj, ObjectPath obj_path) {
-      return p->AsDoc<Doc>(MakeTraced(obj, obj_path));
-    });
-
-TVM_REGISTER_GLOBAL("script.printer.IRDocsifierPushDispatchToken")
-    .set_body_typed([](IRDocsifier p, String token) { p->dispatch_tokens.push_back(token); });
-TVM_REGISTER_GLOBAL("script.printer.IRDocsifierPopDispatchToken").set_body_typed([](IRDocsifier p) {
-  p->dispatch_tokens.pop_back();
-});
-
-TVM_REGISTER_GLOBAL("script.printer.IRDocsifierPushFrame")
-    .set_body_typed([](IRDocsifier p, Frame frame) { p->frames.push_back(frame); });
-TVM_REGISTER_GLOBAL("script.printer.IRDocsifierPopFrame").set_body_typed([](IRDocsifier p) {
-  p->frames.pop_back();
-});
-
-TVM_REGISTER_GLOBAL("script.printer.IRDocsifierSetDispatch")
-    .set_body_typed([](String token, uint64_t type_index, runtime::PackedFunc f) {
-      IRDocsifier::vtable().set_dispatch(token, type_index, std::move(f));
-    });
-TVM_REGISTER_GLOBAL("script.printer.IRDocsifierRemoveDispatch")
-    .set_body_typed([](String token, uint64_t type_index) {
-      IRDocsifier::vtable().remove_dispatch(token, type_index);
-    });
-
-TVM_REGISTER_NODE_TYPE(RootNodeContainerNode);
-TVM_REGISTER_GLOBAL("script.printer.RootNodeContainer").set_body_typed([](ObjectRef root_node) {
-  return RootNodeContainer(root_node);
-});
 
 }  // namespace printer
 }  // namespace script
diff --git a/src/script/printer.cc b/src/script/printer/printer.cc
similarity index 57%
rename from src/script/printer.cc
rename to src/script/printer/printer.cc
index 051b774ba6ac..47fd0b89b09e 100644
--- a/src/script/printer.cc
+++ b/src/script/printer/printer.cc
@@ -16,38 +16,28 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 #include <tvm/runtime/registry.h>
-#include <tvm/script/printer.h>
-#include <tvm/script/printer/doc.h>
-#include <tvm/script/printer/doc_printer.h>
-#include <tvm/script/printer/frame.h>
-#include <tvm/script/printer/ir_docsifier.h>
+#include <tvm/script/printer/printer.h>
 
 namespace tvm {
 namespace script {
 namespace printer {
 
-String Script(                              //
-    const ObjectRef& root_node,             //
-    String ir_name,                         //
-    Map<String, String> ir_prefix,          //
-    int indent_spaces,                      //
-    bool print_line_numbers,                //
-    int num_context_lines,                  //
-    Optional<ObjectPath> path_to_underline  //
-) {
-  IRDocsifier ir_docsifier(ir_prefix);
-
-  auto dispatch_ctx = ir_docsifier->WithDispatchToken(ir_name);
-
-  Doc doc = ir_docsifier->AsDoc<Doc>(MakeTraced(RootNodeContainer(root_node)));
-
+String Script(ObjectRef obj, Map<String, String> ir_prefix, int indent_spaces,
+              bool print_line_numbers, int num_context_lines,
+              Optional<ObjectPath> path_to_underline) {
+  IRDocsifier d(ir_prefix);
+  Doc doc = d->AsDoc(obj, ObjectPath::Root());
   return DocToPythonScript(doc, indent_spaces, print_line_numbers, num_context_lines,
                            path_to_underline);
 }
 
-TVM_REGISTER_GLOBAL("script.printer.Script").set_body_typed(&Script);
+Default* Default::Instance() {
+  static Default inst;
+  return &inst;
+}
+
+TVM_REGISTER_GLOBAL("script.printer.Script").set_body_typed(Script);
 
 }  // namespace printer
 }  // namespace script
diff --git a/src/script/printer/tir/block.cc b/src/script/printer/tir/block.cc
new file mode 100644
index 000000000000..f6dbf616a5a3
--- /dev/null
+++ b/src/script/printer/tir/block.cc
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "./utils.h"
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
+               Optional<tir::BlockRealize> opt_realize, Optional<ObjectPath> opt_realize_p) {
+  With<TIRFrame> frame(d, block);
+  ICHECK_EQ(opt_realize.defined(), opt_realize_p.defined());
+  const tir::BlockRealizeNode* realize = opt_realize.value().get();
+  const ObjectPathNode* realize_p = opt_realize_p.get();
+  // Step 1. Handle block var and block bindings
+  int n_vars = block->iter_vars.size();
+  for (int i = 0; i < n_vars; ++i) {
+    tir::IterVar iter_var = block->iter_vars[i];
+    ObjectPath iter_var_p = block_p->Attr("iter_var")->ArrayIndex(i);
+    ExprDoc rhs = TIR(d)->Attr("axis");
+    if (iter_var->iter_type == tir::IterVarType::kDataPar) {
+      rhs = rhs->Attr("spatial");
+    } else if (iter_var->iter_type == tir::IterVarType::kCommReduce) {
+      rhs = rhs->Attr("reduce");
+    } else if (iter_var->iter_type == tir::IterVarType::kOrdered) {
+      rhs = rhs->Attr("scan");
+    } else if (iter_var->iter_type == tir::IterVarType::kOpaque) {
+      rhs = rhs->Attr("opaque");
+    } else {
+      LOG(FATAL) << "ValueError: Unknown IterVarType in block signature: "
+                 << tir::IterVarType2String(iter_var->iter_type);
+    }
+    ExprDoc dom{nullptr};
+    if (tir::is_zero(iter_var->dom->min)) {
+      ExprDoc extent = d->AsDoc<ExprDoc>(iter_var->dom->extent,  //
+                                         iter_var_p->Attr("dom")->Attr("extent"));
+      dom = extent;
+    } else {
+      ExprDoc min = d->AsDoc<ExprDoc>(iter_var->dom->min, iter_var_p->Attr("dom")->Attr("min"));
+      ExprDoc max = d->AsDoc<ExprDoc>(iter_var->dom->min + iter_var->dom->extent,
+                                      iter_var_p->Attr("dom")->Attr("extent"));
+      dom = TupleDoc({min, max});
+    }
+    if (realize) {
+      ExprDoc binding = d->AsDoc<ExprDoc>(realize->iter_values[i],  //
+                                          realize_p->Attr("iter_values")->ArrayIndex(i));
+      rhs = rhs->Call({dom, binding});
+    } else {
+      rhs = rhs->Call({dom});
+    }
+    (*frame)->stmts.push_back(AssignDoc(DefineVar(iter_var->var, *frame, d), rhs, NullOpt));
+  }
+  // Step 2. Handle block predicate
+  if (realize) {
+    ICHECK(realize->predicate.defined() && realize->predicate->dtype.is_bool());
+    if (!tir::is_one(realize->predicate)) {
+      (*frame)->stmts.push_back(ExprStmtDoc(TIR(d)->Attr("where")->Call(
+          {d->AsDoc<ExprDoc>(realize->predicate, realize_p->Attr("predicate"))})));
+    }
+  }
+  // Step 3. Handle block read/write regions
+  {
+    Array<ExprDoc> reads;
+    for (int i = 0, n = block->reads.size(); i < n; ++i) {
+      reads.push_back(d->AsDoc<ExprDoc>(block->reads[i], block_p->Attr("reads")->ArrayIndex(i)));
+    }
+    (*frame)->stmts.push_back(ExprStmtDoc(TIR(d)->Attr("reads")->Call(reads)));
+    Array<ExprDoc> writes;
+    for (int i = 0, n = block->writes.size(); i < n; ++i) {
+      writes.push_back(d->AsDoc<ExprDoc>(block->writes[i], block_p->Attr("writes")->ArrayIndex(i)));
+    }
+    (*frame)->stmts.push_back(ExprStmtDoc(TIR(d)->Attr("writes")->Call(writes)));
+  }
+  // Step 4. Handle block attributes
+  if (!block->annotations.empty()) {
+    (*frame)->stmts.push_back(ExprStmtDoc(
+        TIR(d)
+            ->Attr("block_attr")
+            ->Call({d->AsDoc<ExprDoc>(block->annotations, block_p->Attr("annotations"))})));
+  }
+  // Step 5. Handle `alloc_buffer`
+  for (int i = 0, n = block->alloc_buffers.size(); i < n; ++i) {
+    tir::Buffer buffer = block->alloc_buffers[i];
+    ObjectPath buffer_p = block_p->Attr("alloc_buffers")->ArrayIndex(i);
+    IdDoc lhs = DefineBuffer(buffer, *frame, d);
+    ExprDoc rhs = BufferDecl(buffer, "alloc_buffer", {}, buffer_p, *frame, d);
+    (*frame)->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
+  }
+  // Step 6. Handle `match_buffer`
+  for (int i = 0, n = block->match_buffers.size(); i < n; ++i) {
+    tir::MatchBufferRegion buffer_region = block->match_buffers[i];
+    ObjectPath buffer_region_p = block_p->Attr("match_buffers")->ArrayIndex(i);
+    StmtDoc doc = d->AsDoc<StmtDoc>(buffer_region, buffer_region_p);
+    (*frame)->stmts.push_back(doc);
+  }
+  // Step 7. Handle init block
+  if (block->init.defined()) {
+    tir::Stmt init = block->init.value();
+    With<TIRFrame> init_frame(d, init);
+    AsDocBody(init, block_p->Attr("init"), init_frame->get(), d);
+    (*frame)->stmts.push_back(
+        ScopeDoc(NullOpt, TIR(d)->Attr("init")->Call({}), (*init_frame)->stmts));
+  }
+  // Step 8. Handle block body
+  AsDocBody(block->body, block_p->Attr("body"), frame->get(), d);
+  return ScopeDoc(NullOpt, TIR(d)->Attr("block")->Call({LiteralDoc::Str(block->name_hint)}),
+                  (*frame)->stmts);
+}
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::BlockRealize>(
+        "", [](tir::BlockRealize realize, ObjectPath p, IRDocsifier d) -> Doc {
+          return PrintBlock(d, realize->block, p->Attr("block"), realize, p);
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Block>("", [](tir::Block block, ObjectPath p, IRDocsifier d) -> Doc {
+      return PrintBlock(d, block, p, NullOpt, NullOpt);
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::MatchBufferRegion>(
+        "", [](tir::MatchBufferRegion stmt, ObjectPath p, IRDocsifier d) -> Doc {
+          Frame frame = d->frames.back();
+          ExprDoc lhs = DefineBuffer(stmt->buffer, frame, d);
+          ExprDoc src_buffer = d->AsDoc<ExprDoc>(stmt->source, p->Attr("source"));
+          ExprDoc rhs = BufferDecl(stmt->buffer, "match_buffer", {src_buffer}, p->Attr("buffer"),
+                                   d->frames.back(), d);
+          return AssignDoc(lhs, rhs, NullOpt);
+        });
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
diff --git a/src/script/printer/tir/buffer.cc b/src/script/printer/tir/buffer.cc
new file mode 100644
index 000000000000..3e1d71af4acd
--- /dev/null
+++ b/src/script/printer/tir/buffer.cc
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/runtime/device_api.h>
+
+#include "./utils.h"
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+Map<String, ExprDoc> BufferAttrs(const tir::Buffer& buffer, const ObjectPath& p, const Frame& frame,
+                                 const IRDocsifier& d) {
+  Map<String, ExprDoc> kwargs;
+  auto implicit_var_def = [&](const PrimExpr& e, const ObjectPath& p, const String& key) {
+    if (Optional<ExprDoc> doc = d->GetVarDoc(e)) {
+      kwargs.Set(key, doc.value());
+      return false;
+    }
+    if (e->IsInstance<tir::VarNode>()) {
+      d->Define(e, frame, [=]() { return d->AsDoc<IdDoc>(buffer, p)->Attr(key); });
+      return true;
+    }
+    kwargs.Set(key, d->AsDoc<ExprDoc>(e, p));
+    return false;
+  };
+  auto array_out_line_var_def = [&](const Array<PrimExpr>& array, const ObjectPath& p,
+                                    const String& key) {
+    int n = array.size();
+    Array<ExprDoc> results;
+    results.reserve(n);
+    for (int i = 0; i < n; ++i) {
+      PrimExpr s = array[i];
+      ObjectPath s_path = p->ArrayIndex(i);
+      // Add out-of-line definition for a new Var in shape
+      results.push_back(d->AsDoc<ExprDoc>(s, s_path));
+    }
+    kwargs.Set(key, TupleDoc(results));
+  };
+  // Step 1. Handle `buffer.shape`
+  array_out_line_var_def(buffer->shape, p->Attr("shape"), "shape");
+  // Step 2. Handle `buffer.dtype`
+  if (buffer->dtype != Default::BufferDType()) {
+    kwargs.Set("dtype", LiteralDoc::DataType(buffer->dtype));
+  }
+  // Step 3. Handle `buffer.data`
+  implicit_var_def(buffer->data, p->Attr("data"), "data");
+  // Step 4. Handle `buffer.strides`
+  if (!buffer->strides.empty()) {
+    array_out_line_var_def(buffer->strides, p->Attr("strides"), "strides");
+  }
+  // Step 5. Handle `buffer.elem_offset`
+  bool needs_print_factor = false;
+  if (const auto* int_imm = buffer->elem_offset.as<IntImmNode>()) {
+    if (int_imm->value != 0) {
+      kwargs.Set("elem_offset", d->AsDoc<ExprDoc>(buffer->elem_offset, p->Attr("elem_offset")));
+    }
+  } else {
+    needs_print_factor =
+        implicit_var_def(buffer->elem_offset, p->Attr("elem_offset"), "elem_offset");
+  }
+  // Step 6. Handle `buffer.scope`
+  {
+    String scope = buffer.scope();
+    if (scope != "global") {
+      kwargs.Set("scope", LiteralDoc::Str(scope));
+    }
+  }
+  // Step 7. Handle `buffer.data_alignment`
+  if (buffer->data_alignment != runtime::kAllocAlignment) {
+    kwargs.Set("align", LiteralDoc::Int(buffer->data_alignment));
+  }
+  // Step 8. Handle `buffer.offset_factor`
+  if (needs_print_factor || buffer->offset_factor != 1) {
+    kwargs.Set("offset_factor", LiteralDoc::Int(buffer->offset_factor));
+  }
+  // Step 9. Handle `buffer.buffer_type`
+  if (buffer->buffer_type != tir::BufferType::kDefault) {
+    kwargs.Set("type", LiteralDoc::Str("auto"));
+  }
+  // Step 10. Handle `buffer.axis_separator`
+  if (!buffer->axis_separators.empty()) {
+    kwargs.Set("axis_separators",
+               d->AsDoc<ExprDoc>(buffer->axis_separators, p->Attr("axis_separators")));
+  }
+  return kwargs;
+}
+
+ExprDoc BufferCall(const ExprDoc& prefix, const Map<String, ExprDoc>& attrs, Array<ExprDoc> args) {
+  Array<String> kwargs_keys;
+  Array<ExprDoc> kwargs_values;
+  for (String s : {"shape", "dtype"}) {
+    if (Optional<ExprDoc> doc = attrs.Get(s)) {
+      args.push_back(doc.value());
+    }
+  }
+  for (String s : {"data", "strides", "elem_offset", "scope", "align", "offset_factor", "type",
+                   "axis_separators"}) {
+    if (Optional<ExprDoc> doc = attrs.Get(s)) {
+      kwargs_keys.push_back(s);
+      kwargs_values.push_back(doc.value());
+    }
+  }
+  return prefix->Call(args, kwargs_keys, kwargs_values);
+}
+
+ExprDoc BufferDecl(const tir::Buffer& buffer, const String& method, const Array<ExprDoc>& args,
+                   const ObjectPath& p, const Frame& frame, const IRDocsifier& d) {
+  return BufferCall(/*prefix=*/TIR(d)->Attr(method),
+                    /*attrs=*/BufferAttrs(buffer, p, frame, d),
+                    /*args=*/args);
+}
+
+Doc BufferIndex(const PrimExpr& index, const ObjectPath& p, const IRDocsifier& d) {
+  if (const auto* ramp = index.as<tir::RampNode>()) {
+    if (const auto* stride = ramp->stride.as<IntImmNode>()) {
+      ExprDoc start = d->AsDoc<ExprDoc>(ramp->base, p->Attr("base"));
+      ExprDoc stop = d->AsDoc<ExprDoc>(ramp->base + ramp->lanes * ramp->stride, p->Attr("lanes"));
+      Optional<ExprDoc> step = NullOpt;
+      if (stride->value != 1) {
+        step = d->AsDoc<ExprDoc>(ramp->stride, p->Attr("stride"));
+      }
+      return SliceDoc(start, stop, step);
+    }
+  }
+  return d->AsDoc<ExprDoc>(index, p);
+}
+
+ExprDoc BufferIndices(const tir::Buffer& buffer, const Array<PrimExpr>& indices,
+                      const ObjectPath& p, const IRDocsifier& d) {
+  int n = indices.size();
+  Array<Doc> indices_doc;
+  indices_doc.reserve(n);
+  for (int i = 0; i < n; ++i) {
+    indices_doc.push_back(BufferIndex(indices[i], p->Attr("indices")->ArrayIndex(i), d));
+  }
+  return d->AsDoc<ExprDoc>(buffer, p->Attr("buffer"))[indices_doc];
+}
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::BufferRegion>(
+        "", [](tir::BufferRegion buffer_region, ObjectPath p, IRDocsifier d) -> Doc {
+          ExprDoc prefix = d->AsDoc<ExprDoc>(buffer_region->buffer, p->Attr("buffer"));
+          p = p->Attr("region");
+          Array<Range> region = buffer_region->region;
+          int n = region.size();
+          Array<Doc> indices;
+          indices.reserve(n);
+          for (int i = 0; i < n; ++i) {
+            Range range = region[i];
+            ExprDoc min = d->AsDoc<ExprDoc>(range->min, p->ArrayIndex(i)->Attr("min"));
+            if (tir::is_one(range->extent)) {
+              indices.push_back(min);
+            } else {
+              ExprDoc max =
+                  d->AsDoc<ExprDoc>(range->min + range->extent, p->ArrayIndex(i)->Attr("extent"));
+              indices.push_back(SliceDoc(min, max, NullOpt));
+            }
+          }
+          return prefix[indices];
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::BufferStore>(  //
+        "", [](tir::BufferStore store, ObjectPath p, IRDocsifier d) -> Doc {
+          return AssignDoc(/*lhs=*/BufferIndices(store->buffer, store->indices, p, d),
+                           /*rhs=*/d->AsDoc<ExprDoc>(store->value, p->Attr("value")), NullOpt);
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::BufferLoad>(  //
+        "", [](tir::BufferLoad load, ObjectPath p, IRDocsifier d) -> Doc {
+          return BufferIndices(load->buffer, load->indices, p, d);
+        });
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
diff --git a/src/script/printer/tir/expr.cc b/src/script/printer/tir/expr.cc
new file mode 100644
index 000000000000..f9b4eb621447
--- /dev/null
+++ b/src/script/printer/tir/expr.cc
@@ -0,0 +1,299 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/tir/builtin.h>
+
+#include "./utils.h"
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+Doc PrintVar(const tir::Var& var, const ObjectPath& p, const IRDocsifier& d) {
+  if (!d->IsVarDefined(var)) {
+    if (Optional<Frame> opt_f = FindLowestVarDef(var, d)) {
+      ExprDoc lhs = DefineVar(var, opt_f.value(), d);
+      Type type = var->type_annotation;
+      if (const auto* ptr_type = type.as<PointerTypeNode>()) {
+        ICHECK(ptr_type->element_type->IsInstance<PrimTypeNode>());
+        ExprDoc rhs = d->AsDoc<ExprDoc>(type, p->Attr("type_annotation"));
+        opt_f.value()->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
+      } else {
+        ExprDoc rhs = TIR(d)->Attr("var")->Call({LiteralDoc::DataType(var->dtype)});
+        opt_f.value()->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
+      }
+    }
+  }
+  if (Optional<ExprDoc> doc = d->GetVarDoc(var)) {
+    return doc.value();
+  }
+  LOG(FATAL) << "IndexError: Variable is not defined in the environment: " << var;
+}
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)  //
+    .set_dispatch<tir::Var>("", [](tir::Var var, ObjectPath p, IRDocsifier d) -> Doc {
+      return PrintVar(var, p, d);
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)  //
+    .set_dispatch<tir::SizeVar>("", [](tir::SizeVar var, ObjectPath p, IRDocsifier d) -> Doc {
+      return PrintVar(var, p, d);
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::IterVar>("", [](tir::IterVar var, ObjectPath p, IRDocsifier d) -> Doc {
+      return TIR(d)
+          ->Attr("iter_var")
+          ->Call({
+              d->AsDoc<ExprDoc>(var->var, p->Attr("var")),
+              d->AsDoc<ExprDoc>(var->dom, p->Attr("dom")),
+              LiteralDoc::Str(IterVarType2String(var->iter_type)),
+              LiteralDoc::Str(var->thread_tag),
+          });
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)  //
+    .set_dispatch<tir::Buffer>("", [](tir::Buffer buffer, ObjectPath p, IRDocsifier d) -> Doc {
+      if (!d->IsVarDefined(buffer)) {
+        if (Optional<Frame> opt_f = FindLowestVarDef(buffer, d)) {
+          ExprDoc lhs = DefineBuffer(buffer, opt_f.value(), d);
+          ExprDoc rhs = BufferDecl(buffer, "buffer_decl",  // TODO(@junrushao): name confusing
+                                   {}, p, opt_f.value(), d);
+          opt_f.value()->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
+        }
+      }
+      if (Optional<ExprDoc> doc = d->GetVarDoc(buffer)) {
+        return doc.value();
+      }
+      LOG(FATAL) << "IndexError: Buffer is not defined in the environment: " << buffer;
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Not>("", [](tir::Not node, ObjectPath p, IRDocsifier d) -> Doc {
+      ExprDoc a = d->AsDoc<ExprDoc>(node->a, p->Attr("a"));
+      if (a->IsInstance<LiteralDocNode>()) {
+        return TIR(d)->Attr("Not")->Call({a});
+      }
+      return OperationDoc(OperationDocNode::Kind::kNot, {a});
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::StringImm>("", [](tir::StringImm s, ObjectPath p, IRDocsifier d) -> Doc {
+      return d->AsDoc<ExprDoc>(s->value, p->Attr("value"));
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Cast>("", [](tir::Cast cast, ObjectPath p, IRDocsifier d) -> Doc {
+      ExprDoc dtype = LiteralDoc::DataType(cast->dtype);
+      ExprDoc value = d->AsDoc<ExprDoc>(cast->value, p->Attr("value"));
+      return TIR(d)->Attr("Cast")->Call({dtype, value});
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Select>("", [](tir::Select select, ObjectPath p, IRDocsifier d) -> Doc {
+      return TIR(d)->Attr("Select")->Call({
+          d->AsDoc<ExprDoc>(select->condition, p->Attr("condition")),
+          d->AsDoc<ExprDoc>(select->true_value, p->Attr("true_value")),
+          d->AsDoc<ExprDoc>(select->false_value, p->Attr("false_value")),
+      });
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Ramp>("", [](tir::Ramp ramp, ObjectPath p, IRDocsifier d) -> Doc {
+      return TIR(d)->Attr("Ramp")->Call({
+          d->AsDoc<ExprDoc>(ramp->base, p->Attr("base")),
+          d->AsDoc<ExprDoc>(ramp->stride, p->Attr("stride")),
+          LiteralDoc::Int(ramp->lanes),
+      });
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Broadcast>("", [](tir::Broadcast bc, ObjectPath p, IRDocsifier d) -> Doc {
+      return TIR(d)
+          ->Attr("Broadcast")
+          ->Call({
+              d->AsDoc<ExprDoc>(bc->value, p->Attr("value")),
+              LiteralDoc::Int(bc->lanes),
+          });
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Shuffle>(  //
+        "", [](tir::Shuffle shuffle, ObjectPath p, IRDocsifier d) -> Doc {
+          return TIR(d)->Attr("Shuffle")->Call({
+              d->AsDoc<ExprDoc>(shuffle->vectors, p->Attr("vectors")),
+              d->AsDoc<ExprDoc>(shuffle->indices, p->Attr("indices")),
+          });
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::CommReducer>(  //
+        "", [](tir::CommReducer r, ObjectPath p, IRDocsifier d) -> Doc {
+          ICHECK_EQ(r->lhs.size(), r->rhs.size());
+          LambdaDoc lambda{nullptr};
+          {
+            With<TIRFrame> f(d, r);
+            int n_vars = r->lhs.size();
+            Array<IdDoc> vars;
+            vars.reserve(n_vars + n_vars);
+            for (int i = 0; i < n_vars; ++i) {
+              vars.push_back(DefineVar(r->lhs[i], *f, d));
+            }
+            for (int i = 0; i < n_vars; ++i) {
+              vars.push_back(DefineVar(r->rhs[i], *f, d));
+            }
+            int n_results = r->result.size();
+            Array<ExprDoc> results;
+            results.reserve(n_results);
+            for (int i = 0; i < n_results; ++i) {
+              results.push_back(d->AsDoc<ExprDoc>(r->result[i], p->Attr("result")->ArrayIndex(i)));
+            }
+            if (results.size() == 1) {
+              lambda = LambdaDoc(vars, results[0]);
+            } else {
+              lambda = LambdaDoc(vars, TupleDoc(results));
+            }
+          }
+          ExprDoc id = d->AsDoc<ExprDoc>(r->identity_element, p->Attr("identity_element"));
+          return TIR(d)->Attr("comm_reducer")->Call({lambda, id});
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Let>("", [](tir::Let let, ObjectPath p, IRDocsifier d) -> Doc {
+      return TIR(d)->Attr("let")->Call({
+          d->AsDoc<ExprDoc>(let->var, p->Attr("var")),
+          d->AsDoc<ExprDoc>(let->value, p->Attr("value")),
+          d->AsDoc<ExprDoc>(let->body, p->Attr("body")),
+      });
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Call>("", [](tir::Call call, ObjectPath p, IRDocsifier d) -> Doc {
+      static const OpAttrMap<tir::TScriptPrinterName>& op_names =
+          Op::GetAttrMap<tir::TScriptPrinterName>("TScriptPrinterName");
+      static const std::unordered_set<const Object*> dtype_first_arg = {
+          tir::builtin::reinterpret().get(),
+          tir::builtin::call_extern().get(),
+          tir::builtin::call_llvm_intrin().get(),       //
+          tir::builtin::call_llvm_pure_intrin().get(),  //
+          tir::builtin::call_pure_extern().get(),       //
+          tir::builtin::ptx_mma().get(),
+          tir::builtin::ptx_mma_sp().get(),
+          tir::builtin::ptx_ldmatrix().get(),
+          tir::builtin::ptx_cp_async().get(),
+          tir::builtin::mma_store().get(),
+          tir::builtin::mma_fill().get(),
+          tir::builtin::vectorlow().get(),
+          tir::builtin::vectorhigh().get(),
+          tir::builtin::vectorcombine().get(),
+          Op::Get("tir.type_annotation").get(),
+      };
+      static const std::unordered_set<const Object*> dtype_last_arg = {
+          tir::builtin::tvm_struct_get().get(),
+      };
+      ExprDoc prefix{nullptr};
+      if (const auto* op = call->op.as<OpNode>()) {
+        String name = op_names[GetRef<Op>(op)];
+        prefix = TIR(d)->Attr(name);
+      } else if (const auto* gv = call->op.as<GlobalVarNode>()) {
+        prefix = LiteralDoc::Str(gv->name_hint);
+      } else {
+        LOG(FATAL) << "call: " << call;
+      }
+      Array<ExprDoc> args;
+      int n_args = call->args.size();
+      args.reserve(n_args + 1);
+      if (dtype_first_arg.count(call->op.get())) {
+        args.push_back(LiteralDoc::DataType(call->dtype));
+      }
+      for (int i = 0; i < n_args; ++i) {
+        args.push_back(d->AsDoc<ExprDoc>(call->args[i], p->Attr("args")->ArrayIndex(i)));
+      }
+      if (dtype_last_arg.count(call->op.get())) {
+        args.push_back(LiteralDoc::DataType(call->dtype));
+      }
+      return prefix->Call(args);
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Any>("", [](tir::Any any, ObjectPath p, IRDocsifier d) -> Doc {
+      return TIR(d)->Attr("Any")->Call({});
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Reduce>("", [](tir::Reduce r, ObjectPath p, IRDocsifier d) -> Doc {
+      LOG(FATAL) << "ValueError: Reduce should never exist in TIR: " << r;
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::ProducerLoad>(
+        "", [](tir::ProducerLoad load, ObjectPath p, IRDocsifier d) -> Doc {
+          LOG(FATAL) << "ValueError: ProducerLoad should never exist in TIR: " << load;
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Load>("", [](tir::Load load, ObjectPath p, IRDocsifier d) -> Doc {
+      LOG(FATAL) << "ValueError: Load has been deprecated for BufferLoad: " << load;
+    });
+
+#define TVM_SCRIPT_PRINTER_DEF_BINARY(NodeType, OpString)                                       \
+  TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)                                                    \
+      .set_dispatch<tir::NodeType>("",                                                          \
+                                   [](tir::NodeType node, ObjectPath p, IRDocsifier d) -> Doc { \
+                                     ExprDoc a = d->AsDoc<ExprDoc>(node->a, p->Attr("a"));      \
+                                     ExprDoc b = d->AsDoc<ExprDoc>(node->b, p->Attr("b"));      \
+                                     return TIR(d)->Attr(OpString)->Call({a, b});               \
+                                   });
+
+#define TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(NodeType, OpString, OpKind)          \
+  TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)                                          \
+      .set_dispatch<tir::NodeType>(                                                   \
+          "", [](tir::NodeType node, ObjectPath p, IRDocsifier d) -> Doc {            \
+            ExprDoc a = d->AsDoc<ExprDoc>(node->a, p->Attr("a"));                     \
+            ExprDoc b = d->AsDoc<ExprDoc>(node->b, p->Attr("b"));                     \
+            if (a->IsInstance<LiteralDocNode>() && b->IsInstance<LiteralDocNode>()) { \
+              return TIR(d)->Attr(OpString)->Call({a, b});                            \
+            }                                                                         \
+            return OperationDoc(OperationDocNode::Kind::OpKind, {a, b});              \
+          });
+
+TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(Add, "Add", kAdd);
+TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(Sub, "Sub", kSub);
+TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(Mul, "Mul", kMult);
+TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(Div, "Div", kDiv);
+TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(FloorDiv, "FloorDiv", kFloorDiv);
+TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(FloorMod, "FloorMod", kMod);
+TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(LT, "LT", kLt);
+TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(LE, "LE", kLtE);
+TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(EQ, "EQ", kEq);
+TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(NE, "NE", kNotEq);
+TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(GT, "GT", kGt);
+TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(GE, "GE", kGtE);
+TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(And, "And", kAnd);
+TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(Or, "Or", kOr);
+
+TVM_SCRIPT_PRINTER_DEF_BINARY(Mod, "truncmod");
+TVM_SCRIPT_PRINTER_DEF_BINARY(Min, "min");
+TVM_SCRIPT_PRINTER_DEF_BINARY(Max, "max");
+
+#undef TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR
+#undef TVM_SCRIPT_PRINTER_DEF_BINARY
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
diff --git a/src/script/printer/tir/for_loop.cc b/src/script/printer/tir/for_loop.cc
new file mode 100644
index 000000000000..6a375935bd79
--- /dev/null
+++ b/src/script/printer/tir/for_loop.cc
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "./utils.h"
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::For>("", [](tir::For loop, ObjectPath p, IRDocsifier d) -> Doc {
+      // Step 1. Check syntactic sugar: `T.grid`
+      std::vector<const tir::ForNode*> grid;
+      std::unordered_set<const tir::VarNode*> grid_loop_vars;
+      auto f_var_dep = [&grid_loop_vars](const PrimExpr& e) -> bool {
+        return tir::UsesVar(e, [&grid_loop_vars](const tir::VarNode* v) -> bool {  //
+          return grid_loop_vars.count(v);
+        });
+      };
+      for (const tir::ForNode* l = loop.get(); l != nullptr; l = l->body.as<tir::ForNode>()) {
+        ICHECK(l->loop_var->dtype == l->min->dtype);
+        ICHECK(l->loop_var->dtype == l->extent->dtype);
+        if (l->kind != tir::ForKind::kSerial ||  //
+            !tir::is_zero(l->min) ||             //
+            !l->annotations.empty() ||           //
+            f_var_dep(l->extent)) {
+          break;
+        }
+        grid.push_back(l);
+        grid_loop_vars.insert(l->loop_var.get());
+      }
+      With<TIRFrame> f(d, loop);
+      // Step 2. Construct `T.grid`
+      if (grid.size() > 1) {
+        int n = grid.size();
+        Array<ExprDoc> lhs;
+        Array<ExprDoc> rhs;
+        lhs.reserve(n);
+        rhs.reserve(n);
+        for (int i = 0; i < n; ++i) {
+          const tir::ForNode* loop = grid[i];
+          lhs.push_back(DefineVar(loop->loop_var, *f, d));
+          rhs.push_back(d->AsDoc<ExprDoc>(loop->extent, p->Attr("extent")));
+          p = p->Attr("body");
+        }
+        AsDocBody(grid.back()->body, p, (*f).get(), d);
+        return ForDoc(TupleDoc(lhs), TIR(d)->Attr("grid")->Call(rhs), (*f)->stmts);
+      }
+      // Step 3. If not `T.grid`, print loop kind accordingly
+      IdDoc lhs = DefineVar(loop->loop_var, *f, d);
+      Optional<ExprDoc> min = NullOpt;
+      Optional<ExprDoc> max = NullOpt;
+      Optional<ExprDoc> annotations = NullOpt;
+      Optional<ExprDoc> thread = NullOpt;
+      if (tir::is_zero(loop->min)) {
+        max = d->AsDoc<ExprDoc>(loop->extent, p->Attr("extent"));
+      } else {
+        min = d->AsDoc<ExprDoc>(loop->min, p->Attr("min"));
+        max = d->AsDoc<ExprDoc>(loop->min + loop->extent, p->Attr("extent"));
+      }
+      if (!loop->annotations.empty()) {
+        annotations = d->AsDoc<ExprDoc>(loop->annotations, p->Attr("annotations"));
+      }
+      ExprDoc prefix = TIR(d);
+      if (loop->kind == tir::ForKind::kSerial) {
+        if (loop->annotations.empty()) {
+          prefix = IdDoc("range");
+        } else {
+          prefix = prefix->Attr("serial");
+        }
+      } else if (loop->kind == tir::ForKind::kParallel) {
+        prefix = prefix->Attr("parallel");
+      } else if (loop->kind == tir::ForKind::kUnrolled) {
+        prefix = prefix->Attr("unroll");
+      } else if (loop->kind == tir::ForKind::kVectorized) {
+        prefix = prefix->Attr("vectorized");
+      } else if (loop->kind == tir::ForKind::kThreadBinding) {
+        prefix = prefix->Attr("thread_binding");
+        thread = LiteralDoc::Str(loop->thread_binding.value()->thread_tag);
+      } else {
+        LOG(FATAL) << "ValueError: Unknown ForKind: " << tir::ForKind2String(loop->kind);
+      }
+      Array<ExprDoc> args;
+      Array<String> kwargs_keys;
+      Array<ExprDoc> kwargs_values;
+      if (min.defined()) {
+        args.push_back(min.value());
+      }
+      if (max.defined()) {
+        args.push_back(max.value());
+      }
+      if (thread.defined()) {
+        kwargs_keys.push_back("thread");
+        kwargs_values.push_back(thread.value());
+      }
+      if (annotations.defined()) {
+        kwargs_keys.push_back("annotations");
+        kwargs_values.push_back(annotations.value());
+      }
+      ExprDoc rhs = prefix->Call(args, kwargs_keys, kwargs_values);
+      AsDocBody(loop->body, p, (*f).get(), d);
+      return ForDoc(lhs, rhs, (*f)->stmts);
+    });
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
diff --git a/src/script/printer/tir/function.cc b/src/script/printer/tir/function.cc
new file mode 100644
index 000000000000..d47a60209e43
--- /dev/null
+++ b/src/script/printer/tir/function.cc
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "./utils.h"
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+String FindFunctionName(const IRDocsifier& d, const tir::PrimFunc& f) {
+  if (!d->mod.defined()) {
+    return "main";
+  }
+  for (const auto& kv : d->mod.value()->functions) {
+    if (kv.second.same_as(f)) {
+      return kv.first->name_hint;
+    }
+  }
+  return "main";
+}
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::PrimFunc>("", [](tir::PrimFunc func, ObjectPath p, IRDocsifier d) -> Doc {
+      d->SetCommonPrefix(func, [](const ObjectRef& obj) {
+        return obj->IsInstance<tir::VarNode>() || obj->IsInstance<tir::BufferNode>();
+      });
+      With<TIRFrame> frame(d, func);
+      (*frame)->AddDispatchToken(d, "tir");
+      int n_args = func->params.size();
+      // Step 1. Handle `func->params`
+      Array<AssignDoc> args;
+      args.reserve(n_args);
+      for (int i = 0; i < n_args; ++i) {
+        tir::Var var = func->params[i];
+        ObjectPath var_p = p->Attr("params")->ArrayIndex(i);
+        ExprDoc a = d->AsDoc<ExprDoc>(var->type_annotation, var_p->Attr("type_annotation"));
+        args.push_back(AssignDoc(DefineVar(var, *frame, d), NullOpt, a));
+      }
+      // Step 2. Handle `func->attrs`
+      if (func->attrs.defined() && !func->attrs->dict.empty()) {
+        (*frame)->stmts.push_back(
+            ExprStmtDoc(TIR(d)
+                            ->Attr("func_attr")  //
+                            ->Call({d->AsDoc<ExprDoc>(func->attrs, p->Attr("attrs"))})));
+      }
+      // Step 3. Handle `func->buffer_map`
+      for (int i = 0; i < n_args; ++i) {
+        tir::Var param = func->params[i];
+        if (func->buffer_map.count(param)) {
+          tir::Buffer buffer = func->buffer_map[param];
+          ExprDoc param = args[i]->lhs;
+          ObjectPath buffer_p = p->Attr("buffer_map")->MapValue(param);
+          ExprDoc lhs =
+              DefineBuffer(buffer, *frame, d);  // TODO(@junrushao): switch `lhs` and `rhs`
+          ExprDoc rhs = BufferDecl(buffer, "match_buffer", {param}, buffer_p, *frame, d);
+          (*frame)->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
+        }
+      }
+      // Step 4. Handle `func->body`
+      AsDocBody(func->body, p->Attr("body"), frame->get(), d);
+      return FunctionDoc(
+          /*name=*/IdDoc(FindFunctionName(d, func)),
+          /*args=*/args,
+          /*decorators=*/{TIR(d)->Attr("prim_func")},
+          /*return_type=*/d->AsDoc<ExprDoc>(func->ret_type, p->Attr("ret_type")),
+          /*body=*/(*frame)->stmts);
+    });
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
diff --git a/src/script/printer/tir/ir.cc b/src/script/printer/tir/ir.cc
new file mode 100644
index 000000000000..f4e3762fc022
--- /dev/null
+++ b/src/script/printer/tir/ir.cc
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/target/target.h>
+
+#include "./utils.h"
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+TVM_REGISTER_NODE_TYPE(TIRFrameNode);
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<IntImm>("", [](IntImm imm, ObjectPath p, IRDocsifier d) -> Doc {
+      DataType dtype = imm->dtype;
+      if (dtype == Default::IntDType()) {
+        return LiteralDoc::Int(imm->value);
+      } else if (dtype == DataType::Bool()) {
+        return LiteralDoc::Boolean(imm->value);
+      } else {
+        return TIR(d)  //
+            ->Attr(runtime::DLDataType2String(dtype))
+            ->Call({LiteralDoc::Int(imm->value)});
+      }
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<FloatImm>("", [](FloatImm imm, ObjectPath p, IRDocsifier d) -> Doc {
+      DataType dtype = imm->dtype;
+      if (dtype == Default::FloatDType()) {
+        return LiteralDoc::Float(imm->value);
+      } else {
+        return TIR(d)
+            ->Attr(runtime::DLDataType2String(dtype))
+            ->Call({LiteralDoc::Float(imm->value)});
+      }
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<Range>("", [](Range range, ObjectPath p, IRDocsifier d) -> Doc {
+      return TIR(d)->Attr("Range")->Call({
+          d->AsDoc<ExprDoc>(range->min, p->Attr("min")),
+          d->AsDoc<ExprDoc>(range->extent, p->Attr("extent")),
+      });
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<PrimType>("", [](PrimType ty, ObjectPath p, IRDocsifier d) -> Doc {
+      std::string dtype = ty->dtype.is_void() ? "void" : runtime::DLDataType2String(ty->dtype);
+      return TIR(d)->Attr(dtype);
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<PointerType>("", [](PointerType ty, ObjectPath p, IRDocsifier d) -> Doc {
+      ExprDoc element_type = d->AsDoc<ExprDoc>(ty->element_type, p->Attr("element_type"));
+      if (ty->storage_scope == "") {
+        return TIR(d)->Attr("Ptr")->Call({element_type});
+      } else {
+        return TIR(d)->Attr("Ptr")->Call({element_type, LiteralDoc::Str(ty->storage_scope)});
+      }
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<TupleType>("", [](TupleType ty, ObjectPath p, IRDocsifier d) -> Doc {
+      if (ty->fields.empty()) {
+        return LiteralDoc::None();
+      }
+      return TIR(d)  //
+          ->Attr("Tuple")
+          ->Call(d->AsDoc<ListDoc>(ty->fields, p->Attr("fields"))->elements);
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<Target>("", [](Target target, ObjectPath p, IRDocsifier d) -> Doc {
+      Map<String, ObjectRef> config = target->Export();
+      return TIR(d)->Attr("target")->Call({d->AsDoc<ExprDoc>(config, p)});
+    });
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
diff --git a/src/script/printer/tir/stmt.cc b/src/script/printer/tir/stmt.cc
new file mode 100644
index 000000000000..03e5657d24b7
--- /dev/null
+++ b/src/script/printer/tir/stmt.cc
@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../../../tir/transforms/ir_utils.h"
+#include "./utils.h"
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+Doc DoConciseScoping(const Optional<ExprDoc>& lhs, const ExprDoc& rhs, Array<StmtDoc>* stmts,
+                     bool concise_scoping) {
+  if (concise_scoping) {
+    if (lhs.defined()) {
+      stmts->insert(stmts->begin(), AssignDoc(lhs.value(), rhs, NullOpt));
+    } else {
+      stmts->insert(stmts->begin(), ExprStmtDoc(rhs));
+    }
+    return StmtBlockDoc(*stmts);
+  } else {
+    return ScopeDoc(lhs, rhs, *stmts);
+  }
+}
+
+bool AllowConciseScoping(const IRDocsifier& d) {
+  ICHECK(!d->frames.empty());
+  if (const auto* f = d->frames.back().as<TIRFrameNode>()) {
+    return f->allow_concise_scoping;
+  }
+  LOG(FATAL) << "NotImplementedError: fragment printing";
+}
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Evaluate>("", [](tir::Evaluate eval, ObjectPath p, IRDocsifier d) -> Doc {
+      ExprDoc value = d->AsDoc<ExprDoc>(eval->value, p->Attr("value"));
+      if (eval->value->IsInstance<tir::CallNode>()) {
+        return ExprStmtDoc(value);
+      }
+      return ExprStmtDoc(TIR(d)->Attr("evaluate")->Call({value}));
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::LetStmt>("", [](tir::LetStmt stmt, ObjectPath p, IRDocsifier d) -> Doc {
+      bool concise = AllowConciseScoping(d);
+      ExprDoc rhs = d->AsDoc<ExprDoc>(stmt->value, p->Attr("value"));
+      With<TIRFrame> f(d, stmt);
+      ExprDoc lhs = d->IsVarDefined(stmt->var) ? d->GetVarDoc(stmt->var).value()
+                                               : DefineVar(stmt->var, *f, d);
+      AsDocBody(stmt->body, p->Attr("body"), f->get(), d);
+      Array<StmtDoc>* stmts = &(*f)->stmts;
+      if (concise) {
+        Type type = stmt->var->type_annotation;
+        Optional<ExprDoc> type_doc =
+            d->AsDoc<ExprDoc>(type, p->Attr("var")->Attr("type_annotation"));
+        if (const auto* tuple_type = type.as<TupleTypeNode>()) {
+          if (tuple_type->fields.empty()) {
+            type_doc = NullOpt;
+          }
+        }
+        stmts->insert(stmts->begin(), AssignDoc(lhs, rhs, type_doc));
+        return StmtBlockDoc(*stmts);
+      } else {
+        rhs = TIR(d)->Attr("let")->Call({lhs, rhs});
+        return ScopeDoc(NullOpt, rhs, *stmts);
+      }
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::AssertStmt>(
+        "", [](tir::AssertStmt stmt, ObjectPath p, IRDocsifier d) -> Doc {
+          bool concise = AllowConciseScoping(d);
+          ExprDoc cond = d->AsDoc<ExprDoc>(stmt->condition, p->Attr("condition"));
+          ExprDoc msg = d->AsDoc<ExprDoc>(stmt->message, p->Attr("message"));
+          With<TIRFrame> f(d, stmt);
+          AsDocBody(stmt->body, p->Attr("body"), f->get(), d);
+          if (concise) {
+            Array<StmtDoc>* stmts = &(*f)->stmts;
+            stmts->insert(stmts->begin(), AssertDoc(cond, msg));
+            return StmtBlockDoc(*stmts);
+          }
+          return ScopeDoc(NullOpt, TIR(d)->Attr("Assert")->Call({cond, msg}), (*f)->stmts);
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::While>("", [](tir::While stmt, ObjectPath p, IRDocsifier d) -> Doc {
+      ExprDoc cond = d->AsDoc<ExprDoc>(stmt->condition, p->Attr("condition"));
+      With<TIRFrame> f(d, stmt);
+      AsDocBody(stmt->body, p->Attr("body"), f->get(), d);
+      return WhileDoc(cond, (*f)->stmts);
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::DeclBuffer>(  //
+        "", [](tir::DeclBuffer stmt, ObjectPath p, IRDocsifier d) -> Doc {
+          bool concise = AllowConciseScoping(d);
+          ExprDoc rhs =
+              BufferDecl(stmt->buffer, "decl_buffer", {}, p->Attr("buffer"), d->frames.back(), d);
+          With<TIRFrame> f(d, stmt);
+          ExprDoc lhs = DefineBuffer(stmt->buffer, *f, d);
+          AsDocBody(stmt->body, p->Attr("body"), f->get(), d);
+          return DoConciseScoping(lhs, rhs, &(*f)->stmts, concise);
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::IfThenElse>(  //
+        "", [](tir::IfThenElse stmt, ObjectPath p, IRDocsifier d) -> Doc {
+          ExprDoc cond = d->AsDoc<ExprDoc>(stmt->condition, p->Attr("condition"));
+          Array<StmtDoc> then_branch;
+          Array<StmtDoc> else_branch;
+          if (stmt->then_case.defined()) {
+            With<TIRFrame> f(d, stmt->then_case);
+            AsDocBody(stmt->then_case, p->Attr("then_case"), f->get(), d);
+            then_branch = (*f)->stmts;
+          }
+          if (stmt->else_case.defined()) {
+            With<TIRFrame> f(d, stmt->else_case);
+            AsDocBody(stmt->else_case.value(), p->Attr("else_case"), f->get(), d);
+            else_branch = (*f)->stmts;
+          }
+          return IfDoc(cond, then_branch, else_branch);
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::SeqStmt>("", [](tir::SeqStmt stmt, ObjectPath p, IRDocsifier d) -> Doc {
+      // TODO(@junrushao): revisit for fragment printing
+      With<TIRFrame> f(d, stmt);
+      AsDocBody(stmt, p, f->get(), d);
+      return StmtBlockDoc((*f)->stmts);
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Prefetch>(  //
+        "", [](tir::Prefetch stmt, ObjectPath p, IRDocsifier d) -> Doc {
+          return ExprStmtDoc(TIR(d)
+                                 ->Attr("prefetch")
+                                 ->Call({
+                                     d->AsDoc<ExprDoc>(stmt->buffer, p->Attr("buffer")),
+                                     d->AsDoc<ExprDoc>(stmt->bounds, p->Attr("bounds")),
+                                 }));
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Allocate>(  //
+        "", [](tir::Allocate stmt, ObjectPath p, IRDocsifier d) -> Doc {
+          bool concise = AllowConciseScoping(d);
+          String storage_scope = tir::GetPtrStorageScope(stmt->buffer_var);
+          Array<ExprDoc> args;
+          Array<String> kwargs_keys;
+          Array<ExprDoc> kwargs_values;
+          args.push_back(d->AsDoc<ExprDoc>(stmt->extents, p->Attr("extents")));
+          args.push_back(LiteralDoc::DataType(stmt->dtype));
+          args.push_back(LiteralDoc::Str(storage_scope));
+          if (!tir::is_one(stmt->condition)) {
+            args.push_back(d->AsDoc<ExprDoc>(stmt->condition, p->Attr("condition")));
+          }
+          if (!stmt->annotations.empty()) {
+            kwargs_keys.push_back("annotations");
+            kwargs_values.push_back(d->AsDoc<ExprDoc>(stmt->annotations, p->Attr("annotations")));
+          }
+          ExprDoc lhs = DefineVar(stmt->buffer_var, d->frames.back(), d);
+          With<TIRFrame> f(d, stmt);
+          ExprDoc rhs = TIR(d)->Attr("allocate")->Call(args, kwargs_keys, kwargs_values);
+          AsDocBody(stmt->body, p->Attr("body"), f->get(), d);
+          return DoConciseScoping(lhs, rhs, &(*f)->stmts, concise);
+        });
+
+template <typename T>
+ExprDoc PrintNDArray(::tvm::runtime::NDArray arr) {
+  // FIXME(@junrushao): this is a hack and can be wrong in most of the cases
+  constexpr int NUM_PRINT = 200;
+  int ndim = arr->ndim;
+  int tot_dim = 1;
+  for (int i = 0; i < ndim; i++) {
+    tot_dim *= arr->shape[i];
+  }
+  Array<ExprDoc> result;
+  T* data_ptr = reinterpret_cast<T*>(arr->data);
+  runtime::DataType dtype = arr.DataType();
+  for (int i = 0; i < tot_dim; i++) {
+    if (dtype.is_float()) {
+      result.push_back(LiteralDoc::Float(data_ptr[i]));
+    } else {
+      result.push_back(LiteralDoc::Int(data_ptr[i]));
+    }
+    if (i == NUM_PRINT) {
+      break;
+    }
+  }
+  return ListDoc(result);
+}
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::AllocateConst>(
+        "", [](tir::AllocateConst stmt, ObjectPath p, IRDocsifier d) -> Doc {
+          bool concise = AllowConciseScoping(d);
+          String storage_scope = tir::GetPtrStorageScope(stmt->buffer_var);
+          Array<ExprDoc> args;
+          Array<String> kwargs_keys;
+          Array<ExprDoc> kwargs_values;
+          ExprDoc data_doc{nullptr};
+          if (stmt->dtype.is_int()) {
+            if (stmt->dtype.bits() == 8) {
+              data_doc = PrintNDArray<int8_t>(stmt->data.value());
+            } else if (stmt->dtype.bits() == 16) {
+              data_doc = PrintNDArray<int16_t>(stmt->data.value());
+            } else if (stmt->dtype.bits() == 32) {
+              data_doc = PrintNDArray<int32_t>(stmt->data.value());
+            } else if (stmt->dtype.bits() == 64) {
+              data_doc = PrintNDArray<int64_t>(stmt->data.value());
+            } else {
+              LOG(FATAL) << "DataType not supported";
+            }
+          } else if (stmt->dtype.is_uint()) {
+            if (stmt->dtype.bits() == 8) {
+              data_doc = PrintNDArray<uint8_t>(stmt->data.value());
+            } else if (stmt->dtype.bits() == 16) {
+              data_doc = PrintNDArray<uint16_t>(stmt->data.value());
+            } else if (stmt->dtype.bits() == 32) {
+              data_doc = PrintNDArray<uint32_t>(stmt->data.value());
+            } else if (stmt->dtype.bits() == 64) {
+              data_doc = PrintNDArray<uint64_t>(stmt->data.value());
+            } else {
+              LOG(FATAL) << "DataType not supported";
+            }
+          } else if (stmt->dtype.is_float()) {
+            if (stmt->dtype.bits() == 16) {
+              data_doc = PrintNDArray<int16_t>(stmt->data.value());
+            } else if (stmt->dtype.bits() == 32) {
+              data_doc = PrintNDArray<float>(stmt->data.value());
+            } else if (stmt->dtype.bits() == 64) {
+              data_doc = PrintNDArray<double>(stmt->data.value());
+            } else {
+              LOG(FATAL) << "DataType not supported";
+            }
+          } else {
+            LOG(FATAL) << "DataType not supported";
+          }
+          args.push_back(data_doc);
+          args.push_back(LiteralDoc::DataType(stmt->dtype));
+          args.push_back(d->AsDoc<ExprDoc>(stmt->extents, p->Attr("extents")));
+          ExprDoc rhs = TIR(d)->Attr("allocate_const")->Call(args, kwargs_keys, kwargs_values);
+          With<TIRFrame> f(d, stmt);
+          ExprDoc lhs = DefineVar(stmt->buffer_var, *f, d);
+          AsDocBody(stmt->body, p->Attr("body"), f->get(), d);
+          return DoConciseScoping(lhs, rhs, &(*f)->stmts, concise);
+        });
+
+ExprDoc DocsifyBufferRealize(const tir::BufferRealizeNode* stmt, Optional<ExprDoc> value,  //
+                             ObjectPath p, IRDocsifier d) {
+  ExprDoc buffer = d->AsDoc<ExprDoc>(stmt->buffer, p->Attr("buffer"));
+  {
+    Array<Doc> bounds;
+    bounds.reserve(stmt->bounds.size());
+    for (int i = 0, n = stmt->bounds.size(); i < n; ++i) {
+      Range range = stmt->bounds[i];
+      ObjectPath range_p = p->Attr("bounds")->ArrayIndex(i);
+      bounds.push_back(
+          SliceDoc(d->AsDoc<ExprDoc>(range->min, range_p->Attr("min")),
+                   d->AsDoc<ExprDoc>(range->min + range->extent, range_p->Attr("extent")),  //
+                   NullOpt));
+    }
+    buffer = buffer[bounds];
+  }
+  Array<ExprDoc> args{buffer};
+  Array<String> kwargs_keys;
+  Array<ExprDoc> kwargs_values;
+  if (value.defined()) {
+    args.push_back(value.value());
+  }
+  if (!tir::is_one(stmt->condition)) {
+    kwargs_keys.push_back("condition");
+    kwargs_values.push_back(d->AsDoc<ExprDoc>(stmt->condition, p->Attr("condition")));
+  }
+  return TIR(d)->Attr("realize")->Call(args, kwargs_keys, kwargs_values);
+}
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::BufferRealize>(  //
+        "", [](tir::BufferRealize stmt, ObjectPath p, IRDocsifier d) -> Doc {
+          bool concise = AllowConciseScoping(d);
+          ExprDoc rhs = DocsifyBufferRealize(stmt.get(), NullOpt, p, d);
+          With<TIRFrame> f(d, stmt);
+          AsDocBody(stmt->body, p->Attr("body"), f->get(), d);
+          return DoConciseScoping(NullOpt, rhs, &(*f)->stmts, concise);
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::AttrStmt>(  //
+        "", [](tir::AttrStmt stmt, ObjectPath p, IRDocsifier d) -> Doc {
+          bool concise = AllowConciseScoping(d);
+          Optional<ExprDoc> rhs = NullOpt;
+          tir::Stmt body = stmt->body;
+          ObjectPath body_p = p->Attr("body");
+          if (stmt->attr_key == "realize_scope") {
+            if (const auto* realize = stmt->body.as<tir::BufferRealizeNode>()) {
+              if (realize->buffer.same_as(stmt->node)) {
+                rhs =
+                    DocsifyBufferRealize(realize,
+                                         /*value=*/d->AsDoc<ExprDoc>(stmt->value, p->Attr("value")),
+                                         /*p=*/p->Attr("body"), d);
+                body = realize->body;
+                body_p = body_p->Attr("body");
+              }
+            }
+          }
+          if (stmt->attr_key == "thread_extent" || stmt->attr_key == "virtual_thread") {
+            if (const auto* iter_var = stmt->node.as<tir::IterVarNode>()) {
+              if (!d->IsVarDefined(iter_var->var)) {
+                // `DefineVar` is not used here because a more specific name is desirable
+                Frame f = FindLowestVarDef(iter_var->var, d).value();
+                DefineVar(iter_var->var, f, d);
+                f->stmts.push_back(
+                    AssignDoc(d->AsDoc<ExprDoc>(iter_var->var, p->Attr("node")->Attr("var")),
+                              TIR(d)  //
+                                  ->Attr("env_thread")
+                                  ->Call({LiteralDoc::Str(iter_var->thread_tag)}),  //
+                              NullOpt));
+              }
+              rhs = TIR(d)
+                        ->Attr("launch_thread")
+                        ->Call({
+                            d->AsDoc<ExprDoc>(iter_var->var, p->Attr("node")),
+                            d->AsDoc<ExprDoc>(stmt->value, p->Attr("value")),
+                        });
+            }
+          }
+          if (!rhs.defined()) {
+            rhs = TIR(d)->Attr("attr")->Call({
+                d->AsDoc<ExprDoc>(stmt->node, p->Attr("node")),
+                LiteralDoc::Str(stmt->attr_key),
+                d->AsDoc<ExprDoc>(stmt->value, p->Attr("value")),
+            });
+          }
+          With<TIRFrame> f(d, stmt);
+          AsDocBody(body, body_p, f->get(), d);
+          return DoConciseScoping(NullOpt, rhs.value(), &(*f)->stmts, concise);
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::ProducerRealize>(  //
+        "", [](tir::ProducerRealize stmt, ObjectPath p, IRDocsifier d) -> Doc {
+          LOG(FATAL) << "ValueError: ProducerRealize should never exist in TIR: " << stmt;
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::ProducerStore>(  //
+        "", [](tir::ProducerStore stmt, ObjectPath p, IRDocsifier d) -> Doc {
+          LOG(FATAL) << "ValueError: ProducerStore should never exist in TIR: " << stmt;
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::Store>(  //
+        "", [](tir::Store stmt, ObjectPath p, IRDocsifier d) -> Doc {
+          LOG(FATAL) << "ValueError: Store has been deprecated for BufferStore: " << stmt;
+        });
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
diff --git a/src/script/printer/tir/utils.h b/src/script/printer/tir/utils.h
new file mode 100644
index 000000000000..6cae378d0e69
--- /dev/null
+++ b/src/script/printer/tir/utils.h
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SCRIPT_PRINTER_TIR_UTILS_H_
+#define TVM_SCRIPT_PRINTER_TIR_UTILS_H_
+
+#include <tvm/script/printer/ir_docsifier.h>
+#include <tvm/script/printer/printer.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt.h>
+
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+/*! \brief A printer frame for TIR fragment */
+class TIRFrameNode : public FrameNode {
+ public:
+  /*! \brief The TIR fragment the frame corresponds to */
+  ObjectRef tir;
+  /*! \brief Whether or not the frame allows concise scoping */
+  bool allow_concise_scoping{false};
+
+  void VisitAttrs(AttrVisitor* v) {
+    FrameNode::VisitAttrs(v);
+    v->Visit("tir", &tir);
+    v->Visit("allow_concise_scoping", &allow_concise_scoping);
+  }
+
+  static constexpr const char* _type_key = "script.printer.TIRFrame";
+  TVM_DECLARE_FINAL_OBJECT_INFO(TIRFrameNode, FrameNode);
+};
+
+/*! \brief Managed reference to TIRFrameNode */
+class TIRFrame : public Frame {
+ public:
+  /*! \brief Constructor */
+  explicit TIRFrame(const IRDocsifier& d, const ObjectRef& tir) {
+    ObjectPtr<TIRFrameNode> n = make_object<TIRFrameNode>();
+    n->stmts.clear();
+    n->d = d.get();
+    n->tir = tir;
+    data_ = std::move(n);
+  }
+
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(TIRFrame, Frame, TIRFrameNode);
+};
+
+/*! \brief Creates the TIR common prefix, which is by default `T` */
+inline IdDoc TIR(const IRDocsifier& d) {  //
+  return IdDoc(d->ir_prefix.Get("tir").value_or("T"));
+}
+
+/*!
+ * \brief Defines a variable in the IRDocsifier at the given frame,
+ * and returns the corresponding IdDoc
+ * \param var The variable to define
+ * \param d The IRDocsifier
+ * \param frame The frame to define the variable in
+ * \return The IdDoc corresponding to the variable
+ */
+inline IdDoc DefineVar(const tir::Var& var, const Frame& frame, const IRDocsifier& d) {
+  return d->Define(var, frame, var->name_hint.empty() ? "v" : var->name_hint);
+}
+
+/*!
+ * \brief Defines a buffer in the IRDocsifier at the given frame,
+ * and returns the corresponding IdDoc
+ * \param buffer The buffer to define
+ * \param frame The frame to define the buffer in
+ * \param d The IRDocsifier
+ * \return The IdDoc corresponding to the buffer
+ */
+inline IdDoc DefineBuffer(const tir::Buffer& buffer, const Frame& frame, const IRDocsifier& d) {
+  return d->Define(buffer, frame, buffer->name.empty() ? "buffer" : buffer->name);
+}
+
+/*!
+ * \brief Recursively process the body statements of a TIR fragment represented by a frame
+ * \param stmt The body statement to process
+ * \param p The object path
+ * \param f The frame
+ * \param d The IRDocsifier
+ */
+inline void AsDocBody(const tir::Stmt& stmt, ObjectPath p, TIRFrameNode* f, const IRDocsifier& d) {
+  if (const auto* seq_stmt = stmt.as<tir::SeqStmtNode>()) {
+    Array<tir::Stmt> body = seq_stmt->seq;
+    p = p->Attr("seq");
+    for (int i = 0, n = body.size(); i < n; ++i) {
+      f->allow_concise_scoping = (i == n - 1);
+      Doc doc = d->AsDoc(body[i], p->ArrayIndex(i));
+      if (const auto* block = doc.as<StmtBlockDocNode>()) {
+        f->stmts.insert(f->stmts.end(), block->stmts.begin(), block->stmts.end());
+      } else {
+        f->stmts.push_back(Downcast<StmtDoc>(doc));
+      }
+    }
+  } else {
+    f->allow_concise_scoping = true;
+    Doc doc = d->AsDoc(stmt, p);
+    if (const auto* block = doc.as<StmtBlockDocNode>()) {
+      f->stmts.insert(f->stmts.end(), block->stmts.begin(), block->stmts.end());
+    } else {
+      f->stmts.push_back(Downcast<StmtDoc>(doc));
+    }
+  }
+}
+
+/*!
+ * \brief Find the top frame in the stack that could place a var definition
+ * \param var The var to be defined
+ * \param d The IRDocsifier
+ * \return The frame that could place the var definition
+ */
+inline Optional<Frame> FindLowestVarDef(const ObjectRef& var, const IRDocsifier& d) {
+  if (!d->common_prefix.count(var.get())) {
+    return NullOpt;
+  }
+  int n_frames = d->frames.size();
+  std::unordered_map<const Object*, const FrameNode*> tir_to_frame;
+  tir_to_frame.reserve(n_frames);
+  for (int i = n_frames - 1; i >= 0; --i) {
+    if (const auto* f = d->frames[i].as<TIRFrameNode>()) {
+      tir_to_frame[f->tir.get()] = f;
+    }
+  }
+  const std::vector<const Object*>& path = d->common_prefix.at(var.get());
+  for (auto it = path.rbegin(); it != path.rend(); ++it) {
+    if (tir_to_frame.count(*it)) {
+      return GetRef<Frame>(tir_to_frame.at(*it));
+    }
+  }
+  return NullOpt;
+}
+
+/*!
+ * \brief Declare and define a buffer
+ * \param buffer The buffer to be defined
+ * \param method The method used to declare the buffer
+ * \param args The extra arguments used to declare the buffer
+ * \param p The object path
+ * \param f The frame
+ * \param d The IRDocsifier
+ * \return The ExprDoc corresponding to the buffer declaration
+ */
+ExprDoc BufferDecl(const tir::Buffer& buffer, const String& method, const Array<ExprDoc>& args,
+                   const ObjectPath& p, const Frame& frame, const IRDocsifier& d);
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
+
+#endif  // TVM_SCRIPT_PRINTER_TIR_UTILS_H_
diff --git a/src/script/printer/traced_object_functor.cc b/src/script/printer/traced_object_functor.cc
deleted file mode 100644
index 43160c7f4be4..000000000000
--- a/src/script/printer/traced_object_functor.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <tvm/script/printer/traced_object_functor.h>
-
-namespace tvm {
-namespace script {
-namespace printer {
-
-const runtime::PackedFunc* GetDispatchFunctionForToken(const DispatchTable& table,
-                                                       const String& token, uint32_t type_index) {
-  auto it = table.find(token);
-  if (it == table.end()) {
-    return nullptr;
-  }
-  const std::vector<runtime::PackedFunc>& tab = it->second;
-  if (type_index >= tab.size()) {
-    return nullptr;
-  }
-  const PackedFunc* f = &tab[type_index];
-  if (f->defined()) {
-    return f;
-  } else {
-    return nullptr;
-  }
-}
-
-const runtime::PackedFunc& GetDispatchFunction(const DispatchTable& dispatch_table,
-                                               const String& token, uint32_t type_index) {
-  if (const runtime::PackedFunc* pf =
-          GetDispatchFunctionForToken(dispatch_table, token, type_index)) {
-    return *pf;
-  } else if (const runtime::PackedFunc* pf =
-                 GetDispatchFunctionForToken(dispatch_table, kDefaultDispatchToken, type_index)) {
-    // Fallback to function with the default dispatch token
-    return *pf;
-  } else {
-    ICHECK(false) << "ObjectFunctor calls un-registered function on type: "
-                  << runtime::Object::TypeIndex2Key(type_index) << " (token: " << token << ")";
-    throw;
-  }
-}
-
-void SetDispatchFunction(DispatchTable* dispatch_table, const String& token, uint32_t type_index,
-                         runtime::PackedFunc f) {
-  std::vector<runtime::PackedFunc>* table = &(*dispatch_table)[token];
-  if (table->size() <= type_index) {
-    table->resize(type_index + 1, nullptr);
-  }
-  runtime::PackedFunc& slot = (*table)[type_index];
-  if (slot != nullptr) {
-    ICHECK(false) << "Dispatch for type is already registered: "
-                  << runtime::Object::TypeIndex2Key(type_index);
-  }
-  slot = f;
-}
-
-void RemoveDispatchFunction(DispatchTable* dispatch_table, const String& token,
-                            uint32_t type_index) {
-  std::vector<runtime::PackedFunc>* table = &(*dispatch_table)[token];
-  if (table->size() <= type_index) {
-    return;
-  }
-  (*table)[type_index] = nullptr;
-}
-
-}  // namespace printer
-}  // namespace script
-}  // namespace tvm
diff --git a/src/script/printer/utils.h b/src/script/printer/utils.h
deleted file mode 100644
index abe7ce5e9a88..000000000000
--- a/src/script/printer/utils.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef TVM_SCRIPT_PRINTER_UTILS_H_
-#define TVM_SCRIPT_PRINTER_UTILS_H_
-
-#include <tvm/script/printer/doc.h>
-#include <tvm/script/printer/ir_docsifier.h>
-
-#include <utility>
-
-namespace tvm {
-namespace script {
-namespace printer {
-
-template <typename DocType, typename NodeType>
-Array<DocType> AsDocArray(const TracedArray<NodeType>& refs, const IRDocsifier& ir_docsifier) {
-  Array<DocType> result;
-  for (auto ref : refs) {
-    result.push_back(ir_docsifier->AsExprDoc(ref));
-  }
-  return result;
-}
-
-template <typename DocType, typename NodeType>
-Array<DocType> AsDocArray(std::initializer_list<NodeType>&& refs, const IRDocsifier& ir_docsifier) {
-  Array<DocType> result;
-  for (auto& ref : refs) {
-    result.push_back(ir_docsifier->AsExprDoc(ref));
-  }
-  return result;
-}
-
-template <typename RefType>
-Array<ExprDoc> AsExprDocArray(const TracedArray<RefType>& refs, const IRDocsifier& ir_docsifier) {
-  return AsDocArray<ExprDoc>(refs, ir_docsifier);
-}
-
-template <typename RefType>
-Array<ExprDoc> AsExprDocArray(std::initializer_list<RefType>&& refs,
-                              const IRDocsifier& ir_docsifier) {
-  return AsDocArray<ExprDoc>(std::move(refs), ir_docsifier);
-}
-
-inline DictDoc AsDictDoc(const TracedMap<String, ObjectRef>& dict,
-                         const IRDocsifier& ir_docsifier) {
-  Array<ExprDoc> keys;
-  Array<ExprDoc> values;
-
-  for (auto p : dict) {
-    keys.push_back(LiteralDoc::Str(p.first));
-    values.push_back(ir_docsifier->AsExprDoc(p.second));
-  }
-
-  auto doc = DictDoc(keys, values);
-  doc->source_paths.push_back(dict.GetPath());
-  return doc;
-}
-
-template <typename T>
-inline ListDoc AsListDoc(const TracedArray<T>& arr, const IRDocsifier& ir_docsifier) {
-  auto ret = ListDoc(AsExprDocArray(arr, ir_docsifier));
-  ret->source_paths.push_back(arr.GetPath());
-  return ret;
-}
-
-template <typename T>
-inline TupleDoc AsTupleDoc(const TracedArray<T>& arr, const IRDocsifier& ir_docsifier) {
-  auto ret = TupleDoc(AsExprDocArray(arr, ir_docsifier));
-  ret->source_paths.push_back(arr.GetPath());
-  return ret;
-}
-
-}  // namespace printer
-}  // namespace script
-}  // namespace tvm
-
-#endif  // TVM_SCRIPT_PRINTER_UTILS_H_
diff --git a/src/script/printer/var_table.cc b/src/script/printer/var_table.cc
deleted file mode 100644
index 62d8b2f66cc2..000000000000
--- a/src/script/printer/var_table.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <tvm/node/object_path.h>
-#include <tvm/runtime/container/optional.h>
-#include <tvm/runtime/logging.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/script/printer/var_table.h>
-
-namespace tvm {
-namespace script {
-namespace printer {
-
-String GenerateUniqueName(const String& name_hint, std::unordered_set<String>* defined_names) {
-  String name = name_hint;
-  for (int i = 1; !defined_names->insert(name).second; ++i) {
-    name = name_hint + "_" + std::to_string(i);
-  }
-  return name;
-}
-
-IdDoc VarTableNode::Define(const ObjectRef& obj, const String& name_hint,
-                           const ObjectPath& object_path, const Frame& frame) {
-  String name = GenerateUniqueName(name_hint, &this->defined_names);
-  DocFactory doc_factory = [name]() { return IdDoc(name); };
-
-  auto result = obj2info.insert({obj, VariableInfo{std::move(doc_factory), name}});
-  ICHECK(result.second) << "Duplicated object: " << obj;
-
-  IdDoc def_doc(name);
-  def_doc->source_paths.push_back(object_path);
-
-  frame->AddExitCallback([this, obj]() { this->RemoveVar(obj); });
-
-  return def_doc;
-}
-
-void VarTableNode::DefineByDoc(const ObjectRef& obj, DocFactory doc_factory, const Frame& frame) {
-  ICHECK(obj2info.find(obj) == obj2info.end()) << "Duplicated object: " << obj;
-
-  ICHECK(!doc_factory()->IsInstance<IdDocNode>())
-      << "VarTableNode::Define cannot be used for variable that's mapped to IdDoc.";
-
-  obj2info.insert({obj, VariableInfo{std::move(doc_factory), NullOpt}});
-
-  frame->AddExitCallback([this, obj]() { this->RemoveVar(obj); });
-}
-
-Optional<ExprDoc> VarTableNode::GetVarDoc(const ObjectRef& obj,
-                                          const ObjectPath& object_path) const {
-  auto it = obj2info.find(obj);
-  if (it == obj2info.end()) {
-    return NullOpt;
-  }
-  ExprDoc doc = it->second.doc_factory();
-  doc->source_paths.push_back(object_path);
-  return doc;
-}
-
-bool VarTableNode::IsVarDefined(const ObjectRef& obj) const { return obj2info.count(obj); }
-
-void VarTableNode::RemoveVar(const ObjectRef& obj) {
-  auto it = obj2info.find(obj);
-  ICHECK(it != obj2info.end()) << "No such object: " << obj;
-
-  if (it->second.name.defined()) {
-    defined_names.erase(it->second.name.value());
-  }
-  obj2info.erase(it);
-}
-
-VarTable::VarTable() { data_ = make_object<VarTableNode>(); }
-
-TVM_REGISTER_NODE_TYPE(VarTableNode);
-TVM_REGISTER_GLOBAL("script.printer.VarTable").set_body_typed([]() { return VarTable(); });
-TVM_REGISTER_GLOBAL("script.printer.VarTableDefine")
-    .set_body_method<VarTable, VarTableNode, IdDoc, const ObjectRef&, const String&,
-                     const ObjectPath&, const Frame&>(&VarTableNode::Define);
-TVM_REGISTER_GLOBAL("script.printer.VarTableDefineByDoc")
-    .set_body_typed([](VarTable var_table, const ObjectRef& obj, runtime::PackedFunc factory,
-                       Frame frame) {
-      var_table->DefineByDoc(
-          obj, [f = std::move(factory)]() { return f(); }, frame);
-    });
-TVM_REGISTER_GLOBAL("script.printer.VarTableGetVarDoc")
-    .set_body_method<VarTable, VarTableNode, Optional<ExprDoc>, const ObjectRef&,
-                     const ObjectPath&>(&VarTableNode::GetVarDoc);
-TVM_REGISTER_GLOBAL("script.printer.VarTableIsVarDefined")
-    .set_body_method<VarTable>(&VarTableNode::IsVarDefined);
-
-}  // namespace printer
-}  // namespace script
-}  // namespace tvm
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index 43d50306be45..1f4962489328 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -1089,7 +1089,7 @@ PrimExpr TypeAnnotation(DataType dtype, Span span) {
   return tir::Call(dtype, op, {}, span);
 }
 
-TVM_REGISTER_OP("tir.type_annotation")
+TVM_TIR_REGISTER_OP("type_annotation")
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
 
 }  // namespace tir
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 56ecba9e9ed9..dc3208f484e3 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -36,7 +36,7 @@ namespace builtin {
     static const Op& op = Op::Get("tir." #OpName); \
     return op;                                     \
   }                                                \
-  TVM_REGISTER_OP("tir." #OpName)
+  TVM_TIR_REGISTER_OP(#OpName)
 
 TIR_DEFINE_BUILTIN_FUNC(reinterpret)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure))
@@ -181,10 +181,12 @@ TIR_DEFINE_BUILTIN_FUNC(tvm_stack_make_array)
 
 // When num_inputs are not set, the function is assumed to be variable length.
 TIR_DEFINE_BUILTIN_FUNC(tvm_call_packed)
-    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque))
+    .set_attr<TScriptPrinterName>("TScriptPrinterName", String("call_packed"), /*plevel=*/20);
 
 TIR_DEFINE_BUILTIN_FUNC(tvm_call_cpacked)
-    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque))
+    .set_attr<TScriptPrinterName>("TScriptPrinterName", String("call_cpacked"), /*plevel=*/20);
 
 TIR_DEFINE_BUILTIN_FUNC(tvm_call_trace_packed)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
@@ -198,10 +200,14 @@ TIR_DEFINE_BUILTIN_FUNC(tvm_thread_context)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
 TIR_DEFINE_BUILTIN_FUNC(tvm_call_packed_lowered)
-    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque))
+    .set_attr<TScriptPrinterName>("TScriptPrinterName", String("call_packed_lowered"),
+                                  /*plevel=*/20);
 
 TIR_DEFINE_BUILTIN_FUNC(tvm_call_cpacked_lowered)
-    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque))
+    .set_attr<TScriptPrinterName>("TScriptPrinterName", String("call_cpacked_lowered"),
+                                  /*plevel=*/20);
 
 TIR_DEFINE_BUILTIN_FUNC(tvm_call_trace_packed_lowered)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index 044d8fd08da5..078e32ca57c7 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -39,13 +39,13 @@ namespace tvm {
 using namespace tir;
 
 // macro to register an unary op
-#define TIR_REGISTER_PURE_UNARY_OP(OpName)                             \
-  TVM_REGISTER_OP(OpName).set_num_inputs(1).set_attr<TCallEffectKind>( \
+#define TVM_TIR_REGISTER_PURE_UNARY_OP(OpName)                             \
+  TVM_TIR_REGISTER_OP(OpName).set_num_inputs(1).set_attr<TCallEffectKind>( \
       "TCallEffectKind", Integer(CallEffectKind::kPure))
 
 // macro to register an binary op
-#define TIR_REGISTER_PURE_BINARY_OP(OpName)                            \
-  TVM_REGISTER_OP(OpName).set_num_inputs(2).set_attr<TCallEffectKind>( \
+#define TVM_TIR_REGISTER_PURE_BINARY_OP(OpName)                            \
+  TVM_TIR_REGISTER_OP(OpName).set_num_inputs(2).set_attr<TCallEffectKind>( \
       "TCallEffectKind", Integer(CallEffectKind::kPure))
 
 runtime::DataType GetRuntimeDataType(const Type& type) {
@@ -657,7 +657,7 @@ PrimExpr pow(PrimExpr x, PrimExpr y, Span span) {
   return tir::Call(x.dtype(), op, {x, y}, span);
 }
 
-TIR_REGISTER_PURE_BINARY_OP("tir.pow").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_BINARY_OP("pow").set_attr<TVectorizable>("TVectorizable", true);
 
 // abs
 PrimExpr abs(PrimExpr x, Span span) {
@@ -685,7 +685,7 @@ PrimExpr abs(PrimExpr x, Span span) {
   }
 }
 
-TIR_REGISTER_PURE_UNARY_OP("tir.fabs").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("fabs").set_attr<TVectorizable>("TVectorizable", true);
 
 // isnan
 PrimExpr isnan(PrimExpr x, Span span) {
@@ -783,7 +783,7 @@ PrimExpr fmod(PrimExpr x, PrimExpr y, Span span) {
   return tir::Call(x.dtype(), op, {x, y}, span);
 }
 
-TIR_REGISTER_PURE_UNARY_OP("tir.fmod");
+TVM_TIR_REGISTER_PURE_UNARY_OP("fmod");
 
 // floor
 PrimExpr floor(PrimExpr x, Span span) {
@@ -797,7 +797,7 @@ PrimExpr floor(PrimExpr x, Span span) {
   return tir::Call(x.dtype(), op, {x}, span);
 }
 
-TIR_REGISTER_PURE_UNARY_OP("tir.floor").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("floor").set_attr<TVectorizable>("TVectorizable", true);
 
 // ceil
 PrimExpr ceil(PrimExpr x, Span span) {
@@ -811,7 +811,7 @@ PrimExpr ceil(PrimExpr x, Span span) {
   return tir::Call(x.dtype(), op, {x}, span);
 }
 
-TIR_REGISTER_PURE_UNARY_OP("tir.ceil").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("ceil").set_attr<TVectorizable>("TVectorizable", true);
 
 // round
 PrimExpr round(PrimExpr x, Span span) {
@@ -825,7 +825,7 @@ PrimExpr round(PrimExpr x, Span span) {
   return tir::Call(x.dtype(), op, {x}, span);
 }
 
-TIR_REGISTER_PURE_UNARY_OP("tir.round").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("round").set_attr<TVectorizable>("TVectorizable", true);
 
 // nearbyint
 PrimExpr nearbyint(PrimExpr x, Span span) {
@@ -839,7 +839,7 @@ PrimExpr nearbyint(PrimExpr x, Span span) {
   return tir::Call(x.dtype(), op, {x}, span);
 }
 
-TIR_REGISTER_PURE_UNARY_OP("tir.nearbyint");
+TVM_TIR_REGISTER_PURE_UNARY_OP("nearbyint");
 
 // trunc
 PrimExpr trunc(PrimExpr x, Span span) {
@@ -856,67 +856,77 @@ PrimExpr trunc(PrimExpr x, Span span) {
   return tir::Call(x.dtype(), op, {x}, span);
 }
 
-TIR_REGISTER_PURE_UNARY_OP("tir.trunc").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("trunc").set_attr<TVectorizable>("TVectorizable", true);
 
 // unary op registration.
-TIR_REGISTER_PURE_UNARY_OP("tir.exp").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("exp").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.exp2").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("exp2").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.exp10").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("exp10").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.erf");
+TVM_TIR_REGISTER_PURE_UNARY_OP("erf");
 
-TIR_REGISTER_PURE_UNARY_OP("tir.tanh").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("tanh").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.sigmoid").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("sigmoid").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.sqrt").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("sqrt").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.rsqrt");
+TVM_TIR_REGISTER_PURE_UNARY_OP("rsqrt");
 
-TIR_REGISTER_PURE_UNARY_OP("tir.log").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("log").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.log2").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("log2").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.log1p");
+TVM_TIR_REGISTER_PURE_UNARY_OP("log1p");
 
-TIR_REGISTER_PURE_UNARY_OP("tir.log10").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("log10").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.tan").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("tan").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.cos").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("cos").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.cosh").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("cosh").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.sin").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("sin").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.sinh").set_attr<TVectorizable>("TVectorizable", true);
+TVM_TIR_REGISTER_PURE_UNARY_OP("sinh").set_attr<TVectorizable>("TVectorizable", true);
 
-TIR_REGISTER_PURE_UNARY_OP("tir.asin");
+TVM_TIR_REGISTER_PURE_UNARY_OP("asin");
 
-TIR_REGISTER_PURE_UNARY_OP("tir.acos");
+TVM_TIR_REGISTER_PURE_UNARY_OP("acos");
 
-TIR_REGISTER_PURE_UNARY_OP("tir.atan");
+TVM_TIR_REGISTER_PURE_UNARY_OP("atan");
 
-TIR_REGISTER_PURE_UNARY_OP("tir.acosh");
+TVM_TIR_REGISTER_PURE_UNARY_OP("acosh");
 
-TIR_REGISTER_PURE_UNARY_OP("tir.asinh");
+TVM_TIR_REGISTER_PURE_UNARY_OP("asinh");
 
-TIR_REGISTER_PURE_UNARY_OP("tir.atanh");
+TVM_TIR_REGISTER_PURE_UNARY_OP("atanh");
 
-TIR_REGISTER_PURE_UNARY_OP("tir.clz");
+TVM_TIR_REGISTER_PURE_UNARY_OP("clz");
 
 // binary intrinsics
-TIR_REGISTER_PURE_BINARY_OP("tir.atan2");
+TVM_TIR_REGISTER_PURE_BINARY_OP("atan2");
 
-TIR_REGISTER_PURE_BINARY_OP("tir.nextafter");
+TVM_TIR_REGISTER_PURE_BINARY_OP("nextafter");
 
-TIR_REGISTER_PURE_BINARY_OP("tir.hypot");
+TVM_TIR_REGISTER_PURE_BINARY_OP("hypot");
 
-TIR_REGISTER_PURE_BINARY_OP("tir.copysign");
+TVM_TIR_REGISTER_PURE_BINARY_OP("copysign");
 
-TIR_REGISTER_PURE_BINARY_OP("tir.ldexp");
+TVM_TIR_REGISTER_PURE_BINARY_OP("ldexp");
+
+TVM_TIR_REGISTER_OP("TVMBackendAllocWorkspace")
+    .set_num_inputs(5)
+    .set_attr<TGlobalSymbol>("TGlobalSymbol", "TVMBackendAllocWorkspace")
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TVM_TIR_REGISTER_OP("TVMBackendFreeWorkspace")
+    .set_num_inputs(3)
+    .set_attr<TGlobalSymbol>("TGlobalSymbol", "TVMBackendFreeWorkspace")
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
 // expose basic functions to node namespace
 TVM_REGISTER_GLOBAL("node._const").set_body([](TVMArgs args, TVMRetValue* ret) {
diff --git a/src/tir/op/runtime.cc b/src/tir/op/runtime.cc
deleted file mode 100644
index adabae9e75f7..000000000000
--- a/src/tir/op/runtime.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tir/op/runtime.cc
- * \brief TIR ops for runtime functions.
- */
-#include <tvm/ir/op.h>
-#include <tvm/tir/op_attr_types.h>
-
-namespace tvm {
-namespace tir {
-
-TVM_REGISTER_OP("tir.TVMBackendAllocWorkspace")
-    .set_num_inputs(5)
-    .set_attr<TGlobalSymbol>("TGlobalSymbol", "TVMBackendAllocWorkspace")
-    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
-
-TVM_REGISTER_OP("tir.TVMBackendFreeWorkspace")
-    .set_num_inputs(3)
-    .set_attr<TGlobalSymbol>("TGlobalSymbol", "TVMBackendFreeWorkspace")
-    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
-
-}  // namespace tir
-}  // namespace tvm
diff --git a/tests/cpp/traced_object_test.cc b/tests/cpp/traced_object_test.cc
deleted file mode 100644
index 7890a67eef95..000000000000
--- a/tests/cpp/traced_object_test.cc
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <dmlc/logging.h>
-#include <gtest/gtest.h>
-#include <tvm/node/repr_printer.h>
-#include <tvm/runtime/container/map.h>
-#include <tvm/script/printer/traced_object.h>
-
-using namespace tvm;
-
-namespace {
-
-class DummyObjectNode : public Object {
- public:
-  void VisitAttrs(AttrVisitor* v) {}
-
-  static constexpr const char* _type_key = "TracedObjectTestDummyObject";
-  TVM_DECLARE_FINAL_OBJECT_INFO(DummyObjectNode, Object);
-};
-
-class DummyObject : public ObjectRef {
- public:
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(DummyObject, ObjectRef, DummyObjectNode);
-};
-
-TVM_REGISTER_NODE_TYPE(DummyObjectNode);
-
-class ObjectWithAttrsNode : public Object {
- public:
-  int64_t int64_attr = 5;
-  Map<String, String> map_attr;
-  Array<String> array_attr;
-  DummyObject obj_attr;
-
-  ObjectWithAttrsNode() : obj_attr(make_object<DummyObjectNode>()) {}
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("int64_attr", &int64_attr);
-    v->Visit("map_attr", &map_attr);
-    v->Visit("array_attr", &array_attr);
-    v->Visit("obj_attr", &obj_attr);
-  }
-
-  static constexpr const char* _type_key = "TracedObjectTestObjectWithAttrs";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ObjectWithAttrsNode, Object);
-};
-
-class ObjectWithAttrs : public ObjectRef {
- public:
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(ObjectWithAttrs, ObjectRef, ObjectWithAttrsNode);
-};
-
-TVM_REGISTER_NODE_TYPE(ObjectWithAttrsNode);
-
-}  // anonymous namespace
-
-TEST(TracedObjectTest, MakeTraced_RootObject) {
-  ObjectWithAttrs root(make_object<ObjectWithAttrsNode>());
-  auto root_traced = MakeTraced(root);
-
-  static_assert(std::is_same<decltype(root_traced), TracedObject<ObjectWithAttrs>>::value);
-  ICHECK(root_traced.GetPath()->PathsEqual(ObjectPath::Root()));
-  ICHECK_EQ(root_traced.Get().get(), root.get());
-}
-
-TEST(TracedObjectTest, MakeTraced_WithPath) {
-  ObjectWithAttrs obj(make_object<ObjectWithAttrsNode>());
-  auto traced = MakeTraced(obj, ObjectPath::Root()->Attr("foo"));
-
-  static_assert(std::is_same<decltype(traced), TracedObject<ObjectWithAttrs>>::value);
-  ICHECK(traced.GetPath()->PathsEqual(ObjectPath::Root()->Attr("foo")));
-  ICHECK_EQ(traced.Get().get(), obj.get());
-}
-
-TEST(TracedObjectTest, TracedObject_ImplicitConversionFromDerived) {
-  DummyObject obj(make_object<DummyObjectNode>());
-  auto traced = MakeTraced(obj);
-  static_assert(std::is_same<decltype(traced), TracedObject<DummyObject>>::value);
-
-  // Check that TracedObject<DummyObject> is implicitly converted to TracedObject<ObjectRef>
-  auto base_traced = [](const TracedObject<ObjectRef>& base) { return base; }(traced);
-
-  static_assert(std::is_same<decltype(base_traced), TracedObject<ObjectRef>>::value);
-}
-
-TEST(TracedObjectTest, TracedObject_GetAttr_ObjectRef) {
-  ObjectWithAttrs root(make_object<ObjectWithAttrsNode>());
-  auto root_traced = MakeTraced(root);
-  auto obj_attr = root_traced.GetAttr(&ObjectWithAttrsNode::obj_attr);
-  static_assert(std::is_same<decltype(obj_attr), TracedObject<DummyObject>>::value);
-  ICHECK(obj_attr.GetPath()->PathsEqual(ObjectPath::Root()->Attr("obj_attr")));
-  ICHECK_EQ(obj_attr.Get().get(), root->obj_attr.get());
-}
-
-TEST(TracedObjectTest, TracedObject_GetAttr_Map) {
-  ObjectWithAttrs root(make_object<ObjectWithAttrsNode>());
-  root->map_attr.Set("foo", "bar");
-
-  auto root_traced = MakeTraced(root);
-  auto map_attr = root_traced.GetAttr(&ObjectWithAttrsNode::map_attr);
-  static_assert(std::is_same<decltype(map_attr), TracedMap<String, String>>::value);
-  ICHECK(map_attr.GetPath()->PathsEqual(ObjectPath::Root()->Attr("map_attr")));
-  ICHECK_EQ(map_attr.Get().get(), root->map_attr.get());
-
-  auto map_val = map_attr.at("foo");
-  ICHECK_EQ(map_val.Get(), "bar");
-  ICHECK(
-      map_val.GetPath()->PathsEqual(ObjectPath::Root()->Attr("map_attr")->MapValue(String("foo"))));
-}
-
-TEST(TracedObjectTest, TracedObject_GetAttr_Array) {
-  ObjectWithAttrs root(make_object<ObjectWithAttrsNode>());
-  root->array_attr.push_back("foo");
-  root->array_attr.push_back("bar");
-
-  auto root_traced = MakeTraced(root);
-  auto array_attr = root_traced.GetAttr(&ObjectWithAttrsNode::array_attr);
-  static_assert(std::is_same<decltype(array_attr), TracedArray<String>>::value);
-  ICHECK(array_attr.GetPath()->PathsEqual(ObjectPath::Root()->Attr("array_attr")));
-  ICHECK_EQ(array_attr.Get().get(), root->array_attr.get());
-
-  auto array_val = array_attr[1];
-  ICHECK_EQ(array_val.Get(), "bar");
-  ICHECK(array_val.GetPath()->PathsEqual(ObjectPath::Root()->Attr("array_attr")->ArrayIndex(1)));
-}
-
-TEST(TracedObjectTest, TracedObject_GetAttr_Int64) {
-  ObjectWithAttrs root(make_object<ObjectWithAttrsNode>());
-  auto root_traced = MakeTraced(root);
-
-  auto int64_attr = root_traced.GetAttr(&ObjectWithAttrsNode::int64_attr);
-  static_assert(std::is_same<decltype(int64_attr), TracedBasicValue<int64_t>>::value);
-  ICHECK_EQ(int64_attr.Get(), 5);
-  ICHECK(int64_attr.GetPath()->PathsEqual(ObjectPath::Root()->Attr("int64_attr")));
-}
-
-TEST(TracedObjectTest, TracedObject_IsInstance) {
-  ObjectRef dummy(make_object<DummyObjectNode>());
-  auto traced = MakeTraced(dummy);
-  ICHECK(traced.IsInstance<DummyObject>());
-  ICHECK(!traced.IsInstance<ObjectWithAttrs>());
-}
-
-TEST(TracedObjectTest, TracedObject_Downcast) {
-  ObjectRef root(make_object<DummyObjectNode>());
-  auto traced = MakeTraced(root);
-
-  auto as_dummy = traced.Downcast<DummyObject>();
-  static_assert(std::is_same<decltype(as_dummy), TracedObject<DummyObject>>::value);
-  ICHECK_EQ(as_dummy.Get(), root);
-
-  // Try downcasting to a wrong type
-  bool caught = false;
-  try {
-    traced.Downcast<ObjectWithAttrs>();
-  } catch (std::exception& e) {
-    caught = strstr(e.what(),
-                    "Downcast from TracedObjectTestDummyObject to TracedObjectTestObjectWithAttrs "
-                    "failed") != nullptr;
-  }
-  ICHECK(caught);
-}
-
-TEST(TracedObjectTest, TracedObject_TryDowncast) {
-  ObjectRef root(make_object<DummyObjectNode>());
-  auto traced = MakeTraced(root);
-
-  auto as_dummy = traced.TryDowncast<DummyObject>();
-  static_assert(std::is_same<decltype(as_dummy), TracedOptional<DummyObject>>::value);
-  ICHECK(as_dummy.defined());
-  ICHECK_EQ(as_dummy.value().Get(), root);
-
-  // Try downcasting to a wrong type
-  ICHECK(!traced.TryDowncast<ObjectWithAttrs>().defined());
-}
-
-TEST(TracedObjectTest, TracedMap_At) {
-  Map<String, String> m({{"k1", "foo"}, {"k2", "bar"}});
-  auto traced = MakeTraced(m);
-
-  auto traced_foo = traced.at("k1");
-  static_assert(std::is_same<decltype(traced_foo), TracedObject<String>>::value);
-  ICHECK_EQ(traced_foo.Get(), "foo");
-  ICHECK(traced_foo.GetPath()->PathsEqual(ObjectPath::Root()->MapValue(String("k1"))));
-}
-
-TEST(TracedObjectTest, TracedMap_Iterator) {
-  Map<String, String> m({{"k1", "foo"}, {"k2", "bar"}});
-  auto traced = MakeTraced(m);
-
-  size_t k1_count = 0;
-  size_t k2_count = 0;
-
-  for (const auto& kv : traced) {
-    if (kv.first == "k1") {
-      ++k1_count;
-      ICHECK_EQ(kv.second.Get(), "foo");
-      ICHECK(kv.second.GetPath()->PathsEqual(ObjectPath::Root()->MapValue(String("k1"))));
-    } else if (kv.first == "k2") {
-      ++k2_count;
-      ICHECK_EQ(kv.second.Get(), "bar");
-      ICHECK(kv.second.GetPath()->PathsEqual(ObjectPath::Root()->MapValue(String("k2"))));
-    } else {
-      ICHECK(false);
-    }
-  }
-
-  ICHECK_EQ(k1_count, 1);
-  ICHECK_EQ(k2_count, 1);
-}
-
-TEST(TracedObjectTest, TracedArray_Index) {
-  Array<String> a = {"foo", "bar"};
-  auto traced = MakeTraced(a);
-
-  auto traced_bar = traced[1];
-  static_assert(std::is_same<decltype(traced_bar), TracedObject<String>>::value);
-  ICHECK_EQ(traced_bar.Get(), "bar");
-  ICHECK(traced_bar.GetPath()->PathsEqual(ObjectPath::Root()->ArrayIndex(1)));
-}
-
-TEST(TracedObjectTest, TracedArray_Iterator) {
-  Array<String> a = {"foo", "bar"};
-  auto traced = MakeTraced(a);
-
-  size_t index = 0;
-  for (const auto& x : traced) {
-    if (index == 0) {
-      ICHECK_EQ(x.Get(), "foo");
-      ICHECK(x.GetPath()->PathsEqual(ObjectPath::Root()->ArrayIndex(0)));
-    } else if (index == 1) {
-      ICHECK_EQ(x.Get(), "bar");
-      ICHECK(x.GetPath()->PathsEqual(ObjectPath::Root()->ArrayIndex(1)));
-    } else {
-      ICHECK(false);
-    }
-    ++index;
-  }
-
-  ICHECK_EQ(index, 2);
-}
-
-TEST(TracedObjectTest, TracedBasicValue_ApplyFunc) {
-  auto traced = MakeTraced(123, ObjectPath::Root()->Attr("foo"));
-  static_assert(std::is_same<decltype(traced), TracedBasicValue<int>>::value);
-
-  auto transformed = traced.ApplyFunc([](int x) { return x + 4.0; });
-  static_assert(std::is_same<decltype(transformed), TracedBasicValue<double>>::value);
-
-  ICHECK(transformed.GetPath()->PathsEqual(ObjectPath::Root()->Attr("foo")));
-}
diff --git a/tests/cpp/tvmscript_printer_irdocsifier_test.cc b/tests/cpp/tvmscript_printer_irdocsifier_test.cc
deleted file mode 100644
index 8c68399df222..000000000000
--- a/tests/cpp/tvmscript_printer_irdocsifier_test.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <dmlc/logging.h>
-#include <gtest/gtest.h>
-#include <tvm/runtime/logging.h>
-#include <tvm/runtime/memory.h>
-#include <tvm/script/printer/doc.h>
-#include <tvm/script/printer/ir_docsifier.h>
-#include <tvm/script/printer/traced_object.h>
-
-using namespace tvm;
-using namespace tvm::script::printer;
-
-class TestObjectNode : public Object {
- public:
-  void VisitAttrs(AttrVisitor* v) {}
-
-  static constexpr const char* _type_key = "test.script.printer.irdocsifier.TestObject";
-  TVM_DECLARE_FINAL_OBJECT_INFO(TestObjectNode, Object);
-};
-
-class TestObject : public ObjectRef {
- public:
-  TestObject() : ObjectRef(runtime::make_object<TestObjectNode>()) {}
-  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(TestObject, ObjectRef, TestObjectNode);
-};
-
-TVM_REGISTER_NODE_TYPE(TestObjectNode);
-
-TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<TestObject>([](TracedObject<TestObject> obj, IRDocsifier p) {
-      return IdDoc("x");
-    });
-
-TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<TestObject>("tir", [](TracedObject<TestObject> obj, IRDocsifier p) {
-      return IdDoc("tir");
-    });
-
-TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<TestObject>("relax", [](TracedObject<TestObject> obj, IRDocsifier p) {
-      return IdDoc("relax");
-    });
-
-TEST(PrinterIRDocsifierTest, AsDoc) {
-  IRDocsifier p(Map<String, String>{});
-  ObjectPath path = ObjectPath::Root();
-  TestObject obj;
-
-  IdDoc doc = p->AsDoc<IdDoc>(MakeTraced(obj, path));
-
-  ICHECK_EQ(doc->name, "x");
-}
-
-TEST(PrinterIRDocsifierTest, AsExprDoc) {
-  IRDocsifier p(Map<String, String>{});
-  ObjectPath path = ObjectPath::Root();
-  TestObject obj;
-
-  ExprDoc doc = p->AsExprDoc(MakeTraced(obj, path));
-
-  ICHECK_EQ(Downcast<IdDoc>(doc)->name, "x");
-}
-
-TEST(PrinterIRDocsifierTest, WithDispatchToken) {
-  IRDocsifier p(Map<String, String>{});
-  TracedObject<TestObject> obj = MakeTraced(TestObject(), ObjectPath::Root());
-
-  ICHECK_EQ(p->AsDoc<IdDoc>(obj)->name, "x");
-
-  {
-    auto ctx = p->WithDispatchToken("tir");
-    ICHECK_EQ(p->AsDoc<IdDoc>(obj)->name, "tir");
-
-    {
-      auto ctx = p->WithDispatchToken("relax");
-      ICHECK_EQ(p->AsDoc<IdDoc>(obj)->name, "relax");
-    }
-
-    ICHECK_EQ(p->AsDoc<IdDoc>(obj)->name, "tir");
-  }
-
-  ICHECK_EQ(p->AsDoc<IdDoc>(obj)->name, "x");
-}
-
-TEST(PrinterIRDocsifierTest, WithFrame) {
-  IRDocsifier p(Map<String, String>{});
-  TestObject obj;
-
-  {
-    VarDefFrame frame;
-    auto ctx = p->WithFrame(frame);
-    ICHECK_EQ(p->frames.size(), 1);
-
-    p->vars->Define(obj, "x", ObjectPath::Root(), frame);
-    ICHECK(p->vars->IsVarDefined(obj));
-  }
-  ICHECK_EQ(p->frames.size(), 0);
-  ICHECK(!p->vars->IsVarDefined(obj));
-}
diff --git a/tests/cpp/tvmscript_printer_traced_object_functor_test.cc b/tests/cpp/tvmscript_printer_traced_object_functor_test.cc
deleted file mode 100644
index d662ce132405..000000000000
--- a/tests/cpp/tvmscript_printer_traced_object_functor_test.cc
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <dmlc/logging.h>
-#include <gtest/gtest.h>
-#include <tvm/node/object_path.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/script/printer/traced_object.h>
-#include <tvm/script/printer/traced_object_functor.h>
-
-using namespace tvm;
-using namespace tvm::script::printer;
-
-namespace {
-
-class FooObjectNode : public Object {
- public:
-  void VisitAttrs(AttrVisitor* v) {}
-
-  static constexpr const char* _type_key = "test.TracedObjectFunctor.FooObject";
-  TVM_DECLARE_FINAL_OBJECT_INFO(FooObjectNode, Object);
-};
-
-class FooObject : public ObjectRef {
- public:
-  FooObject() { this->data_ = make_object<FooObjectNode>(); }
-  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(FooObject, ObjectRef, FooObjectNode);
-};
-
-TVM_REGISTER_NODE_TYPE(FooObjectNode);
-
-class BarObjectNode : public Object {
- public:
-  void VisitAttrs(AttrVisitor* v) {}
-
-  static constexpr const char* _type_key = "test.TracedObjectFunctor.BarObject";
-  TVM_DECLARE_FINAL_OBJECT_INFO(BarObjectNode, Object);
-};
-
-class BarObject : public ObjectRef {
- public:
-  BarObject() { this->data_ = make_object<BarObjectNode>(); }
-  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(BarObject, ObjectRef, BarObjectNode);
-};
-
-TVM_REGISTER_NODE_TYPE(BarObjectNode);
-
-String ComputeFoo(TracedObject<FooObject> foo) { return "Foo"; }
-
-}  // anonymous namespace
-
-TEST(TracedObjectFunctorTest, NormalRegistration) {
-  TracedObjectFunctor<String> functor;
-  ObjectPath path = ObjectPath::Root();
-
-  functor.set_dispatch<FooObject>([](TracedObject<FooObject> o) -> String { return "Foo"; });
-  functor.set_dispatch<BarObject>([](TracedObject<BarObject> o) -> String { return "Bar"; });
-
-  ICHECK_EQ(functor("", MakeTraced(FooObject(), path)), "Foo");
-  ICHECK_EQ(functor("", MakeTraced(BarObject(), path)), "Bar");
-}
-
-TEST(TracedObjectFunctorTest, RegistrationWithFunction) {
-  TracedObjectFunctor<String> functor;
-  ObjectPath path = ObjectPath::Root();
-
-  functor.set_dispatch<FooObject>([](TracedObject<FooObject> o) -> String { return "FooLambda"; });
-  functor.set_dispatch<FooObject>("tir", ComputeFoo);
-
-  ICHECK_EQ(functor("", MakeTraced(FooObject(), path)), "FooLambda");
-  ICHECK_EQ(functor("tir", MakeTraced(FooObject(), path)), "Foo");
-}
-
-TEST(TracedObjectFunctorTest, RegistrationWithDispatchToken) {
-  TracedObjectFunctor<String> functor;
-  ObjectPath path = ObjectPath::Root();
-
-  functor.set_dispatch<FooObject>([](TracedObject<FooObject> o) -> String { return "Foo"; });
-  functor.set_dispatch<FooObject>("tir",
-                                  [](TracedObject<FooObject> o) -> String { return "Foo tir"; });
-  functor.set_dispatch<FooObject>("relax",
-                                  [](TracedObject<FooObject> o) -> String { return "Foo relax"; });
-
-  ICHECK_EQ(functor("", MakeTraced(FooObject(), path)), "Foo");
-  ICHECK_EQ(functor("tir", MakeTraced(FooObject(), path)), "Foo tir");
-  ICHECK_EQ(functor("relax", MakeTraced(FooObject(), path)), "Foo relax");
-  ICHECK_EQ(functor("xyz", MakeTraced(FooObject(), path)), "Foo");
-}
-
-TEST(TracedObjectFunctorTest, RegistrationWithPackedFunc) {
-  TracedObjectFunctor<String> functor;
-  ObjectPath path = ObjectPath::Root();
-
-  auto f_default = [](runtime::TVMArgs, runtime::TVMRetValue* ret) { *ret = String("default"); };
-  auto f_tir = [](runtime::TVMArgs, runtime::TVMRetValue* ret) { *ret = String("tir"); };
-
-  functor.set_dispatch("", FooObjectNode::RuntimeTypeIndex(), runtime::PackedFunc(f_default));
-  functor.set_dispatch("tir", FooObjectNode::RuntimeTypeIndex(), runtime::PackedFunc(f_tir));
-
-  ICHECK_EQ(functor("", MakeTraced(FooObject(), path)), "default");
-  ICHECK_EQ(functor("tir", MakeTraced(FooObject(), path)), "tir");
-}
-
-TEST(TracedObjectFunctorTest, ExtraArg) {
-  TracedObjectFunctor<int, int> functor;
-  ObjectPath path = ObjectPath::Root();
-
-  functor.set_dispatch<FooObject>([](TracedObject<FooObject> o, int x) { return x; });
-  functor.set_dispatch<BarObject>([](TracedObject<BarObject> o, int x) { return x + 1; });
-
-  ICHECK_EQ(functor("", MakeTraced(FooObject(), path), 2), 2);
-  ICHECK_EQ(functor("", MakeTraced(BarObject(), path), 2), 3);
-  ICHECK_EQ(functor("tir", MakeTraced(BarObject(), path), 2), 3);
-}
-
-TEST(TracedObjectFunctorTest, RemoveDispatchFunction) {
-  TracedObjectFunctor<String> functor;
-  ObjectPath path = ObjectPath::Root();
-
-  functor.set_dispatch<FooObject>([](TracedObject<FooObject> o) -> String { return "Foo"; });
-  functor.set_dispatch<FooObject>("tir",
-                                  [](TracedObject<FooObject> o) -> String { return "Foo tir"; });
-
-  ICHECK_EQ(functor("", MakeTraced(FooObject(), path)), "Foo");
-  ICHECK_EQ(functor("tir", MakeTraced(FooObject(), path)), "Foo tir");
-
-  functor.remove_dispatch("tir", FooObjectNode::RuntimeTypeIndex());
-  ICHECK_EQ(functor("tir", MakeTraced(FooObject(), path)), "Foo");
-}
-
-TEST(TracedObjectFunctorTest, CallWithUnregisteredType) {
-  TracedObjectFunctor<int, int> functor;
-  ObjectPath path = ObjectPath::Root();
-
-  bool failed = false;
-  try {
-    ICHECK_EQ(functor("", MakeTraced(FooObject(), path), 2), 2);
-  } catch (...) {
-    failed = true;
-  }
-  ASSERT_EQ(failed, true);
-}
-
-TEST(TracedObjectFunctorTest, DuplicateRegistration_WithoutToken) {
-  TracedObjectFunctor<int, int> functor;
-  ObjectPath path = ObjectPath::Root();
-
-  functor.set_dispatch<FooObject>([](TracedObject<FooObject> o, int x) { return x; });
-
-  bool failed = false;
-  try {
-    functor.set_dispatch<FooObject>([](TracedObject<FooObject> o, int x) { return x; });
-  } catch (...) {
-    failed = true;
-  }
-  ASSERT_EQ(failed, true);
-}
-
-TEST(TracedObjectFunctorTest, DuplicateRegistration_WithToken) {
-  TracedObjectFunctor<int, int> functor;
-  ObjectPath path = ObjectPath::Root();
-
-  functor.set_dispatch<FooObject>("tir", [](TracedObject<FooObject> o, int x) { return x; });
-
-  bool failed = false;
-  try {
-    functor.set_dispatch<FooObject>("tir", [](TracedObject<FooObject> o, int x) { return x; });
-  } catch (...) {
-    failed = true;
-  }
-  ASSERT_EQ(failed, true);
-}
diff --git a/tests/cpp/tvmscript_printer_var_table_test.cc b/tests/cpp/tvmscript_printer_var_table_test.cc
deleted file mode 100644
index b447c81ac0b8..000000000000
--- a/tests/cpp/tvmscript_printer_var_table_test.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <dmlc/logging.h>
-#include <gtest/gtest.h>
-#include <tvm/node/object_path.h>
-#include <tvm/runtime/logging.h>
-#include <tvm/script/printer/frame.h>
-#include <tvm/script/printer/var_table.h>
-#include <tvm/tir/var.h>
-
-using namespace tvm;
-using namespace tvm::script::printer;
-
-TEST(PrinterVarTableTest, Define) {
-  VarTable vars;
-  MetadataFrame frame;
-  tir::Var x("x");
-  ObjectPath object_path = ObjectPath::Root();
-
-  IdDoc doc = vars->Define(x, "x", object_path, frame);
-
-  ICHECK_EQ(doc->name, "x");
-
-  IdDoc second_doc = Downcast<IdDoc>(vars->GetVarDoc(x, object_path).value());
-
-  ICHECK_EQ(second_doc->name, "x");
-}
-
-TEST(PrinterVarTableTest, DefineByDoc) {
-  VarTable vars;
-  MetadataFrame frame;
-  tir::Var x("x");
-  ObjectPath object_path = ObjectPath::Root();
-
-  auto doc_factory = []() { return LiteralDoc::Str("x"); };
-
-  vars->DefineByDoc(x, doc_factory, frame);
-
-  ExprDoc doc = vars->GetVarDoc(x, object_path).value();
-
-  ICHECK_EQ(Downcast<String>(Downcast<LiteralDoc>(doc)->value), "x");
-}
-
-TEST(PrinterVarTableTest, GetVarDocWithUnknownVariable) {
-  VarTable vars;
-  MetadataFrame frame;
-  tir::Var x("x");
-  tir::Var y("y");
-  ObjectPath object_path = ObjectPath::Root();
-
-  Doc doc = vars->Define(x, "x", object_path, frame);
-  ICHECK(!vars->GetVarDoc(y, object_path).defined());
-}
-
-TEST(PrinterVarTableTest, GetVarDocWithObjectPath) {
-  VarTable vars;
-  MetadataFrame frame;
-  tir::Var x("x");
-  ObjectPath object_path = ObjectPath::Root();
-  ObjectPath second_object_path = ObjectPath::Root()->Attr("x");
-
-  IdDoc doc = vars->Define(x, "x", object_path, frame);
-  ICHECK_EQ(doc->source_paths[0], object_path);
-  ICHECK_EQ(doc->source_paths.size(), 1);
-
-  Doc second_doc = vars->GetVarDoc(x, second_object_path).value();
-  ICHECK_EQ(second_doc->source_paths[0], second_object_path);
-  ICHECK_EQ(second_doc->source_paths.size(), 1);
-}
-
-TEST(PrinterVarTableTest, IsVarDefined) {
-  VarTable vars;
-  MetadataFrame frame;
-  tir::Var x("x");
-  tir::Var y("y");
-  ObjectPath object_path = ObjectPath::Root();
-
-  vars->Define(x, "x", object_path, frame);
-  ICHECK(vars->IsVarDefined(x));
-  ICHECK(!vars->IsVarDefined(y));
-}
-
-TEST(PrinterVarTableTest, VarRemovedAfterFrameOutOfScope) {
-  VarTable vars;
-  MetadataFrame frame;
-  tir::Var x("x");
-  ObjectPath object_path = ObjectPath::Root();
-
-  vars->Define(x, "x", object_path, frame);
-  ICHECK(vars->IsVarDefined(x));
-
-  frame->ExitWithScope();
-  ICHECK(!vars->IsVarDefined(x));
-}
-
-TEST(PrinterVarTableTest, DefineDuplicateName) {
-  VarTable vars;
-  MetadataFrame frame;
-  tir::Var x("x");
-  tir::Var y("y");
-  ObjectPath object_path = ObjectPath::Root();
-
-  IdDoc x_doc = vars->Define(x, "x", object_path, frame);
-  IdDoc y_doc = vars->Define(y, "x", object_path, frame);
-
-  ICHECK_NE(x_doc->name, y_doc->name);
-}
-
-TEST(PrinterVarTableTest, DefineDuplicateVariable) {
-  VarTable vars;
-  MetadataFrame frame;
-  tir::Var x("x");
-  ObjectPath object_path = ObjectPath::Root();
-
-  vars->Define(x, "x", object_path, frame);
-
-  bool failed = false;
-  try {
-    vars->Define(x, "x", object_path, frame);
-  } catch (...) {
-    failed = true;
-  }
-  ASSERT_EQ(failed, true);
-}
-
-TEST(PrinterVarTableTest, DefineByDocWithIdDoc) {
-  VarTable vars;
-  MetadataFrame frame;
-  tir::Var x("x");
-  ObjectPath object_path = ObjectPath::Root();
-
-  bool failed = false;
-  try {
-    // User has to use `Define` if variable needs to be mapped to IdDoc
-    vars->DefineByDoc(
-        x, []() { return IdDoc("x"); }, frame);
-  } catch (...) {
-    failed = true;
-  }
-  ASSERT_EQ(failed, true);
-}
diff --git a/tests/python/unittest/test_tvmscript_printer_entry_point.py b/tests/python/unittest/test_tvmscript_printer_entry_point.py
deleted file mode 100644
index 208386dbdd4a..000000000000
--- a/tests/python/unittest/test_tvmscript_printer_entry_point.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-
-from tvm.error import TVMError
-from tvm.script.printer import script
-from tvm.tir import FloatImm
-
-
-def test_as_script_unknown_ir():
-    ir_node = FloatImm("float32", 1.0)
-
-    with pytest.raises(TVMError) as e:
-        script(ir_node, "test_xyz", {})
-
-    assert "test_xyz" in str(e.value)
diff --git a/tests/python/unittest/test_tvmscript_printer_frame.py b/tests/python/unittest/test_tvmscript_printer_frame.py
deleted file mode 100644
index bd98d6445644..000000000000
--- a/tests/python/unittest/test_tvmscript_printer_frame.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from tvm.script.printer.frame import MetadataFrame
-
-
-def test_frame_add_callback():
-    frame = MetadataFrame()
-
-    flag = 0
-
-    def callback1():
-        nonlocal flag
-        flag += 1
-
-    def callback2():
-        nonlocal flag
-        flag += 5
-
-    frame.add_exit_callback(callback1)
-    with frame:
-        frame.add_exit_callback(callback2)
-        assert flag == 0
-
-    assert flag == 6
-
-
-def test_frame_clear_callbacks_after_exit():
-    frame = MetadataFrame()
-
-    flag = 0
-
-    def callback():
-        nonlocal flag
-        flag += 1
-
-    frame.add_exit_callback(callback)
-
-    with frame:
-        pass
-
-    assert flag == 1
-
-    with frame:
-        pass
-
-    assert flag == 1
diff --git a/tests/python/unittest/test_tvmscript_printer_irdocsifier.py b/tests/python/unittest/test_tvmscript_printer_irdocsifier.py
deleted file mode 100644
index d9d552ce4b9f..000000000000
--- a/tests/python/unittest/test_tvmscript_printer_irdocsifier.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-
-from tvm.runtime import ObjectPath
-from tvm.script.printer.doc import IdDoc
-from tvm.script.printer.frame import MetadataFrame, VarDefFrame
-from tvm.script.printer.ir_docsifier import IRDocsifier, RootNodeContainer
-from tvm.tir import Var
-
-
-@pytest.fixture
-def ir_docsifier():
-    """
-    Creates an IRDocsifier instance with a special dispatch token.
-    """
-    _ir_docsifier = IRDocsifier({})
-    with _ir_docsifier.dispatch_token(f"{__file__}"):
-        yield _ir_docsifier
-
-
-def _get_id_doc_printer(id_name):
-    def printer(obj, object_path, ir_docsifier):  # pylint: disable=unused-argument
-        return IdDoc(id_name)
-
-    return printer
-
-
-def _root_dispatch_function(obj, ir_docsifier):
-    doc = ir_docsifier.as_doc(obj, ObjectPath.root())
-    doc.source_paths = [ObjectPath.root().attr("irdocsifier_test")]
-    return doc
-
-
-# Because the dispatch table is global, tests should only set dispatch function under
-# unique dispatch token.
-IRDocsifier.set_dispatch(Var, _get_id_doc_printer("x"), f"{__file__}")
-IRDocsifier.set_root_dispatch(f"{__file__}", _root_dispatch_function)
-
-
-def test_set_dispatch(ir_docsifier):
-    IRDocsifier.set_dispatch(Var, _get_id_doc_printer("x2"), f"{__file__}-2")
-    with ir_docsifier.dispatch_token(f"{__file__}-2"):
-        doc = ir_docsifier.as_doc(Var("x", dtype="int8"), ObjectPath.root())
-        assert doc.name == "x2"
-
-    doc = ir_docsifier.as_doc(Var("x", dtype="int8"), ObjectPath.root())
-    assert doc.name == "x"
-
-
-def test_set_root_dispatch(ir_docsifier):
-    doc = ir_docsifier.as_doc(RootNodeContainer(Var("x", dtype="int8")), ObjectPath.root())
-    assert ObjectPath.root().attr("irdocsifier_test") in doc.source_paths
-
-
-def test_as_doc(ir_docsifier):
-    object_path = ObjectPath.root()
-    doc = ir_docsifier.as_doc(Var("x", "int8"), ObjectPath.root())
-    assert doc.name == "x"
-    assert list(doc.source_paths) == [object_path]
-
-
-def test_with_dispatch_token(ir_docsifier):
-    initial_token_count = len(ir_docsifier.dispatch_tokens)
-
-    with ir_docsifier.dispatch_token("tir"):
-        assert len(ir_docsifier.dispatch_tokens) == initial_token_count + 1
-
-    assert len(ir_docsifier.dispatch_tokens) == initial_token_count
-
-
-def test_with_frame(ir_docsifier):
-    initial_frame_count = len(ir_docsifier.frames)
-
-    frame = VarDefFrame()
-    is_callback_called = False
-
-    def callback():
-        nonlocal is_callback_called
-        is_callback_called = True
-
-    frame.add_exit_callback(callback)
-
-    with ir_docsifier.frame(frame):
-        assert len(ir_docsifier.frames) == initial_frame_count + 1
-        assert not is_callback_called
-
-    assert len(ir_docsifier.frames) == initial_frame_count
-    assert is_callback_called
-
-
-def test_get_frame(ir_docsifier):
-    with ir_docsifier.frame(VarDefFrame()) as frame_a:
-        assert ir_docsifier.get_frame(MetadataFrame) is None
-        assert ir_docsifier.get_frame(VarDefFrame) == frame_a
-
-        with ir_docsifier.frame(VarDefFrame()) as frame_b:
-            assert ir_docsifier.get_frame(MetadataFrame) is None
-            assert ir_docsifier.get_frame(VarDefFrame) == frame_b
-
-            with ir_docsifier.frame(MetadataFrame()) as frame_c:
-                assert ir_docsifier.get_frame(MetadataFrame) == frame_c
-                assert ir_docsifier.get_frame(VarDefFrame) == frame_b
-
-            assert ir_docsifier.get_frame(MetadataFrame) is None
-            assert ir_docsifier.get_frame(VarDefFrame) == frame_b
-
-        assert ir_docsifier.get_frame(MetadataFrame) is None
-        assert ir_docsifier.get_frame(VarDefFrame) == frame_a
diff --git a/tests/python/unittest/test_tvmscript_printer_var_table.py b/tests/python/unittest/test_tvmscript_printer_var_table.py
deleted file mode 100644
index eab63a08ddad..000000000000
--- a/tests/python/unittest/test_tvmscript_printer_var_table.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-This file tests the FFI binding of script.printer.VarTable.
-These only make sure parameter can be passed to the C++ functions
-correctly. The test for the functionality of VarTable is in C++.
-"""
-
-from tvm.runtime import ObjectPath
-from tvm.script.printer.doc import LiteralDoc
-from tvm.script.printer.frame import VarDefFrame
-from tvm.script.printer.var_table import VarTable
-from tvm.tir import Var
-
-
-def test_define():
-    var_table = VarTable()
-    var_name = "a"
-    var_obj = Var(var_name, dtype="int32")
-    object_path = ObjectPath.root().attr("a")
-    frame = VarDefFrame()
-
-    id_doc = var_table.define(var_obj, var_name, object_path, frame)
-
-    assert id_doc.name == "a"
-    assert list(id_doc.source_paths) == [object_path]
-
-    id_doc = var_table.get_var_doc(var_obj, object_path)
-
-    assert id_doc.name == "a"
-    assert list(id_doc.source_paths) == [object_path]
-
-
-def test_define_by_doc():
-    var_table = VarTable()
-    var_name = "a"
-    var_obj = Var(var_name, dtype="int32")
-    object_path = ObjectPath.root().attr("a")
-    frame = VarDefFrame()
-
-    var_table.define_by_doc(var_obj, lambda: LiteralDoc(var_name), frame)
-
-    var_doc = var_table.get_var_doc(var_obj, object_path)
-
-    assert isinstance(var_doc, LiteralDoc)
-    assert var_doc.value == var_name
-    assert list(var_doc.source_paths) == [object_path]
-
-
-def test_is_var_defined():
-    var_table = VarTable()
-    a = Var("a", dtype="int32")
-    object_path = ObjectPath.root().attr("a")
-    frame = VarDefFrame()
-
-    var_table.define(a, "a", object_path, frame)
-
-    assert var_table.is_var_defined(a)
-    assert a in var_table
-
-
-def test_var_out_of_scope():
-    var_table = VarTable()
-    var_name = "a"
-    var_obj = Var(var_name, dtype="int32")
-    object_path = ObjectPath.root().attr("a")
-    frame = VarDefFrame()
-
-    var_table.define(var_obj, var_name, object_path, frame)
-
-    with frame:
-        assert var_obj in var_table
-
-    assert var_obj not in var_table
-    assert var_table.get_var_doc(var_obj, object_path) is None

From c78bc8aaa6c63f3f2d260f6aad11a8d06f8c3ed1 Mon Sep 17 00:00:00 2001
From: multiverstack <39256082+multiverstack-intellif@users.noreply.github.com>
Date: Mon, 9 Jan 2023 12:00:55 +0800
Subject: [PATCH 131/286] [TIR][Arith] Add common sub expr analyzer (#13702)

* [TIR][Arith] Add common sub expr analyzer

* Update python/tvm/arith/pattern.py

Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>

* Update src/arith/detect_common_subexpr.cc

Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>

* Update python/tvm/arith/pattern.py

Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>

* Update python/tvm/arith/pattern.py

Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>

* Update src/arith/detect_common_subexpr.cc

Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>

* Update detect_common_subexpr.cc

* Update pattern.py

* Update pattern.py

* Update pattern.py

* Update pattern.py

Co-authored-by: Min Chen <chen.min@intellif.com>
Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
---
 python/tvm/arith/__init__.py                  |  2 +-
 python/tvm/arith/pattern.py                   | 23 ++++++
 src/arith/detect_common_subexpr.cc            | 74 +++++++++++++++++++
 .../transforms/common_subexpr_elim_tools.cc   |  6 +-
 .../transforms/common_subexpr_elim_tools.h    |  3 +-
 .../python/unittest/test_arith_detect_cse.py  | 33 +++++++++
 6 files changed, 136 insertions(+), 5 deletions(-)
 create mode 100644 src/arith/detect_common_subexpr.cc
 create mode 100644 tests/python/unittest/test_arith_detect_cse.py

diff --git a/python/tvm/arith/__init__.py b/python/tvm/arith/__init__.py
index 03c0769850c9..423aafe5d69f 100644
--- a/python/tvm/arith/__init__.py
+++ b/python/tvm/arith/__init__.py
@@ -25,7 +25,7 @@
 )
 from .analyzer import ModularSet, ConstIntBound, Analyzer
 from .bound import deduce_bound
-from .pattern import detect_linear_equation, detect_clip_bound
+from .pattern import detect_linear_equation, detect_clip_bound, detect_common_subexpr
 from .int_solver import solve_linear_equations, solve_linear_inequalities
 from .iter_affine_map import IterMapExpr, IterMark, IterSplitExpr, IterSumExpr
 from .iter_affine_map import (
diff --git a/python/tvm/arith/pattern.py b/python/tvm/arith/pattern.py
index 53f8eb62b6e1..3c822dc52399 100644
--- a/python/tvm/arith/pattern.py
+++ b/python/tvm/arith/pattern.py
@@ -15,6 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """Detect common patterns."""
+
+from typing import Dict
+
+from tvm.tir import PrimExpr
 from . import _ffi_api
 
 
@@ -58,3 +62,22 @@ def detect_clip_bound(expr, var_list):
         An empty list if the match failed.
     """
     return _ffi_api.DetectClipBound(expr, var_list)
+
+
+def detect_common_subexpr(expr: PrimExpr, threshold: int) -> Dict[PrimExpr, int]:
+    """Detect common sub expression which shows up more than a threshold times
+
+    Parameters
+    ----------
+    expr : PrimExpr
+        The expression to be analyzed.
+
+    threshold : int
+        The threshold of repeat times that determines a common sub expression
+
+    Returns
+    -------
+    cse_dict : Dict[PrimExpr, int]
+        The detected common sub expression dict, with sub expression and repeat times
+    """
+    return _ffi_api.DetectCommonSubExpr(expr, threshold)
diff --git a/src/arith/detect_common_subexpr.cc b/src/arith/detect_common_subexpr.cc
new file mode 100644
index 000000000000..b496e7fefca5
--- /dev/null
+++ b/src/arith/detect_common_subexpr.cc
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file detect_common_subexpr.cc
+ * \brief Utility to detect common sub expressions.
+ */
+#include <tvm/tir/expr.h>
+
+#include <limits>
+
+#include "../tir/transforms/common_subexpr_elim_tools.h"
+
+namespace tvm {
+namespace arith {
+
+using namespace tir;
+
+Map<PrimExpr, Integer> DetectCommonSubExpr(const PrimExpr& e, int thresh) {
+  // Check the threshold in the range of size_t
+  CHECK_GE(thresh, std::numeric_limits<size_t>::min());
+  CHECK_LE(thresh, std::numeric_limits<size_t>::max());
+  size_t repeat_thr = static_cast<size_t>(thresh);
+  auto IsEligibleComputation = [](const PrimExpr& expr) {
+    return (SideEffect(expr) <= CallEffectKind::kPure && CalculateExprComplexity(expr) > 1 &&
+            (expr.as<RampNode>() == nullptr) && (expr.as<BroadcastNode>() == nullptr));
+  };
+
+  // Analyze the sub expressions
+  ComputationTable table_syntactic_comp_done_by_expr = ComputationsDoneBy::GetComputationsDoneBy(
+      e, IsEligibleComputation, [](const PrimExpr& expr) { return true; });
+
+  std::vector<std::pair<PrimExpr, size_t>> semantic_comp_done_by_expr =
+      SyntacticToSemanticComputations(table_syntactic_comp_done_by_expr, true);
+
+  // Find eligible sub expr if occurrence is under thresh
+  for (size_t i = 0; i < semantic_comp_done_by_expr.size(); i++) {
+    std::pair<PrimExpr, size_t>& computation_and_nb = semantic_comp_done_by_expr[i];
+    if (computation_and_nb.second < repeat_thr) {
+      std::vector<PrimExpr> direct_subexprs =
+          DirectSubexpr::GetDirectSubexpressions(computation_and_nb.first, IsEligibleComputation,
+                                                 [](const PrimExpr& expr) { return true; });
+      InsertVectorToSortedSemanticComputations(&semantic_comp_done_by_expr, direct_subexprs, true,
+                                               computation_and_nb.second);
+    }
+  }
+
+  // Return the common sub expr that occur more than thresh times
+  Map<PrimExpr, Integer> results;
+  for (auto& it : semantic_comp_done_by_expr) {
+    if (it.second >= repeat_thr) results.Set(it.first, it.second);
+  }
+  return results;
+}
+
+TVM_REGISTER_GLOBAL("arith.DetectCommonSubExpr").set_body_typed(DetectCommonSubExpr);
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/tir/transforms/common_subexpr_elim_tools.cc b/src/tir/transforms/common_subexpr_elim_tools.cc
index 130004c51cd8..c118d1db7d8e 100644
--- a/src/tir/transforms/common_subexpr_elim_tools.cc
+++ b/src/tir/transforms/common_subexpr_elim_tools.cc
@@ -902,7 +902,7 @@ void InsertElemToSortedSemanticComputations(std::vector<std::pair<PrimExpr, size
  */
 void InsertVectorToSortedSemanticComputations(std::vector<std::pair<PrimExpr, size_t>>* sorted_vec,
                                               const std::vector<PrimExpr>& vec_to_add,
-                                              bool identify_equiv_terms) {
+                                              bool identify_equiv_terms, size_t increase_count) {
   if (sorted_vec == nullptr) {
     return;
   }
@@ -918,10 +918,10 @@ void InsertVectorToSortedSemanticComputations(std::vector<std::pair<PrimExpr, si
     // If we found `elem_to_add` (or an equivalent expression) already in sorted_vec
     if (it_found != sorted_vec->end()) {
       // then we just increase its associated count
-      it_found->second++;
+      it_found->second += increase_count;
     } else {
       // Otherwise we add the pair (`elem_to_add`,1) at the right place
-      InsertElemToSortedSemanticComputations(sorted_vec, {elem_to_add, 1});
+      InsertElemToSortedSemanticComputations(sorted_vec, {elem_to_add, increase_count});
     }
   }
 }
diff --git a/src/tir/transforms/common_subexpr_elim_tools.h b/src/tir/transforms/common_subexpr_elim_tools.h
index 0871fd009149..841f1d65a6f6 100644
--- a/src/tir/transforms/common_subexpr_elim_tools.h
+++ b/src/tir/transforms/common_subexpr_elim_tools.h
@@ -210,9 +210,10 @@ template std::vector<Var> VectorMap(const std::vector<std::pair<Var, MaybeValue>
 
 void InsertElemToSortedSemanticComputations(std::vector<std::pair<PrimExpr, size_t>>* sorted_vec,
                                             const std::pair<PrimExpr, size_t>& pair);
+
 void InsertVectorToSortedSemanticComputations(std::vector<std::pair<PrimExpr, size_t>>* sorted_vec,
                                               const std::vector<PrimExpr>& vec_to_add,
-                                              bool identify_equiv_terms);
+                                              bool identify_equiv_terms, size_t increase_count = 1);
 
 }  // namespace tir
 }  // namespace tvm
diff --git a/tests/python/unittest/test_arith_detect_cse.py b/tests/python/unittest/test_arith_detect_cse.py
new file mode 100644
index 000000000000..eba0920cb2da
--- /dev/null
+++ b/tests/python/unittest/test_arith_detect_cse.py
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+import tvm.testing
+from tvm.script import tir as T
+
+
+def test_detect_cs():
+    x = T.Var("x", dtype="int32")
+    y = T.Var("y", dtype="int32")
+    z = T.Var("z", dtype="int32")
+    c = T.floor(x + y + 0.5) + x + z * (T.floor(x + y + 0.5))
+    m = tvm.arith.detect_common_subexpr(c, 2)
+    assert c.a.a in m
+    assert m[c.a.a] == 2
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 9c2f24999e068c4dedef2e2c392bdddc00b6a867 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Mon, 9 Jan 2023 16:30:44 +0000
Subject: [PATCH 132/286] [microTVM] Replace arm_nnsupportfunctions.h with
 arm_acle.h (#13363)

* [microTVM] Replace arm_nnsupportfunctions.h with arm_acle.h

This attempts to replace the CMSIS-NN header with a more portable
alternative and avoid dependence on CMSIS

* Remove CMSIS __STATIC_FORCEINLINE macro

* Replace more intrinsics with ACLE variants

* Use builtins for intrinsics missing in older GCC

* Re-use common_includes to propagate shared functions

The packing definitions aren't implemented as ACLE intrinsics nor is there a simple way to convince a C compiler to generate them.

* Properly align memory access for

Introduce `memcpy` to explain to the compiler that we're changing
the alignment of `int16_t` to `int32_t`. What this appears to actually
do is encourage the compiler to use three loads rather than one double
load plus a regular load.

The padded array is aligned as an `int16_t`, it isn't guaranteed to
behave like an `int32_t` aligned array. One of the side effects of the
type punning from `int16_t*` to `int32_t*` is that we're effectively
lying to the compiler that this is correctly aligned and it can use
instructions which load multiple `int32_t`s at the same time - this does
not work :crying_cat_face:

Co-authored-by: Ashutosh Parkhi <ashutosh.parkhi@arm.com>
---
 .../mprofile/dsp/micro_kernel/avg_pool.py     |  6 +-
 .../mprofile/dsp/micro_kernel/common.py       | 34 +++++++++-
 .../arm_cpu/mprofile/dsp/micro_kernel/gemm.py | 66 ++++++++++++-------
 .../mprofile/dsp/micro_kernel/max_pool.py     | 10 +--
 .../micro_kernel/multi_channel_convolve.py    | 19 +++---
 5 files changed, 93 insertions(+), 42 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py
index 786ac2607b7f..e8e45152aae7 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py
@@ -101,7 +101,7 @@ def sum_impl(N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif // __cplusplus
-__STATIC_FORCEINLINE int32_t sum16_reset_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t sum16_reset_{uniq_id}(
     int16_t *res) {{
   *res = (int16_t)0;
   return 0;
@@ -110,7 +110,7 @@ def sum_impl(N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t sum16_{N}_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t sum16_{N}_{uniq_id}(
     int16_t *arr,
     int16_t *res16,
     long arr_offset,
@@ -129,7 +129,7 @@ def sum_impl(N, uniq_id):
   }}
 
   for ( int i = 0; i < n / 2; ++ i ) {{
-    res = __SMLAD(*p32, 0x00010001, res);
+    res = __smlad(*p32, 0x00010001, res);
     ++ p32;
   }}
 
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
index 0398844315a7..e89bf7c1b4fc 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
@@ -24,10 +24,42 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include <arm_nnsupportfunctions.h>
+#include <arm_acle.h>
 
 #include <tvm/runtime/crt/error_codes.h>
 
+
+#ifndef ARM_CPU_INTRINSICS_EXIST
+#define ARM_CPU_INTRINSICS_EXIST
+__attribute__((always_inline)) uint32_t __ror(uint32_t op1, uint32_t op2)
+{
+  op2 %= 32U;
+  if (op2 == 0U)
+  {
+    return op1;
+  }
+  return (op1 >> op2) | (op1 << (32U - op2));
+}
+
+#define __pkhbt(ARG1,ARG2,ARG3) \
+__extension__ \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+  __asm("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
+  __RES; \
+ })
+
+#define __pkhtb(ARG1,ARG2,ARG3) \
+__extension__ \
+({                          \
+  uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+  if (ARG3 == 0) \
+    __asm("pkhtb %0, %1, %2" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2)  ); \
+  else \
+    __asm("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) :  "r" (__ARG1), "r" (__ARG2), "I" (ARG3)  ); \
+  __RES; \
+ })
+#endif
 """
 
 MICRO_WORD_LENGTH_BITS = 32
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
index f1c0e3ea8d6d..0f448095027b 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
@@ -132,12 +132,30 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
     cc_code = (
         common.common_includes
         + f"""
+#ifndef ARM_CPU_MPROFILE_READ_AND_PAD_EXISTS
+#define ARM_CPU_MPROFILE_READ_AND_PAD_EXISTS
+__attribute__((always_inline)) static inline const int8_t *read_and_pad(const int8_t *source, int32_t *out1, int32_t *out2)
+{{
+    int32_t inA;
+    memcpy(&inA, source, 4);
+    source += 4;
+
+    int32_t inAbuf1 = __sxtb16(__ror((uint32_t)inA, 8));
+    int32_t inAbuf2 = __sxtb16(inA);
+    *out2 = (int32_t)(__pkhtb(inAbuf1, inAbuf2, 16));
+    *out1 = (int32_t)(__pkhbt(inAbuf2, inAbuf1, 16));
+
+    return source;
+}}
+#endif
+"""
+        + f"""
 
 
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{N}_body_rest_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm_{M}x{N}_body_rest_{uniq_id}(
     int K,
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
@@ -180,7 +198,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_loop_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_body_loop_{uniq_id}(
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   for (int i = 0; i < {M}; i++) {{
@@ -201,7 +219,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_body_{uniq_id}(
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   int16_t bb_pad[{bb_pad_size}];
@@ -226,7 +244,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
       int32_t *bb_ptr = (int32_t *) &bb_pad[j*{K}];
       int32_t sum = 0;
       for (int l = 0; l < 2 * ({K} / 4); l++) {{
-        sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
+        sum = __smlad(*aa_ptr, *bb_ptr, sum);
         ++ aa_ptr; ++ bb_ptr;
       }}
       // NOTE: this is the line where `*_body` differs from `*_update`. here
@@ -246,7 +264,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{N}_update_rest_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm_{M}x{N}_update_rest_{uniq_id}(
     int K,
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
@@ -289,7 +307,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_loop_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_update_loop_{uniq_id}(
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   for (int i = 0; i < {M}; i++) {{
@@ -307,7 +325,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_update_{uniq_id}(
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   int16_t bb_pad[{bb_pad_size}];
@@ -332,7 +350,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
       int32_t *bb_ptr = (int32_t *) &bb_pad[j*{K}];
       int32_t sum = 0;
       for (int l = 0; l < 2 * ({K} / 4); l++) {{
-        sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
+        sum = __smlad(*aa_ptr, *bb_ptr, sum);
         ++ aa_ptr; ++ bb_ptr;
       }}
       cc[i*C_stride + j] += sum;
@@ -349,7 +367,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm16_{M}x{N}_body_rest_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm16_{M}x{N}_body_rest_{uniq_id}(
     int K,
     int16_t *aa, int16_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
@@ -367,7 +385,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(
     int16_t *aa, int16_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   for (int i = 0; i < {M}; i++) {{
@@ -388,7 +406,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_body_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm16_{M}x{K}x{N}_body_{uniq_id}(
     int16_t *aa, int16_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   int32_t retcode = 0;
@@ -405,13 +423,14 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 
   for (int i = 0; i < {M}; i++) {{
     for (int j = 0; j < {N}; j++) {{
-      int32_t *aa_ptr = (int32_t *) &aa[i*A_stride];
-      int32_t *bb_ptr = (int32_t *) &bb[j*B_stride];
+      int32_t aa_vector[{K} / 2];
+      int32_t bb_vector[{K} / 2];
+      memcpy(&aa_vector, &aa[i * A_stride], sizeof(aa_vector));
+      memcpy(&bb_vector, &bb[j * B_stride], sizeof(bb_vector));
 
       int32_t sum = 0;
       for (int l = 0; l < {K} / 2; l++) {{
-        sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
-        ++ aa_ptr; ++ bb_ptr;
+        sum = __smlad(aa_vector[l], bb_vector[l], sum);
       }}
       // NOTE: this is the line where `*_body` differs from `*_update`. here
       // we're *setting* the result, instead of accumulating, because we know
@@ -430,7 +449,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm16_{M}x{N}_update_rest_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm16_{M}x{N}_update_rest_{uniq_id}(
     int K,
     int16_t *aa, int16_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
@@ -448,7 +467,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(
     int16_t *aa, int16_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   for (int i = 0; i < {M}; i++) {{
@@ -466,7 +485,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_update_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t gemm16_{M}x{K}x{N}_update_{uniq_id}(
     int16_t *aa, int16_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   int32_t retcode = 0;
@@ -478,13 +497,14 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 
   for (int i = 0; i < {M}; i++) {{
     for (int j = 0; j < {N}; j++) {{
-      int32_t *aa_ptr = (int32_t *) &aa[i*A_stride];
-      int32_t *bb_ptr = (int32_t *) &bb[j*B_stride];
+      int32_t aa_vector[{K} / 2];
+      int32_t bb_vector[{K} / 2];
+      memcpy(&aa_vector, &aa[i * A_stride], sizeof(aa_vector));
+      memcpy(&bb_vector, &bb[j * B_stride], sizeof(bb_vector));
 
       int32_t sum = 0;
       for (int l = 0; l < {K} / 2; l++) {{
-        sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
-        ++ aa_ptr; ++ bb_ptr;
+        sum = __smlad(aa_vector[l], bb_vector[l], sum);
       }}
       cc[i*C_stride + j] += sum;
     }}
@@ -500,7 +520,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_reset_{uniq_id}(int32_t *cc, int C_stride) {{
+__attribute__((always_inline)) static inline int32_t gemm_{M}x{K}x{N}_reset_{uniq_id}(int32_t *cc, int C_stride) {{
   for (int i = 0; i < {M}; i++) {{
     for (int j = 0; j < {N}; j++) {{
       cc[i*C_stride + j] = 0;
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py
index 4d410427c0cc..66d712a4a0a2 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py
@@ -94,7 +94,7 @@ def max_impl(uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t max8_reset_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t max8_reset_{uniq_id}(
     int8_t *res,
     int N) {{
   memset(res, (int8_t)-128, N * sizeof(*res));
@@ -104,7 +104,7 @@ def max_impl(uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t max8_loop_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t max8_loop_{uniq_id}(
     int8_t *arg,
     int8_t *res,
     int N) {{
@@ -117,7 +117,7 @@ def max_impl(uniq_id):
 #ifdef __cplusplus
 extern "C"
 #endif
-__STATIC_FORCEINLINE int32_t max8_{uniq_id}(
+__attribute__((always_inline)) static inline int32_t max8_{uniq_id}(
     int8_t *arg,
     int8_t *res,
     int N) {{
@@ -146,8 +146,8 @@ def max_impl(uniq_id):
   for ( int i = 0; i < N / 4; ++ i ) {{
     int32_t arg32 = *parg32 ++;
     int32_t res32 = *pres32;
-    __SSUB8(arg32, res32);
-    res32 = __SEL(arg32, res32);
+    __ssub8(arg32, res32);
+    res32 = __sel(arg32, res32);
     *pres32 ++ = res32;
   }}
 
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
index 992d90578046..25588964eeaf 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
@@ -23,7 +23,7 @@
 import textwrap
 
 from tvm import te, tir
-from .common import num_simd_lanes_per_word
+from .common import num_simd_lanes_per_word, common_includes
 
 
 def _get_func_name(in_dtype, tensor_w, channels, kernel_h, kernel_w, suffix):
@@ -107,10 +107,8 @@ def multi_channel_convolve_impl(in_dtype, *args) -> str:
 def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
     return textwrap.dedent(
         (
-            f"""
-        #include <stdint.h>
-        #include <arm_nnsupportfunctions.h>
-
+            common_includes
+            + f"""
         // __SXTB16(_ROR(X, Y)) is combined into one assembly instruction
 
         #define TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP( \
@@ -120,13 +118,13 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
           \
           uint32_t kernel_c3210 = *arranged_kernel++; \
           \
-          uint32_t tensor_c20 = __SXTB16(tensor_c3210); \
-          uint32_t kernel_c20 = __SXTB16(kernel_c3210); \
+          uint32_t tensor_c20 = __sxtb16(tensor_c3210); \
+          uint32_t kernel_c20 = __sxtb16(kernel_c3210); \
           sum_c0 = __builtin_arm_smlabb(tensor_c20, kernel_c20, sum_c0); \
           sum_c2 = __builtin_arm_smlatt(tensor_c20, kernel_c20, sum_c2); \
           \
-          uint32_t tensor_c31 = __SXTB16(__ROR(tensor_c3210, 8)); \
-          uint32_t kernel_c31 = __SXTB16(__ROR(kernel_c3210, 8)); \
+          uint32_t tensor_c31 = __sxtb16(__ror(tensor_c3210, 8)); \
+          uint32_t kernel_c31 = __sxtb16(__ror(kernel_c3210, 8)); \
           sum_c1 = __builtin_arm_smlabb(tensor_c31, kernel_c31, sum_c1); \
           sum_c3 = __builtin_arm_smlatt(tensor_c31, kernel_c31, sum_c3); \
         }}
@@ -172,7 +170,8 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, ke
 def _dual_int16_channel_convolve_impl(_tensor_h, tensor_w, channels, kernel_h, kernel_w, suffix):
     return textwrap.dedent(
         (
-            f"""
+            common_includes
+            + f"""
         #include <stdint.h>
 
         /* We do four channels at once to get this speed boost. */

From ba1e41c1f0f8847d5e6a5429a277e0fae932afe7 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Mon, 9 Jan 2023 17:48:03 +0000
Subject: [PATCH 133/286] [AOT] Added a test for detecting output size post MLF
 export (#13655)

Follow up: https://github.com/apache/tvm/pull/12789

-Added a separate test to detect output size from MLF codegen
---
 python/tvm/micro/model_library_format.py | 10 ++++-
 tests/python/relay/aot/test_crt_aot.py   | 57 ++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 263371cda171..7ad2c531f292 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -485,6 +485,12 @@ def _export_graph_model_library_format(
                     "functions"
                 ]["main"][0]["outputs"][key]
 
+            input_name_to_size_map = {
+                name: property_map["size"] for name, property_map in inputs_sizes.items()
+            }
+            output_name_to_size_map = {
+                name: property_map["size"] for name, property_map in output_sizes.items()
+            }
             generate_c_interface_header(
                 mod.libmod_name,
                 inputs,
@@ -494,8 +500,8 @@ def _export_graph_model_library_format(
                 devices,
                 workspace_size,
                 include_path,
-                inputs_sizes,
-                output_sizes,
+                input_name_to_size_map,
+                output_name_to_size_map,
             )
 
         is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index c3426f147e0d..b3db410156b3 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -225,6 +225,63 @@ def test_packed_global_variables():
             assert f"{func}_packed" not in tvmgen_names
 
 
+def test_io_size_definition():
+    """Check network IO size definitions in the codegen output."""
+    dtype = "float32"
+    ishape = (1, 32, 14, 14)
+    wshape = (32, 32, 3, 3)
+    interface_api = "c"
+    use_unpacked_api = True
+
+    data0 = relay.var("data", shape=ishape, dtype=dtype)
+    weight0 = relay.var("weight", shape=wshape, dtype=dtype)
+    out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1), groups=1)
+    main_f = relay.Function([data0, weight0], out)
+    mod = tvm.IRModule()
+    mod["main"] = main_f
+    mod = transform.InferType()(mod)
+
+    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
+    w_data = np.random.uniform(0, 1, wshape).astype(dtype)
+
+    inputs = OrderedDict([("data", i_data), ("weight", w_data)])
+
+    output_list = generate_ref_data(mod, inputs)
+    compiled_models_list = compile_models(
+        models=AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+        interface_api=interface_api,
+        use_unpacked_api=use_unpacked_api,
+        workspace_byte_alignment=8,
+        enable_op_fusion=True,
+        pass_config=AOT_DEFAULT_RUNNER.pass_config,
+        use_runtime_executor=True,
+        target=tvm.target.Target("c"),
+    )
+    dtype_itemsize = np.dtype(dtype).itemsize
+    ref_input_size = i_data.size * dtype_itemsize
+    ref_weight_size = w_data.size * dtype_itemsize
+    ref_output_size = output_list["output"].size * dtype_itemsize
+    compiled_model = compiled_models_list[0]
+
+    tmp_path = utils.tempdir()
+    base_path = tmp_path.temp_dir
+
+    model = compiled_model.model
+    tar_file = os.path.join(base_path, f"{model.name}.tar")
+    export_model_library_format(compiled_model.executor_factory, tar_file)
+    t = tarfile.open(tar_file)
+    t.extractall(base_path)
+
+    header_path = f"{base_path}/codegen/host/include/tvmgen_{model.name}.h"
+    with open(header_path, "r") as header:
+        contents = header.readlines()
+        contents = "".join(map(str, contents))
+        assert contents.count("_SIZE") == 4
+        assert f"TVMGEN_DEFAULT_DATA_SIZE {ref_input_size}" in contents
+        assert f"TVMGEN_DEFAULT_WEIGHT_SIZE {ref_weight_size}" in contents
+        assert f"TVMGEN_DEFAULT_OUTPUT_SIZE {ref_output_size}" in contents
+
+
 @parametrize_aot_options
 def test_concatenate(interface_api, use_unpacked_api, test_runner):
     """Tests compilation of concatenate"""

From 9abe71cef106d69937d6d5502d75a3033fe87f3c Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 9 Jan 2023 09:56:27 -0800
Subject: [PATCH 134/286] [Fix,Roofline] Fix roofline handling of multiple peak
 flops (#13716)

In the switch to multiple possible peakflops measurement, the logic to
add all of them was skipped. Instead only the last was added.
---
 python/tvm/utils/roofline/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/tvm/utils/roofline/__init__.py b/python/tvm/utils/roofline/__init__.py
index 67cf1133ffa5..67d80eb05284 100644
--- a/python/tvm/utils/roofline/__init__.py
+++ b/python/tvm/utils/roofline/__init__.py
@@ -145,6 +145,7 @@ def roofline_from_existing(
         if isinstance(prim, tir.PrimFunc) and "hash" in prim.attrs.keys()
     }
 
+    new_configuration = dict(report.configuration.items())
     new_calls = []
     for call in report.calls:
         if "Hash" in call.keys() and call["Hash"] in all_features:
@@ -159,6 +160,10 @@ def roofline_from_existing(
                 loaded_bytes, peak_bandwidth, bandwidth_name = registry.estimate_peak_bandwidth(
                     prim, features, target, dev, remote
                 )
+            new_configuration[f"Estimated Peak FLOP/s ({flops_name})"] = profiling.Ratio(peak_flops)
+            new_configuration[
+                f"Estimated Peak Bandwidth ({bandwidth_name}, byte/second)"
+            ] = profiling.Ratio(peak_bandwidth)
             ridge_point = peak_flops / peak_bandwidth
 
             runtime = call["Duration (us)"].microseconds * 1e-6
@@ -180,11 +185,6 @@ def roofline_from_existing(
             new_calls.append(call)
         else:
             new_calls.append(call)
-    new_configuration = dict(report.configuration.items())
-    new_configuration[f"Estimated Peak FLOP/s ({flops_name})"] = profiling.Ratio(peak_flops)
-    new_configuration[
-        f"Estimated Peak Bandwidth ({bandwidth_name}, byte/second)"
-    ] = profiling.Ratio(peak_bandwidth)
     return profiling.Report(new_calls, report.device_metrics, new_configuration)
 
 
From bf7d667cf96ab3ad95cbd2b4878bb1e1adab1ae5 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Mon, 9 Jan 2023 19:42:19 +0000
Subject: [PATCH 135/286] Add support for named outputs in MLF archive (#13704)

Following from #12789, this adds support for determining the output tensor name from the input model within the MLF metadata json.

Co-authored-by: Ashutosh Parkhi <ashutosh.parkhi@arm.com>
---
 python/tvm/micro/model_library_format.py      | 119 ++++++++----------
 .../test_micro_model_library_format.py        |  84 +++++++++++++
 2 files changed, 135 insertions(+), 68 deletions(-)

diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 7ad2c531f292..2e4b00da289c 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -26,7 +26,6 @@
 import typing
 
 import tvm
-from tvm.ir.type import TupleType
 from tvm.micro import get_standalone_crt_dir
 from .._ffi import get_global_func
 from ..contrib import utils
@@ -217,6 +216,29 @@ def _create_type_metadata(input_type):
     }
 
 
+def _flatten_tuple_outputs(ret_type, predefined_names, offset=0):
+    if isinstance(ret_type, tvm.ir.tensor_type.TensorType):
+        name = predefined_names[offset] if predefined_names else f"output{offset}"
+        return {name: ret_type}
+
+    added_fields = len(ret_type.fields)
+    outputs = {}
+    for output_index in range(added_fields):
+        next_output = offset + len(outputs)
+        outputs.update(
+            _flatten_tuple_outputs(ret_type.fields[output_index], predefined_names, next_output)
+        )
+
+    return outputs
+
+
+def _get_outputs_from_ret_type(ret_type, predefined_names):
+    if isinstance(ret_type, tvm.ir.tensor_type.TensorType):
+        name = predefined_names[0] if predefined_names else "output"
+        return {name: ret_type}
+    return _flatten_tuple_outputs(ret_type, predefined_names)
+
+
 def _build_function_memory_map(function_metadata):
     """Build a simple map that shows how much workspace is required to execute
     each primitive function. The main_func describes how much memory is required
@@ -297,29 +319,25 @@ def _create_empty_entry(target_device_type):
             target_main_entries[int(target.get_target_device_type())] = _create_empty_entry(
                 int(target.get_target_device_type())
             )
-        target_main_entries[int(target.get_target_device_type())]["io_size_bytes"] = int(
-            main_func_metadata.io_sizes[target]
-        )
+        target_main_on_device = target_main_entries[int(target.get_target_device_type())]
+        target_main_on_device["io_size_bytes"] = int(main_func_metadata.io_sizes[target])
 
-        # Now, we also add the information about the size of each input and output of the main
-        # function (in bytes)
-        input_dict = {}
-        for input_param in main_func_metadata.relay_primfuncs[target].params:
-            input_dict[input_param.name_hint] = _create_type_metadata(input_param.checked_type)
-        target_main_entries[int(target.get_target_device_type())]["inputs"] = input_dict
-
-        output_dict = {}
-        # For output, we dont have the name of the output, so we enumerate them
-        if isinstance(main_func_metadata.relay_primfuncs[target].ret_type, tvm.ir.type.TupleType):
-            output_list = _convert_tuple_to_outputs(
-                main_func_metadata.relay_primfuncs[target].ret_type
-            )
-            for i, output_type in enumerate(output_list):
-                output_dict[f"output{i}"] = _create_type_metadata(output_type)
-        else:
-            output_type = main_func_metadata.relay_primfuncs[target].ret_type
-            output_dict["output"] = _create_type_metadata(output_type)
-        target_main_entries[int(target.get_target_device_type())]["outputs"] = output_dict
+        main_relay_func = main_func_metadata.relay_primfuncs[target]
+        target_main_on_device["inputs"] = {
+            input_param.name_hint: _create_type_metadata(input_param.checked_type)
+            for input_param in main_relay_func.params
+        }
+        predefined_names = (
+            main_relay_func.attrs["output_tensor_names"]
+            if "output_tensor_names" in main_relay_func.attrs
+            else None
+        )
+        target_main_on_device["outputs"] = {
+            name: _create_type_metadata(output_type)
+            for name, output_type in _get_outputs_from_ret_type(
+                main_relay_func.ret_type, predefined_names
+            ).items()
+        }
 
     ret = {
         "operator_functions": func_entries,
@@ -328,30 +346,6 @@ def _create_empty_entry(target_device_type):
     return ret
 
 
-def _get_main_relay_func(mod: executor_factory.ExecutorFactoryModule):
-    main_func = mod.function_metadata[MAIN_FUNC_NAME_STR]
-    target = list(main_func.relay_primfuncs.keys())[0]
-    return main_func.relay_primfuncs[target]
-
-
-def _convert_tuple_to_outputs(ret_type, offset=0):
-    outputs = []
-    added_fields = len(ret_type.fields)
-    for output_index in range(added_fields):
-        next_output = offset + len(outputs)
-        if isinstance(ret_type.fields[output_index], TupleType):
-            outputs.extend(_convert_tuple_to_outputs(ret_type.fields[output_index], next_output))
-        else:
-            outputs.append(ret_type.fields[output_index])
-    return outputs
-
-
-def _get_inputs_and_outputs_from_module(mod):
-    inputs = [str(input_var.name) for input_var in mod.executor_codegen_metadata.inputs]
-    outputs = list(mod.executor_codegen_metadata.outputs)
-    return inputs, outputs
-
-
 def _get_pools_from_module(mod):
     return list(dict(mod.executor_codegen_metadata.pool_inputs).values())
 
@@ -462,28 +456,17 @@ def _export_graph_model_library_format(
             if not include_path.exists():
                 include_path.mkdir()
 
-            inputs, outputs = _get_inputs_and_outputs_from_module(mod)
             devices = mod.get_devices()
             pools = _get_pools_from_module(mod)
             io_pool_allocations = _get_io_pool_allocation_from_module(mod)
-            workspace_size = int(
-                metadata["modules"][mod.libmod_name]["memory"]["functions"]["main"][0][
-                    "workspace_size_bytes"
-                ]
-            )
-            inputs_sizes = metadata["modules"][mod.libmod_name]["memory"]["functions"]["main"][0][
-                "inputs"
-            ]
-            # Here, we merge the output sizes with the actual output names
-            output_sizes = {}
-            for i, key in enumerate(
-                metadata["modules"][mod.libmod_name]["memory"]["functions"]["main"][0][
-                    "outputs"
-                ].keys()
-            ):
-                output_sizes[outputs[i]] = metadata["modules"][mod.libmod_name]["memory"][
-                    "functions"
-                ]["main"][0]["outputs"][key]
+            main_func = metadata["modules"][mod.libmod_name]["memory"]["functions"]["main"][0]
+            workspace_size = int(main_func["workspace_size_bytes"])
+            inputs = main_func["inputs"]
+            outputs = main_func["outputs"]
+            inputs_sizes = {name: property_map["size"] for name, property_map in inputs.items()}
+            output_sizes = {name: property_map["size"] for name, property_map in outputs.items()}
+            input_names = list(inputs.keys())
+            output_names = list(outputs.keys())
 
             input_name_to_size_map = {
                 name: property_map["size"] for name, property_map in inputs_sizes.items()
@@ -493,8 +476,8 @@ def _export_graph_model_library_format(
             }
             generate_c_interface_header(
                 mod.libmod_name,
-                inputs,
-                outputs,
+                input_names,
+                output_names,
                 pools,
                 io_pool_allocations,
                 devices,
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index e8ffed82062e..e664c2ebb858 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -632,5 +632,89 @@ def test_multiple_relay_modules_aot_graph():
     assert metadata["version"] == _GENERATED_VERSION
 
 
+@tvm.testing.requires_micro
+def test_output_name_single():
+    """Generate a conv2d Relay module for testing."""
+    input_a = tvm.relay.var("input_a", shape=(3, 4, 5), dtype="int64")
+    output_1 = input_a + tvm.relay.const(1, "int64")
+    attrs = tvm.ir.make_node("DictAttrs", output_tensor_names=["test_output_a"])
+    main_func = tvm.relay.Function([input_a], output_1, attrs=attrs)
+    mod = tvm.IRModule.from_expr(main_func)
+    mod = tvm.relay.transform.InferType()(mod)
+
+    executor = Executor("aot", {"unpacked-api": True, "interface-api": "c"})
+    runtime = Runtime("crt")
+    target = tvm.target.target.micro("host")
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod1")
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+
+    micro.export_model_library_format(factory, mlf_tar_path)
+
+    tf = tarfile.open(mlf_tar_path)
+    extract_dir = temp_dir.relpath("extract")
+    os.mkdir(extract_dir)
+    tf.extractall(extract_dir)
+
+    with open(os.path.join(extract_dir, "metadata.json")) as f:
+        metadata = json.load(f)
+
+    assert metadata["modules"]["mod1"]["memory"]["functions"]["main"][0]["outputs"] == {
+        "test_output_a": {"size": 480, "dtype": "int64"}
+    }
+
+
+@tvm.testing.requires_micro
+def test_output_names_many():
+    """Generate a conv2d Relay module for testing."""
+    input_a = tvm.relay.var("input_a", shape=(3, 4, 5), dtype="int64")
+    input_b = tvm.relay.var("input_b", shape=(3, 4), dtype="int32")
+    input_c = tvm.relay.var("input_c", shape=(3,), dtype="float32")
+
+    output_1 = input_a + tvm.relay.const(1, "int64")
+    output_2 = input_b + tvm.relay.const(2)
+    output_3 = input_b + tvm.relay.const(3)
+    output_4 = input_c + tvm.relay.const(4.0)
+
+    full_output = tvm.relay.Tuple(
+        [output_1, tvm.relay.Tuple([tvm.relay.Tuple([output_2, output_3]), output_4])]
+    )
+    attrs = tvm.ir.make_node(
+        "DictAttrs",
+        output_tensor_names=["test_output_a", "test_output_b", "test_output_c", "test_output_d"],
+    )
+    main_func = tvm.relay.Function([input_a, input_b, input_c], full_output, attrs=attrs)
+    mod = tvm.IRModule.from_expr(main_func)
+    mod = tvm.relay.transform.InferType()(mod)
+
+    executor = Executor("aot", {"unpacked-api": True, "interface-api": "c"})
+    runtime = Runtime("crt")
+    target = tvm.target.target.micro("host")
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod1")
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+
+    micro.export_model_library_format(factory, mlf_tar_path)
+
+    tf = tarfile.open(mlf_tar_path)
+    extract_dir = temp_dir.relpath("extract")
+    os.mkdir(extract_dir)
+    tf.extractall(extract_dir)
+
+    with open(os.path.join(extract_dir, "metadata.json")) as f:
+        metadata = json.load(f)
+
+    assert metadata["modules"]["mod1"]["memory"]["functions"]["main"][0]["outputs"] == {
+        "test_output_a": {"size": 480, "dtype": "int64"},
+        "test_output_b": {"size": 48, "dtype": "int32"},
+        "test_output_c": {"size": 48, "dtype": "int32"},
+        "test_output_d": {"size": 12, "dtype": "float32"},
+    }
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From f8d2311849304a4bac92f7156d60f46bb34f970d Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Mon, 9 Jan 2023 12:02:44 -0800
Subject: [PATCH 136/286] [HotFix][docs] Use correct Colab button URL (#13725)

* [docs] Use correct Colab button URL

* Black format conf.py
---
 docs/conf.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/conf.py b/docs/conf.py
index 357df8cef12c..18c634c05d05 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -153,7 +153,9 @@ def split_code_and_text_blocks(source_file, return_node, real_func):
 IPYTHON_GITHUB_BASE = "apache/tvm-site/blob/asf-site/docs/_downloads/"
 
 # The SVG image of the "Open in Colab" button.
-BUTTON = "https://raw.githubusercontent.com/apache/web-data/main/images/utilities/colab_button.svg"
+BUTTON = (
+    "https://raw.githubusercontent.com/tlc-pack/web-data/main/images/utilities/colab_button.svg"
+)
 
 
 @monkey_patch("sphinx_gallery.gen_rst", "save_rst_example")

From b075bdab1da9043efbfa720480844b4f2035fcd5 Mon Sep 17 00:00:00 2001
From: Alexey Yazev <113356454+Alexey-Yazev@users.noreply.github.com>
Date: Tue, 10 Jan 2023 01:24:59 +0400
Subject: [PATCH 137/286] [microNPU] Add support for TFLite PAD (#13732)

A separate nn.pad relay operator is legalized to an Ethos-U depthwise_conv2d operator.
For ethosu_depthwise_conv2d the hardware only supports padding up to 31, 31, 32, 32, 32,
so the pad size for legalization on the NPU is within these limits.
---
 .../relay/backend/contrib/ethosu/legalize.py  | 57 +++++++++++++
 .../tvm/relay/backend/contrib/ethosu/util.py  | 10 +++
 python/tvm/relay/op/contrib/ethosu.py         | 85 +++++++++++++++++++
 .../contrib/test_ethosu/test_codegen.py       | 23 +++++
 4 files changed, 175 insertions(+)

diff --git a/python/tvm/relay/backend/contrib/ethosu/legalize.py b/python/tvm/relay/backend/contrib/ethosu/legalize.py
index e261f129bf50..fdd465529123 100644
--- a/python/tvm/relay/backend/contrib/ethosu/legalize.py
+++ b/python/tvm/relay/backend/contrib/ethosu/legalize.py
@@ -1355,6 +1355,62 @@ def callback(self, pre, post, node_map):
         return ethosu_fc
 
 
+class PadRewriter(DFPatternCallback):
+    """Convert ethos-u.pad2d composite function to ethosu_depthwise_conv2d
+    operator"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (
+            wildcard().has_attr({"Composite": ethosu_patterns.PadParams.composite_name})
+        )(wildcard())
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = ethosu_patterns.PadParams(post.op.body)
+        params.ifm.tensor = post.args[0]
+        channels_map = {
+            "NHWC": 3,
+        }
+        w_h, w_w = (1, 1)
+        # OHWI format for the ethosu_depthwise_conv2d kernel weights
+        weight_shape = (params.ifm.shape[-1], w_h, w_w, 1)
+        weights = relay.const(np.full(weight_shape, 1), params.ifm.dtype)
+        scale_bias = vela_api.pack_biases(
+            biases=np.zeros(params.ifm.shape[-1]),
+            ifm_scale=params.ifm.q_params.scale_f32,
+            ifm_dtype=np.dtype(params.ifm.dtype),
+            weight_scales=np.array(1.0, dtype=np.float32),
+            ofm_scale=params.ofm.q_params.scale_f32,
+            is_activation_tanh_or_sigmoid=False,
+        )
+
+        return ethosu_ops.ethosu_depthwise_conv2d(
+            ifm=post.args[0],
+            weight=weights,
+            scale_bias=relay.const(scale_bias, "uint8"),
+            lut=relay.const([], "int8"),
+            ifm_scale=float(params.ifm.q_params.scale_f32),
+            ifm_zero_point=int(params.ifm.q_params.zero_point.item()),
+            weight_zero_point=0,
+            ofm_scale=float(params.ofm.q_params.scale_f32),
+            ofm_zero_point=int(params.ofm.q_params.zero_point.item()),
+            kernel_shape=(w_h, w_w),
+            ofm_channels=params.ofm.shape[channels_map[str(params.ofm.layout)]],
+            strides=(1, 1),
+            padding=params.padding,
+            dilation=(1, 1),
+            activation="NONE",
+            clip_min=0,
+            clip_max=0,
+            upscale="NONE",
+            ifm_layout=str(params.ifm.layout),
+            ofm_layout=str(params.ofm.layout),
+            ofm_dtype=str(params.ofm.dtype),
+        )
+
+
 @util.create_npu_function_pass(opt_level=1)
 class LegalizeEthosU:
     """This is the pass to call graph-rewrites to perform graph transformation
@@ -1375,6 +1431,7 @@ def transform_npu_function(self, _, func: relay.Function) -> relay.Function:
             FullyConnectedRewriter(),
             MaxPoolingRewriter(),
             AvgPoolingRewriter(),
+            PadRewriter(),
             AddRewriter(),
             SubRewriter(),
             MulRewriter(),
diff --git a/python/tvm/relay/backend/contrib/ethosu/util.py b/python/tvm/relay/backend/contrib/ethosu/util.py
index de4c50e51c63..70ec1c12eb3d 100644
--- a/python/tvm/relay/backend/contrib/ethosu/util.py
+++ b/python/tvm/relay/backend/contrib/ethosu/util.py
@@ -143,6 +143,16 @@ class QDenseArgs(Enum):
     WEIGHTS_SCALE = 5
 
 
+class QPad2DArgs(Enum):
+    """
+    This is a helper enum to obtain the correct index
+    of nn.pad arguments.
+    """
+
+    IFM = 0
+    IFM_ZERO_POINT = 1
+
+
 def is_npu_func(func: relay.Function) -> bool:
     """Check if the given function is an NPU function."""
     return func.attrs and "Compiler" in func.attrs and func.attrs["Compiler"] == "ethos-u"
diff --git a/python/tvm/relay/op/contrib/ethosu.py b/python/tvm/relay/op/contrib/ethosu.py
index c0f8e5e9708e..a86357db39fc 100644
--- a/python/tvm/relay/op/contrib/ethosu.py
+++ b/python/tvm/relay/op/contrib/ethosu.py
@@ -1772,6 +1772,86 @@ def hard_swish_pattern():
     return quantize
 
 
+class PadParams:
+    """
+    This class will parse a call to a ethosu.pad2d composite function
+    and extract the parameter information.
+    """
+
+    composite_name = "ethos-u.pad2d"
+    # The ethos-u.pad2d composite function will be transformed to the
+    # ethosu_depthwise_conv2d operator.
+    # For the ethosu_depthwise_conv2d the hardware only supports padding
+    # upto the numbers as follows, so we define such padding limits
+    padding_bounds = [31, 31, 32, 32]
+
+    def __init__(self, func_body: Call):
+        from tvm.relay.backend.contrib.ethosu.util import QPad2DArgs
+
+        # there is no 'layout' attribute in nn.pad
+        layout = "NHWC"
+        self.ifm = TensorParams(
+            tensor=func_body.args[QPad2DArgs.IFM.value],
+            layout=layout,
+            scale=tvm.relay.Constant(tvm.nd.array(np.array(1.0, dtype="float32"))),
+            zero_point=func_body.args[QPad2DArgs.IFM_ZERO_POINT.value],
+        )
+
+        self.padding = self.extract_padding(func_body)
+        self.ofm = TensorParams(
+            tensor=func_body,
+            layout=layout,
+            scale=tvm.relay.Constant(tvm.nd.array(np.array(1.0, dtype="float32"))),
+            zero_point=func_body.args[QPad2DArgs.IFM_ZERO_POINT.value],
+        )
+
+    @staticmethod
+    def extract_padding(
+        padding: relay.Call,
+    ) -> Optional[Tuple[int, int, int, int]]:
+        """
+        Here we check whether a separate padding operation can be rewritten
+        as NPU depthwise convolution. If the padding specified by the
+        separate nn.pad operation is not supported by NPU depthwise convolution,
+        None will be returned. This will cause the nn.pad not to be offloaded to NPU.
+        """
+        pad_width = padding.attrs["pad_width"]
+        if len(pad_width) != 4:
+            return None
+        if list(pad_width[0]) != [0, 0] or list(pad_width[3]) != [0, 0]:
+            return None
+        return [
+            pad_width[1][0],
+            pad_width[2][0],
+            pad_width[1][1],
+            pad_width[2][1],
+        ]
+
+    def is_valid(self):
+        """
+        This function checks whether pad has compatible attributes
+        with the NPU depthwise convolution
+        """
+        tensor_params = [self.ifm, self.ofm]
+        if not check_valid_dtypes(tensor_params, supported_dtypes=[np.uint8, np.int8]):
+            return False
+        if self.ifm.dtype != self.ofm.dtype:
+            return False
+        if not check_batch_size(self.ifm):
+            return False
+        if not self.padding or not check_padding(self.padding, self.padding_bounds):
+            return False
+        if not check_dimensions(self.ifm) or not check_dimensions(self.ofm):
+            return False
+        return True
+
+
+def pad_pattern():
+    """Create pattern for pad"""
+    pattern = is_op("nn.pad")(wildcard(), is_constant())
+    return pattern
+
+
 @register_pattern_table("ethos-u")
 def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]:
     return [
@@ -1805,6 +1885,11 @@ def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Cal
             qnn_avgpool2d_pattern(),
             lambda pat: AvgPool2DParams(pat).is_valid(),
         ),
+        (
+            PadParams.composite_name,
+            pad_pattern(),
+            lambda pat: PadParams(pat).is_valid(),
+        ),
         (
             AddParams.composite_name,
             qnn_add_pattern(),
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index e06e36638d7f..13b54b988963 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -258,6 +258,29 @@ def depthwise_conv2d(x):
     infra.compare_tvm_with_tflite(depthwise_conv2d, [ifm_shape], "ethos-u55-256")
 
 
+@pytest.mark.parametrize("ifm_shape", [(1, 55, 55, 3), (1, 23, 32, 7)])
+@pytest.mark.parametrize("padding", [(0, 1, 0, 0), (1, 1, 1, 1), (1, 1, 5, 5)])
+@pytest.mark.parametrize("const_value", [0, 5, 125, -5])
+def test_tflite_separate_pad(
+    ifm_shape,
+    padding,
+    const_value,
+):
+
+    np.random.seed(0)
+
+    @tf.function
+    def pad2d(x):
+        return tf.pad(
+            x,
+            [[0, 0], [padding[0], padding[2]], [padding[1], padding[3]], [0, 0]],
+            "CONSTANT",
+            const_value,
+        )
+
+    infra.compare_tvm_with_tflite(pad2d, [ifm_shape], "ethos-u55-256")
+
+
 @pytest.mark.parametrize(
     "accel_type",
     ACCEL_TYPES,

From 923a2fb06b37d30112784b0663d70ae9863e2d8f Mon Sep 17 00:00:00 2001
From: LancerComet <LancerComet@users.noreply.github.com>
Date: Tue, 10 Jan 2023 05:39:20 +0800
Subject: [PATCH 138/286] [Web] Try to upgrade WebGPU API usage to the latest
 (#13731)

+ Upgrade WebGPU API to latest.
---
 web/package-lock.json | 8746 ++++++++++++++++++++++++++++++++++++++++-
 web/package.json      |    2 +-
 web/src/webgpu.ts     |   32 +-
 3 files changed, 8760 insertions(+), 20 deletions(-)

diff --git a/web/package-lock.json b/web/package-lock.json
index 7032e318877c..77474c8e9be1 100644
--- a/web/package-lock.json
+++ b/web/package-lock.json
@@ -1,8 +1,8746 @@
 {
   "name": "tvmjs",
   "version": "0.11.0-dev0",
-  "lockfileVersion": 1,
+  "lockfileVersion": 2,
   "requires": true,
+  "packages": {
+    "": {
+      "name": "tvmjs",
+      "version": "0.11.0-dev0",
+      "license": "Apache-2.0",
+      "devDependencies": {
+        "@rollup/plugin-commonjs": "^11.1.0",
+        "@rollup/plugin-node-resolve": "^7.1.3",
+        "@types/babel__traverse": "<=7.18.2",
+        "@types/node": "^12.12.37",
+        "@typescript-eslint/eslint-plugin": "^2.29.0",
+        "@typescript-eslint/parser": "^2.29.0",
+        "@webgpu/types": "^0.1.24",
+        "eslint": "^6.8.0",
+        "jest": "^26.0.1",
+        "rollup": "^2.7.6",
+        "rollup-plugin-typescript2": "^0.27.0",
+        "typedoc": "^0.17.6",
+        "typescript": "^3.8.3",
+        "ws": "^7.2.5"
+      }
+    },
+    "node_modules/@ampproject/remapping": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.2.0.tgz",
+      "integrity": "sha512-qRmjj8nj9qmLTQXXmaR1cck3UXSRMPrbsLJAasZpF+t3riI71BXed5ebIOYwQntykeZuhjsdweEc9BxH5Jc26w==",
+      "dev": true,
+      "dependencies": {
+        "@jridgewell/gen-mapping": "^0.1.0",
+        "@jridgewell/trace-mapping": "^0.3.9"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@babel/code-frame": {
+      "version": "7.18.6",
+      "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.18.6.tgz",
+      "integrity": "sha512-TDCmlK5eOvH+eH7cdAFlNXeVJqWIQ7gW9tY1GJIpUtFb6CmjVyq2VM3u71bOyR8CRihcCgMUYoDNyLXao3+70Q==",
+      "dev": true,
+      "dependencies": {
+        "@babel/highlight": "^7.18.6"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/compat-data": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.20.5.tgz",
+      "integrity": "sha512-KZXo2t10+/jxmkhNXc7pZTqRvSOIvVv/+lJwHS+B2rErwOyjuVRh60yVpb7liQ1U5t7lLJ1bz+t8tSypUZdm0g==",
+      "dev": true,
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/core": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.20.5.tgz",
+      "integrity": "sha512-UdOWmk4pNWTm/4DlPUl/Pt4Gz4rcEMb7CY0Y3eJl5Yz1vI8ZJGmHWaVE55LoxRjdpx0z259GE9U5STA9atUinQ==",
+      "dev": true,
+      "dependencies": {
+        "@ampproject/remapping": "^2.1.0",
+        "@babel/code-frame": "^7.18.6",
+        "@babel/generator": "^7.20.5",
+        "@babel/helper-compilation-targets": "^7.20.0",
+        "@babel/helper-module-transforms": "^7.20.2",
+        "@babel/helpers": "^7.20.5",
+        "@babel/parser": "^7.20.5",
+        "@babel/template": "^7.18.10",
+        "@babel/traverse": "^7.20.5",
+        "@babel/types": "^7.20.5",
+        "convert-source-map": "^1.7.0",
+        "debug": "^4.1.0",
+        "gensync": "^1.0.0-beta.2",
+        "json5": "^2.2.1",
+        "semver": "^6.3.0"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/babel"
+      }
+    },
+    "node_modules/@babel/core/node_modules/semver": {
+      "version": "6.3.0",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
+      "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+      "dev": true,
+      "bin": {
+        "semver": "bin/semver.js"
+      }
+    },
+    "node_modules/@babel/generator": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.20.5.tgz",
+      "integrity": "sha512-jl7JY2Ykn9S0yj4DQP82sYvPU+T3g0HFcWTqDLqiuA9tGRNIj9VfbtXGAYTTkyNEnQk1jkMGOdYka8aG/lulCA==",
+      "dev": true,
+      "dependencies": {
+        "@babel/types": "^7.20.5",
+        "@jridgewell/gen-mapping": "^0.3.2",
+        "jsesc": "^2.5.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/generator/node_modules/@jridgewell/gen-mapping": {
+      "version": "0.3.2",
+      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.2.tgz",
+      "integrity": "sha512-mh65xKQAzI6iBcFzwv28KVWSmCkdRBWoOh+bYQGW3+6OZvbbN3TqMGo5hqYxQniRcH9F2VZIoJCm4pa3BPDK/A==",
+      "dev": true,
+      "dependencies": {
+        "@jridgewell/set-array": "^1.0.1",
+        "@jridgewell/sourcemap-codec": "^1.4.10",
+        "@jridgewell/trace-mapping": "^0.3.9"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@babel/helper-compilation-targets": {
+      "version": "7.20.0",
+      "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.20.0.tgz",
+      "integrity": "sha512-0jp//vDGp9e8hZzBc6N/KwA5ZK3Wsm/pfm4CrY7vzegkVxc65SgSn6wYOnwHe9Js9HRQ1YTCKLGPzDtaS3RoLQ==",
+      "dev": true,
+      "dependencies": {
+        "@babel/compat-data": "^7.20.0",
+        "@babel/helper-validator-option": "^7.18.6",
+        "browserslist": "^4.21.3",
+        "semver": "^6.3.0"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0"
+      }
+    },
+    "node_modules/@babel/helper-compilation-targets/node_modules/semver": {
+      "version": "6.3.0",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
+      "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+      "dev": true,
+      "bin": {
+        "semver": "bin/semver.js"
+      }
+    },
+    "node_modules/@babel/helper-environment-visitor": {
+      "version": "7.18.9",
+      "resolved": "https://registry.npmjs.org/@babel/helper-environment-visitor/-/helper-environment-visitor-7.18.9.tgz",
+      "integrity": "sha512-3r/aACDJ3fhQ/EVgFy0hpj8oHyHpQc+LPtJoY9SzTThAsStm4Ptegq92vqKoE3vD706ZVFWITnMnxucw+S9Ipg==",
+      "dev": true,
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-function-name": {
+      "version": "7.19.0",
+      "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.19.0.tgz",
+      "integrity": "sha512-WAwHBINyrpqywkUH0nTnNgI5ina5TFn85HKS0pbPDfxFfhyR/aNQEn4hGi1P1JyT//I0t4OgXUlofzWILRvS5w==",
+      "dev": true,
+      "dependencies": {
+        "@babel/template": "^7.18.10",
+        "@babel/types": "^7.19.0"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-hoist-variables": {
+      "version": "7.18.6",
+      "resolved": "https://registry.npmjs.org/@babel/helper-hoist-variables/-/helper-hoist-variables-7.18.6.tgz",
+      "integrity": "sha512-UlJQPkFqFULIcyW5sbzgbkxn2FKRgwWiRexcuaR8RNJRy8+LLveqPjwZV/bwrLZCN0eUHD/x8D0heK1ozuoo6Q==",
+      "dev": true,
+      "dependencies": {
+        "@babel/types": "^7.18.6"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-module-imports": {
+      "version": "7.18.6",
+      "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.18.6.tgz",
+      "integrity": "sha512-0NFvs3VkuSYbFi1x2Vd6tKrywq+z/cLeYC/RJNFrIX/30Bf5aiGYbtvGXolEktzJH8o5E5KJ3tT+nkxuuZFVlA==",
+      "dev": true,
+      "dependencies": {
+        "@babel/types": "^7.18.6"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-module-transforms": {
+      "version": "7.20.2",
+      "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.20.2.tgz",
+      "integrity": "sha512-zvBKyJXRbmK07XhMuujYoJ48B5yvvmM6+wcpv6Ivj4Yg6qO7NOZOSnvZN9CRl1zz1Z4cKf8YejmCMh8clOoOeA==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-environment-visitor": "^7.18.9",
+        "@babel/helper-module-imports": "^7.18.6",
+        "@babel/helper-simple-access": "^7.20.2",
+        "@babel/helper-split-export-declaration": "^7.18.6",
+        "@babel/helper-validator-identifier": "^7.19.1",
+        "@babel/template": "^7.18.10",
+        "@babel/traverse": "^7.20.1",
+        "@babel/types": "^7.20.2"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-plugin-utils": {
+      "version": "7.20.2",
+      "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.20.2.tgz",
+      "integrity": "sha512-8RvlJG2mj4huQ4pZ+rU9lqKi9ZKiRmuvGuM2HlWmkmgOhbs6zEAw6IEiJ5cQqGbDzGZOhwuOQNtZMi/ENLjZoQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-simple-access": {
+      "version": "7.20.2",
+      "resolved": "https://registry.npmjs.org/@babel/helper-simple-access/-/helper-simple-access-7.20.2.tgz",
+      "integrity": "sha512-+0woI/WPq59IrqDYbVGfshjT5Dmk/nnbdpcF8SnMhhXObpTq2KNBdLFRFrkVdbDOyUmHBCxzm5FHV1rACIkIbA==",
+      "dev": true,
+      "dependencies": {
+        "@babel/types": "^7.20.2"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-split-export-declaration": {
+      "version": "7.18.6",
+      "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.18.6.tgz",
+      "integrity": "sha512-bde1etTx6ZyTmobl9LLMMQsaizFVZrquTEHOqKeQESMKo4PlObf+8+JA25ZsIpZhT/WEd39+vOdLXAFG/nELpA==",
+      "dev": true,
+      "dependencies": {
+        "@babel/types": "^7.18.6"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-string-parser": {
+      "version": "7.19.4",
+      "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.19.4.tgz",
+      "integrity": "sha512-nHtDoQcuqFmwYNYPz3Rah5ph2p8PFeFCsZk9A/48dPc/rGocJ5J3hAAZ7pb76VWX3fZKu+uEr/FhH5jLx7umrw==",
+      "dev": true,
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-validator-identifier": {
+      "version": "7.19.1",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.19.1.tgz",
+      "integrity": "sha512-awrNfaMtnHUr653GgGEs++LlAvW6w+DcPrOliSMXWCKo597CwL5Acf/wWdNkf/tfEQE3mjkeD1YOVZOUV/od1w==",
+      "dev": true,
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-validator-option": {
+      "version": "7.18.6",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.18.6.tgz",
+      "integrity": "sha512-XO7gESt5ouv/LRJdrVjkShckw6STTaB7l9BrpBaAHDeF5YZT+01PCwmR0SJHnkW6i8OwW/EVWRShfi4j2x+KQw==",
+      "dev": true,
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helpers": {
+      "version": "7.20.6",
+      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.20.6.tgz",
+      "integrity": "sha512-Pf/OjgfgFRW5bApskEz5pvidpim7tEDPlFtKcNRXWmfHGn9IEI2W2flqRQXTFb7gIPTyK++N6rVHuwKut4XK6w==",
+      "dev": true,
+      "dependencies": {
+        "@babel/template": "^7.18.10",
+        "@babel/traverse": "^7.20.5",
+        "@babel/types": "^7.20.5"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/highlight": {
+      "version": "7.18.6",
+      "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.18.6.tgz",
+      "integrity": "sha512-u7stbOuYjaPezCuLj29hNW1v64M2Md2qupEKP1fHc7WdOA3DgLh37suiSrZYY7haUB7iBeQZ9P1uiRF359do3g==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-validator-identifier": "^7.18.6",
+        "chalk": "^2.0.0",
+        "js-tokens": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/parser": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.20.5.tgz",
+      "integrity": "sha512-r27t/cy/m9uKLXQNWWebeCUHgnAZq0CpG1OwKRxzJMP1vpSU4bSIK2hq+/cp0bQxetkXx38n09rNu8jVkcK/zA==",
+      "dev": true,
+      "bin": {
+        "parser": "bin/babel-parser.js"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-async-generators": {
+      "version": "7.8.4",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-async-generators/-/plugin-syntax-async-generators-7.8.4.tgz",
+      "integrity": "sha512-tycmZxkGfZaxhMRbXlPXuVFpdWlXpir2W4AMhSJgRKzk/eDlIXOhb2LHWoLpDF7TEHylV5zNhykX6KAgHJmTNw==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-bigint": {
+      "version": "7.8.3",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-bigint/-/plugin-syntax-bigint-7.8.3.tgz",
+      "integrity": "sha512-wnTnFlG+YxQm3vDxpGE57Pj0srRU4sHE/mDkt1qv2YJJSeUAec2ma4WLUnUPeKjyrfntVwe/N6dCXpU+zL3Npg==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-class-properties": {
+      "version": "7.12.13",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-class-properties/-/plugin-syntax-class-properties-7.12.13.tgz",
+      "integrity": "sha512-fm4idjKla0YahUNgFNLCB0qySdsoPiZP3iQE3rky0mBUtMZ23yDJ9SJdg6dXTSDnulOVqiF3Hgr9nbXvXTQZYA==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.12.13"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-import-meta": {
+      "version": "7.10.4",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-import-meta/-/plugin-syntax-import-meta-7.10.4.tgz",
+      "integrity": "sha512-Yqfm+XDx0+Prh3VSeEQCPU81yC+JWZ2pDPFSS4ZdpfZhp4MkFMaDC1UqseovEKwSUpnIL7+vK+Clp7bfh0iD7g==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.10.4"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-json-strings": {
+      "version": "7.8.3",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-json-strings/-/plugin-syntax-json-strings-7.8.3.tgz",
+      "integrity": "sha512-lY6kdGpWHvjoe2vk4WrAapEuBR69EMxZl+RoGRhrFGNYVK8mOPAW8VfbT/ZgrFbXlDNiiaxQnAtgVCZ6jv30EA==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-logical-assignment-operators": {
+      "version": "7.10.4",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-logical-assignment-operators/-/plugin-syntax-logical-assignment-operators-7.10.4.tgz",
+      "integrity": "sha512-d8waShlpFDinQ5MtvGU9xDAOzKH47+FFoney2baFIoMr952hKOLp1HR7VszoZvOsV/4+RRszNY7D17ba0te0ig==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.10.4"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-nullish-coalescing-operator": {
+      "version": "7.8.3",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-nullish-coalescing-operator/-/plugin-syntax-nullish-coalescing-operator-7.8.3.tgz",
+      "integrity": "sha512-aSff4zPII1u2QD7y+F8oDsz19ew4IGEJg9SVW+bqwpwtfFleiQDMdzA/R+UlWDzfnHFCxxleFT0PMIrR36XLNQ==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-numeric-separator": {
+      "version": "7.10.4",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-numeric-separator/-/plugin-syntax-numeric-separator-7.10.4.tgz",
+      "integrity": "sha512-9H6YdfkcK/uOnY/K7/aA2xpzaAgkQn37yzWUMRK7OaPOqOpGS1+n0H5hxT9AUw9EsSjPW8SVyMJwYRtWs3X3ug==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.10.4"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-object-rest-spread": {
+      "version": "7.8.3",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-object-rest-spread/-/plugin-syntax-object-rest-spread-7.8.3.tgz",
+      "integrity": "sha512-XoqMijGZb9y3y2XskN+P1wUGiVwWZ5JmoDRwx5+3GmEplNyVM2s2Dg8ILFQm8rWM48orGy5YpI5Bl8U1y7ydlA==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-optional-catch-binding": {
+      "version": "7.8.3",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-optional-catch-binding/-/plugin-syntax-optional-catch-binding-7.8.3.tgz",
+      "integrity": "sha512-6VPD0Pc1lpTqw0aKoeRTMiB+kWhAoT24PA+ksWSBrFtl5SIRVpZlwN3NNPQjehA2E/91FV3RjLWoVTglWcSV3Q==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-optional-chaining": {
+      "version": "7.8.3",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-optional-chaining/-/plugin-syntax-optional-chaining-7.8.3.tgz",
+      "integrity": "sha512-KoK9ErH1MBlCPxV0VANkXW2/dw4vlbGDrFgz8bmUsBGYkFRcbRwMh6cIJubdPrkxRwuGdtCk0v/wPTKbQgBjkg==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.8.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/plugin-syntax-top-level-await": {
+      "version": "7.14.5",
+      "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-top-level-await/-/plugin-syntax-top-level-await-7.14.5.tgz",
+      "integrity": "sha512-hx++upLv5U1rgYfwe1xBQUhRmU41NEvpUvrp8jkrSCdvGSnM5/qdRMtylJ6PG5OFkBaHkbTAKTnd3/YyESRHFw==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.14.5"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0-0"
+      }
+    },
+    "node_modules/@babel/template": {
+      "version": "7.18.10",
+      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.18.10.tgz",
+      "integrity": "sha512-TI+rCtooWHr3QJ27kJxfjutghu44DLnasDMwpDqCXVTal9RLp3RSYNh4NdBrRP2cQAoG9A8juOQl6P6oZG4JxA==",
+      "dev": true,
+      "dependencies": {
+        "@babel/code-frame": "^7.18.6",
+        "@babel/parser": "^7.18.10",
+        "@babel/types": "^7.18.10"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/traverse": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.20.5.tgz",
+      "integrity": "sha512-WM5ZNN3JITQIq9tFZaw1ojLU3WgWdtkxnhM1AegMS+PvHjkM5IXjmYEGY7yukz5XS4sJyEf2VzWjI8uAavhxBQ==",
+      "dev": true,
+      "dependencies": {
+        "@babel/code-frame": "^7.18.6",
+        "@babel/generator": "^7.20.5",
+        "@babel/helper-environment-visitor": "^7.18.9",
+        "@babel/helper-function-name": "^7.19.0",
+        "@babel/helper-hoist-variables": "^7.18.6",
+        "@babel/helper-split-export-declaration": "^7.18.6",
+        "@babel/parser": "^7.20.5",
+        "@babel/types": "^7.20.5",
+        "debug": "^4.1.0",
+        "globals": "^11.1.0"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/traverse/node_modules/globals": {
+      "version": "11.12.0",
+      "resolved": "https://registry.npmjs.org/globals/-/globals-11.12.0.tgz",
+      "integrity": "sha512-WOBp/EEGUiIsJSp7wcv/y6MO+lV9UoncWqxuFfm8eBwzWNgyfBd6Gz+IeKQ9jCmyhoH99g15M3T+QaVHFjizVA==",
+      "dev": true,
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/@babel/types": {
+      "version": "7.20.5",
+      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.20.5.tgz",
+      "integrity": "sha512-c9fst/h2/dcF7H+MJKZ2T0KjEQ8hY/BNnDk/H3XY8C4Aw/eWQXWn/lWntHF9ooUBnGmEvbfGrTgLWc+um0YDUg==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-string-parser": "^7.19.4",
+        "@babel/helper-validator-identifier": "^7.19.1",
+        "to-fast-properties": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@bcoe/v8-coverage": {
+      "version": "0.2.3",
+      "resolved": "https://registry.npmjs.org/@bcoe/v8-coverage/-/v8-coverage-0.2.3.tgz",
+      "integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==",
+      "dev": true
+    },
+    "node_modules/@cnakazawa/watch": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/@cnakazawa/watch/-/watch-1.0.4.tgz",
+      "integrity": "sha512-v9kIhKwjeZThiWrLmj0y17CWoyddASLj9O2yvbZkbvw/N3rWOYy9zkV66ursAoVr0mV15bL8g0c4QZUE6cdDoQ==",
+      "dev": true,
+      "dependencies": {
+        "exec-sh": "^0.3.2",
+        "minimist": "^1.2.0"
+      },
+      "bin": {
+        "watch": "cli.js"
+      },
+      "engines": {
+        "node": ">=0.1.95"
+      }
+    },
+    "node_modules/@istanbuljs/load-nyc-config": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@istanbuljs/load-nyc-config/-/load-nyc-config-1.1.0.tgz",
+      "integrity": "sha512-VjeHSlIzpv/NyD3N0YuHfXOPDIixcA1q2ZV98wsMqcYlPmv2n3Yb2lYP9XMElnaFVXg5A7YLTeLu6V84uQDjmQ==",
+      "dev": true,
+      "dependencies": {
+        "camelcase": "^5.3.1",
+        "find-up": "^4.1.0",
+        "get-package-type": "^0.1.0",
+        "js-yaml": "^3.13.1",
+        "resolve-from": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/@istanbuljs/load-nyc-config/node_modules/resolve-from": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-5.0.0.tgz",
+      "integrity": "sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/@istanbuljs/schema": {
+      "version": "0.1.3",
+      "resolved": "https://registry.npmjs.org/@istanbuljs/schema/-/schema-0.1.3.tgz",
+      "integrity": "sha512-ZXRY4jNvVgSVQ8DL3LTcakaAtXwTVUxE81hslsyD2AtoXW/wVob10HkOJ1X/pAlcI7D+2YoZKg5do8G/w6RYgA==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/@jest/console": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/console/-/console-26.6.2.tgz",
+      "integrity": "sha512-IY1R2i2aLsLr7Id3S6p2BA82GNWryt4oSvEXLAKc+L2zdi89dSkE8xC1C+0kpATG4JhBJREnQOH7/zmccM2B0g==",
+      "dev": true,
+      "dependencies": {
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "chalk": "^4.0.0",
+        "jest-message-util": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "slash": "^3.0.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/@jest/console/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/@jest/console/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/@jest/console/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/@jest/console/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/@jest/console/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/@jest/console/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/@jest/core": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/@jest/core/-/core-26.6.3.tgz",
+      "integrity": "sha512-xvV1kKbhfUqFVuZ8Cyo+JPpipAHHAV3kcDBftiduK8EICXmTFddryy3P7NfZt8Pv37rA9nEJBKCCkglCPt/Xjw==",
+      "dev": true,
+      "dependencies": {
+        "@jest/console": "^26.6.2",
+        "@jest/reporters": "^26.6.2",
+        "@jest/test-result": "^26.6.2",
+        "@jest/transform": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "ansi-escapes": "^4.2.1",
+        "chalk": "^4.0.0",
+        "exit": "^0.1.2",
+        "graceful-fs": "^4.2.4",
+        "jest-changed-files": "^26.6.2",
+        "jest-config": "^26.6.3",
+        "jest-haste-map": "^26.6.2",
+        "jest-message-util": "^26.6.2",
+        "jest-regex-util": "^26.0.0",
+        "jest-resolve": "^26.6.2",
+        "jest-resolve-dependencies": "^26.6.3",
+        "jest-runner": "^26.6.3",
+        "jest-runtime": "^26.6.3",
+        "jest-snapshot": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "jest-validate": "^26.6.2",
+        "jest-watcher": "^26.6.2",
+        "micromatch": "^4.0.2",
+        "p-each-series": "^2.1.0",
+        "rimraf": "^3.0.0",
+        "slash": "^3.0.0",
+        "strip-ansi": "^6.0.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/@jest/core/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/@jest/core/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/@jest/core/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/@jest/core/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/@jest/core/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/@jest/core/node_modules/rimraf": {
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz",
+      "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==",
+      "dev": true,
+      "dependencies": {
+        "glob": "^7.1.3"
+      },
+      "bin": {
+        "rimraf": "bin.js"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/@jest/core/node_modules/strip-ansi": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "dev": true,
+      "dependencies": {
+        "ansi-regex": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/@jest/core/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/@jest/environment": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/environment/-/environment-26.6.2.tgz",
+      "integrity": "sha512-nFy+fHl28zUrRsCeMB61VDThV1pVTtlEokBRgqPrcT1JNq4yRNIyTHfyht6PqtUvY9IsuLGTrbG8kPXjSZIZwA==",
+      "dev": true,
+      "dependencies": {
+        "@jest/fake-timers": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "jest-mock": "^26.6.2"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/@jest/fake-timers": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/fake-timers/-/fake-timers-26.6.2.tgz",
+      "integrity": "sha512-14Uleatt7jdzefLPYM3KLcnUl1ZNikaKq34enpb5XG9i81JpppDb5muZvonvKyrl7ftEHkKS5L5/eB/kxJ+bvA==",
+      "dev": true,
+      "dependencies": {
+        "@jest/types": "^26.6.2",
+        "@sinonjs/fake-timers": "^6.0.1",
+        "@types/node": "*",
+        "jest-message-util": "^26.6.2",
+        "jest-mock": "^26.6.2",
+        "jest-util": "^26.6.2"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/@jest/globals": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/globals/-/globals-26.6.2.tgz",
+      "integrity": "sha512-85Ltnm7HlB/KesBUuALwQ68YTU72w9H2xW9FjZ1eL1U3lhtefjjl5c2MiUbpXt/i6LaPRvoOFJ22yCBSfQ0JIA==",
+      "dev": true,
+      "dependencies": {
+        "@jest/environment": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "expect": "^26.6.2"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/@jest/reporters": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/reporters/-/reporters-26.6.2.tgz",
+      "integrity": "sha512-h2bW53APG4HvkOnVMo8q3QXa6pcaNt1HkwVsOPMBV6LD/q9oSpxNSYZQYkAnjdMjrJ86UuYeLo+aEZClV6opnw==",
+      "dev": true,
+      "dependencies": {
+        "@bcoe/v8-coverage": "^0.2.3",
+        "@jest/console": "^26.6.2",
+        "@jest/test-result": "^26.6.2",
+        "@jest/transform": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "chalk": "^4.0.0",
+        "collect-v8-coverage": "^1.0.0",
+        "exit": "^0.1.2",
+        "glob": "^7.1.2",
+        "graceful-fs": "^4.2.4",
+        "istanbul-lib-coverage": "^3.0.0",
+        "istanbul-lib-instrument": "^4.0.3",
+        "istanbul-lib-report": "^3.0.0",
+        "istanbul-lib-source-maps": "^4.0.0",
+        "istanbul-reports": "^3.0.2",
+        "jest-haste-map": "^26.6.2",
+        "jest-resolve": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "jest-worker": "^26.6.2",
+        "slash": "^3.0.0",
+        "source-map": "^0.6.0",
+        "string-length": "^4.0.1",
+        "terminal-link": "^2.0.0",
+        "v8-to-istanbul": "^7.0.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      },
+      "optionalDependencies": {
+        "node-notifier": "^8.0.0"
+      }
+    },
+    "node_modules/@jest/reporters/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/@jest/reporters/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/@jest/reporters/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/@jest/reporters/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/@jest/reporters/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/@jest/reporters/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/@jest/source-map": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/source-map/-/source-map-26.6.2.tgz",
+      "integrity": "sha512-YwYcCwAnNmOVsZ8mr3GfnzdXDAl4LaenZP5z+G0c8bzC9/dugL8zRmxZzdoTl4IaS3CryS1uWnROLPFmb6lVvA==",
+      "dev": true,
+      "dependencies": {
+        "callsites": "^3.0.0",
+        "graceful-fs": "^4.2.4",
+        "source-map": "^0.6.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/@jest/test-result": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/test-result/-/test-result-26.6.2.tgz",
+      "integrity": "sha512-5O7H5c/7YlojphYNrK02LlDIV2GNPYisKwHm2QTKjNZeEzezCbwYs9swJySv2UfPMyZ0VdsmMv7jIlD/IKYQpQ==",
+      "dev": true,
+      "dependencies": {
+        "@jest/console": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/istanbul-lib-coverage": "^2.0.0",
+        "collect-v8-coverage": "^1.0.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/@jest/test-sequencer": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/@jest/test-sequencer/-/test-sequencer-26.6.3.tgz",
+      "integrity": "sha512-YHlVIjP5nfEyjlrSr8t/YdNfU/1XEt7c5b4OxcXCjyRhjzLYu/rO69/WHPuYcbCWkz8kAeZVZp2N2+IOLLEPGw==",
+      "dev": true,
+      "dependencies": {
+        "@jest/test-result": "^26.6.2",
+        "graceful-fs": "^4.2.4",
+        "jest-haste-map": "^26.6.2",
+        "jest-runner": "^26.6.3",
+        "jest-runtime": "^26.6.3"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/@jest/transform": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/transform/-/transform-26.6.2.tgz",
+      "integrity": "sha512-E9JjhUgNzvuQ+vVAL21vlyfy12gP0GhazGgJC4h6qUt1jSdUXGWJ1wfu/X7Sd8etSgxV4ovT1pb9v5D6QW4XgA==",
+      "dev": true,
+      "dependencies": {
+        "@babel/core": "^7.1.0",
+        "@jest/types": "^26.6.2",
+        "babel-plugin-istanbul": "^6.0.0",
+        "chalk": "^4.0.0",
+        "convert-source-map": "^1.4.0",
+        "fast-json-stable-stringify": "^2.0.0",
+        "graceful-fs": "^4.2.4",
+        "jest-haste-map": "^26.6.2",
+        "jest-regex-util": "^26.0.0",
+        "jest-util": "^26.6.2",
+        "micromatch": "^4.0.2",
+        "pirates": "^4.0.1",
+        "slash": "^3.0.0",
+        "source-map": "^0.6.1",
+        "write-file-atomic": "^3.0.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/@jest/transform/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/@jest/transform/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/@jest/transform/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/@jest/transform/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/@jest/transform/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/@jest/transform/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/@jest/types": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/@jest/types/-/types-26.6.2.tgz",
+      "integrity": "sha512-fC6QCp7Sc5sX6g8Tvbmj4XUTbyrik0akgRy03yjXbQaBWWNWGE7SGtJk98m0N8nzegD/7SggrUlivxo5ax4KWQ==",
+      "dev": true,
+      "dependencies": {
+        "@types/istanbul-lib-coverage": "^2.0.0",
+        "@types/istanbul-reports": "^3.0.0",
+        "@types/node": "*",
+        "@types/yargs": "^15.0.0",
+        "chalk": "^4.0.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/@jest/types/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/@jest/types/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/@jest/types/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/@jest/types/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/@jest/types/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/@jest/types/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/@jridgewell/gen-mapping": {
+      "version": "0.1.1",
+      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.1.1.tgz",
+      "integrity": "sha512-sQXCasFk+U8lWYEe66WxRDOE9PjVz4vSM51fTu3Hw+ClTpUSQb718772vH3pyS5pShp6lvQM7SxgIDXXXmOX7w==",
+      "dev": true,
+      "dependencies": {
+        "@jridgewell/set-array": "^1.0.0",
+        "@jridgewell/sourcemap-codec": "^1.4.10"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@jridgewell/resolve-uri": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.0.tgz",
+      "integrity": "sha512-F2msla3tad+Mfht5cJq7LSXcdudKTWCVYUgw6pLFOOHSTtZlj6SWNYAp+AhuqLmWdBO2X5hPrLcu8cVP8fy28w==",
+      "dev": true,
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@jridgewell/set-array": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.1.2.tgz",
+      "integrity": "sha512-xnkseuNADM0gt2bs+BvhO0p78Mk762YnZdsuzFV018NoG1Sj1SCQvpSqa7XUaTam5vAGasABV9qXASMKnFMwMw==",
+      "dev": true,
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/@jridgewell/sourcemap-codec": {
+      "version": "1.4.14",
+      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.14.tgz",
+      "integrity": "sha512-XPSJHWmi394fuUuzDnGz1wiKqWfo1yXecHQMRf2l6hztTO+nPru658AyDngaBe7isIxEkRsPR3FZh+s7iVa4Uw==",
+      "dev": true
+    },
+    "node_modules/@jridgewell/trace-mapping": {
+      "version": "0.3.17",
+      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.17.tgz",
+      "integrity": "sha512-MCNzAp77qzKca9+W/+I0+sEpaUnZoeasnghNeVc41VZCEKaCH73Vq3BZZ/SzWIgrqE4H4ceI+p+b6C0mHf9T4g==",
+      "dev": true,
+      "dependencies": {
+        "@jridgewell/resolve-uri": "3.1.0",
+        "@jridgewell/sourcemap-codec": "1.4.14"
+      }
+    },
+    "node_modules/@rollup/plugin-commonjs": {
+      "version": "11.1.0",
+      "resolved": "https://registry.npmjs.org/@rollup/plugin-commonjs/-/plugin-commonjs-11.1.0.tgz",
+      "integrity": "sha512-Ycr12N3ZPN96Fw2STurD21jMqzKwL9QuFhms3SD7KKRK7oaXUsBU9Zt0jL/rOPHiPYisI21/rXGO3jr9BnLHUA==",
+      "dev": true,
+      "dependencies": {
+        "@rollup/pluginutils": "^3.0.8",
+        "commondir": "^1.0.1",
+        "estree-walker": "^1.0.1",
+        "glob": "^7.1.2",
+        "is-reference": "^1.1.2",
+        "magic-string": "^0.25.2",
+        "resolve": "^1.11.0"
+      },
+      "engines": {
+        "node": ">= 8.0.0"
+      },
+      "peerDependencies": {
+        "rollup": "^1.20.0||^2.0.0"
+      }
+    },
+    "node_modules/@rollup/plugin-node-resolve": {
+      "version": "7.1.3",
+      "resolved": "https://registry.npmjs.org/@rollup/plugin-node-resolve/-/plugin-node-resolve-7.1.3.tgz",
+      "integrity": "sha512-RxtSL3XmdTAE2byxekYLnx+98kEUOrPHF/KRVjLH+DEIHy6kjIw7YINQzn+NXiH/NTrQLAwYs0GWB+csWygA9Q==",
+      "dev": true,
+      "dependencies": {
+        "@rollup/pluginutils": "^3.0.8",
+        "@types/resolve": "0.0.8",
+        "builtin-modules": "^3.1.0",
+        "is-module": "^1.0.0",
+        "resolve": "^1.14.2"
+      },
+      "engines": {
+        "node": ">= 8.0.0"
+      },
+      "peerDependencies": {
+        "rollup": "^1.20.0||^2.0.0"
+      }
+    },
+    "node_modules/@rollup/pluginutils": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/@rollup/pluginutils/-/pluginutils-3.1.0.tgz",
+      "integrity": "sha512-GksZ6pr6TpIjHm8h9lSQ8pi8BE9VeubNT0OMJ3B5uZJ8pz73NPiqOtCog/x2/QzM1ENChPKxMDhiQuRHsqc+lg==",
+      "dev": true,
+      "dependencies": {
+        "@types/estree": "0.0.39",
+        "estree-walker": "^1.0.1",
+        "picomatch": "^2.2.2"
+      },
+      "engines": {
+        "node": ">= 8.0.0"
+      },
+      "peerDependencies": {
+        "rollup": "^1.20.0||^2.0.0"
+      }
+    },
+    "node_modules/@sinonjs/commons": {
+      "version": "1.8.6",
+      "resolved": "https://registry.npmjs.org/@sinonjs/commons/-/commons-1.8.6.tgz",
+      "integrity": "sha512-Ky+XkAkqPZSm3NLBeUng77EBQl3cmeJhITaGHdYH8kjVB+aun3S4XBRti2zt17mtt0mIUDiNxYeoJm6drVvBJQ==",
+      "dev": true,
+      "dependencies": {
+        "type-detect": "4.0.8"
+      }
+    },
+    "node_modules/@sinonjs/fake-timers": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/@sinonjs/fake-timers/-/fake-timers-6.0.1.tgz",
+      "integrity": "sha512-MZPUxrmFubI36XS1DI3qmI0YdN1gks62JtFZvxR67ljjSNCeK6U08Zx4msEWOXuofgqUt6zPHSi1H9fbjR/NRA==",
+      "dev": true,
+      "dependencies": {
+        "@sinonjs/commons": "^1.7.0"
+      }
+    },
+    "node_modules/@tootallnate/once": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@tootallnate/once/-/once-1.1.2.tgz",
+      "integrity": "sha512-RbzJvlNzmRq5c3O09UipeuXno4tA1FE6ikOjxZK0tuxVv3412l64l5t1W5pj4+rJq9vpkm/kwiR07aZXnsKPxw==",
+      "dev": true,
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/@types/babel__core": {
+      "version": "7.1.20",
+      "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.1.20.tgz",
+      "integrity": "sha512-PVb6Bg2QuscZ30FvOU7z4guG6c926D9YRvOxEaelzndpMsvP+YM74Q/dAFASpg2l6+XLalxSGxcq/lrgYWZtyQ==",
+      "dev": true,
+      "dependencies": {
+        "@babel/parser": "^7.1.0",
+        "@babel/types": "^7.0.0",
+        "@types/babel__generator": "*",
+        "@types/babel__template": "*",
+        "@types/babel__traverse": "*"
+      }
+    },
+    "node_modules/@types/babel__generator": {
+      "version": "7.6.4",
+      "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.6.4.tgz",
+      "integrity": "sha512-tFkciB9j2K755yrTALxD44McOrk+gfpIpvC3sxHjRawj6PfnQxrse4Clq5y/Rq+G3mrBurMax/lG8Qn2t9mSsg==",
+      "dev": true,
+      "dependencies": {
+        "@babel/types": "^7.0.0"
+      }
+    },
+    "node_modules/@types/babel__template": {
+      "version": "7.4.1",
+      "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.1.tgz",
+      "integrity": "sha512-azBFKemX6kMg5Io+/rdGT0dkGreboUVR0Cdm3fz9QJWpaQGJRQXl7C+6hOTCZcMll7KFyEQpgbYI2lHdsS4U7g==",
+      "dev": true,
+      "dependencies": {
+        "@babel/parser": "^7.1.0",
+        "@babel/types": "^7.0.0"
+      }
+    },
+    "node_modules/@types/babel__traverse": {
+      "version": "7.18.2",
+      "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.18.2.tgz",
+      "integrity": "sha512-FcFaxOr2V5KZCviw1TnutEMVUVsGt4D2hP1TAfXZAMKuHYW3xQhe3jTxNPWutgCJ3/X1c5yX8ZoGVEItxKbwBg==",
+      "dev": true,
+      "dependencies": {
+        "@babel/types": "^7.3.0"
+      }
+    },
+    "node_modules/@types/eslint-visitor-keys": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/@types/eslint-visitor-keys/-/eslint-visitor-keys-1.0.0.tgz",
+      "integrity": "sha512-OCutwjDZ4aFS6PB1UZ988C4YgwlBHJd6wCeQqaLdmadZ/7e+w79+hbMUFC1QXDNCmdyoRfAFdm0RypzwR+Qpag==",
+      "dev": true
+    },
+    "node_modules/@types/estree": {
+      "version": "0.0.39",
+      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-0.0.39.tgz",
+      "integrity": "sha512-EYNwp3bU+98cpU4lAWYYL7Zz+2gryWH1qbdDTidVd6hkiR6weksdbMadyXKXNPEkQFhXM+hVO9ZygomHXp+AIw==",
+      "dev": true
+    },
+    "node_modules/@types/graceful-fs": {
+      "version": "4.1.5",
+      "resolved": "https://registry.npmjs.org/@types/graceful-fs/-/graceful-fs-4.1.5.tgz",
+      "integrity": "sha512-anKkLmZZ+xm4p8JWBf4hElkM4XR+EZeA2M9BAkkTldmcyDY4mbdIJnRghDJH3Ov5ooY7/UAoENtmdMSkaAd7Cw==",
+      "dev": true,
+      "dependencies": {
+        "@types/node": "*"
+      }
+    },
+    "node_modules/@types/istanbul-lib-coverage": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/@types/istanbul-lib-coverage/-/istanbul-lib-coverage-2.0.4.tgz",
+      "integrity": "sha512-z/QT1XN4K4KYuslS23k62yDIDLwLFkzxOuMplDtObz0+y7VqJCaO2o+SPwHCvLFZh7xazvvoor2tA/hPz9ee7g==",
+      "dev": true
+    },
+    "node_modules/@types/istanbul-lib-report": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/@types/istanbul-lib-report/-/istanbul-lib-report-3.0.0.tgz",
+      "integrity": "sha512-plGgXAPfVKFoYfa9NpYDAkseG+g6Jr294RqeqcqDixSbU34MZVJRi/P+7Y8GDpzkEwLaGZZOpKIEmeVZNtKsrg==",
+      "dev": true,
+      "dependencies": {
+        "@types/istanbul-lib-coverage": "*"
+      }
+    },
+    "node_modules/@types/istanbul-reports": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/@types/istanbul-reports/-/istanbul-reports-3.0.1.tgz",
+      "integrity": "sha512-c3mAZEuK0lvBp8tmuL74XRKn1+y2dcwOUpH7x4WrF6gk1GIgiluDRgMYQtw2OFcBvAJWlt6ASU3tSqxp0Uu0Aw==",
+      "dev": true,
+      "dependencies": {
+        "@types/istanbul-lib-report": "*"
+      }
+    },
+    "node_modules/@types/json-schema": {
+      "version": "7.0.11",
+      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.11.tgz",
+      "integrity": "sha512-wOuvG1SN4Us4rez+tylwwwCV1psiNVOkJeM3AUWUNWg/jDQY2+HE/444y5gc+jBmRqASOm2Oeh5c1axHobwRKQ==",
+      "dev": true
+    },
+    "node_modules/@types/node": {
+      "version": "12.20.55",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-12.20.55.tgz",
+      "integrity": "sha512-J8xLz7q2OFulZ2cyGTLE1TbbZcjpno7FaN6zdJNrgAdrJ+DZzh/uFR6YrTb4C+nXakvud8Q4+rbhoIWlYQbUFQ==",
+      "dev": true
+    },
+    "node_modules/@types/normalize-package-data": {
+      "version": "2.4.1",
+      "resolved": "https://registry.npmjs.org/@types/normalize-package-data/-/normalize-package-data-2.4.1.tgz",
+      "integrity": "sha512-Gj7cI7z+98M282Tqmp2K5EIsoouUEzbBJhQQzDE3jSIRk6r9gsz0oUokqIUR4u1R3dMHo0pDHM7sNOHyhulypw==",
+      "dev": true
+    },
+    "node_modules/@types/prettier": {
+      "version": "2.7.1",
+      "resolved": "https://registry.npmjs.org/@types/prettier/-/prettier-2.7.1.tgz",
+      "integrity": "sha512-ri0UmynRRvZiiUJdiz38MmIblKK+oH30MztdBVR95dv/Ubw6neWSb8u1XpRb72L4qsZOhz+L+z9JD40SJmfWow==",
+      "dev": true
+    },
+    "node_modules/@types/resolve": {
+      "version": "0.0.8",
+      "resolved": "https://registry.npmjs.org/@types/resolve/-/resolve-0.0.8.tgz",
+      "integrity": "sha512-auApPaJf3NPfe18hSoJkp8EbZzer2ISk7o8mCC3M9he/a04+gbMF97NkpD2S8riMGvm4BMRI59/SZQSaLTKpsQ==",
+      "dev": true,
+      "dependencies": {
+        "@types/node": "*"
+      }
+    },
+    "node_modules/@types/stack-utils": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-2.0.1.tgz",
+      "integrity": "sha512-Hl219/BT5fLAaz6NDkSuhzasy49dwQS/DSdu4MdggFB8zcXv7vflBI3xp7FEmkmdDkBUI2bPUNeMttp2knYdxw==",
+      "dev": true
+    },
+    "node_modules/@types/yargs": {
+      "version": "15.0.14",
+      "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-15.0.14.tgz",
+      "integrity": "sha512-yEJzHoxf6SyQGhBhIYGXQDSCkJjB6HohDShto7m8vaKg9Yp0Yn8+71J9eakh2bnPg6BfsH9PRMhiRTZnd4eXGQ==",
+      "dev": true,
+      "dependencies": {
+        "@types/yargs-parser": "*"
+      }
+    },
+    "node_modules/@types/yargs-parser": {
+      "version": "21.0.0",
+      "resolved": "https://registry.npmjs.org/@types/yargs-parser/-/yargs-parser-21.0.0.tgz",
+      "integrity": "sha512-iO9ZQHkZxHn4mSakYV0vFHAVDyEOIJQrV2uZ06HxEPcx+mt8swXoZHIbaaJ2crJYFfErySgktuTZ3BeLz+XmFA==",
+      "dev": true
+    },
+    "node_modules/@typescript-eslint/eslint-plugin": {
+      "version": "2.34.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-2.34.0.tgz",
+      "integrity": "sha512-4zY3Z88rEE99+CNvTbXSyovv2z9PNOVffTWD2W8QF5s2prBQtwN2zadqERcrHpcR7O/+KMI3fcTAmUUhK/iQcQ==",
+      "dev": true,
+      "dependencies": {
+        "@typescript-eslint/experimental-utils": "2.34.0",
+        "functional-red-black-tree": "^1.0.1",
+        "regexpp": "^3.0.0",
+        "tsutils": "^3.17.1"
+      },
+      "engines": {
+        "node": "^8.10.0 || ^10.13.0 || >=11.10.1"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      },
+      "peerDependencies": {
+        "@typescript-eslint/parser": "^2.0.0",
+        "eslint": "^5.0.0 || ^6.0.0"
+      },
+      "peerDependenciesMeta": {
+        "typescript": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@typescript-eslint/experimental-utils": {
+      "version": "2.34.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/experimental-utils/-/experimental-utils-2.34.0.tgz",
+      "integrity": "sha512-eS6FTkq+wuMJ+sgtuNTtcqavWXqsflWcfBnlYhg/nS4aZ1leewkXGbvBhaapn1q6qf4M71bsR1tez5JTRMuqwA==",
+      "dev": true,
+      "dependencies": {
+        "@types/json-schema": "^7.0.3",
+        "@typescript-eslint/typescript-estree": "2.34.0",
+        "eslint-scope": "^5.0.0",
+        "eslint-utils": "^2.0.0"
+      },
+      "engines": {
+        "node": "^8.10.0 || ^10.13.0 || >=11.10.1"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      },
+      "peerDependencies": {
+        "eslint": "*"
+      }
+    },
+    "node_modules/@typescript-eslint/parser": {
+      "version": "2.34.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-2.34.0.tgz",
+      "integrity": "sha512-03ilO0ucSD0EPTw2X4PntSIRFtDPWjrVq7C3/Z3VQHRC7+13YB55rcJI3Jt+YgeHbjUdJPcPa7b23rXCBokuyA==",
+      "dev": true,
+      "dependencies": {
+        "@types/eslint-visitor-keys": "^1.0.0",
+        "@typescript-eslint/experimental-utils": "2.34.0",
+        "@typescript-eslint/typescript-estree": "2.34.0",
+        "eslint-visitor-keys": "^1.1.0"
+      },
+      "engines": {
+        "node": "^8.10.0 || ^10.13.0 || >=11.10.1"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      },
+      "peerDependencies": {
+        "eslint": "^5.0.0 || ^6.0.0"
+      },
+      "peerDependenciesMeta": {
+        "typescript": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@typescript-eslint/typescript-estree": {
+      "version": "2.34.0",
+      "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-2.34.0.tgz",
+      "integrity": "sha512-OMAr+nJWKdlVM9LOqCqh3pQQPwxHAN7Du8DR6dmwCrAmxtiXQnhHJ6tBNtf+cggqfo51SG/FCwnKhXCIM7hnVg==",
+      "dev": true,
+      "dependencies": {
+        "debug": "^4.1.1",
+        "eslint-visitor-keys": "^1.1.0",
+        "glob": "^7.1.6",
+        "is-glob": "^4.0.1",
+        "lodash": "^4.17.15",
+        "semver": "^7.3.2",
+        "tsutils": "^3.17.1"
+      },
+      "engines": {
+        "node": "^8.10.0 || ^10.13.0 || >=11.10.1"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/typescript-eslint"
+      },
+      "peerDependenciesMeta": {
+        "typescript": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@webgpu/types": {
+      "version": "0.1.24",
+      "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.24.tgz",
+      "integrity": "sha512-Mkz+SVJwHApTg6nCzqIuHDt3HsGRcCvHJNkWT2PgZTTC2Gy+LXvN4+7x6YvduAcx3F/pEDWW5OfAHs6VSo6J4Q==",
+      "dev": true
+    },
+    "node_modules/abab": {
+      "version": "2.0.6",
+      "resolved": "https://registry.npmjs.org/abab/-/abab-2.0.6.tgz",
+      "integrity": "sha512-j2afSsaIENvHZN2B8GOpF566vZ5WVk5opAiMTvWgaQT8DkbOqsTfvNAvHoRGU2zzP8cPoqys+xHTRDWW8L+/BA==",
+      "dev": true
+    },
+    "node_modules/acorn": {
+      "version": "7.4.1",
+      "resolved": "https://registry.npmjs.org/acorn/-/acorn-7.4.1.tgz",
+      "integrity": "sha512-nQyp0o1/mNdbTO1PO6kHkwSrmgZ0MT/jCCpNiwbUjGoRN4dlBhqJtoQuCnEOKzgTVwg0ZWiCoQy6SxMebQVh8A==",
+      "dev": true,
+      "bin": {
+        "acorn": "bin/acorn"
+      },
+      "engines": {
+        "node": ">=0.4.0"
+      }
+    },
+    "node_modules/acorn-globals": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/acorn-globals/-/acorn-globals-6.0.0.tgz",
+      "integrity": "sha512-ZQl7LOWaF5ePqqcX4hLuv/bLXYQNfNWw2c0/yX/TsPRKamzHcTGQnlCjHT3TsmkOUVEPS3crCxiPfdzE/Trlhg==",
+      "dev": true,
+      "dependencies": {
+        "acorn": "^7.1.1",
+        "acorn-walk": "^7.1.1"
+      }
+    },
+    "node_modules/acorn-jsx": {
+      "version": "5.3.2",
+      "resolved": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.2.tgz",
+      "integrity": "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==",
+      "dev": true,
+      "peerDependencies": {
+        "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0"
+      }
+    },
+    "node_modules/acorn-walk": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-7.2.0.tgz",
+      "integrity": "sha512-OPdCF6GsMIP+Az+aWfAAOEt2/+iVDKE7oy6lJ098aoe59oAmK76qV6Gw60SbZ8jHuG2wH058GF4pLFbYamYrVA==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.4.0"
+      }
+    },
+    "node_modules/agent-base": {
+      "version": "6.0.2",
+      "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-6.0.2.tgz",
+      "integrity": "sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==",
+      "dev": true,
+      "dependencies": {
+        "debug": "4"
+      },
+      "engines": {
+        "node": ">= 6.0.0"
+      }
+    },
+    "node_modules/ajv": {
+      "version": "6.12.6",
+      "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
+      "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
+      "dev": true,
+      "dependencies": {
+        "fast-deep-equal": "^3.1.1",
+        "fast-json-stable-stringify": "^2.0.0",
+        "json-schema-traverse": "^0.4.1",
+        "uri-js": "^4.2.2"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/epoberezkin"
+      }
+    },
+    "node_modules/ansi-escapes": {
+      "version": "4.3.2",
+      "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-4.3.2.tgz",
+      "integrity": "sha512-gKXj5ALrKWQLsYG9jlTRmR/xKluxHV+Z9QEwNIgCfM1/uwPMCuzVVnh5mwTd+OuBZcwSIMbqssNWRm1lE51QaQ==",
+      "dev": true,
+      "dependencies": {
+        "type-fest": "^0.21.3"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/ansi-escapes/node_modules/type-fest": {
+      "version": "0.21.3",
+      "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.21.3.tgz",
+      "integrity": "sha512-t0rzBq87m3fVcduHDUFhKmyyX+9eo6WQjZvf51Ea/M0Q7+T374Jp1aUiyUl0GKxp8M/OETVHSDvmkyPgvX+X2w==",
+      "dev": true,
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/ansi-regex": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
+      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/ansi-styles": {
+      "version": "3.2.1",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz",
+      "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^1.9.0"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/anymatch": {
+      "version": "3.1.3",
+      "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz",
+      "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==",
+      "dev": true,
+      "dependencies": {
+        "normalize-path": "^3.0.0",
+        "picomatch": "^2.0.4"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/argparse": {
+      "version": "1.0.10",
+      "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz",
+      "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==",
+      "dev": true,
+      "dependencies": {
+        "sprintf-js": "~1.0.2"
+      }
+    },
+    "node_modules/arr-diff": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz",
+      "integrity": "sha512-YVIQ82gZPGBebQV/a8dar4AitzCQs0jjXwMPZllpXMaGjXPYVUawSxQrRsjhjupyVxEvbHgUmIhKVlND+j02kA==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/arr-flatten": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/arr-flatten/-/arr-flatten-1.1.0.tgz",
+      "integrity": "sha512-L3hKV5R/p5o81R7O02IGnwpDmkp6E982XhtbuwSe3O4qOtMMMtodicASA1Cny2U+aCXcNpml+m4dPsvsJ3jatg==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/arr-union": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz",
+      "integrity": "sha512-sKpyeERZ02v1FeCZT8lrfJq5u6goHCtpTAzPwJYe7c8SPFOboNjNg1vz2L4VTn9T4PQxEx13TbXLmYUcS6Ug7Q==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/array-unique": {
+      "version": "0.3.2",
+      "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz",
+      "integrity": "sha512-SleRWjh9JUud2wH1hPs9rZBZ33H6T9HOiL0uwGnGx9FpE6wKGyfWugmbkEOIs6qWrZhg0LWeLziLrEwQJhs5mQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/assign-symbols": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/assign-symbols/-/assign-symbols-1.0.0.tgz",
+      "integrity": "sha512-Q+JC7Whu8HhmTdBph/Tq59IoRtoy6KAm5zzPv00WdujX82lbAL8K7WVjne7vdCsAmbF4AYaDOPyO3k0kl8qIrw==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/astral-regex": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/astral-regex/-/astral-regex-1.0.0.tgz",
+      "integrity": "sha512-+Ryf6g3BKoRc7jfp7ad8tM4TtMiaWvbF/1/sQcZPkkS7ag3D5nMBCe2UfOTONtAkaG0tO0ij3C5Lwmf1EiyjHg==",
+      "dev": true,
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/asynckit": {
+      "version": "0.4.0",
+      "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
+      "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==",
+      "dev": true
+    },
+    "node_modules/atob": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/atob/-/atob-2.1.2.tgz",
+      "integrity": "sha512-Wm6ukoaOGJi/73p/cl2GvLjTI5JM1k/O14isD73YML8StrH/7/lRFgmg8nICZgD3bZZvjwCGxtMOD3wWNAu8cg==",
+      "dev": true,
+      "bin": {
+        "atob": "bin/atob.js"
+      },
+      "engines": {
+        "node": ">= 4.5.0"
+      }
+    },
+    "node_modules/babel-jest": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/babel-jest/-/babel-jest-26.6.3.tgz",
+      "integrity": "sha512-pl4Q+GAVOHwvjrck6jKjvmGhnO3jHX/xuB9d27f+EJZ/6k+6nMuPjorrYp7s++bKKdANwzElBWnLWaObvTnaZA==",
+      "dev": true,
+      "dependencies": {
+        "@jest/transform": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/babel__core": "^7.1.7",
+        "babel-plugin-istanbul": "^6.0.0",
+        "babel-preset-jest": "^26.6.2",
+        "chalk": "^4.0.0",
+        "graceful-fs": "^4.2.4",
+        "slash": "^3.0.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0"
+      }
+    },
+    "node_modules/babel-jest/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/babel-jest/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/babel-jest/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/babel-jest/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/babel-jest/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/babel-jest/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/babel-plugin-istanbul": {
+      "version": "6.1.1",
+      "resolved": "https://registry.npmjs.org/babel-plugin-istanbul/-/babel-plugin-istanbul-6.1.1.tgz",
+      "integrity": "sha512-Y1IQok9821cC9onCx5otgFfRm7Lm+I+wwxOx738M/WLPZ9Q42m4IG5W0FNX8WLL2gYMZo3JkuXIH2DOpWM+qwA==",
+      "dev": true,
+      "dependencies": {
+        "@babel/helper-plugin-utils": "^7.0.0",
+        "@istanbuljs/load-nyc-config": "^1.0.0",
+        "@istanbuljs/schema": "^0.1.2",
+        "istanbul-lib-instrument": "^5.0.4",
+        "test-exclude": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/babel-plugin-istanbul/node_modules/istanbul-lib-instrument": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/istanbul-lib-instrument/-/istanbul-lib-instrument-5.2.1.tgz",
+      "integrity": "sha512-pzqtp31nLv/XFOzXGuvhCb8qhjmTVo5vjVk19XE4CRlSWz0KoeJ3bw9XsA7nOp9YBf4qHjwBxkDzKcME/J29Yg==",
+      "dev": true,
+      "dependencies": {
+        "@babel/core": "^7.12.3",
+        "@babel/parser": "^7.14.7",
+        "@istanbuljs/schema": "^0.1.2",
+        "istanbul-lib-coverage": "^3.2.0",
+        "semver": "^6.3.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/babel-plugin-istanbul/node_modules/semver": {
+      "version": "6.3.0",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
+      "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+      "dev": true,
+      "bin": {
+        "semver": "bin/semver.js"
+      }
+    },
+    "node_modules/babel-plugin-jest-hoist": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/babel-plugin-jest-hoist/-/babel-plugin-jest-hoist-26.6.2.tgz",
+      "integrity": "sha512-PO9t0697lNTmcEHH69mdtYiOIkkOlj9fySqfO3K1eCcdISevLAE0xY59VLLUj0SoiPiTX/JU2CYFpILydUa5Lw==",
+      "dev": true,
+      "dependencies": {
+        "@babel/template": "^7.3.3",
+        "@babel/types": "^7.3.3",
+        "@types/babel__core": "^7.0.0",
+        "@types/babel__traverse": "^7.0.6"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/babel-preset-current-node-syntax": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/babel-preset-current-node-syntax/-/babel-preset-current-node-syntax-1.0.1.tgz",
+      "integrity": "sha512-M7LQ0bxarkxQoN+vz5aJPsLBn77n8QgTFmo8WK0/44auK2xlCXrYcUxHFxgU7qW5Yzw/CjmLRK2uJzaCd7LvqQ==",
+      "dev": true,
+      "dependencies": {
+        "@babel/plugin-syntax-async-generators": "^7.8.4",
+        "@babel/plugin-syntax-bigint": "^7.8.3",
+        "@babel/plugin-syntax-class-properties": "^7.8.3",
+        "@babel/plugin-syntax-import-meta": "^7.8.3",
+        "@babel/plugin-syntax-json-strings": "^7.8.3",
+        "@babel/plugin-syntax-logical-assignment-operators": "^7.8.3",
+        "@babel/plugin-syntax-nullish-coalescing-operator": "^7.8.3",
+        "@babel/plugin-syntax-numeric-separator": "^7.8.3",
+        "@babel/plugin-syntax-object-rest-spread": "^7.8.3",
+        "@babel/plugin-syntax-optional-catch-binding": "^7.8.3",
+        "@babel/plugin-syntax-optional-chaining": "^7.8.3",
+        "@babel/plugin-syntax-top-level-await": "^7.8.3"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0"
+      }
+    },
+    "node_modules/babel-preset-jest": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/babel-preset-jest/-/babel-preset-jest-26.6.2.tgz",
+      "integrity": "sha512-YvdtlVm9t3k777c5NPQIv6cxFFFapys25HiUmuSgHwIZhfifweR5c5Sf5nwE3MAbfu327CYSvps8Yx6ANLyleQ==",
+      "dev": true,
+      "dependencies": {
+        "babel-plugin-jest-hoist": "^26.6.2",
+        "babel-preset-current-node-syntax": "^1.0.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      },
+      "peerDependencies": {
+        "@babel/core": "^7.0.0"
+      }
+    },
+    "node_modules/balanced-match": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
+      "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
+      "dev": true
+    },
+    "node_modules/base": {
+      "version": "0.11.2",
+      "resolved": "https://registry.npmjs.org/base/-/base-0.11.2.tgz",
+      "integrity": "sha512-5T6P4xPgpp0YDFvSWwEZ4NoE3aM4QBQXDzmVbraCkFj8zHM+mba8SyqB5DbZWyR7mYHo6Y7BdQo3MoA4m0TeQg==",
+      "dev": true,
+      "dependencies": {
+        "cache-base": "^1.0.1",
+        "class-utils": "^0.3.5",
+        "component-emitter": "^1.2.1",
+        "define-property": "^1.0.0",
+        "isobject": "^3.0.1",
+        "mixin-deep": "^1.2.0",
+        "pascalcase": "^0.1.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/base/node_modules/define-property": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz",
+      "integrity": "sha512-cZTYKFWspt9jZsMscWo8sc/5lbPC9Q0N5nBLgb+Yd915iL3udB1uFgS3B8YCx66UVHq018DAVFoee7x+gxggeA==",
+      "dev": true,
+      "dependencies": {
+        "is-descriptor": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/base/node_modules/is-accessor-descriptor": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz",
+      "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==",
+      "dev": true,
+      "dependencies": {
+        "kind-of": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/base/node_modules/is-data-descriptor": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz",
+      "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==",
+      "dev": true,
+      "dependencies": {
+        "kind-of": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/base/node_modules/is-descriptor": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz",
+      "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==",
+      "dev": true,
+      "dependencies": {
+        "is-accessor-descriptor": "^1.0.0",
+        "is-data-descriptor": "^1.0.0",
+        "kind-of": "^6.0.2"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/brace-expansion": {
+      "version": "1.1.11",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
+      "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
+      "dev": true,
+      "dependencies": {
+        "balanced-match": "^1.0.0",
+        "concat-map": "0.0.1"
+      }
+    },
+    "node_modules/braces": {
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz",
+      "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==",
+      "dev": true,
+      "dependencies": {
+        "fill-range": "^7.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/browser-process-hrtime": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/browser-process-hrtime/-/browser-process-hrtime-1.0.0.tgz",
+      "integrity": "sha512-9o5UecI3GhkpM6DrXr69PblIuWxPKk9Y0jHBRhdocZ2y7YECBFCsHm79Pr3OyR2AvjhDkabFJaDJMYRazHgsow==",
+      "dev": true
+    },
+    "node_modules/browserslist": {
+      "version": "4.21.4",
+      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.21.4.tgz",
+      "integrity": "sha512-CBHJJdDmgjl3daYjN5Cp5kbTf1mUhZoS+beLklHIvkOWscs83YAhLlF3Wsh/lciQYAcbBJgTOD44VtG31ZM4Hw==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/browserslist"
+        }
+      ],
+      "dependencies": {
+        "caniuse-lite": "^1.0.30001400",
+        "electron-to-chromium": "^1.4.251",
+        "node-releases": "^2.0.6",
+        "update-browserslist-db": "^1.0.9"
+      },
+      "bin": {
+        "browserslist": "cli.js"
+      },
+      "engines": {
+        "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7"
+      }
+    },
+    "node_modules/bser": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/bser/-/bser-2.1.1.tgz",
+      "integrity": "sha512-gQxTNE/GAfIIrmHLUE3oJyp5FO6HRBfhjnw4/wMmA63ZGDJnWBmgY/lyQBpnDUkGmAhbSe39tx2d/iTOAfglwQ==",
+      "dev": true,
+      "dependencies": {
+        "node-int64": "^0.4.0"
+      }
+    },
+    "node_modules/buffer-from": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz",
+      "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==",
+      "dev": true
+    },
+    "node_modules/builtin-modules": {
+      "version": "3.3.0",
+      "resolved": "https://registry.npmjs.org/builtin-modules/-/builtin-modules-3.3.0.tgz",
+      "integrity": "sha512-zhaCDicdLuWN5UbN5IMnFqNMhNfo919sH85y2/ea+5Yg9TsTkeZxpL+JLbp6cgYFS4sRLp3YV4S6yDuqVWHYOw==",
+      "dev": true,
+      "engines": {
+        "node": ">=6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/cache-base": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/cache-base/-/cache-base-1.0.1.tgz",
+      "integrity": "sha512-AKcdTnFSWATd5/GCPRxr2ChwIJ85CeyrEyjRHlKxQ56d4XJMGym0uAiKn0xbLOGOl3+yRpOTi484dVCEc5AUzQ==",
+      "dev": true,
+      "dependencies": {
+        "collection-visit": "^1.0.0",
+        "component-emitter": "^1.2.1",
+        "get-value": "^2.0.6",
+        "has-value": "^1.0.0",
+        "isobject": "^3.0.1",
+        "set-value": "^2.0.0",
+        "to-object-path": "^0.3.0",
+        "union-value": "^1.0.0",
+        "unset-value": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/callsites": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz",
+      "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/camelcase": {
+      "version": "5.3.1",
+      "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-5.3.1.tgz",
+      "integrity": "sha512-L28STB170nwWS63UjtlEOE3dldQApaJXZkOI1uMFfzf3rRuPegHaHesyee+YxQ+W6SvRDQV6UrdOdRiR153wJg==",
+      "dev": true,
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/caniuse-lite": {
+      "version": "1.0.30001434",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001434.tgz",
+      "integrity": "sha512-aOBHrLmTQw//WFa2rcF1If9fa3ypkC1wzqqiKHgfdrXTWcU8C4gKVZT77eQAPWN1APys3+uQ0Df07rKauXGEYA==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/caniuse-lite"
+        }
+      ]
+    },
+    "node_modules/capture-exit": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/capture-exit/-/capture-exit-2.0.0.tgz",
+      "integrity": "sha512-PiT/hQmTonHhl/HFGN+Lx3JJUznrVYJ3+AQsnthneZbvW7x+f08Tk7yLJTLEOUvBTbduLeeBkxEaYXUOUrRq6g==",
+      "dev": true,
+      "dependencies": {
+        "rsvp": "^4.8.4"
+      },
+      "engines": {
+        "node": "6.* || 8.* || >= 10.*"
+      }
+    },
+    "node_modules/chalk": {
+      "version": "2.4.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.2.tgz",
+      "integrity": "sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^3.2.1",
+        "escape-string-regexp": "^1.0.5",
+        "supports-color": "^5.3.0"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/char-regex": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/char-regex/-/char-regex-1.0.2.tgz",
+      "integrity": "sha512-kWWXztvZ5SBQV+eRgKFeh8q5sLuZY2+8WUIzlxWVTg+oGwY14qylx1KbKzHd8P6ZYkAg0xyIDU9JMHhyJMZ1jw==",
+      "dev": true,
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/chardet": {
+      "version": "0.7.0",
+      "resolved": "https://registry.npmjs.org/chardet/-/chardet-0.7.0.tgz",
+      "integrity": "sha512-mT8iDcrh03qDGRRmoA2hmBJnxpllMR+0/0qlzjqZES6NdiWDcZkCNAk4rPFZ9Q85r27unkiNNg8ZOiwZXBHwcA==",
+      "dev": true
+    },
+    "node_modules/ci-info": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-2.0.0.tgz",
+      "integrity": "sha512-5tK7EtrZ0N+OLFMthtqOj4fI2Jeb88C4CAZPu25LDVUgXJ0A3Js4PMGqrn0JU1W0Mh1/Z8wZzYPxqUrXeBboCQ==",
+      "dev": true
+    },
+    "node_modules/cjs-module-lexer": {
+      "version": "0.6.0",
+      "resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-0.6.0.tgz",
+      "integrity": "sha512-uc2Vix1frTfnuzxxu1Hp4ktSvM3QaI4oXl4ZUqL1wjTu/BGki9TrCWoqLTg/drR1KwAEarXuRFCG2Svr1GxPFw==",
+      "dev": true
+    },
+    "node_modules/class-utils": {
+      "version": "0.3.6",
+      "resolved": "https://registry.npmjs.org/class-utils/-/class-utils-0.3.6.tgz",
+      "integrity": "sha512-qOhPa/Fj7s6TY8H8esGu5QNpMMQxz79h+urzrNYN6mn+9BnxlDGf5QZ+XeCDsxSjPqsSR56XOZOJmpeurnLMeg==",
+      "dev": true,
+      "dependencies": {
+        "arr-union": "^3.1.0",
+        "define-property": "^0.2.5",
+        "isobject": "^3.0.0",
+        "static-extend": "^0.1.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/class-utils/node_modules/define-property": {
+      "version": "0.2.5",
+      "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz",
+      "integrity": "sha512-Rr7ADjQZenceVOAKop6ALkkRAmH1A4Gx9hV/7ZujPUN2rkATqFO0JZLZInbAjpZYoJ1gUx8MRMQVkYemcbMSTA==",
+      "dev": true,
+      "dependencies": {
+        "is-descriptor": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/cli-cursor": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-3.1.0.tgz",
+      "integrity": "sha512-I/zHAwsKf9FqGoXM4WWRACob9+SNukZTd94DWF57E4toouRulbCxcUh6RKUEOQlYTHJnzkPMySvPNaaSLNfLZw==",
+      "dev": true,
+      "dependencies": {
+        "restore-cursor": "^3.1.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/cli-width": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/cli-width/-/cli-width-3.0.0.tgz",
+      "integrity": "sha512-FxqpkPPwu1HjuN93Omfm4h8uIanXofW0RxVEW3k5RKx+mJJYSthzNhp32Kzxxy3YAEZ/Dc/EWN1vZRY0+kOhbw==",
+      "dev": true,
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/cliui": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/cliui/-/cliui-6.0.0.tgz",
+      "integrity": "sha512-t6wbgtoCXvAzst7QgXxJYqPt0usEfbgQdftEPbLL/cvv6HPE5VgvqCuAIDR0NgU52ds6rFwqrgakNLrHEjCbrQ==",
+      "dev": true,
+      "dependencies": {
+        "string-width": "^4.2.0",
+        "strip-ansi": "^6.0.0",
+        "wrap-ansi": "^6.2.0"
+      }
+    },
+    "node_modules/cliui/node_modules/strip-ansi": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "dev": true,
+      "dependencies": {
+        "ansi-regex": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/co": {
+      "version": "4.6.0",
+      "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
+      "integrity": "sha512-QVb0dM5HvG+uaxitm8wONl7jltx8dqhfU33DcqtOZcLSVIKSDDLDi7+0LbAKiyI8hD9u42m2YxXSkMGWThaecQ==",
+      "dev": true,
+      "engines": {
+        "iojs": ">= 1.0.0",
+        "node": ">= 0.12.0"
+      }
+    },
+    "node_modules/collect-v8-coverage": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/collect-v8-coverage/-/collect-v8-coverage-1.0.1.tgz",
+      "integrity": "sha512-iBPtljfCNcTKNAto0KEtDfZ3qzjJvqE3aTGZsbhjSBlorqpXJlaWWtPO35D+ZImoC3KWejX64o+yPGxhWSTzfg==",
+      "dev": true
+    },
+    "node_modules/collection-visit": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/collection-visit/-/collection-visit-1.0.0.tgz",
+      "integrity": "sha512-lNkKvzEeMBBjUGHZ+q6z9pSJla0KWAQPvtzhEV9+iGyQYG+pBpl7xKDhxoNSOZH2hhv0v5k0y2yAM4o4SjoSkw==",
+      "dev": true,
+      "dependencies": {
+        "map-visit": "^1.0.0",
+        "object-visit": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/color-convert": {
+      "version": "1.9.3",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz",
+      "integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "1.1.3"
+      }
+    },
+    "node_modules/color-name": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz",
+      "integrity": "sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw==",
+      "dev": true
+    },
+    "node_modules/combined-stream": {
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
+      "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
+      "dev": true,
+      "dependencies": {
+        "delayed-stream": "~1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/commondir": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/commondir/-/commondir-1.0.1.tgz",
+      "integrity": "sha512-W9pAhw0ja1Edb5GVdIF1mjZw/ASI0AlShXM83UUGe2DVr5TdAPEA1OA8m/g8zWp9x6On7gqufY+FatDbC3MDQg==",
+      "dev": true
+    },
+    "node_modules/component-emitter": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/component-emitter/-/component-emitter-1.3.0.tgz",
+      "integrity": "sha512-Rd3se6QB+sO1TwqZjscQrurpEPIfO0/yYnSin6Q/rD3mOutHvUrCAhJub3r90uNb+SESBuE0QYoB90YdfatsRg==",
+      "dev": true
+    },
+    "node_modules/concat-map": {
+      "version": "0.0.1",
+      "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
+      "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
+      "dev": true
+    },
+    "node_modules/convert-source-map": {
+      "version": "1.9.0",
+      "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-1.9.0.tgz",
+      "integrity": "sha512-ASFBup0Mz1uyiIjANan1jzLQami9z1PoYSZCiiYW2FczPbenXc45FZdBZLzOT+r6+iciuEModtmCti+hjaAk0A==",
+      "dev": true
+    },
+    "node_modules/copy-descriptor": {
+      "version": "0.1.1",
+      "resolved": "https://registry.npmjs.org/copy-descriptor/-/copy-descriptor-0.1.1.tgz",
+      "integrity": "sha512-XgZ0pFcakEUlbwQEVNg3+QAis1FyTL3Qel9FYy8pSkQqoG3PNoT0bOCQtOXcOkur21r2Eq2kI+IE+gsmAEVlYw==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/cross-spawn": {
+      "version": "6.0.5",
+      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-6.0.5.tgz",
+      "integrity": "sha512-eTVLrBSt7fjbDygz805pMnstIs2VTBNkRm0qxZd+M7A5XDdxVRWO5MxGBXZhjY4cqLYLdtrGqRf8mBPmzwSpWQ==",
+      "dev": true,
+      "dependencies": {
+        "nice-try": "^1.0.4",
+        "path-key": "^2.0.1",
+        "semver": "^5.5.0",
+        "shebang-command": "^1.2.0",
+        "which": "^1.2.9"
+      },
+      "engines": {
+        "node": ">=4.8"
+      }
+    },
+    "node_modules/cross-spawn/node_modules/semver": {
+      "version": "5.7.1",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
+      "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
+      "dev": true,
+      "bin": {
+        "semver": "bin/semver"
+      }
+    },
+    "node_modules/cssom": {
+      "version": "0.4.4",
+      "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.4.4.tgz",
+      "integrity": "sha512-p3pvU7r1MyyqbTk+WbNJIgJjG2VmTIaB10rI93LzVPrmDJKkzKYMtxxyAvQXR/NS6otuzveI7+7BBq3SjBS2mw==",
+      "dev": true
+    },
+    "node_modules/cssstyle": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-2.3.0.tgz",
+      "integrity": "sha512-AZL67abkUzIuvcHqk7c09cezpGNcxUxU4Ioi/05xHk4DQeTkWmGYftIE6ctU6AEt+Gn4n1lDStOtj7FKycP71A==",
+      "dev": true,
+      "dependencies": {
+        "cssom": "~0.3.6"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/cssstyle/node_modules/cssom": {
+      "version": "0.3.8",
+      "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.3.8.tgz",
+      "integrity": "sha512-b0tGHbfegbhPJpxpiBPU2sCkigAqtM9O121le6bbOlgyV+NyGyCmVfJ6QW9eRjz8CpNfWEOYBIMIGRYkLwsIYg==",
+      "dev": true
+    },
+    "node_modules/data-urls": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-2.0.0.tgz",
+      "integrity": "sha512-X5eWTSXO/BJmpdIKCRuKUgSCgAN0OwliVK3yPKbwIWU1Tdw5BRajxlzMidvh+gwko9AfQ9zIj52pzF91Q3YAvQ==",
+      "dev": true,
+      "dependencies": {
+        "abab": "^2.0.3",
+        "whatwg-mimetype": "^2.3.0",
+        "whatwg-url": "^8.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/debug": {
+      "version": "4.3.4",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
+      "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
+      "dev": true,
+      "dependencies": {
+        "ms": "2.1.2"
+      },
+      "engines": {
+        "node": ">=6.0"
+      },
+      "peerDependenciesMeta": {
+        "supports-color": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/decamelize": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz",
+      "integrity": "sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/decimal.js": {
+      "version": "10.4.2",
+      "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.2.tgz",
+      "integrity": "sha512-ic1yEvwT6GuvaYwBLLY6/aFFgjZdySKTE8en/fkU3QICTmRtgtSlFn0u0BXN06InZwtfCelR7j8LRiDI/02iGA==",
+      "dev": true
+    },
+    "node_modules/decode-uri-component": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.0.tgz",
+      "integrity": "sha512-hjf+xovcEn31w/EUYdTXQh/8smFL/dzYjohQGEIgjyNavaJfBY2p5F527Bo1VPATxv0VYTUC2bOcXvqFwk78Og==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10"
+      }
+    },
+    "node_modules/deep-is": {
+      "version": "0.1.4",
+      "resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.4.tgz",
+      "integrity": "sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==",
+      "dev": true
+    },
+    "node_modules/deepmerge": {
+      "version": "4.2.2",
+      "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.2.2.tgz",
+      "integrity": "sha512-FJ3UgI4gIl+PHZm53knsuSFpE+nESMr7M4v9QcgB7S63Kj/6WqMiFQJpBBYz1Pt+66bZpP3Q7Lye0Oo9MPKEdg==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/define-property": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/define-property/-/define-property-2.0.2.tgz",
+      "integrity": "sha512-jwK2UV4cnPpbcG7+VRARKTZPUWowwXA8bzH5NP6ud0oeAxyYPuGZUAC7hMugpCdz4BeSZl2Dl9k66CHJ/46ZYQ==",
+      "dev": true,
+      "dependencies": {
+        "is-descriptor": "^1.0.2",
+        "isobject": "^3.0.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/define-property/node_modules/is-accessor-descriptor": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz",
+      "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==",
+      "dev": true,
+      "dependencies": {
+        "kind-of": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/define-property/node_modules/is-data-descriptor": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz",
+      "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==",
+      "dev": true,
+      "dependencies": {
+        "kind-of": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/define-property/node_modules/is-descriptor": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz",
+      "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==",
+      "dev": true,
+      "dependencies": {
+        "is-accessor-descriptor": "^1.0.0",
+        "is-data-descriptor": "^1.0.0",
+        "kind-of": "^6.0.2"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/delayed-stream": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
+      "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.4.0"
+      }
+    },
+    "node_modules/detect-newline": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/detect-newline/-/detect-newline-3.1.0.tgz",
+      "integrity": "sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/diff-sequences": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/diff-sequences/-/diff-sequences-26.6.2.tgz",
+      "integrity": "sha512-Mv/TDa3nZ9sbc5soK+OoA74BsS3mL37yixCvUAQkiuA4Wz6YtwP/K47n2rv2ovzHZvoiQeA5FTQOschKkEwB0Q==",
+      "dev": true,
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/doctrine": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-3.0.0.tgz",
+      "integrity": "sha512-yS+Q5i3hBf7GBkd4KG8a7eBNNWNGLTaEwwYWUijIYM7zrlYDM0BFXHjjPWlWZ1Rg7UaddZeIDmi9jF3HmqiQ2w==",
+      "dev": true,
+      "dependencies": {
+        "esutils": "^2.0.2"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/domexception": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/domexception/-/domexception-2.0.1.tgz",
+      "integrity": "sha512-yxJ2mFy/sibVQlu5qHjOkf9J3K6zgmCxgJ94u2EdvDOV09H+32LtRswEcUsmUWN72pVLOEnTSRaIVVzVQgS0dg==",
+      "dev": true,
+      "dependencies": {
+        "webidl-conversions": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/domexception/node_modules/webidl-conversions": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-5.0.0.tgz",
+      "integrity": "sha512-VlZwKPCkYKxQgeSbH5EyngOmRp7Ww7I9rQLERETtf5ofd9pGeswWiOtogpEO850jziPRarreGxn5QIiTqpb2wA==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/electron-to-chromium": {
+      "version": "1.4.284",
+      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.284.tgz",
+      "integrity": "sha512-M8WEXFuKXMYMVr45fo8mq0wUrrJHheiKZf6BArTKk9ZBYCKJEOU5H8cdWgDT+qCVZf7Na4lVUaZsA+h6uA9+PA==",
+      "dev": true
+    },
+    "node_modules/emittery": {
+      "version": "0.7.2",
+      "resolved": "https://registry.npmjs.org/emittery/-/emittery-0.7.2.tgz",
+      "integrity": "sha512-A8OG5SR/ij3SsJdWDJdkkSYUjQdCUx6APQXem0SaEePBSRg4eymGYwBkKo1Y6DU+af/Jn2dBQqDBvjnr9Vi8nQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sindresorhus/emittery?sponsor=1"
+      }
+    },
+    "node_modules/emoji-regex": {
+      "version": "8.0.0",
+      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
+      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
+      "dev": true
+    },
+    "node_modules/end-of-stream": {
+      "version": "1.4.4",
+      "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz",
+      "integrity": "sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==",
+      "dev": true,
+      "dependencies": {
+        "once": "^1.4.0"
+      }
+    },
+    "node_modules/error-ex": {
+      "version": "1.3.2",
+      "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.2.tgz",
+      "integrity": "sha512-7dFHNmqeFSEt2ZBsCriorKnn3Z2pj+fd9kmI6QoWw4//DL+icEBfc0U7qJCisqrTsKTjw4fNFy2pW9OqStD84g==",
+      "dev": true,
+      "dependencies": {
+        "is-arrayish": "^0.2.1"
+      }
+    },
+    "node_modules/escalade": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.1.tgz",
+      "integrity": "sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw==",
+      "dev": true,
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/escape-string-regexp": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz",
+      "integrity": "sha512-vbRorB5FUQWvla16U8R/qgaFIya2qGzwDrNmCZuYKrbdSUMG6I1ZCGQRefkRVhuOkIGVne7BQ35DSfo1qvJqFg==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.8.0"
+      }
+    },
+    "node_modules/escodegen": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/escodegen/-/escodegen-2.0.0.tgz",
+      "integrity": "sha512-mmHKys/C8BFUGI+MAWNcSYoORYLMdPzjrknd2Vc+bUsjN5bXcr8EhrNB+UTqfL1y3I9c4fw2ihgtMPQLBRiQxw==",
+      "dev": true,
+      "dependencies": {
+        "esprima": "^4.0.1",
+        "estraverse": "^5.2.0",
+        "esutils": "^2.0.2",
+        "optionator": "^0.8.1"
+      },
+      "bin": {
+        "escodegen": "bin/escodegen.js",
+        "esgenerate": "bin/esgenerate.js"
+      },
+      "engines": {
+        "node": ">=6.0"
+      },
+      "optionalDependencies": {
+        "source-map": "~0.6.1"
+      }
+    },
+    "node_modules/escodegen/node_modules/estraverse": {
+      "version": "5.3.0",
+      "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
+      "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
+      "dev": true,
+      "engines": {
+        "node": ">=4.0"
+      }
+    },
+    "node_modules/eslint": {
+      "version": "6.8.0",
+      "resolved": "https://registry.npmjs.org/eslint/-/eslint-6.8.0.tgz",
+      "integrity": "sha512-K+Iayyo2LtyYhDSYwz5D5QdWw0hCacNzyq1Y821Xna2xSJj7cijoLLYmLxTQgcgZ9mC61nryMy9S7GRbYpI5Ig==",
+      "dev": true,
+      "dependencies": {
+        "@babel/code-frame": "^7.0.0",
+        "ajv": "^6.10.0",
+        "chalk": "^2.1.0",
+        "cross-spawn": "^6.0.5",
+        "debug": "^4.0.1",
+        "doctrine": "^3.0.0",
+        "eslint-scope": "^5.0.0",
+        "eslint-utils": "^1.4.3",
+        "eslint-visitor-keys": "^1.1.0",
+        "espree": "^6.1.2",
+        "esquery": "^1.0.1",
+        "esutils": "^2.0.2",
+        "file-entry-cache": "^5.0.1",
+        "functional-red-black-tree": "^1.0.1",
+        "glob-parent": "^5.0.0",
+        "globals": "^12.1.0",
+        "ignore": "^4.0.6",
+        "import-fresh": "^3.0.0",
+        "imurmurhash": "^0.1.4",
+        "inquirer": "^7.0.0",
+        "is-glob": "^4.0.0",
+        "js-yaml": "^3.13.1",
+        "json-stable-stringify-without-jsonify": "^1.0.1",
+        "levn": "^0.3.0",
+        "lodash": "^4.17.14",
+        "minimatch": "^3.0.4",
+        "mkdirp": "^0.5.1",
+        "natural-compare": "^1.4.0",
+        "optionator": "^0.8.3",
+        "progress": "^2.0.0",
+        "regexpp": "^2.0.1",
+        "semver": "^6.1.2",
+        "strip-ansi": "^5.2.0",
+        "strip-json-comments": "^3.0.1",
+        "table": "^5.2.3",
+        "text-table": "^0.2.0",
+        "v8-compile-cache": "^2.0.3"
+      },
+      "bin": {
+        "eslint": "bin/eslint.js"
+      },
+      "engines": {
+        "node": "^8.10.0 || ^10.13.0 || >=11.10.1"
+      },
+      "funding": {
+        "url": "https://opencollective.com/eslint"
+      }
+    },
+    "node_modules/eslint-scope": {
+      "version": "5.1.1",
+      "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-5.1.1.tgz",
+      "integrity": "sha512-2NxwbF/hZ0KpepYN0cNbo+FN6XoK7GaHlQhgx/hIZl6Va0bF45RQOOwhLIy8lQDbuCiadSLCBnH2CFYquit5bw==",
+      "dev": true,
+      "dependencies": {
+        "esrecurse": "^4.3.0",
+        "estraverse": "^4.1.1"
+      },
+      "engines": {
+        "node": ">=8.0.0"
+      }
+    },
+    "node_modules/eslint-utils": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/eslint-utils/-/eslint-utils-2.1.0.tgz",
+      "integrity": "sha512-w94dQYoauyvlDc43XnGB8lU3Zt713vNChgt4EWwhXAP2XkBvndfxF0AgIqKOOasjPIPzj9JqgwkwbCYD0/V3Zg==",
+      "dev": true,
+      "dependencies": {
+        "eslint-visitor-keys": "^1.1.0"
+      },
+      "engines": {
+        "node": ">=6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/mysticatea"
+      }
+    },
+    "node_modules/eslint-visitor-keys": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz",
+      "integrity": "sha512-6J72N8UNa462wa/KFODt/PJ3IU60SDpC3QXC1Hjc1BXXpfL2C9R5+AU7jhe0F6GREqVMh4Juu+NY7xn+6dipUQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/eslint/node_modules/eslint-utils": {
+      "version": "1.4.3",
+      "resolved": "https://registry.npmjs.org/eslint-utils/-/eslint-utils-1.4.3.tgz",
+      "integrity": "sha512-fbBN5W2xdY45KulGXmLHZ3c3FHfVYmKg0IrAKGOkT/464PQsx2UeIzfz1RmEci+KLm1bBaAzZAh8+/E+XAeZ8Q==",
+      "dev": true,
+      "dependencies": {
+        "eslint-visitor-keys": "^1.1.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/eslint/node_modules/eslint-visitor-keys": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz",
+      "integrity": "sha512-6J72N8UNa462wa/KFODt/PJ3IU60SDpC3QXC1Hjc1BXXpfL2C9R5+AU7jhe0F6GREqVMh4Juu+NY7xn+6dipUQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/eslint/node_modules/ignore": {
+      "version": "4.0.6",
+      "resolved": "https://registry.npmjs.org/ignore/-/ignore-4.0.6.tgz",
+      "integrity": "sha512-cyFDKrqc/YdcWFniJhzI42+AzS+gNwmUzOSFcRCQYwySuBBBy/KjuxWLZ/FHEH6Moq1NizMOBWyTcv8O4OZIMg==",
+      "dev": true,
+      "engines": {
+        "node": ">= 4"
+      }
+    },
+    "node_modules/eslint/node_modules/regexpp": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/regexpp/-/regexpp-2.0.1.tgz",
+      "integrity": "sha512-lv0M6+TkDVniA3aD1Eg0DVpfU/booSu7Eev3TDO/mZKHBfVjgCGTV4t4buppESEYDtkArYFOxTJWv6S5C+iaNw==",
+      "dev": true,
+      "engines": {
+        "node": ">=6.5.0"
+      }
+    },
+    "node_modules/eslint/node_modules/semver": {
+      "version": "6.3.0",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
+      "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+      "dev": true,
+      "bin": {
+        "semver": "bin/semver.js"
+      }
+    },
+    "node_modules/espree": {
+      "version": "6.2.1",
+      "resolved": "https://registry.npmjs.org/espree/-/espree-6.2.1.tgz",
+      "integrity": "sha512-ysCxRQY3WaXJz9tdbWOwuWr5Y/XrPTGX9Kiz3yoUXwW0VZ4w30HTkQLaGx/+ttFjF8i+ACbArnB4ce68a9m5hw==",
+      "dev": true,
+      "dependencies": {
+        "acorn": "^7.1.1",
+        "acorn-jsx": "^5.2.0",
+        "eslint-visitor-keys": "^1.1.0"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/espree/node_modules/eslint-visitor-keys": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz",
+      "integrity": "sha512-6J72N8UNa462wa/KFODt/PJ3IU60SDpC3QXC1Hjc1BXXpfL2C9R5+AU7jhe0F6GREqVMh4Juu+NY7xn+6dipUQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/esprima": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz",
+      "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==",
+      "dev": true,
+      "bin": {
+        "esparse": "bin/esparse.js",
+        "esvalidate": "bin/esvalidate.js"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/esquery": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/esquery/-/esquery-1.4.0.tgz",
+      "integrity": "sha512-cCDispWt5vHHtwMY2YrAQ4ibFkAL8RbH5YGBnZBc90MolvvfkkQcJro/aZiAQUlQ3qgrYS6D6v8Gc5G5CQsc9w==",
+      "dev": true,
+      "dependencies": {
+        "estraverse": "^5.1.0"
+      },
+      "engines": {
+        "node": ">=0.10"
+      }
+    },
+    "node_modules/esquery/node_modules/estraverse": {
+      "version": "5.3.0",
+      "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
+      "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
+      "dev": true,
+      "engines": {
+        "node": ">=4.0"
+      }
+    },
+    "node_modules/esrecurse": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.3.0.tgz",
+      "integrity": "sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag==",
+      "dev": true,
+      "dependencies": {
+        "estraverse": "^5.2.0"
+      },
+      "engines": {
+        "node": ">=4.0"
+      }
+    },
+    "node_modules/esrecurse/node_modules/estraverse": {
+      "version": "5.3.0",
+      "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
+      "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
+      "dev": true,
+      "engines": {
+        "node": ">=4.0"
+      }
+    },
+    "node_modules/estraverse": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-4.3.0.tgz",
+      "integrity": "sha512-39nnKffWz8xN1BU/2c79n9nB9HDzo0niYUqx6xyqUnyoAnQyyWpOTdZEeiCch8BBu515t4wp9ZmgVfVhn9EBpw==",
+      "dev": true,
+      "engines": {
+        "node": ">=4.0"
+      }
+    },
+    "node_modules/estree-walker": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-1.0.1.tgz",
+      "integrity": "sha512-1fMXF3YP4pZZVozF8j/ZLfvnR8NSIljt56UhbZ5PeeDmmGHpgpdwQt7ITlGvYaQukCvuBRMLEiKiYC+oeIg4cg==",
+      "dev": true
+    },
+    "node_modules/esutils": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
+      "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/exec-sh": {
+      "version": "0.3.6",
+      "resolved": "https://registry.npmjs.org/exec-sh/-/exec-sh-0.3.6.tgz",
+      "integrity": "sha512-nQn+hI3yp+oD0huYhKwvYI32+JFeq+XkNcD1GAo3Y/MjxsfVGmrrzrnzjWiNY6f+pUCP440fThsFh5gZrRAU/w==",
+      "dev": true
+    },
+    "node_modules/execa": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/execa/-/execa-1.0.0.tgz",
+      "integrity": "sha512-adbxcyWV46qiHyvSp50TKt05tB4tK3HcmF7/nxfAdhnox83seTDbwnaqKO4sXRy7roHAIFqJP/Rw/AuEbX61LA==",
+      "dev": true,
+      "dependencies": {
+        "cross-spawn": "^6.0.0",
+        "get-stream": "^4.0.0",
+        "is-stream": "^1.1.0",
+        "npm-run-path": "^2.0.0",
+        "p-finally": "^1.0.0",
+        "signal-exit": "^3.0.0",
+        "strip-eof": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/exit": {
+      "version": "0.1.2",
+      "resolved": "https://registry.npmjs.org/exit/-/exit-0.1.2.tgz",
+      "integrity": "sha512-Zk/eNKV2zbjpKzrsQ+n1G6poVbErQxJ0LBOJXaKZ1EViLzH+hrLu9cdXI4zw9dBQJslwBEpbQ2P1oS7nDxs6jQ==",
+      "dev": true,
+      "engines": {
+        "node": ">= 0.8.0"
+      }
+    },
+    "node_modules/expand-brackets": {
+      "version": "2.1.4",
+      "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz",
+      "integrity": "sha512-w/ozOKR9Obk3qoWeY/WDi6MFta9AoMR+zud60mdnbniMcBxRuFJyDt2LdX/14A1UABeqk+Uk+LDfUpvoGKppZA==",
+      "dev": true,
+      "dependencies": {
+        "debug": "^2.3.3",
+        "define-property": "^0.2.5",
+        "extend-shallow": "^2.0.1",
+        "posix-character-classes": "^0.1.0",
+        "regex-not": "^1.0.0",
+        "snapdragon": "^0.8.1",
+        "to-regex": "^3.0.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/expand-brackets/node_modules/debug": {
+      "version": "2.6.9",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
+      "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
+      "dev": true,
+      "dependencies": {
+        "ms": "2.0.0"
+      }
+    },
+    "node_modules/expand-brackets/node_modules/define-property": {
+      "version": "0.2.5",
+      "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz",
+      "integrity": "sha512-Rr7ADjQZenceVOAKop6ALkkRAmH1A4Gx9hV/7ZujPUN2rkATqFO0JZLZInbAjpZYoJ1gUx8MRMQVkYemcbMSTA==",
+      "dev": true,
+      "dependencies": {
+        "is-descriptor": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/expand-brackets/node_modules/extend-shallow": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz",
+      "integrity": "sha512-zCnTtlxNoAiDc3gqY2aYAWFx7XWWiasuF2K8Me5WbN8otHKTUKBwjPtNpRs/rbUZm7KxWAaNj7P1a/p52GbVug==",
+      "dev": true,
+      "dependencies": {
+        "is-extendable": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/expand-brackets/node_modules/ms": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
+      "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==",
+      "dev": true
+    },
+    "node_modules/expect": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/expect/-/expect-26.6.2.tgz",
+      "integrity": "sha512-9/hlOBkQl2l/PLHJx6JjoDF6xPKcJEsUlWKb23rKE7KzeDqUZKXKNMW27KIue5JMdBV9HgmoJPcc8HtO85t9IA==",
+      "dev": true,
+      "dependencies": {
+        "@jest/types": "^26.6.2",
+        "ansi-styles": "^4.0.0",
+        "jest-get-type": "^26.3.0",
+        "jest-matcher-utils": "^26.6.2",
+        "jest-message-util": "^26.6.2",
+        "jest-regex-util": "^26.0.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/expect/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/expect/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/expect/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/extend-shallow": {
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-3.0.2.tgz",
+      "integrity": "sha512-BwY5b5Ql4+qZoefgMj2NUmx+tehVTH/Kf4k1ZEtOHNFcm2wSxMRo992l6X3TIgni2eZVTZ85xMOjF31fwZAj6Q==",
+      "dev": true,
+      "dependencies": {
+        "assign-symbols": "^1.0.0",
+        "is-extendable": "^1.0.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/extend-shallow/node_modules/is-extendable": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-1.0.1.tgz",
+      "integrity": "sha512-arnXMxT1hhoKo9k1LZdmlNyJdDDfy2v0fXjFlmok4+i8ul/6WlbVge9bhM74OpNPQPMGUToDtz+KXa1PneJxOA==",
+      "dev": true,
+      "dependencies": {
+        "is-plain-object": "^2.0.4"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/external-editor": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/external-editor/-/external-editor-3.1.0.tgz",
+      "integrity": "sha512-hMQ4CX1p1izmuLYyZqLMO/qGNw10wSv9QDCPfzXfyFrOaCSSoRfqE1Kf1s5an66J5JZC62NewG+mK49jOCtQew==",
+      "dev": true,
+      "dependencies": {
+        "chardet": "^0.7.0",
+        "iconv-lite": "^0.4.24",
+        "tmp": "^0.0.33"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/extglob": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz",
+      "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==",
+      "dev": true,
+      "dependencies": {
+        "array-unique": "^0.3.2",
+        "define-property": "^1.0.0",
+        "expand-brackets": "^2.1.4",
+        "extend-shallow": "^2.0.1",
+        "fragment-cache": "^0.2.1",
+        "regex-not": "^1.0.0",
+        "snapdragon": "^0.8.1",
+        "to-regex": "^3.0.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/extglob/node_modules/define-property": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz",
+      "integrity": "sha512-cZTYKFWspt9jZsMscWo8sc/5lbPC9Q0N5nBLgb+Yd915iL3udB1uFgS3B8YCx66UVHq018DAVFoee7x+gxggeA==",
+      "dev": true,
+      "dependencies": {
+        "is-descriptor": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/extglob/node_modules/extend-shallow": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz",
+      "integrity": "sha512-zCnTtlxNoAiDc3gqY2aYAWFx7XWWiasuF2K8Me5WbN8otHKTUKBwjPtNpRs/rbUZm7KxWAaNj7P1a/p52GbVug==",
+      "dev": true,
+      "dependencies": {
+        "is-extendable": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/extglob/node_modules/is-accessor-descriptor": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz",
+      "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==",
+      "dev": true,
+      "dependencies": {
+        "kind-of": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/extglob/node_modules/is-data-descriptor": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz",
+      "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==",
+      "dev": true,
+      "dependencies": {
+        "kind-of": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/extglob/node_modules/is-descriptor": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz",
+      "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==",
+      "dev": true,
+      "dependencies": {
+        "is-accessor-descriptor": "^1.0.0",
+        "is-data-descriptor": "^1.0.0",
+        "kind-of": "^6.0.2"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/fast-deep-equal": {
+      "version": "3.1.3",
+      "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
+      "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
+      "dev": true
+    },
+    "node_modules/fast-json-stable-stringify": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
+      "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
+      "dev": true
+    },
+    "node_modules/fast-levenshtein": {
+      "version": "2.0.6",
+      "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz",
+      "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==",
+      "dev": true
+    },
+    "node_modules/fb-watchman": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/fb-watchman/-/fb-watchman-2.0.2.tgz",
+      "integrity": "sha512-p5161BqbuCaSnB8jIbzQHOlpgsPmK5rJVDfDKO91Axs5NC1uu3HRQm6wt9cd9/+GtQQIO53JdGXXoyDpTAsgYA==",
+      "dev": true,
+      "dependencies": {
+        "bser": "2.1.1"
+      }
+    },
+    "node_modules/figures": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/figures/-/figures-3.2.0.tgz",
+      "integrity": "sha512-yaduQFRKLXYOGgEn6AZau90j3ggSOyiqXU0F9JZfeXYhNa+Jk4X+s45A2zg5jns87GAFa34BBm2kXw4XpNcbdg==",
+      "dev": true,
+      "dependencies": {
+        "escape-string-regexp": "^1.0.5"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/file-entry-cache": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-5.0.1.tgz",
+      "integrity": "sha512-bCg29ictuBaKUwwArK4ouCaqDgLZcysCFLmM/Yn/FDoqndh/9vNuQfXRDvTuXKLxfD/JtZQGKFT8MGcJBK644g==",
+      "dev": true,
+      "dependencies": {
+        "flat-cache": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/fill-range": {
+      "version": "7.0.1",
+      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz",
+      "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==",
+      "dev": true,
+      "dependencies": {
+        "to-regex-range": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/find-cache-dir": {
+      "version": "3.3.2",
+      "resolved": "https://registry.npmjs.org/find-cache-dir/-/find-cache-dir-3.3.2.tgz",
+      "integrity": "sha512-wXZV5emFEjrridIgED11OoUKLxiYjAcqot/NJdAkOhlJ+vGzwhOAfcG5OX1jP+S0PcjEn8bdMJv+g2jwQ3Onig==",
+      "dev": true,
+      "dependencies": {
+        "commondir": "^1.0.1",
+        "make-dir": "^3.0.2",
+        "pkg-dir": "^4.1.0"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/avajs/find-cache-dir?sponsor=1"
+      }
+    },
+    "node_modules/find-up": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/find-up/-/find-up-4.1.0.tgz",
+      "integrity": "sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw==",
+      "dev": true,
+      "dependencies": {
+        "locate-path": "^5.0.0",
+        "path-exists": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/flat-cache": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-2.0.1.tgz",
+      "integrity": "sha512-LoQe6yDuUMDzQAEH8sgmh4Md6oZnc/7PjtwjNFSzveXqSHt6ka9fPBuso7IGf9Rz4uqnSnWiFH2B/zj24a5ReA==",
+      "dev": true,
+      "dependencies": {
+        "flatted": "^2.0.0",
+        "rimraf": "2.6.3",
+        "write": "1.0.3"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/flatted": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/flatted/-/flatted-2.0.2.tgz",
+      "integrity": "sha512-r5wGx7YeOwNWNlCA0wQ86zKyDLMQr+/RB8xy74M4hTphfmjlijTSSXGuH8rnvKZnfT9i+75zmd8jcKdMR4O6jA==",
+      "dev": true
+    },
+    "node_modules/for-in": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz",
+      "integrity": "sha512-7EwmXrOjyL+ChxMhmG5lnW9MPt1aIeZEwKhQzoBUdTV0N3zuwWDZYVJatDvZ2OyzPUvdIAZDsCetk3coyMfcnQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/form-data": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/form-data/-/form-data-3.0.1.tgz",
+      "integrity": "sha512-RHkBKtLWUVwd7SqRIvCZMEvAMoGUp0XU+seQiZejj0COz3RI3hWP4sCv3gZWWLjJTd7rGwcsF5eKZGii0r/hbg==",
+      "dev": true,
+      "dependencies": {
+        "asynckit": "^0.4.0",
+        "combined-stream": "^1.0.8",
+        "mime-types": "^2.1.12"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/fragment-cache": {
+      "version": "0.2.1",
+      "resolved": "https://registry.npmjs.org/fragment-cache/-/fragment-cache-0.2.1.tgz",
+      "integrity": "sha512-GMBAbW9antB8iZRHLoGw0b3HANt57diZYFO/HL1JGIC1MjKrdmhxvrJbupnVvpys0zsz7yBApXdQyfepKly2kA==",
+      "dev": true,
+      "dependencies": {
+        "map-cache": "^0.2.2"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/fs-extra": {
+      "version": "8.1.0",
+      "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-8.1.0.tgz",
+      "integrity": "sha512-yhlQgA6mnOJUKOsRUFsgJdQCvkKhcz8tlZG5HBQfReYZy46OwLcY+Zia0mtdHsOo9y/hP+CxMN0TU9QxoOtG4g==",
+      "dev": true,
+      "dependencies": {
+        "graceful-fs": "^4.2.0",
+        "jsonfile": "^4.0.0",
+        "universalify": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=6 <7 || >=8"
+      }
+    },
+    "node_modules/fs-extra/node_modules/universalify": {
+      "version": "0.1.2",
+      "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz",
+      "integrity": "sha512-rBJeI5CXAlmy1pV+617WB9J63U6XcazHHF2f2dbJix4XzpUF0RS3Zbj0FGIOCAva5P/d/GBOYaACQ1w+0azUkg==",
+      "dev": true,
+      "engines": {
+        "node": ">= 4.0.0"
+      }
+    },
+    "node_modules/fs.realpath": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
+      "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==",
+      "dev": true
+    },
+    "node_modules/fsevents": {
+      "version": "2.3.2",
+      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
+      "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
+      "dev": true,
+      "hasInstallScript": true,
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+      }
+    },
+    "node_modules/function-bind": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz",
+      "integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A==",
+      "dev": true
+    },
+    "node_modules/functional-red-black-tree": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/functional-red-black-tree/-/functional-red-black-tree-1.0.1.tgz",
+      "integrity": "sha512-dsKNQNdj6xA3T+QlADDA7mOSlX0qiMINjn0cgr+eGHGsbSHzTabcIogz2+p/iqP1Xs6EP/sS2SbqH+brGTbq0g==",
+      "dev": true
+    },
+    "node_modules/gensync": {
+      "version": "1.0.0-beta.2",
+      "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz",
+      "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==",
+      "dev": true,
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/get-caller-file": {
+      "version": "2.0.5",
+      "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
+      "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==",
+      "dev": true,
+      "engines": {
+        "node": "6.* || 8.* || >= 10.*"
+      }
+    },
+    "node_modules/get-package-type": {
+      "version": "0.1.0",
+      "resolved": "https://registry.npmjs.org/get-package-type/-/get-package-type-0.1.0.tgz",
+      "integrity": "sha512-pjzuKtY64GYfWizNAJ0fr9VqttZkNiK2iS430LtIHzjBEr6bX8Am2zm4sW4Ro5wjWW5cAlRL1qAMTcXbjNAO2Q==",
+      "dev": true,
+      "engines": {
+        "node": ">=8.0.0"
+      }
+    },
+    "node_modules/get-stream": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz",
+      "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==",
+      "dev": true,
+      "dependencies": {
+        "pump": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/get-value": {
+      "version": "2.0.6",
+      "resolved": "https://registry.npmjs.org/get-value/-/get-value-2.0.6.tgz",
+      "integrity": "sha512-Ln0UQDlxH1BapMu3GPtf7CuYNwRZf2gwCuPqbyG6pB8WfmFpzqcy4xtAaAMUhnNqjMKTiCPZG2oMT3YSx8U2NA==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/glob": {
+      "version": "7.2.3",
+      "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz",
+      "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==",
+      "dev": true,
+      "dependencies": {
+        "fs.realpath": "^1.0.0",
+        "inflight": "^1.0.4",
+        "inherits": "2",
+        "minimatch": "^3.1.1",
+        "once": "^1.3.0",
+        "path-is-absolute": "^1.0.0"
+      },
+      "engines": {
+        "node": "*"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/glob-parent": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
+      "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
+      "dev": true,
+      "dependencies": {
+        "is-glob": "^4.0.1"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/globals": {
+      "version": "12.4.0",
+      "resolved": "https://registry.npmjs.org/globals/-/globals-12.4.0.tgz",
+      "integrity": "sha512-BWICuzzDvDoH54NHKCseDanAhE3CeDorgDL5MT6LMXXj2WCnd9UC2szdk4AWLfjdgNBCXLUanXYcpBBKOSWGwg==",
+      "dev": true,
+      "dependencies": {
+        "type-fest": "^0.8.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/graceful-fs": {
+      "version": "4.2.10",
+      "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.10.tgz",
+      "integrity": "sha512-9ByhssR2fPVsNZj478qUUbKfmL0+t5BDVyjShtyZZLiK7ZDAArFFfopyOTj0M05wE2tJPisA4iTnnXl2YoPvOA==",
+      "dev": true
+    },
+    "node_modules/growly": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/growly/-/growly-1.3.0.tgz",
+      "integrity": "sha512-+xGQY0YyAWCnqy7Cd++hc2JqMYzlm0dG30Jd0beaA64sROr8C4nt8Yc9V5Ro3avlSUDTN0ulqP/VBKi1/lLygw==",
+      "dev": true,
+      "optional": true
+    },
+    "node_modules/handlebars": {
+      "version": "4.7.7",
+      "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.7.7.tgz",
+      "integrity": "sha512-aAcXm5OAfE/8IXkcZvCepKU3VzW1/39Fb5ZuqMtgI/hT8X2YgoMvBY5dLhq/cpOvw7Lk1nK/UF71aLG/ZnVYRA==",
+      "dev": true,
+      "dependencies": {
+        "minimist": "^1.2.5",
+        "neo-async": "^2.6.0",
+        "source-map": "^0.6.1",
+        "wordwrap": "^1.0.0"
+      },
+      "bin": {
+        "handlebars": "bin/handlebars"
+      },
+      "engines": {
+        "node": ">=0.4.7"
+      },
+      "optionalDependencies": {
+        "uglify-js": "^3.1.4"
+      }
+    },
+    "node_modules/has": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz",
+      "integrity": "sha512-f2dvO0VU6Oej7RkWJGrehjbzMAjFp5/VKPp5tTpWIV4JHHZK1/BxbFRtf/siA2SWTe09caDmVtYYzWEIbBS4zw==",
+      "dev": true,
+      "dependencies": {
+        "function-bind": "^1.1.1"
+      },
+      "engines": {
+        "node": ">= 0.4.0"
+      }
+    },
+    "node_modules/has-flag": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz",
+      "integrity": "sha512-sKJf1+ceQBr4SMkvQnBDNDtf4TXpVhVGateu0t918bl30FnbE2m4vNLX+VWe/dpjlb+HugGYzW7uQXH98HPEYw==",
+      "dev": true,
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/has-value": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/has-value/-/has-value-1.0.0.tgz",
+      "integrity": "sha512-IBXk4GTsLYdQ7Rvt+GRBrFSVEkmuOUy4re0Xjd9kJSUQpnTrWR4/y9RpfexN9vkAPMFuQoeWKwqzPozRTlasGw==",
+      "dev": true,
+      "dependencies": {
+        "get-value": "^2.0.6",
+        "has-values": "^1.0.0",
+        "isobject": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/has-values": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/has-values/-/has-values-1.0.0.tgz",
+      "integrity": "sha512-ODYZC64uqzmtfGMEAX/FvZiRyWLpAC3vYnNunURUnkGVTS+mI0smVsWaPydRBsE3g+ok7h960jChO8mFcWlHaQ==",
+      "dev": true,
+      "dependencies": {
+        "is-number": "^3.0.0",
+        "kind-of": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/has-values/node_modules/is-number": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz",
+      "integrity": "sha512-4cboCqIpliH+mAvFNegjZQ4kgKc3ZUhQVr3HvWbSh5q3WH2v82ct+T2Y1hdU5Gdtorx/cLifQjqCbL7bpznLTg==",
+      "dev": true,
+      "dependencies": {
+        "kind-of": "^3.0.2"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/has-values/node_modules/is-number/node_modules/kind-of": {
+      "version": "3.2.2",
+      "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
+      "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
+      "dev": true,
+      "dependencies": {
+        "is-buffer": "^1.1.5"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/has-values/node_modules/kind-of": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-4.0.0.tgz",
+      "integrity": "sha512-24XsCxmEbRwEDbz/qz3stgin8TTzZ1ESR56OMCN0ujYg+vRutNSiOj9bHH9u85DKgXguraugV5sFuvbD4FW/hw==",
+      "dev": true,
+      "dependencies": {
+        "is-buffer": "^1.1.5"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/highlight.js": {
+      "version": "10.7.3",
+      "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-10.7.3.tgz",
+      "integrity": "sha512-tzcUFauisWKNHaRkN4Wjl/ZA07gENAjFl3J/c480dprkGTg5EQstgaNFqBfUqCq54kZRIEcreTsAgF/m2quD7A==",
+      "dev": true,
+      "engines": {
+        "node": "*"
+      }
+    },
+    "node_modules/hosted-git-info": {
+      "version": "2.8.9",
+      "resolved": "https://registry.npmjs.org/hosted-git-info/-/hosted-git-info-2.8.9.tgz",
+      "integrity": "sha512-mxIDAb9Lsm6DoOJ7xH+5+X4y1LU/4Hi50L9C5sIswK3JzULS4bwk1FvjdBgvYR4bzT4tuUQiC15FE2f5HbLvYw==",
+      "dev": true
+    },
+    "node_modules/html-encoding-sniffer": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-2.0.1.tgz",
+      "integrity": "sha512-D5JbOMBIR/TVZkubHT+OyT2705QvogUW4IBn6nHd756OwieSF9aDYFj4dv6HHEVGYbHaLETa3WggZYWWMyy3ZQ==",
+      "dev": true,
+      "dependencies": {
+        "whatwg-encoding": "^1.0.5"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/html-escaper": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
+      "integrity": "sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==",
+      "dev": true
+    },
+    "node_modules/http-proxy-agent": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-4.0.1.tgz",
+      "integrity": "sha512-k0zdNgqWTGA6aeIRVpvfVob4fL52dTfaehylg0Y4UvSySvOq/Y+BOyPrgpUrA7HylqvU8vIZGsRuXmspskV0Tg==",
+      "dev": true,
+      "dependencies": {
+        "@tootallnate/once": "1",
+        "agent-base": "6",
+        "debug": "4"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/https-proxy-agent": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
+      "integrity": "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==",
+      "dev": true,
+      "dependencies": {
+        "agent-base": "6",
+        "debug": "4"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/human-signals": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-1.1.1.tgz",
+      "integrity": "sha512-SEQu7vl8KjNL2eoGBLF3+wAjpsNfA9XMlXAYj/3EdaNfAlxKthD1xjEQfGOUhllCGGJVNY34bRr6lPINhNjyZw==",
+      "dev": true,
+      "engines": {
+        "node": ">=8.12.0"
+      }
+    },
+    "node_modules/iconv-lite": {
+      "version": "0.4.24",
+      "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
+      "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==",
+      "dev": true,
+      "dependencies": {
+        "safer-buffer": ">= 2.1.2 < 3"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/import-fresh": {
+      "version": "3.3.0",
+      "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.0.tgz",
+      "integrity": "sha512-veYYhQa+D1QBKznvhUHxb8faxlrwUnxseDAbAp457E0wLNio2bOSKnjYDhMj+YiAq61xrMGhQk9iXVk5FzgQMw==",
+      "dev": true,
+      "dependencies": {
+        "parent-module": "^1.0.0",
+        "resolve-from": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/import-local": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/import-local/-/import-local-3.1.0.tgz",
+      "integrity": "sha512-ASB07uLtnDs1o6EHjKpX34BKYDSqnFerfTOJL2HvMqF70LnxpjkzDB8J44oT9pu4AMPkQwf8jl6szgvNd2tRIg==",
+      "dev": true,
+      "dependencies": {
+        "pkg-dir": "^4.2.0",
+        "resolve-cwd": "^3.0.0"
+      },
+      "bin": {
+        "import-local-fixture": "fixtures/cli.js"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/imurmurhash": {
+      "version": "0.1.4",
+      "resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz",
+      "integrity": "sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.8.19"
+      }
+    },
+    "node_modules/inflight": {
+      "version": "1.0.6",
+      "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
+      "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==",
+      "dev": true,
+      "dependencies": {
+        "once": "^1.3.0",
+        "wrappy": "1"
+      }
+    },
+    "node_modules/inherits": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
+      "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
+      "dev": true
+    },
+    "node_modules/inquirer": {
+      "version": "7.3.3",
+      "resolved": "https://registry.npmjs.org/inquirer/-/inquirer-7.3.3.tgz",
+      "integrity": "sha512-JG3eIAj5V9CwcGvuOmoo6LB9kbAYT8HXffUl6memuszlwDC/qvFAJw49XJ5NROSFNPxp3iQg1GqkFhaY/CR0IA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-escapes": "^4.2.1",
+        "chalk": "^4.1.0",
+        "cli-cursor": "^3.1.0",
+        "cli-width": "^3.0.0",
+        "external-editor": "^3.0.3",
+        "figures": "^3.0.0",
+        "lodash": "^4.17.19",
+        "mute-stream": "0.0.8",
+        "run-async": "^2.4.0",
+        "rxjs": "^6.6.0",
+        "string-width": "^4.1.0",
+        "strip-ansi": "^6.0.0",
+        "through": "^2.3.6"
+      },
+      "engines": {
+        "node": ">=8.0.0"
+      }
+    },
+    "node_modules/inquirer/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/inquirer/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/inquirer/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/inquirer/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/inquirer/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/inquirer/node_modules/strip-ansi": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "dev": true,
+      "dependencies": {
+        "ansi-regex": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/inquirer/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/interpret": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/interpret/-/interpret-1.4.0.tgz",
+      "integrity": "sha512-agE4QfB2Lkp9uICn7BAqoscw4SZP9kTE2hxiFI3jBPmXJfdqiahTbUuKGsMoN2GtqL9AxhYioAcVvgsb1HvRbA==",
+      "dev": true,
+      "engines": {
+        "node": ">= 0.10"
+      }
+    },
+    "node_modules/is-accessor-descriptor": {
+      "version": "0.1.6",
+      "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz",
+      "integrity": "sha512-e1BM1qnDbMRG3ll2U9dSK0UMHuWOs3pY3AtcFsmvwPtKL3MML/Q86i+GilLfvqEs4GW+ExB91tQ3Ig9noDIZ+A==",
+      "dev": true,
+      "dependencies": {
+        "kind-of": "^3.0.2"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-accessor-descriptor/node_modules/kind-of": {
+      "version": "3.2.2",
+      "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
+      "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
+      "dev": true,
+      "dependencies": {
+        "is-buffer": "^1.1.5"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-arrayish": {
+      "version": "0.2.1",
+      "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.2.1.tgz",
+      "integrity": "sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==",
+      "dev": true
+    },
+    "node_modules/is-buffer": {
+      "version": "1.1.6",
+      "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
+      "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==",
+      "dev": true
+    },
+    "node_modules/is-ci": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/is-ci/-/is-ci-2.0.0.tgz",
+      "integrity": "sha512-YfJT7rkpQB0updsdHLGWrvhBJfcfzNNawYDNIyQXJz0IViGf75O8EBPKSdvw2rF+LGCsX4FZ8tcr3b19LcZq4w==",
+      "dev": true,
+      "dependencies": {
+        "ci-info": "^2.0.0"
+      },
+      "bin": {
+        "is-ci": "bin.js"
+      }
+    },
+    "node_modules/is-core-module": {
+      "version": "2.11.0",
+      "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.11.0.tgz",
+      "integrity": "sha512-RRjxlvLDkD1YJwDbroBHMb+cukurkDWNyHx7D3oNB5x9rb5ogcksMC5wHCadcXoo67gVr/+3GFySh3134zi6rw==",
+      "dev": true,
+      "dependencies": {
+        "has": "^1.0.3"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/is-data-descriptor": {
+      "version": "0.1.4",
+      "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz",
+      "integrity": "sha512-+w9D5ulSoBNlmw9OHn3U2v51SyoCd0he+bB3xMl62oijhrspxowjU+AIcDY0N3iEJbUEkB15IlMASQsxYigvXg==",
+      "dev": true,
+      "dependencies": {
+        "kind-of": "^3.0.2"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-data-descriptor/node_modules/kind-of": {
+      "version": "3.2.2",
+      "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
+      "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
+      "dev": true,
+      "dependencies": {
+        "is-buffer": "^1.1.5"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-descriptor": {
+      "version": "0.1.6",
+      "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz",
+      "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==",
+      "dev": true,
+      "dependencies": {
+        "is-accessor-descriptor": "^0.1.6",
+        "is-data-descriptor": "^0.1.4",
+        "kind-of": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-descriptor/node_modules/kind-of": {
+      "version": "5.1.0",
+      "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz",
+      "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-docker": {
+      "version": "2.2.1",
+      "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-2.2.1.tgz",
+      "integrity": "sha512-F+i2BKsFrH66iaUFc0woD8sLy8getkwTwtOBjvs56Cx4CgJDeKQeqfz8wAYiSb8JOprWhHH5p77PbmYCvvUuXQ==",
+      "dev": true,
+      "optional": true,
+      "bin": {
+        "is-docker": "cli.js"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/is-extendable": {
+      "version": "0.1.1",
+      "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz",
+      "integrity": "sha512-5BMULNob1vgFX6EjQw5izWDxrecWK9AM72rugNr0TFldMOi0fj6Jk+zeKIt0xGj4cEfQIJth4w3OKWOJ4f+AFw==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-extglob": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz",
+      "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-fullwidth-code-point": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
+      "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/is-generator-fn": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/is-generator-fn/-/is-generator-fn-2.1.0.tgz",
+      "integrity": "sha512-cTIB4yPYL/Grw0EaSzASzg6bBy9gqCofvWN8okThAYIxKJZC+udlRAmGbM0XLeniEJSs8uEgHPGuHSe1XsOLSQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/is-glob": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
+      "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==",
+      "dev": true,
+      "dependencies": {
+        "is-extglob": "^2.1.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-module": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-module/-/is-module-1.0.0.tgz",
+      "integrity": "sha512-51ypPSPCoTEIN9dy5Oy+h4pShgJmPCygKfyRCISBI+JoWT/2oJvK8QPxmwv7b/p239jXrm9M1mlQbyKJ5A152g==",
+      "dev": true
+    },
+    "node_modules/is-number": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz",
+      "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.12.0"
+      }
+    },
+    "node_modules/is-plain-object": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz",
+      "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==",
+      "dev": true,
+      "dependencies": {
+        "isobject": "^3.0.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-potential-custom-element-name": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
+      "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==",
+      "dev": true
+    },
+    "node_modules/is-reference": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/is-reference/-/is-reference-1.2.1.tgz",
+      "integrity": "sha512-U82MsXXiFIrjCK4otLT+o2NA2Cd2g5MLoOVXUZjIOhLurrRxpEXzI8O0KZHr3IjLvlAH1kTPYSuqer5T9ZVBKQ==",
+      "dev": true,
+      "dependencies": {
+        "@types/estree": "*"
+      }
+    },
+    "node_modules/is-stream": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-1.1.0.tgz",
+      "integrity": "sha512-uQPm8kcs47jx38atAcWTVxyltQYoPT68y9aWYdV6yWXSyW8mzSat0TL6CiWdZeCdF3KrAvpVtnHbTv4RN+rqdQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-typedarray": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-typedarray/-/is-typedarray-1.0.0.tgz",
+      "integrity": "sha512-cyA56iCMHAh5CdzjJIa4aohJyeO1YbwLi3Jc35MmRU6poroFjIGZzUzupGiRPOjgHg9TLu43xbpwXk523fMxKA==",
+      "dev": true
+    },
+    "node_modules/is-windows": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/is-windows/-/is-windows-1.0.2.tgz",
+      "integrity": "sha512-eXK1UInq2bPmjyX6e3VHIzMLobc4J94i4AWn+Hpq3OU5KkrRC96OAcR3PRJ/pGu6m8TRnBHP9dkXQVsT/COVIA==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-wsl": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-2.2.0.tgz",
+      "integrity": "sha512-fKzAra0rGJUUBwGBgNkHZuToZcn+TtXHpeCgmkMJMMYx1sQDYaCSyjJBSCa2nH1DGm7s3n1oBnohoVTBaN7Lww==",
+      "dev": true,
+      "optional": true,
+      "dependencies": {
+        "is-docker": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/isarray": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
+      "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==",
+      "dev": true
+    },
+    "node_modules/isexe": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",
+      "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
+      "dev": true
+    },
+    "node_modules/isobject": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz",
+      "integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/istanbul-lib-coverage": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.0.tgz",
+      "integrity": "sha512-eOeJ5BHCmHYvQK7xt9GkdHuzuCGS1Y6g9Gvnx3Ym33fz/HpLRYxiS0wHNr+m/MBC8B647Xt608vCDEvhl9c6Mw==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/istanbul-lib-instrument": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/istanbul-lib-instrument/-/istanbul-lib-instrument-4.0.3.tgz",
+      "integrity": "sha512-BXgQl9kf4WTCPCCpmFGoJkz/+uhvm7h7PFKUYxh7qarQd3ER33vHG//qaE8eN25l07YqZPpHXU9I09l/RD5aGQ==",
+      "dev": true,
+      "dependencies": {
+        "@babel/core": "^7.7.5",
+        "@istanbuljs/schema": "^0.1.2",
+        "istanbul-lib-coverage": "^3.0.0",
+        "semver": "^6.3.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/istanbul-lib-instrument/node_modules/semver": {
+      "version": "6.3.0",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
+      "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+      "dev": true,
+      "bin": {
+        "semver": "bin/semver.js"
+      }
+    },
+    "node_modules/istanbul-lib-report": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/istanbul-lib-report/-/istanbul-lib-report-3.0.0.tgz",
+      "integrity": "sha512-wcdi+uAKzfiGT2abPpKZ0hSU1rGQjUQnLvtY5MpQ7QCTahD3VODhcu4wcfY1YtkGaDD5yuydOLINXsfbus9ROw==",
+      "dev": true,
+      "dependencies": {
+        "istanbul-lib-coverage": "^3.0.0",
+        "make-dir": "^3.0.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/istanbul-lib-report/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/istanbul-lib-report/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/istanbul-lib-source-maps": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmjs.org/istanbul-lib-source-maps/-/istanbul-lib-source-maps-4.0.1.tgz",
+      "integrity": "sha512-n3s8EwkdFIJCG3BPKBYvskgXGoy88ARzvegkitk60NxRdwltLOTaH7CUiMRXvwYorl0Q712iEjcWB+fK/MrWVw==",
+      "dev": true,
+      "dependencies": {
+        "debug": "^4.1.1",
+        "istanbul-lib-coverage": "^3.0.0",
+        "source-map": "^0.6.1"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/istanbul-reports": {
+      "version": "3.1.5",
+      "resolved": "https://registry.npmjs.org/istanbul-reports/-/istanbul-reports-3.1.5.tgz",
+      "integrity": "sha512-nUsEMa9pBt/NOHqbcbeJEgqIlY/K7rVWUX6Lql2orY5e9roQOthbR3vtY4zzf2orPELg80fnxxk9zUyPlgwD1w==",
+      "dev": true,
+      "dependencies": {
+        "html-escaper": "^2.0.0",
+        "istanbul-lib-report": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/jest/-/jest-26.6.3.tgz",
+      "integrity": "sha512-lGS5PXGAzR4RF7V5+XObhqz2KZIDUA1yD0DG6pBVmy10eh0ZIXQImRuzocsI/N2XZ1GrLFwTS27In2i2jlpq1Q==",
+      "dev": true,
+      "dependencies": {
+        "@jest/core": "^26.6.3",
+        "import-local": "^3.0.2",
+        "jest-cli": "^26.6.3"
+      },
+      "bin": {
+        "jest": "bin/jest.js"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-changed-files": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-changed-files/-/jest-changed-files-26.6.2.tgz",
+      "integrity": "sha512-fDS7szLcY9sCtIip8Fjry9oGf3I2ht/QT21bAHm5Dmf0mD4X3ReNUf17y+bO6fR8WgbIZTlbyG1ak/53cbRzKQ==",
+      "dev": true,
+      "dependencies": {
+        "@jest/types": "^26.6.2",
+        "execa": "^4.0.0",
+        "throat": "^5.0.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-changed-files/node_modules/cross-spawn": {
+      "version": "7.0.3",
+      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
+      "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
+      "dev": true,
+      "dependencies": {
+        "path-key": "^3.1.0",
+        "shebang-command": "^2.0.0",
+        "which": "^2.0.1"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/jest-changed-files/node_modules/execa": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/execa/-/execa-4.1.0.tgz",
+      "integrity": "sha512-j5W0//W7f8UxAn8hXVnwG8tLwdiUy4FJLcSupCg6maBYZDpyBvTApK7KyuI4bKj8KOh1r2YH+6ucuYtJv1bTZA==",
+      "dev": true,
+      "dependencies": {
+        "cross-spawn": "^7.0.0",
+        "get-stream": "^5.0.0",
+        "human-signals": "^1.1.1",
+        "is-stream": "^2.0.0",
+        "merge-stream": "^2.0.0",
+        "npm-run-path": "^4.0.0",
+        "onetime": "^5.1.0",
+        "signal-exit": "^3.0.2",
+        "strip-final-newline": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sindresorhus/execa?sponsor=1"
+      }
+    },
+    "node_modules/jest-changed-files/node_modules/get-stream": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz",
+      "integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==",
+      "dev": true,
+      "dependencies": {
+        "pump": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/jest-changed-files/node_modules/is-stream": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz",
+      "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/jest-changed-files/node_modules/npm-run-path": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-4.0.1.tgz",
+      "integrity": "sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==",
+      "dev": true,
+      "dependencies": {
+        "path-key": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-changed-files/node_modules/path-key": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz",
+      "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-changed-files/node_modules/shebang-command": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
+      "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==",
+      "dev": true,
+      "dependencies": {
+        "shebang-regex": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-changed-files/node_modules/shebang-regex": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz",
+      "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-changed-files/node_modules/which": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
+      "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==",
+      "dev": true,
+      "dependencies": {
+        "isexe": "^2.0.0"
+      },
+      "bin": {
+        "node-which": "bin/node-which"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/jest-config": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/jest-config/-/jest-config-26.6.3.tgz",
+      "integrity": "sha512-t5qdIj/bCj2j7NFVHb2nFB4aUdfucDn3JRKgrZnplb8nieAirAzRSHP8uDEd+qV6ygzg9Pz4YG7UTJf94LPSyg==",
+      "dev": true,
+      "dependencies": {
+        "@babel/core": "^7.1.0",
+        "@jest/test-sequencer": "^26.6.3",
+        "@jest/types": "^26.6.2",
+        "babel-jest": "^26.6.3",
+        "chalk": "^4.0.0",
+        "deepmerge": "^4.2.2",
+        "glob": "^7.1.1",
+        "graceful-fs": "^4.2.4",
+        "jest-environment-jsdom": "^26.6.2",
+        "jest-environment-node": "^26.6.2",
+        "jest-get-type": "^26.3.0",
+        "jest-jasmine2": "^26.6.3",
+        "jest-regex-util": "^26.0.0",
+        "jest-resolve": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "jest-validate": "^26.6.2",
+        "micromatch": "^4.0.2",
+        "pretty-format": "^26.6.2"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      },
+      "peerDependencies": {
+        "ts-node": ">=9.0.0"
+      },
+      "peerDependenciesMeta": {
+        "ts-node": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/jest-config/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/jest-config/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/jest-config/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/jest-config/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/jest-config/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-config/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-diff": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-diff/-/jest-diff-26.6.2.tgz",
+      "integrity": "sha512-6m+9Z3Gv9wN0WFVasqjCL/06+EFCMTqDEUl/b87HYK2rAPTyfz4ZIuSlPhY51PIQRWx5TaxeF1qmXKe9gfN3sA==",
+      "dev": true,
+      "dependencies": {
+        "chalk": "^4.0.0",
+        "diff-sequences": "^26.6.2",
+        "jest-get-type": "^26.3.0",
+        "pretty-format": "^26.6.2"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-diff/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/jest-diff/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/jest-diff/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/jest-diff/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/jest-diff/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-diff/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-docblock": {
+      "version": "26.0.0",
+      "resolved": "https://registry.npmjs.org/jest-docblock/-/jest-docblock-26.0.0.tgz",
+      "integrity": "sha512-RDZ4Iz3QbtRWycd8bUEPxQsTlYazfYn/h5R65Fc6gOfwozFhoImx+affzky/FFBuqISPTqjXomoIGJVKBWoo0w==",
+      "dev": true,
+      "dependencies": {
+        "detect-newline": "^3.0.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-each": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-each/-/jest-each-26.6.2.tgz",
+      "integrity": "sha512-Mer/f0KaATbjl8MCJ+0GEpNdqmnVmDYqCTJYTvoo7rqmRiDllmp2AYN+06F93nXcY3ur9ShIjS+CO/uD+BbH4A==",
+      "dev": true,
+      "dependencies": {
+        "@jest/types": "^26.6.2",
+        "chalk": "^4.0.0",
+        "jest-get-type": "^26.3.0",
+        "jest-util": "^26.6.2",
+        "pretty-format": "^26.6.2"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-each/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/jest-each/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/jest-each/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/jest-each/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/jest-each/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-each/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-environment-jsdom": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-environment-jsdom/-/jest-environment-jsdom-26.6.2.tgz",
+      "integrity": "sha512-jgPqCruTlt3Kwqg5/WVFyHIOJHsiAvhcp2qiR2QQstuG9yWox5+iHpU3ZrcBxW14T4fe5Z68jAfLRh7joCSP2Q==",
+      "dev": true,
+      "dependencies": {
+        "@jest/environment": "^26.6.2",
+        "@jest/fake-timers": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "jest-mock": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "jsdom": "^16.4.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-environment-node": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-environment-node/-/jest-environment-node-26.6.2.tgz",
+      "integrity": "sha512-zhtMio3Exty18dy8ee8eJ9kjnRyZC1N4C1Nt/VShN1apyXc8rWGtJ9lI7vqiWcyyXS4BVSEn9lxAM2D+07/Tag==",
+      "dev": true,
+      "dependencies": {
+        "@jest/environment": "^26.6.2",
+        "@jest/fake-timers": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "jest-mock": "^26.6.2",
+        "jest-util": "^26.6.2"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-get-type": {
+      "version": "26.3.0",
+      "resolved": "https://registry.npmjs.org/jest-get-type/-/jest-get-type-26.3.0.tgz",
+      "integrity": "sha512-TpfaviN1R2pQWkIihlfEanwOXK0zcxrKEE4MlU6Tn7keoXdN6/3gK/xl0yEh8DOunn5pOVGKf8hB4R9gVh04ig==",
+      "dev": true,
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-haste-map": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-haste-map/-/jest-haste-map-26.6.2.tgz",
+      "integrity": "sha512-easWIJXIw71B2RdR8kgqpjQrbMRWQBgiBwXYEhtGUTaX+doCjBheluShdDMeR8IMfJiTqH4+zfhtg29apJf/8w==",
+      "dev": true,
+      "dependencies": {
+        "@jest/types": "^26.6.2",
+        "@types/graceful-fs": "^4.1.2",
+        "@types/node": "*",
+        "anymatch": "^3.0.3",
+        "fb-watchman": "^2.0.0",
+        "graceful-fs": "^4.2.4",
+        "jest-regex-util": "^26.0.0",
+        "jest-serializer": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "jest-worker": "^26.6.2",
+        "micromatch": "^4.0.2",
+        "sane": "^4.0.3",
+        "walker": "^1.0.7"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      },
+      "optionalDependencies": {
+        "fsevents": "^2.1.2"
+      }
+    },
+    "node_modules/jest-jasmine2": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/jest-jasmine2/-/jest-jasmine2-26.6.3.tgz",
+      "integrity": "sha512-kPKUrQtc8aYwBV7CqBg5pu+tmYXlvFlSFYn18ev4gPFtrRzB15N2gW/Roew3187q2w2eHuu0MU9TJz6w0/nPEg==",
+      "dev": true,
+      "dependencies": {
+        "@babel/traverse": "^7.1.0",
+        "@jest/environment": "^26.6.2",
+        "@jest/source-map": "^26.6.2",
+        "@jest/test-result": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "chalk": "^4.0.0",
+        "co": "^4.6.0",
+        "expect": "^26.6.2",
+        "is-generator-fn": "^2.0.0",
+        "jest-each": "^26.6.2",
+        "jest-matcher-utils": "^26.6.2",
+        "jest-message-util": "^26.6.2",
+        "jest-runtime": "^26.6.3",
+        "jest-snapshot": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "pretty-format": "^26.6.2",
+        "throat": "^5.0.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-jasmine2/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/jest-jasmine2/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/jest-jasmine2/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/jest-jasmine2/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/jest-jasmine2/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-jasmine2/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-leak-detector": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-leak-detector/-/jest-leak-detector-26.6.2.tgz",
+      "integrity": "sha512-i4xlXpsVSMeKvg2cEKdfhh0H39qlJlP5Ex1yQxwF9ubahboQYMgTtz5oML35AVA3B4Eu+YsmwaiKVev9KCvLxg==",
+      "dev": true,
+      "dependencies": {
+        "jest-get-type": "^26.3.0",
+        "pretty-format": "^26.6.2"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-matcher-utils": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-matcher-utils/-/jest-matcher-utils-26.6.2.tgz",
+      "integrity": "sha512-llnc8vQgYcNqDrqRDXWwMr9i7rS5XFiCwvh6DTP7Jqa2mqpcCBBlpCbn+trkG0KNhPu/h8rzyBkriOtBstvWhw==",
+      "dev": true,
+      "dependencies": {
+        "chalk": "^4.0.0",
+        "jest-diff": "^26.6.2",
+        "jest-get-type": "^26.3.0",
+        "pretty-format": "^26.6.2"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-matcher-utils/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/jest-matcher-utils/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/jest-matcher-utils/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/jest-matcher-utils/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/jest-matcher-utils/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-matcher-utils/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-message-util": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-message-util/-/jest-message-util-26.6.2.tgz",
+      "integrity": "sha512-rGiLePzQ3AzwUshu2+Rn+UMFk0pHN58sOG+IaJbk5Jxuqo3NYO1U2/MIR4S1sKgsoYSXSzdtSa0TgrmtUwEbmA==",
+      "dev": true,
+      "dependencies": {
+        "@babel/code-frame": "^7.0.0",
+        "@jest/types": "^26.6.2",
+        "@types/stack-utils": "^2.0.0",
+        "chalk": "^4.0.0",
+        "graceful-fs": "^4.2.4",
+        "micromatch": "^4.0.2",
+        "pretty-format": "^26.6.2",
+        "slash": "^3.0.0",
+        "stack-utils": "^2.0.2"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-message-util/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/jest-message-util/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/jest-message-util/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/jest-message-util/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/jest-message-util/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-message-util/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-mock": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-mock/-/jest-mock-26.6.2.tgz",
+      "integrity": "sha512-YyFjePHHp1LzpzYcmgqkJ0nm0gg/lJx2aZFzFy1S6eUqNjXsOqTK10zNRff2dNfssgokjkG65OlWNcIlgd3zew==",
+      "dev": true,
+      "dependencies": {
+        "@jest/types": "^26.6.2",
+        "@types/node": "*"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-pnp-resolver": {
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/jest-pnp-resolver/-/jest-pnp-resolver-1.2.3.tgz",
+      "integrity": "sha512-+3NpwQEnRoIBtx4fyhblQDPgJI0H1IEIkX7ShLUjPGA7TtUTvI1oiKi3SR4oBR0hQhQR80l4WAe5RrXBwWMA8w==",
+      "dev": true,
+      "engines": {
+        "node": ">=6"
+      },
+      "peerDependencies": {
+        "jest-resolve": "*"
+      },
+      "peerDependenciesMeta": {
+        "jest-resolve": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/jest-regex-util": {
+      "version": "26.0.0",
+      "resolved": "https://registry.npmjs.org/jest-regex-util/-/jest-regex-util-26.0.0.tgz",
+      "integrity": "sha512-Gv3ZIs/nA48/Zvjrl34bf+oD76JHiGDUxNOVgUjh3j890sblXryjY4rss71fPtD/njchl6PSE2hIhvyWa1eT0A==",
+      "dev": true,
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-resolve": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-resolve/-/jest-resolve-26.6.2.tgz",
+      "integrity": "sha512-sOxsZOq25mT1wRsfHcbtkInS+Ek7Q8jCHUB0ZUTP0tc/c41QHriU/NunqMfCUWsL4H3MHpvQD4QR9kSYhS7UvQ==",
+      "dev": true,
+      "dependencies": {
+        "@jest/types": "^26.6.2",
+        "chalk": "^4.0.0",
+        "graceful-fs": "^4.2.4",
+        "jest-pnp-resolver": "^1.2.2",
+        "jest-util": "^26.6.2",
+        "read-pkg-up": "^7.0.1",
+        "resolve": "^1.18.1",
+        "slash": "^3.0.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-resolve-dependencies": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/jest-resolve-dependencies/-/jest-resolve-dependencies-26.6.3.tgz",
+      "integrity": "sha512-pVwUjJkxbhe4RY8QEWzN3vns2kqyuldKpxlxJlzEYfKSvY6/bMvxoFrYYzUO1Gx28yKWN37qyV7rIoIp2h8fTg==",
+      "dev": true,
+      "dependencies": {
+        "@jest/types": "^26.6.2",
+        "jest-regex-util": "^26.0.0",
+        "jest-snapshot": "^26.6.2"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-resolve/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/jest-resolve/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/jest-resolve/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/jest-resolve/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/jest-resolve/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-resolve/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-runner": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/jest-runner/-/jest-runner-26.6.3.tgz",
+      "integrity": "sha512-atgKpRHnaA2OvByG/HpGA4g6CSPS/1LK0jK3gATJAoptC1ojltpmVlYC3TYgdmGp+GLuhzpH30Gvs36szSL2JQ==",
+      "dev": true,
+      "dependencies": {
+        "@jest/console": "^26.6.2",
+        "@jest/environment": "^26.6.2",
+        "@jest/test-result": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "chalk": "^4.0.0",
+        "emittery": "^0.7.1",
+        "exit": "^0.1.2",
+        "graceful-fs": "^4.2.4",
+        "jest-config": "^26.6.3",
+        "jest-docblock": "^26.0.0",
+        "jest-haste-map": "^26.6.2",
+        "jest-leak-detector": "^26.6.2",
+        "jest-message-util": "^26.6.2",
+        "jest-resolve": "^26.6.2",
+        "jest-runtime": "^26.6.3",
+        "jest-util": "^26.6.2",
+        "jest-worker": "^26.6.2",
+        "source-map-support": "^0.5.6",
+        "throat": "^5.0.0"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-runner/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/jest-runner/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/jest-runner/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/jest-runner/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/jest-runner/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-runner/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-runtime": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/jest-runtime/-/jest-runtime-26.6.3.tgz",
+      "integrity": "sha512-lrzyR3N8sacTAMeonbqpnSka1dHNux2uk0qqDXVkMv2c/A3wYnvQ4EXuI013Y6+gSKSCxdaczvf4HF0mVXHRdw==",
+      "dev": true,
+      "dependencies": {
+        "@jest/console": "^26.6.2",
+        "@jest/environment": "^26.6.2",
+        "@jest/fake-timers": "^26.6.2",
+        "@jest/globals": "^26.6.2",
+        "@jest/source-map": "^26.6.2",
+        "@jest/test-result": "^26.6.2",
+        "@jest/transform": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/yargs": "^15.0.0",
+        "chalk": "^4.0.0",
+        "cjs-module-lexer": "^0.6.0",
+        "collect-v8-coverage": "^1.0.0",
+        "exit": "^0.1.2",
+        "glob": "^7.1.3",
+        "graceful-fs": "^4.2.4",
+        "jest-config": "^26.6.3",
+        "jest-haste-map": "^26.6.2",
+        "jest-message-util": "^26.6.2",
+        "jest-mock": "^26.6.2",
+        "jest-regex-util": "^26.0.0",
+        "jest-resolve": "^26.6.2",
+        "jest-snapshot": "^26.6.2",
+        "jest-util": "^26.6.2",
+        "jest-validate": "^26.6.2",
+        "slash": "^3.0.0",
+        "strip-bom": "^4.0.0",
+        "yargs": "^15.4.1"
+      },
+      "bin": {
+        "jest-runtime": "bin/jest-runtime.js"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-runtime/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/jest-runtime/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/jest-runtime/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/jest-runtime/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/jest-runtime/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-runtime/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-serializer": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-serializer/-/jest-serializer-26.6.2.tgz",
+      "integrity": "sha512-S5wqyz0DXnNJPd/xfIzZ5Xnp1HrJWBczg8mMfMpN78OJ5eDxXyf+Ygld9wX1DnUWbIbhM1YDY95NjR4CBXkb2g==",
+      "dev": true,
+      "dependencies": {
+        "@types/node": "*",
+        "graceful-fs": "^4.2.4"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-snapshot": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-snapshot/-/jest-snapshot-26.6.2.tgz",
+      "integrity": "sha512-OLhxz05EzUtsAmOMzuupt1lHYXCNib0ECyuZ/PZOx9TrZcC8vL0x+DUG3TL+GLX3yHG45e6YGjIm0XwDc3q3og==",
+      "dev": true,
+      "dependencies": {
+        "@babel/types": "^7.0.0",
+        "@jest/types": "^26.6.2",
+        "@types/babel__traverse": "^7.0.4",
+        "@types/prettier": "^2.0.0",
+        "chalk": "^4.0.0",
+        "expect": "^26.6.2",
+        "graceful-fs": "^4.2.4",
+        "jest-diff": "^26.6.2",
+        "jest-get-type": "^26.3.0",
+        "jest-haste-map": "^26.6.2",
+        "jest-matcher-utils": "^26.6.2",
+        "jest-message-util": "^26.6.2",
+        "jest-resolve": "^26.6.2",
+        "natural-compare": "^1.4.0",
+        "pretty-format": "^26.6.2",
+        "semver": "^7.3.2"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-snapshot/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/jest-snapshot/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/jest-snapshot/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/jest-snapshot/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/jest-snapshot/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-snapshot/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-util": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-util/-/jest-util-26.6.2.tgz",
+      "integrity": "sha512-MDW0fKfsn0OI7MS7Euz6h8HNDXVQ0gaM9uW6RjfDmd1DAFcaxX9OqIakHIqhbnmF08Cf2DLDG+ulq8YQQ0Lp0Q==",
+      "dev": true,
+      "dependencies": {
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "chalk": "^4.0.0",
+        "graceful-fs": "^4.2.4",
+        "is-ci": "^2.0.0",
+        "micromatch": "^4.0.2"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-util/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/jest-util/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/jest-util/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/jest-util/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/jest-util/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-util/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-validate": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-validate/-/jest-validate-26.6.2.tgz",
+      "integrity": "sha512-NEYZ9Aeyj0i5rQqbq+tpIOom0YS1u2MVu6+euBsvpgIme+FOfRmoC4R5p0JiAUpaFvFy24xgrpMknarR/93XjQ==",
+      "dev": true,
+      "dependencies": {
+        "@jest/types": "^26.6.2",
+        "camelcase": "^6.0.0",
+        "chalk": "^4.0.0",
+        "jest-get-type": "^26.3.0",
+        "leven": "^3.1.0",
+        "pretty-format": "^26.6.2"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-validate/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/jest-validate/node_modules/camelcase": {
+      "version": "6.3.0",
+      "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.3.0.tgz",
+      "integrity": "sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==",
+      "dev": true,
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/jest-validate/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/jest-validate/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/jest-validate/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/jest-validate/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-validate/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-watcher": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-watcher/-/jest-watcher-26.6.2.tgz",
+      "integrity": "sha512-WKJob0P/Em2csiVthsI68p6aGKTIcsfjH9Gsx1f0A3Italz43e3ho0geSAVsmj09RWOELP1AZ/DXyJgOgDKxXQ==",
+      "dev": true,
+      "dependencies": {
+        "@jest/test-result": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "@types/node": "*",
+        "ansi-escapes": "^4.2.1",
+        "chalk": "^4.0.0",
+        "jest-util": "^26.6.2",
+        "string-length": "^4.0.1"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest-watcher/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/jest-watcher/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/jest-watcher/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/jest-watcher/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/jest-watcher/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-watcher/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-worker": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/jest-worker/-/jest-worker-26.6.2.tgz",
+      "integrity": "sha512-KWYVV1c4i+jbMpaBC+U++4Va0cp8OisU185o73T1vo99hqi7w8tSJfUXYswwqqrjzwxa6KpRK54WhPvwf5w6PQ==",
+      "dev": true,
+      "dependencies": {
+        "@types/node": "*",
+        "merge-stream": "^2.0.0",
+        "supports-color": "^7.0.0"
+      },
+      "engines": {
+        "node": ">= 10.13.0"
+      }
+    },
+    "node_modules/jest-worker/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest-worker/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/jest/node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/jest/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/jest/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/jest/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/jest/node_modules/jest-cli": {
+      "version": "26.6.3",
+      "resolved": "https://registry.npmjs.org/jest-cli/-/jest-cli-26.6.3.tgz",
+      "integrity": "sha512-GF9noBSa9t08pSyl3CY4frMrqp+aQXFGFkf5hEPbh/pIUFYWMK6ZLTfbmadxJVcJrdRoChlWQsA2VkJcDFK8hg==",
+      "dev": true,
+      "dependencies": {
+        "@jest/core": "^26.6.3",
+        "@jest/test-result": "^26.6.2",
+        "@jest/types": "^26.6.2",
+        "chalk": "^4.0.0",
+        "exit": "^0.1.2",
+        "graceful-fs": "^4.2.4",
+        "import-local": "^3.0.2",
+        "is-ci": "^2.0.0",
+        "jest-config": "^26.6.3",
+        "jest-util": "^26.6.2",
+        "jest-validate": "^26.6.2",
+        "prompts": "^2.0.1",
+        "yargs": "^15.4.1"
+      },
+      "bin": {
+        "jest": "bin/jest.js"
+      },
+      "engines": {
+        "node": ">= 10.14.2"
+      }
+    },
+    "node_modules/jest/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/js-tokens": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
+      "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
+      "dev": true
+    },
+    "node_modules/js-yaml": {
+      "version": "3.14.1",
+      "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.14.1.tgz",
+      "integrity": "sha512-okMH7OXXJ7YrN9Ok3/SXrnu4iX9yOk+25nqX4imS2npuvTYDmo/QEZoqwZkYaIDk3jVvBOTOIEgEhaLOynBS9g==",
+      "dev": true,
+      "dependencies": {
+        "argparse": "^1.0.7",
+        "esprima": "^4.0.0"
+      },
+      "bin": {
+        "js-yaml": "bin/js-yaml.js"
+      }
+    },
+    "node_modules/jsdom": {
+      "version": "16.7.0",
+      "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.7.0.tgz",
+      "integrity": "sha512-u9Smc2G1USStM+s/x1ru5Sxrl6mPYCbByG1U/hUmqaVsm4tbNyS7CicOSRyuGQYZhTu0h84qkZZQ/I+dzizSVw==",
+      "dev": true,
+      "dependencies": {
+        "abab": "^2.0.5",
+        "acorn": "^8.2.4",
+        "acorn-globals": "^6.0.0",
+        "cssom": "^0.4.4",
+        "cssstyle": "^2.3.0",
+        "data-urls": "^2.0.0",
+        "decimal.js": "^10.2.1",
+        "domexception": "^2.0.1",
+        "escodegen": "^2.0.0",
+        "form-data": "^3.0.0",
+        "html-encoding-sniffer": "^2.0.1",
+        "http-proxy-agent": "^4.0.1",
+        "https-proxy-agent": "^5.0.0",
+        "is-potential-custom-element-name": "^1.0.1",
+        "nwsapi": "^2.2.0",
+        "parse5": "6.0.1",
+        "saxes": "^5.0.1",
+        "symbol-tree": "^3.2.4",
+        "tough-cookie": "^4.0.0",
+        "w3c-hr-time": "^1.0.2",
+        "w3c-xmlserializer": "^2.0.0",
+        "webidl-conversions": "^6.1.0",
+        "whatwg-encoding": "^1.0.5",
+        "whatwg-mimetype": "^2.3.0",
+        "whatwg-url": "^8.5.0",
+        "ws": "^7.4.6",
+        "xml-name-validator": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "peerDependencies": {
+        "canvas": "^2.5.0"
+      },
+      "peerDependenciesMeta": {
+        "canvas": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/jsdom/node_modules/acorn": {
+      "version": "8.8.1",
+      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.8.1.tgz",
+      "integrity": "sha512-7zFpHzhnqYKrkYdUjF1HI1bzd0VygEGX8lFk4k5zVMqHEoES+P+7TKI+EvLO9WVMJ8eekdO0aDEK044xTXwPPA==",
+      "dev": true,
+      "bin": {
+        "acorn": "bin/acorn"
+      },
+      "engines": {
+        "node": ">=0.4.0"
+      }
+    },
+    "node_modules/jsesc": {
+      "version": "2.5.2",
+      "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz",
+      "integrity": "sha512-OYu7XEzjkCQ3C5Ps3QIZsQfNpqoJyZZA99wd9aWd05NCtC5pWOkShK2mkL6HXQR6/Cy2lbNdPlZBpuQHXE63gA==",
+      "dev": true,
+      "bin": {
+        "jsesc": "bin/jsesc"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/json-parse-even-better-errors": {
+      "version": "2.3.1",
+      "resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz",
+      "integrity": "sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==",
+      "dev": true
+    },
+    "node_modules/json-schema-traverse": {
+      "version": "0.4.1",
+      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
+      "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
+      "dev": true
+    },
+    "node_modules/json-stable-stringify-without-jsonify": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz",
+      "integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==",
+      "dev": true
+    },
+    "node_modules/json5": {
+      "version": "2.2.1",
+      "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.1.tgz",
+      "integrity": "sha512-1hqLFMSrGHRHxav9q9gNjJ5EXznIxGVO09xQRrwplcS8qs28pZ8s8hupZAmqDwZUmVZ2Qb2jnyPOWcDH8m8dlA==",
+      "dev": true,
+      "bin": {
+        "json5": "lib/cli.js"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/jsonfile": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-4.0.0.tgz",
+      "integrity": "sha512-m6F1R3z8jjlf2imQHS2Qez5sjKWQzbuuhuJ/FKYFRZvPE3PuHcSMVZzfsLhGVOkfd20obL5SWEBew5ShlquNxg==",
+      "dev": true,
+      "optionalDependencies": {
+        "graceful-fs": "^4.1.6"
+      }
+    },
+    "node_modules/kind-of": {
+      "version": "6.0.3",
+      "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.3.tgz",
+      "integrity": "sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/kleur": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/kleur/-/kleur-3.0.3.tgz",
+      "integrity": "sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==",
+      "dev": true,
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/leven": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/leven/-/leven-3.1.0.tgz",
+      "integrity": "sha512-qsda+H8jTaUaN/x5vzW2rzc+8Rw4TAQ/4KjB46IwK5VH+IlVeeeje/EoZRpiXvIqjFgK84QffqPztGI3VBLG1A==",
+      "dev": true,
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/levn": {
+      "version": "0.3.0",
+      "resolved": "https://registry.npmjs.org/levn/-/levn-0.3.0.tgz",
+      "integrity": "sha512-0OO4y2iOHix2W6ujICbKIaEQXvFQHue65vUG3pb5EUomzPI90z9hsA1VsO/dbIIpC53J8gxM9Q4Oho0jrCM/yA==",
+      "dev": true,
+      "dependencies": {
+        "prelude-ls": "~1.1.2",
+        "type-check": "~0.3.2"
+      },
+      "engines": {
+        "node": ">= 0.8.0"
+      }
+    },
+    "node_modules/lines-and-columns": {
+      "version": "1.2.4",
+      "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz",
+      "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==",
+      "dev": true
+    },
+    "node_modules/locate-path": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-5.0.0.tgz",
+      "integrity": "sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g==",
+      "dev": true,
+      "dependencies": {
+        "p-locate": "^4.1.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/lodash": {
+      "version": "4.17.21",
+      "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
+      "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==",
+      "dev": true
+    },
+    "node_modules/lru-cache": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz",
+      "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==",
+      "dev": true,
+      "dependencies": {
+        "yallist": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/lunr": {
+      "version": "2.3.9",
+      "resolved": "https://registry.npmjs.org/lunr/-/lunr-2.3.9.tgz",
+      "integrity": "sha512-zTU3DaZaF3Rt9rhN3uBMGQD3dD2/vFQqnvZCDv4dl5iOzq2IZQqTxu90r4E5J+nP70J3ilqVCrbho2eWaeW8Ow==",
+      "dev": true
+    },
+    "node_modules/magic-string": {
+      "version": "0.25.9",
+      "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.25.9.tgz",
+      "integrity": "sha512-RmF0AsMzgt25qzqqLc1+MbHmhdx0ojF2Fvs4XnOqz2ZOBXzzkEwc/dJQZCYHAn7v1jbVOjAZfK8msRn4BxO4VQ==",
+      "dev": true,
+      "dependencies": {
+        "sourcemap-codec": "^1.4.8"
+      }
+    },
+    "node_modules/make-dir": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-3.1.0.tgz",
+      "integrity": "sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw==",
+      "dev": true,
+      "dependencies": {
+        "semver": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/make-dir/node_modules/semver": {
+      "version": "6.3.0",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
+      "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+      "dev": true,
+      "bin": {
+        "semver": "bin/semver.js"
+      }
+    },
+    "node_modules/makeerror": {
+      "version": "1.0.12",
+      "resolved": "https://registry.npmjs.org/makeerror/-/makeerror-1.0.12.tgz",
+      "integrity": "sha512-JmqCvUhmt43madlpFzG4BQzG2Z3m6tvQDNKdClZnO3VbIudJYmxsT0FNJMeiB2+JTSlTQTSbU8QdesVmwJcmLg==",
+      "dev": true,
+      "dependencies": {
+        "tmpl": "1.0.5"
+      }
+    },
+    "node_modules/map-cache": {
+      "version": "0.2.2",
+      "resolved": "https://registry.npmjs.org/map-cache/-/map-cache-0.2.2.tgz",
+      "integrity": "sha512-8y/eV9QQZCiyn1SprXSrCmqJN0yNRATe+PO8ztwqrvrbdRLA3eYJF0yaR0YayLWkMbsQSKWS9N2gPcGEc4UsZg==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/map-visit": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/map-visit/-/map-visit-1.0.0.tgz",
+      "integrity": "sha512-4y7uGv8bd2WdM9vpQsiQNo41Ln1NvhvDRuVt0k2JZQ+ezN2uaQes7lZeZ+QQUHOLQAtDaBJ+7wCbi+ab/KFs+w==",
+      "dev": true,
+      "dependencies": {
+        "object-visit": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/marked": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/marked/-/marked-1.0.0.tgz",
+      "integrity": "sha512-Wo+L1pWTVibfrSr+TTtMuiMfNzmZWiOPeO7rZsQUY5bgsxpHesBEcIWJloWVTFnrMXnf/TL30eTFSGJddmQAng==",
+      "dev": true,
+      "bin": {
+        "marked": "bin/marked"
+      },
+      "engines": {
+        "node": ">= 8.16.2"
+      }
+    },
+    "node_modules/merge-stream": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
+      "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==",
+      "dev": true
+    },
+    "node_modules/micromatch": {
+      "version": "4.0.5",
+      "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.5.tgz",
+      "integrity": "sha512-DMy+ERcEW2q8Z2Po+WNXuw3c5YaUSFjAO5GsJqfEl7UjvtIuFKO6ZrKvcItdy98dwFI2N1tg3zNIdKaQT+aNdA==",
+      "dev": true,
+      "dependencies": {
+        "braces": "^3.0.2",
+        "picomatch": "^2.3.1"
+      },
+      "engines": {
+        "node": ">=8.6"
+      }
+    },
+    "node_modules/mime-db": {
+      "version": "1.52.0",
+      "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
+      "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
+      "dev": true,
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/mime-types": {
+      "version": "2.1.35",
+      "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
+      "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
+      "dev": true,
+      "dependencies": {
+        "mime-db": "1.52.0"
+      },
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/mimic-fn": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-2.1.0.tgz",
+      "integrity": "sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==",
+      "dev": true,
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/minimatch": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
+      "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
+      "dev": true,
+      "dependencies": {
+        "brace-expansion": "^1.1.7"
+      },
+      "engines": {
+        "node": "*"
+      }
+    },
+    "node_modules/minimist": {
+      "version": "1.2.7",
+      "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.7.tgz",
+      "integrity": "sha512-bzfL1YUZsP41gmu/qjrEk0Q6i2ix/cVeAhbCbqH9u3zYutS1cLg00qhrD0M2MVdCcx4Sc0UpP2eBWo9rotpq6g==",
+      "dev": true,
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/mixin-deep": {
+      "version": "1.3.2",
+      "resolved": "https://registry.npmjs.org/mixin-deep/-/mixin-deep-1.3.2.tgz",
+      "integrity": "sha512-WRoDn//mXBiJ1H40rqa3vH0toePwSsGb45iInWlTySa+Uu4k3tYUSxa2v1KqAiLtvlrSzaExqS1gtk96A9zvEA==",
+      "dev": true,
+      "dependencies": {
+        "for-in": "^1.0.2",
+        "is-extendable": "^1.0.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/mixin-deep/node_modules/is-extendable": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-1.0.1.tgz",
+      "integrity": "sha512-arnXMxT1hhoKo9k1LZdmlNyJdDDfy2v0fXjFlmok4+i8ul/6WlbVge9bhM74OpNPQPMGUToDtz+KXa1PneJxOA==",
+      "dev": true,
+      "dependencies": {
+        "is-plain-object": "^2.0.4"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/mkdirp": {
+      "version": "0.5.6",
+      "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.6.tgz",
+      "integrity": "sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==",
+      "dev": true,
+      "dependencies": {
+        "minimist": "^1.2.6"
+      },
+      "bin": {
+        "mkdirp": "bin/cmd.js"
+      }
+    },
+    "node_modules/ms": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
+      "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==",
+      "dev": true
+    },
+    "node_modules/mute-stream": {
+      "version": "0.0.8",
+      "resolved": "https://registry.npmjs.org/mute-stream/-/mute-stream-0.0.8.tgz",
+      "integrity": "sha512-nnbWWOkoWyUsTjKrhgD0dcz22mdkSnpYqbEjIm2nhwhuxlSkpywJmBo8h0ZqJdkp73mb90SssHkN4rsRaBAfAA==",
+      "dev": true
+    },
+    "node_modules/nanomatch": {
+      "version": "1.2.13",
+      "resolved": "https://registry.npmjs.org/nanomatch/-/nanomatch-1.2.13.tgz",
+      "integrity": "sha512-fpoe2T0RbHwBTBUOftAfBPaDEi06ufaUai0mE6Yn1kacc3SnTErfb/h+X94VXzI64rKFHYImXSvdwGGCmwOqCA==",
+      "dev": true,
+      "dependencies": {
+        "arr-diff": "^4.0.0",
+        "array-unique": "^0.3.2",
+        "define-property": "^2.0.2",
+        "extend-shallow": "^3.0.2",
+        "fragment-cache": "^0.2.1",
+        "is-windows": "^1.0.2",
+        "kind-of": "^6.0.2",
+        "object.pick": "^1.3.0",
+        "regex-not": "^1.0.0",
+        "snapdragon": "^0.8.1",
+        "to-regex": "^3.0.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/natural-compare": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz",
+      "integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==",
+      "dev": true
+    },
+    "node_modules/neo-async": {
+      "version": "2.6.2",
+      "resolved": "https://registry.npmjs.org/neo-async/-/neo-async-2.6.2.tgz",
+      "integrity": "sha512-Yd3UES5mWCSqR+qNT93S3UoYUkqAZ9lLg8a7g9rimsWmYGK8cVToA4/sF3RrshdyV3sAGMXVUmpMYOw+dLpOuw==",
+      "dev": true
+    },
+    "node_modules/nice-try": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/nice-try/-/nice-try-1.0.5.tgz",
+      "integrity": "sha512-1nh45deeb5olNY7eX82BkPO7SSxR5SSYJiPTrTdFUVYwAl8CKMA5N9PjTYkHiRjisVcxcQ1HXdLhx2qxxJzLNQ==",
+      "dev": true
+    },
+    "node_modules/node-int64": {
+      "version": "0.4.0",
+      "resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz",
+      "integrity": "sha512-O5lz91xSOeoXP6DulyHfllpq+Eg00MWitZIbtPfoSEvqIHdl5gfcY6hYzDWnj0qD5tz52PI08u9qUvSVeUBeHw==",
+      "dev": true
+    },
+    "node_modules/node-notifier": {
+      "version": "8.0.2",
+      "resolved": "https://registry.npmjs.org/node-notifier/-/node-notifier-8.0.2.tgz",
+      "integrity": "sha512-oJP/9NAdd9+x2Q+rfphB2RJCHjod70RcRLjosiPMMu5gjIfwVnOUGq2nbTjTUbmy0DJ/tFIVT30+Qe3nzl4TJg==",
+      "dev": true,
+      "optional": true,
+      "dependencies": {
+        "growly": "^1.3.0",
+        "is-wsl": "^2.2.0",
+        "semver": "^7.3.2",
+        "shellwords": "^0.1.1",
+        "uuid": "^8.3.0",
+        "which": "^2.0.2"
+      }
+    },
+    "node_modules/node-notifier/node_modules/which": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
+      "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==",
+      "dev": true,
+      "optional": true,
+      "dependencies": {
+        "isexe": "^2.0.0"
+      },
+      "bin": {
+        "node-which": "bin/node-which"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/node-releases": {
+      "version": "2.0.6",
+      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.6.tgz",
+      "integrity": "sha512-PiVXnNuFm5+iYkLBNeq5211hvO38y63T0i2KKh2KnUs3RpzJ+JtODFjkD8yjLwnDkTYF1eKXheUwdssR+NRZdg==",
+      "dev": true
+    },
+    "node_modules/normalize-package-data": {
+      "version": "2.5.0",
+      "resolved": "https://registry.npmjs.org/normalize-package-data/-/normalize-package-data-2.5.0.tgz",
+      "integrity": "sha512-/5CMN3T0R4XTj4DcGaexo+roZSdSFW/0AOOTROrjxzCG1wrWXEsGbRKevjlIL+ZDE4sZlJr5ED4YW0yqmkK+eA==",
+      "dev": true,
+      "dependencies": {
+        "hosted-git-info": "^2.1.4",
+        "resolve": "^1.10.0",
+        "semver": "2 || 3 || 4 || 5",
+        "validate-npm-package-license": "^3.0.1"
+      }
+    },
+    "node_modules/normalize-package-data/node_modules/semver": {
+      "version": "5.7.1",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
+      "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
+      "dev": true,
+      "bin": {
+        "semver": "bin/semver"
+      }
+    },
+    "node_modules/normalize-path": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz",
+      "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/npm-run-path": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-2.0.2.tgz",
+      "integrity": "sha512-lJxZYlT4DW/bRUtFh1MQIWqmLwQfAxnqWG4HhEdjMlkrJYnJn0Jrr2u3mgxqaWsdiBc76TYkTG/mhrnYTuzfHw==",
+      "dev": true,
+      "dependencies": {
+        "path-key": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/nwsapi": {
+      "version": "2.2.2",
+      "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.2.tgz",
+      "integrity": "sha512-90yv+6538zuvUMnN+zCr8LuV6bPFdq50304114vJYJ8RDyK8D5O9Phpbd6SZWgI7PwzmmfN1upeOJlvybDSgCw==",
+      "dev": true
+    },
+    "node_modules/object-copy": {
+      "version": "0.1.0",
+      "resolved": "https://registry.npmjs.org/object-copy/-/object-copy-0.1.0.tgz",
+      "integrity": "sha512-79LYn6VAb63zgtmAteVOWo9Vdj71ZVBy3Pbse+VqxDpEP83XuujMrGqHIwAXJ5I/aM0zU7dIyIAhifVTPrNItQ==",
+      "dev": true,
+      "dependencies": {
+        "copy-descriptor": "^0.1.0",
+        "define-property": "^0.2.5",
+        "kind-of": "^3.0.3"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/object-copy/node_modules/define-property": {
+      "version": "0.2.5",
+      "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz",
+      "integrity": "sha512-Rr7ADjQZenceVOAKop6ALkkRAmH1A4Gx9hV/7ZujPUN2rkATqFO0JZLZInbAjpZYoJ1gUx8MRMQVkYemcbMSTA==",
+      "dev": true,
+      "dependencies": {
+        "is-descriptor": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/object-copy/node_modules/kind-of": {
+      "version": "3.2.2",
+      "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
+      "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
+      "dev": true,
+      "dependencies": {
+        "is-buffer": "^1.1.5"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/object-visit": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/object-visit/-/object-visit-1.0.1.tgz",
+      "integrity": "sha512-GBaMwwAVK9qbQN3Scdo0OyvgPW7l3lnaVMj84uTOZlswkX0KpF6fyDBJhtTthf7pymztoN36/KEr1DyhF96zEA==",
+      "dev": true,
+      "dependencies": {
+        "isobject": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/object.pick": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/object.pick/-/object.pick-1.3.0.tgz",
+      "integrity": "sha512-tqa/UMy/CCoYmj+H5qc07qvSL9dqcs/WZENZ1JbtWBlATP+iVOe778gE6MSijnyCnORzDuX6hU+LA4SZ09YjFQ==",
+      "dev": true,
+      "dependencies": {
+        "isobject": "^3.0.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/once": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
+      "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
+      "dev": true,
+      "dependencies": {
+        "wrappy": "1"
+      }
+    },
+    "node_modules/onetime": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/onetime/-/onetime-5.1.2.tgz",
+      "integrity": "sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==",
+      "dev": true,
+      "dependencies": {
+        "mimic-fn": "^2.1.0"
+      },
+      "engines": {
+        "node": ">=6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/optionator": {
+      "version": "0.8.3",
+      "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.8.3.tgz",
+      "integrity": "sha512-+IW9pACdk3XWmmTXG8m3upGUJst5XRGzxMRjXzAuJ1XnIFNvfhjjIuYkDvysnPQ7qzqVzLt78BCruntqRhWQbA==",
+      "dev": true,
+      "dependencies": {
+        "deep-is": "~0.1.3",
+        "fast-levenshtein": "~2.0.6",
+        "levn": "~0.3.0",
+        "prelude-ls": "~1.1.2",
+        "type-check": "~0.3.2",
+        "word-wrap": "~1.2.3"
+      },
+      "engines": {
+        "node": ">= 0.8.0"
+      }
+    },
+    "node_modules/os-tmpdir": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz",
+      "integrity": "sha512-D2FR03Vir7FIu45XBY20mTb+/ZSWB00sjU9jdQXt83gDrI4Ztz5Fs7/yy74g2N5SVQY4xY1qDr4rNddwYRVX0g==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/p-each-series": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/p-each-series/-/p-each-series-2.2.0.tgz",
+      "integrity": "sha512-ycIL2+1V32th+8scbpTvyHNaHe02z0sjgh91XXjAk+ZeXoPN4Z46DVUnzdso0aX4KckKw0FNNFHdjZ2UsZvxiA==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/p-finally": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/p-finally/-/p-finally-1.0.0.tgz",
+      "integrity": "sha512-LICb2p9CB7FS+0eR1oqWnHhp0FljGLZCWBE9aix0Uye9W8LTQPwMTYVGWQWIw9RdQiDg4+epXQODwIYJtSJaow==",
+      "dev": true,
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/p-limit": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz",
+      "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==",
+      "dev": true,
+      "dependencies": {
+        "p-try": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/p-locate": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-4.1.0.tgz",
+      "integrity": "sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A==",
+      "dev": true,
+      "dependencies": {
+        "p-limit": "^2.2.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/p-try": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz",
+      "integrity": "sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/parent-module": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
+      "integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==",
+      "dev": true,
+      "dependencies": {
+        "callsites": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/parse-json": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-5.2.0.tgz",
+      "integrity": "sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==",
+      "dev": true,
+      "dependencies": {
+        "@babel/code-frame": "^7.0.0",
+        "error-ex": "^1.3.1",
+        "json-parse-even-better-errors": "^2.3.0",
+        "lines-and-columns": "^1.1.6"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/parse5": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/parse5/-/parse5-6.0.1.tgz",
+      "integrity": "sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw==",
+      "dev": true
+    },
+    "node_modules/pascalcase": {
+      "version": "0.1.1",
+      "resolved": "https://registry.npmjs.org/pascalcase/-/pascalcase-0.1.1.tgz",
+      "integrity": "sha512-XHXfu/yOQRy9vYOtUDVMN60OEJjW013GoObG1o+xwQTpB9eYJX/BjXMsdW13ZDPruFhYYn0AG22w0xgQMwl3Nw==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/path-exists": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz",
+      "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/path-is-absolute": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
+      "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/path-key": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/path-key/-/path-key-2.0.1.tgz",
+      "integrity": "sha512-fEHGKCSmUSDPv4uoj8AlD+joPlq3peND+HRYyxFz4KPw4z926S/b8rIuFs2FYJg3BwsxJf6A9/3eIdLaYC+9Dw==",
+      "dev": true,
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/path-parse": {
+      "version": "1.0.7",
+      "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz",
+      "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==",
+      "dev": true
+    },
+    "node_modules/picocolors": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz",
+      "integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==",
+      "dev": true
+    },
+    "node_modules/picomatch": {
+      "version": "2.3.1",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
+      "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
+      "dev": true,
+      "engines": {
+        "node": ">=8.6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/jonschlinkert"
+      }
+    },
+    "node_modules/pirates": {
+      "version": "4.0.5",
+      "resolved": "https://registry.npmjs.org/pirates/-/pirates-4.0.5.tgz",
+      "integrity": "sha512-8V9+HQPupnaXMA23c5hvl69zXvTwTzyAYasnkb0Tts4XvO4CliqONMOnvlq26rkhLC3nWDFBJf73LU1e1VZLaQ==",
+      "dev": true,
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/pkg-dir": {
+      "version": "4.2.0",
+      "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-4.2.0.tgz",
+      "integrity": "sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==",
+      "dev": true,
+      "dependencies": {
+        "find-up": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/posix-character-classes": {
+      "version": "0.1.1",
+      "resolved": "https://registry.npmjs.org/posix-character-classes/-/posix-character-classes-0.1.1.tgz",
+      "integrity": "sha512-xTgYBc3fuo7Yt7JbiuFxSYGToMoz8fLoE6TC9Wx1P/u+LfeThMOAqmuyECnlBaaJb+u1m9hHiXUEtwW4OzfUJg==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/prelude-ls": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.1.2.tgz",
+      "integrity": "sha512-ESF23V4SKG6lVSGZgYNpbsiaAkdab6ZgOxe52p7+Kid3W3u3bxR4Vfd/o21dmN7jSt0IwgZ4v5MUd26FEtXE9w==",
+      "dev": true,
+      "engines": {
+        "node": ">= 0.8.0"
+      }
+    },
+    "node_modules/pretty-format": {
+      "version": "26.6.2",
+      "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-26.6.2.tgz",
+      "integrity": "sha512-7AeGuCYNGmycyQbCqd/3PWH4eOoX/OiCa0uphp57NVTeAGdJGaAliecxwBDHYQCIvrW7aDBZCYeNTP/WX69mkg==",
+      "dev": true,
+      "dependencies": {
+        "@jest/types": "^26.6.2",
+        "ansi-regex": "^5.0.0",
+        "ansi-styles": "^4.0.0",
+        "react-is": "^17.0.1"
+      },
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/pretty-format/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/pretty-format/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/pretty-format/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/progress": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz",
+      "integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.4.0"
+      }
+    },
+    "node_modules/prompts": {
+      "version": "2.4.2",
+      "resolved": "https://registry.npmjs.org/prompts/-/prompts-2.4.2.tgz",
+      "integrity": "sha512-NxNv/kLguCA7p3jE8oL2aEBsrJWgAakBpgmgK6lpPWV+WuOmY6r2/zbAVnP+T8bQlA0nzHXSJSJW0Hq7ylaD2Q==",
+      "dev": true,
+      "dependencies": {
+        "kleur": "^3.0.3",
+        "sisteransi": "^1.0.5"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/psl": {
+      "version": "1.9.0",
+      "resolved": "https://registry.npmjs.org/psl/-/psl-1.9.0.tgz",
+      "integrity": "sha512-E/ZsdU4HLs/68gYzgGTkMicWTLPdAftJLfJFlLUAAKZGkStNU72sZjT66SnMDVOfOWY/YAoiD7Jxa9iHvngcag==",
+      "dev": true
+    },
+    "node_modules/pump": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz",
+      "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==",
+      "dev": true,
+      "dependencies": {
+        "end-of-stream": "^1.1.0",
+        "once": "^1.3.1"
+      }
+    },
+    "node_modules/punycode": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz",
+      "integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==",
+      "dev": true,
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/querystringify": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/querystringify/-/querystringify-2.2.0.tgz",
+      "integrity": "sha512-FIqgj2EUvTa7R50u0rGsyTftzjYmv/a3hO345bZNrqabNqjtgiDMgmo4mkUjd+nzU5oF3dClKqFIPUKybUyqoQ==",
+      "dev": true
+    },
+    "node_modules/react-is": {
+      "version": "17.0.2",
+      "resolved": "https://registry.npmjs.org/react-is/-/react-is-17.0.2.tgz",
+      "integrity": "sha512-w2GsyukL62IJnlaff/nRegPQR94C/XXamvMWmSHRJ4y7Ts/4ocGRmTHvOs8PSE6pB3dWOrD/nueuU5sduBsQ4w==",
+      "dev": true
+    },
+    "node_modules/read-pkg": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-5.2.0.tgz",
+      "integrity": "sha512-Ug69mNOpfvKDAc2Q8DRpMjjzdtrnv9HcSMX+4VsZxD1aZ6ZzrIE7rlzXBtWTyhULSMKg076AW6WR5iZpD0JiOg==",
+      "dev": true,
+      "dependencies": {
+        "@types/normalize-package-data": "^2.4.0",
+        "normalize-package-data": "^2.5.0",
+        "parse-json": "^5.0.0",
+        "type-fest": "^0.6.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/read-pkg-up": {
+      "version": "7.0.1",
+      "resolved": "https://registry.npmjs.org/read-pkg-up/-/read-pkg-up-7.0.1.tgz",
+      "integrity": "sha512-zK0TB7Xd6JpCLmlLmufqykGE+/TlOePD6qKClNW7hHDKFh/J7/7gCWGR7joEQEW1bKq3a3yUZSObOoWLFQ4ohg==",
+      "dev": true,
+      "dependencies": {
+        "find-up": "^4.1.0",
+        "read-pkg": "^5.2.0",
+        "type-fest": "^0.8.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/read-pkg/node_modules/type-fest": {
+      "version": "0.6.0",
+      "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.6.0.tgz",
+      "integrity": "sha512-q+MB8nYR1KDLrgr4G5yemftpMC7/QLqVndBmEEdqzmNj5dcFOO4Oo8qlwZE3ULT3+Zim1F8Kq4cBnikNhlCMlg==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/rechoir": {
+      "version": "0.6.2",
+      "resolved": "https://registry.npmjs.org/rechoir/-/rechoir-0.6.2.tgz",
+      "integrity": "sha512-HFM8rkZ+i3zrV+4LQjwQ0W+ez98pApMGM3HUrN04j3CqzPOzl9nmP15Y8YXNm8QHGv/eacOVEjqhmWpkRV0NAw==",
+      "dev": true,
+      "dependencies": {
+        "resolve": "^1.1.6"
+      },
+      "engines": {
+        "node": ">= 0.10"
+      }
+    },
+    "node_modules/regex-not": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/regex-not/-/regex-not-1.0.2.tgz",
+      "integrity": "sha512-J6SDjUgDxQj5NusnOtdFxDwN/+HWykR8GELwctJ7mdqhcyy1xEc4SRFHUXvxTp661YaVKAjfRLZ9cCqS6tn32A==",
+      "dev": true,
+      "dependencies": {
+        "extend-shallow": "^3.0.2",
+        "safe-regex": "^1.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/regexpp": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/regexpp/-/regexpp-3.2.0.tgz",
+      "integrity": "sha512-pq2bWo9mVD43nbts2wGv17XLiNLya+GklZ8kaDLV2Z08gDCsGpnKn9BFMepvWuHCbyVvY7J5o5+BVvoQbmlJLg==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/mysticatea"
+      }
+    },
+    "node_modules/remove-trailing-separator": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/remove-trailing-separator/-/remove-trailing-separator-1.1.0.tgz",
+      "integrity": "sha512-/hS+Y0u3aOfIETiaiirUFwDBDzmXPvO+jAfKTitUngIPzdKc6Z0LoFjM/CK5PL4C+eKwHohlHAb6H0VFfmmUsw==",
+      "dev": true
+    },
+    "node_modules/repeat-element": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/repeat-element/-/repeat-element-1.1.4.tgz",
+      "integrity": "sha512-LFiNfRcSu7KK3evMyYOuCzv3L10TW7yC1G2/+StMjK8Y6Vqd2MG7r/Qjw4ghtuCOjFvlnms/iMmLqpvW/ES/WQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/repeat-string": {
+      "version": "1.6.1",
+      "resolved": "https://registry.npmjs.org/repeat-string/-/repeat-string-1.6.1.tgz",
+      "integrity": "sha512-PV0dzCYDNfRi1jCDbJzpW7jNNDRuCOG/jI5ctQcGKt/clZD+YcPS3yIlWuTJMmESC8aevCFmWJy5wjAFgNqN6w==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10"
+      }
+    },
+    "node_modules/require-directory": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
+      "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/require-main-filename": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/require-main-filename/-/require-main-filename-2.0.0.tgz",
+      "integrity": "sha512-NKN5kMDylKuldxYLSUfrbo5Tuzh4hd+2E8NPPX02mZtn1VuREQToYe/ZdlJy+J3uCpfaiGF05e7B8W0iXbQHmg==",
+      "dev": true
+    },
+    "node_modules/requires-port": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz",
+      "integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ==",
+      "dev": true
+    },
+    "node_modules/resolve": {
+      "version": "1.22.1",
+      "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.1.tgz",
+      "integrity": "sha512-nBpuuYuY5jFsli/JIs1oldw6fOQCBioohqWZg/2hiaOybXOft4lonv85uDOKXdf8rhyK159cxU5cDcK/NKk8zw==",
+      "dev": true,
+      "dependencies": {
+        "is-core-module": "^2.9.0",
+        "path-parse": "^1.0.7",
+        "supports-preserve-symlinks-flag": "^1.0.0"
+      },
+      "bin": {
+        "resolve": "bin/resolve"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/resolve-cwd": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/resolve-cwd/-/resolve-cwd-3.0.0.tgz",
+      "integrity": "sha512-OrZaX2Mb+rJCpH/6CpSqt9xFVpN++x01XnN2ie9g6P5/3xelLAkXWVADpdz1IHD/KFfEXyE6V0U01OQ3UO2rEg==",
+      "dev": true,
+      "dependencies": {
+        "resolve-from": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/resolve-cwd/node_modules/resolve-from": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-5.0.0.tgz",
+      "integrity": "sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/resolve-from": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz",
+      "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==",
+      "dev": true,
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/resolve-url": {
+      "version": "0.2.1",
+      "resolved": "https://registry.npmjs.org/resolve-url/-/resolve-url-0.2.1.tgz",
+      "integrity": "sha512-ZuF55hVUQaaczgOIwqWzkEcEidmlD/xl44x1UZnhOXcYuFN2S6+rcxpG+C1N3So0wvNI3DmJICUFfu2SxhBmvg==",
+      "deprecated": "https://github.com/lydell/resolve-url#deprecated",
+      "dev": true
+    },
+    "node_modules/restore-cursor": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-3.1.0.tgz",
+      "integrity": "sha512-l+sSefzHpj5qimhFSE5a8nufZYAM3sBSVMAPtYkmC+4EH2anSGaEMXSD0izRQbu9nfyQ9y5JrVmp7E8oZrUjvA==",
+      "dev": true,
+      "dependencies": {
+        "onetime": "^5.1.0",
+        "signal-exit": "^3.0.2"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/ret": {
+      "version": "0.1.15",
+      "resolved": "https://registry.npmjs.org/ret/-/ret-0.1.15.tgz",
+      "integrity": "sha512-TTlYpa+OL+vMMNG24xSlQGEJ3B/RzEfUlLct7b5G/ytav+wPrplCpVMFuwzXbkecJrb6IYo1iFb0S9v37754mg==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.12"
+      }
+    },
+    "node_modules/rimraf": {
+      "version": "2.6.3",
+      "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.3.tgz",
+      "integrity": "sha512-mwqeW5XsA2qAejG46gYdENaxXjx9onRNCfn7L0duuP4hCuTIi/QO7PDK07KJfp1d+izWPrzEJDcSqBa0OZQriA==",
+      "dev": true,
+      "dependencies": {
+        "glob": "^7.1.3"
+      },
+      "bin": {
+        "rimraf": "bin.js"
+      }
+    },
+    "node_modules/rollup": {
+      "version": "2.79.1",
+      "resolved": "https://registry.npmjs.org/rollup/-/rollup-2.79.1.tgz",
+      "integrity": "sha512-uKxbd0IhMZOhjAiD5oAFp7BqvkA4Dv47qpOCtaNvng4HBwdbWtdOh8f5nZNuk2rp51PMGk3bzfWu5oayNEuYnw==",
+      "dev": true,
+      "bin": {
+        "rollup": "dist/bin/rollup"
+      },
+      "engines": {
+        "node": ">=10.0.0"
+      },
+      "optionalDependencies": {
+        "fsevents": "~2.3.2"
+      }
+    },
+    "node_modules/rollup-plugin-typescript2": {
+      "version": "0.27.3",
+      "resolved": "https://registry.npmjs.org/rollup-plugin-typescript2/-/rollup-plugin-typescript2-0.27.3.tgz",
+      "integrity": "sha512-gmYPIFmALj9D3Ga1ZbTZAKTXq1JKlTQBtj299DXhqYz9cL3g/AQfUvbb2UhH+Nf++cCq941W2Mv7UcrcgLzJJg==",
+      "dev": true,
+      "dependencies": {
+        "@rollup/pluginutils": "^3.1.0",
+        "find-cache-dir": "^3.3.1",
+        "fs-extra": "8.1.0",
+        "resolve": "1.17.0",
+        "tslib": "2.0.1"
+      },
+      "peerDependencies": {
+        "rollup": ">=1.26.3",
+        "typescript": ">=2.4.0"
+      }
+    },
+    "node_modules/rollup-plugin-typescript2/node_modules/resolve": {
+      "version": "1.17.0",
+      "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.17.0.tgz",
+      "integrity": "sha512-ic+7JYiV8Vi2yzQGFWOkiZD5Z9z7O2Zhm9XMaTxdJExKasieFCr+yXZ/WmXsckHiKl12ar0y6XiXDx3m4RHn1w==",
+      "dev": true,
+      "dependencies": {
+        "path-parse": "^1.0.6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/rollup-plugin-typescript2/node_modules/tslib": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.0.1.tgz",
+      "integrity": "sha512-SgIkNheinmEBgx1IUNirK0TUD4X9yjjBRTqqjggWCU3pUEqIk3/Uwl3yRixYKT6WjQuGiwDv4NomL3wqRCj+CQ==",
+      "dev": true
+    },
+    "node_modules/rsvp": {
+      "version": "4.8.5",
+      "resolved": "https://registry.npmjs.org/rsvp/-/rsvp-4.8.5.tgz",
+      "integrity": "sha512-nfMOlASu9OnRJo1mbEk2cz0D56a1MBNrJ7orjRZQG10XDyuvwksKbuXNp6qa+kbn839HwjwhBzhFmdsaEAfauA==",
+      "dev": true,
+      "engines": {
+        "node": "6.* || >= 7.*"
+      }
+    },
+    "node_modules/run-async": {
+      "version": "2.4.1",
+      "resolved": "https://registry.npmjs.org/run-async/-/run-async-2.4.1.tgz",
+      "integrity": "sha512-tvVnVv01b8c1RrA6Ep7JkStj85Guv/YrMcwqYQnwjsAS2cTmmPGBBjAjpCW7RrSodNSoE2/qg9O4bceNvUuDgQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.12.0"
+      }
+    },
+    "node_modules/rxjs": {
+      "version": "6.6.7",
+      "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-6.6.7.tgz",
+      "integrity": "sha512-hTdwr+7yYNIT5n4AMYp85KA6yw2Va0FLa3Rguvbpa4W3I5xynaBZo41cM3XM+4Q6fRMj3sBYIR1VAmZMXYJvRQ==",
+      "dev": true,
+      "dependencies": {
+        "tslib": "^1.9.0"
+      },
+      "engines": {
+        "npm": ">=2.0.0"
+      }
+    },
+    "node_modules/safe-regex": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/safe-regex/-/safe-regex-1.1.0.tgz",
+      "integrity": "sha512-aJXcif4xnaNUzvUuC5gcb46oTS7zvg4jpMTnuqtrEPlR3vFr4pxtdTwaF1Qs3Enjn9HK+ZlwQui+a7z0SywIzg==",
+      "dev": true,
+      "dependencies": {
+        "ret": "~0.1.10"
+      }
+    },
+    "node_modules/safer-buffer": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
+      "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
+      "dev": true
+    },
+    "node_modules/sane": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/sane/-/sane-4.1.0.tgz",
+      "integrity": "sha512-hhbzAgTIX8O7SHfp2c8/kREfEn4qO/9q8C9beyY6+tvZ87EpoZ3i1RIEvp27YBswnNbY9mWd6paKVmKbAgLfZA==",
+      "deprecated": "some dependency vulnerabilities fixed, support for node < 10 dropped, and newer ECMAScript syntax/features added",
+      "dev": true,
+      "dependencies": {
+        "@cnakazawa/watch": "^1.0.3",
+        "anymatch": "^2.0.0",
+        "capture-exit": "^2.0.0",
+        "exec-sh": "^0.3.2",
+        "execa": "^1.0.0",
+        "fb-watchman": "^2.0.0",
+        "micromatch": "^3.1.4",
+        "minimist": "^1.1.1",
+        "walker": "~1.0.5"
+      },
+      "bin": {
+        "sane": "src/cli.js"
+      },
+      "engines": {
+        "node": "6.* || 8.* || >= 10.*"
+      }
+    },
+    "node_modules/sane/node_modules/anymatch": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-2.0.0.tgz",
+      "integrity": "sha512-5teOsQWABXHHBFP9y3skS5P3d/WfWXpv3FUpy+LorMrNYaT9pI4oLMQX7jzQ2KklNpGpWHzdCXTDT2Y3XGlZBw==",
+      "dev": true,
+      "dependencies": {
+        "micromatch": "^3.1.4",
+        "normalize-path": "^2.1.1"
+      }
+    },
+    "node_modules/sane/node_modules/braces": {
+      "version": "2.3.2",
+      "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz",
+      "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==",
+      "dev": true,
+      "dependencies": {
+        "arr-flatten": "^1.1.0",
+        "array-unique": "^0.3.2",
+        "extend-shallow": "^2.0.1",
+        "fill-range": "^4.0.0",
+        "isobject": "^3.0.1",
+        "repeat-element": "^1.1.2",
+        "snapdragon": "^0.8.1",
+        "snapdragon-node": "^2.0.1",
+        "split-string": "^3.0.2",
+        "to-regex": "^3.0.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/sane/node_modules/braces/node_modules/extend-shallow": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz",
+      "integrity": "sha512-zCnTtlxNoAiDc3gqY2aYAWFx7XWWiasuF2K8Me5WbN8otHKTUKBwjPtNpRs/rbUZm7KxWAaNj7P1a/p52GbVug==",
+      "dev": true,
+      "dependencies": {
+        "is-extendable": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/sane/node_modules/fill-range": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz",
+      "integrity": "sha512-VcpLTWqWDiTerugjj8e3+esbg+skS3M9e54UuR3iCeIDMXCLTsAH8hTSzDQU/X6/6t3eYkOKoZSef2PlU6U1XQ==",
+      "dev": true,
+      "dependencies": {
+        "extend-shallow": "^2.0.1",
+        "is-number": "^3.0.0",
+        "repeat-string": "^1.6.1",
+        "to-regex-range": "^2.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/sane/node_modules/fill-range/node_modules/extend-shallow": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz",
+      "integrity": "sha512-zCnTtlxNoAiDc3gqY2aYAWFx7XWWiasuF2K8Me5WbN8otHKTUKBwjPtNpRs/rbUZm7KxWAaNj7P1a/p52GbVug==",
+      "dev": true,
+      "dependencies": {
+        "is-extendable": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/sane/node_modules/is-number": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz",
+      "integrity": "sha512-4cboCqIpliH+mAvFNegjZQ4kgKc3ZUhQVr3HvWbSh5q3WH2v82ct+T2Y1hdU5Gdtorx/cLifQjqCbL7bpznLTg==",
+      "dev": true,
+      "dependencies": {
+        "kind-of": "^3.0.2"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/sane/node_modules/is-number/node_modules/kind-of": {
+      "version": "3.2.2",
+      "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
+      "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
+      "dev": true,
+      "dependencies": {
+        "is-buffer": "^1.1.5"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/sane/node_modules/micromatch": {
+      "version": "3.1.10",
+      "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz",
+      "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==",
+      "dev": true,
+      "dependencies": {
+        "arr-diff": "^4.0.0",
+        "array-unique": "^0.3.2",
+        "braces": "^2.3.1",
+        "define-property": "^2.0.2",
+        "extend-shallow": "^3.0.2",
+        "extglob": "^2.0.4",
+        "fragment-cache": "^0.2.1",
+        "kind-of": "^6.0.2",
+        "nanomatch": "^1.2.9",
+        "object.pick": "^1.3.0",
+        "regex-not": "^1.0.0",
+        "snapdragon": "^0.8.1",
+        "to-regex": "^3.0.2"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/sane/node_modules/normalize-path": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-2.1.1.tgz",
+      "integrity": "sha512-3pKJwH184Xo/lnH6oyP1q2pMd7HcypqqmRs91/6/i2CGtWwIKGCkOOMTm/zXbgTEWHw1uNpNi/igc3ePOYHb6w==",
+      "dev": true,
+      "dependencies": {
+        "remove-trailing-separator": "^1.0.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/sane/node_modules/to-regex-range": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-2.1.1.tgz",
+      "integrity": "sha512-ZZWNfCjUokXXDGXFpZehJIkZqq91BcULFq/Pi7M5i4JnxXdhMKAK682z8bCW3o8Hj1wuuzoKcW3DfVzaP6VuNg==",
+      "dev": true,
+      "dependencies": {
+        "is-number": "^3.0.0",
+        "repeat-string": "^1.6.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/saxes": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/saxes/-/saxes-5.0.1.tgz",
+      "integrity": "sha512-5LBh1Tls8c9xgGjw3QrMwETmTMVk0oFgvrFSvWx62llR2hcEInrKNZ2GZCCuuy2lvWrdl5jhbpeqc5hRYKFOcw==",
+      "dev": true,
+      "dependencies": {
+        "xmlchars": "^2.2.0"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/semver": {
+      "version": "7.3.8",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
+      "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
+      "dev": true,
+      "dependencies": {
+        "lru-cache": "^6.0.0"
+      },
+      "bin": {
+        "semver": "bin/semver.js"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/set-blocking": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz",
+      "integrity": "sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==",
+      "dev": true
+    },
+    "node_modules/set-value": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/set-value/-/set-value-2.0.1.tgz",
+      "integrity": "sha512-JxHc1weCN68wRY0fhCoXpyK55m/XPHafOmK4UWD7m2CI14GMcFypt4w/0+NV5f/ZMby2F6S2wwA7fgynh9gWSw==",
+      "dev": true,
+      "dependencies": {
+        "extend-shallow": "^2.0.1",
+        "is-extendable": "^0.1.1",
+        "is-plain-object": "^2.0.3",
+        "split-string": "^3.0.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/set-value/node_modules/extend-shallow": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz",
+      "integrity": "sha512-zCnTtlxNoAiDc3gqY2aYAWFx7XWWiasuF2K8Me5WbN8otHKTUKBwjPtNpRs/rbUZm7KxWAaNj7P1a/p52GbVug==",
+      "dev": true,
+      "dependencies": {
+        "is-extendable": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/shebang-command": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-1.2.0.tgz",
+      "integrity": "sha512-EV3L1+UQWGor21OmnvojK36mhg+TyIKDh3iFBKBohr5xeXIhNBcx8oWdgkTEEQ+BEFFYdLRuqMfd5L84N1V5Vg==",
+      "dev": true,
+      "dependencies": {
+        "shebang-regex": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/shebang-regex": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-1.0.0.tgz",
+      "integrity": "sha512-wpoSFAxys6b2a2wHZ1XpDSgD7N9iVjg29Ph9uV/uaP9Ex/KXlkTZTeddxDPSYQpgvzKLGJke2UU0AzoGCjNIvQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/shelljs": {
+      "version": "0.8.5",
+      "resolved": "https://registry.npmjs.org/shelljs/-/shelljs-0.8.5.tgz",
+      "integrity": "sha512-TiwcRcrkhHvbrZbnRcFYMLl30Dfov3HKqzp5tO5b4pt6G/SezKcYhmDg15zXVBswHmctSAQKznqNW2LO5tTDow==",
+      "dev": true,
+      "dependencies": {
+        "glob": "^7.0.0",
+        "interpret": "^1.0.0",
+        "rechoir": "^0.6.2"
+      },
+      "bin": {
+        "shjs": "bin/shjs"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/shellwords": {
+      "version": "0.1.1",
+      "resolved": "https://registry.npmjs.org/shellwords/-/shellwords-0.1.1.tgz",
+      "integrity": "sha512-vFwSUfQvqybiICwZY5+DAWIPLKsWO31Q91JSKl3UYv+K5c2QRPzn0qzec6QPu1Qc9eHYItiP3NdJqNVqetYAww==",
+      "dev": true,
+      "optional": true
+    },
+    "node_modules/signal-exit": {
+      "version": "3.0.7",
+      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz",
+      "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==",
+      "dev": true
+    },
+    "node_modules/sisteransi": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/sisteransi/-/sisteransi-1.0.5.tgz",
+      "integrity": "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==",
+      "dev": true
+    },
+    "node_modules/slash": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz",
+      "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/slice-ansi": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-2.1.0.tgz",
+      "integrity": "sha512-Qu+VC3EwYLldKa1fCxuuvULvSJOKEgk9pi8dZeCVK7TqBfUNTH4sFkk4joj8afVSfAYgJoSOetjx9QWOJ5mYoQ==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^3.2.0",
+        "astral-regex": "^1.0.0",
+        "is-fullwidth-code-point": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/slice-ansi/node_modules/is-fullwidth-code-point": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz",
+      "integrity": "sha512-VHskAKYM8RfSFXwee5t5cbN5PZeq1Wrh6qd5bkyiXIf6UQcN6w/A0eXM9r6t8d+GYOh+o6ZhiEnb88LN/Y8m2w==",
+      "dev": true,
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/snapdragon": {
+      "version": "0.8.2",
+      "resolved": "https://registry.npmjs.org/snapdragon/-/snapdragon-0.8.2.tgz",
+      "integrity": "sha512-FtyOnWN/wCHTVXOMwvSv26d+ko5vWlIDD6zoUJ7LW8vh+ZBC8QdljveRP+crNrtBwioEUWy/4dMtbBjA4ioNlg==",
+      "dev": true,
+      "dependencies": {
+        "base": "^0.11.1",
+        "debug": "^2.2.0",
+        "define-property": "^0.2.5",
+        "extend-shallow": "^2.0.1",
+        "map-cache": "^0.2.2",
+        "source-map": "^0.5.6",
+        "source-map-resolve": "^0.5.0",
+        "use": "^3.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/snapdragon-node": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/snapdragon-node/-/snapdragon-node-2.1.1.tgz",
+      "integrity": "sha512-O27l4xaMYt/RSQ5TR3vpWCAB5Kb/czIcqUFOM/C4fYcLnbZUc1PkjTAMjof2pBWaSTwOUd6qUHcFGVGj7aIwnw==",
+      "dev": true,
+      "dependencies": {
+        "define-property": "^1.0.0",
+        "isobject": "^3.0.0",
+        "snapdragon-util": "^3.0.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/snapdragon-node/node_modules/define-property": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz",
+      "integrity": "sha512-cZTYKFWspt9jZsMscWo8sc/5lbPC9Q0N5nBLgb+Yd915iL3udB1uFgS3B8YCx66UVHq018DAVFoee7x+gxggeA==",
+      "dev": true,
+      "dependencies": {
+        "is-descriptor": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/snapdragon-node/node_modules/is-accessor-descriptor": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz",
+      "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==",
+      "dev": true,
+      "dependencies": {
+        "kind-of": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/snapdragon-node/node_modules/is-data-descriptor": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz",
+      "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==",
+      "dev": true,
+      "dependencies": {
+        "kind-of": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/snapdragon-node/node_modules/is-descriptor": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz",
+      "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==",
+      "dev": true,
+      "dependencies": {
+        "is-accessor-descriptor": "^1.0.0",
+        "is-data-descriptor": "^1.0.0",
+        "kind-of": "^6.0.2"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/snapdragon-util": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/snapdragon-util/-/snapdragon-util-3.0.1.tgz",
+      "integrity": "sha512-mbKkMdQKsjX4BAL4bRYTj21edOf8cN7XHdYUJEe+Zn99hVEYcMvKPct1IqNe7+AZPirn8BCDOQBHQZknqmKlZQ==",
+      "dev": true,
+      "dependencies": {
+        "kind-of": "^3.2.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/snapdragon-util/node_modules/kind-of": {
+      "version": "3.2.2",
+      "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
+      "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
+      "dev": true,
+      "dependencies": {
+        "is-buffer": "^1.1.5"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/snapdragon/node_modules/debug": {
+      "version": "2.6.9",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
+      "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
+      "dev": true,
+      "dependencies": {
+        "ms": "2.0.0"
+      }
+    },
+    "node_modules/snapdragon/node_modules/define-property": {
+      "version": "0.2.5",
+      "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz",
+      "integrity": "sha512-Rr7ADjQZenceVOAKop6ALkkRAmH1A4Gx9hV/7ZujPUN2rkATqFO0JZLZInbAjpZYoJ1gUx8MRMQVkYemcbMSTA==",
+      "dev": true,
+      "dependencies": {
+        "is-descriptor": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/snapdragon/node_modules/extend-shallow": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz",
+      "integrity": "sha512-zCnTtlxNoAiDc3gqY2aYAWFx7XWWiasuF2K8Me5WbN8otHKTUKBwjPtNpRs/rbUZm7KxWAaNj7P1a/p52GbVug==",
+      "dev": true,
+      "dependencies": {
+        "is-extendable": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/snapdragon/node_modules/ms": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
+      "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==",
+      "dev": true
+    },
+    "node_modules/snapdragon/node_modules/source-map": {
+      "version": "0.5.7",
+      "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz",
+      "integrity": "sha512-LbrmJOMUSdEVxIKvdcJzQC+nQhe8FUZQTXQy6+I75skNgn3OoQ0DZA8YnFa7gp8tqtL3KPf1kmo0R5DoApeSGQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/source-map": {
+      "version": "0.6.1",
+      "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
+      "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/source-map-resolve": {
+      "version": "0.5.3",
+      "resolved": "https://registry.npmjs.org/source-map-resolve/-/source-map-resolve-0.5.3.tgz",
+      "integrity": "sha512-Htz+RnsXWk5+P2slx5Jh3Q66vhQj1Cllm0zvnaY98+NFx+Dv2CF/f5O/t8x+KaNdrdIAsruNzoh/KpialbqAnw==",
+      "deprecated": "See https://github.com/lydell/source-map-resolve#deprecated",
+      "dev": true,
+      "dependencies": {
+        "atob": "^2.1.2",
+        "decode-uri-component": "^0.2.0",
+        "resolve-url": "^0.2.1",
+        "source-map-url": "^0.4.0",
+        "urix": "^0.1.0"
+      }
+    },
+    "node_modules/source-map-support": {
+      "version": "0.5.21",
+      "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.21.tgz",
+      "integrity": "sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w==",
+      "dev": true,
+      "dependencies": {
+        "buffer-from": "^1.0.0",
+        "source-map": "^0.6.0"
+      }
+    },
+    "node_modules/source-map-url": {
+      "version": "0.4.1",
+      "resolved": "https://registry.npmjs.org/source-map-url/-/source-map-url-0.4.1.tgz",
+      "integrity": "sha512-cPiFOTLUKvJFIg4SKVScy4ilPPW6rFgMgfuZJPNoDuMs3nC1HbMUycBoJw77xFIp6z1UJQJOfx6C9GMH80DiTw==",
+      "deprecated": "See https://github.com/lydell/source-map-url#deprecated",
+      "dev": true
+    },
+    "node_modules/sourcemap-codec": {
+      "version": "1.4.8",
+      "resolved": "https://registry.npmjs.org/sourcemap-codec/-/sourcemap-codec-1.4.8.tgz",
+      "integrity": "sha512-9NykojV5Uih4lgo5So5dtw+f0JgJX30KCNI8gwhz2J9A15wD0Ml6tjHKwf6fTSa6fAdVBdZeNOs9eJ71qCk8vA==",
+      "deprecated": "Please use @jridgewell/sourcemap-codec instead",
+      "dev": true
+    },
+    "node_modules/spdx-correct": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-3.1.1.tgz",
+      "integrity": "sha512-cOYcUWwhCuHCXi49RhFRCyJEK3iPj1Ziz9DpViV3tbZOwXD49QzIN3MpOLJNxh2qwq2lJJZaKMVw9qNi4jTC0w==",
+      "dev": true,
+      "dependencies": {
+        "spdx-expression-parse": "^3.0.0",
+        "spdx-license-ids": "^3.0.0"
+      }
+    },
+    "node_modules/spdx-exceptions": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/spdx-exceptions/-/spdx-exceptions-2.3.0.tgz",
+      "integrity": "sha512-/tTrYOC7PPI1nUAgx34hUpqXuyJG+DTHJTnIULG4rDygi4xu/tfgmq1e1cIRwRzwZgo4NLySi+ricLkZkw4i5A==",
+      "dev": true
+    },
+    "node_modules/spdx-expression-parse": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/spdx-expression-parse/-/spdx-expression-parse-3.0.1.tgz",
+      "integrity": "sha512-cbqHunsQWnJNE6KhVSMsMeH5H/L9EpymbzqTQ3uLwNCLZ1Q481oWaofqH7nO6V07xlXwY6PhQdQ2IedWx/ZK4Q==",
+      "dev": true,
+      "dependencies": {
+        "spdx-exceptions": "^2.1.0",
+        "spdx-license-ids": "^3.0.0"
+      }
+    },
+    "node_modules/spdx-license-ids": {
+      "version": "3.0.12",
+      "resolved": "https://registry.npmjs.org/spdx-license-ids/-/spdx-license-ids-3.0.12.tgz",
+      "integrity": "sha512-rr+VVSXtRhO4OHbXUiAF7xW3Bo9DuuF6C5jH+q/x15j2jniycgKbxU09Hr0WqlSLUs4i4ltHGXqTe7VHclYWyA==",
+      "dev": true
+    },
+    "node_modules/split-string": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/split-string/-/split-string-3.1.0.tgz",
+      "integrity": "sha512-NzNVhJDYpwceVVii8/Hu6DKfD2G+NrQHlS/V/qgv763EYudVwEcMQNxd2lh+0VrUByXN/oJkl5grOhYWvQUYiw==",
+      "dev": true,
+      "dependencies": {
+        "extend-shallow": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/sprintf-js": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",
+      "integrity": "sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==",
+      "dev": true
+    },
+    "node_modules/stack-utils": {
+      "version": "2.0.6",
+      "resolved": "https://registry.npmjs.org/stack-utils/-/stack-utils-2.0.6.tgz",
+      "integrity": "sha512-XlkWvfIm6RmsWtNJx+uqtKLS8eqFbxUg0ZzLXqY0caEy9l7hruX8IpiDnjsLavoBgqCCR71TqWO8MaXYheJ3RQ==",
+      "dev": true,
+      "dependencies": {
+        "escape-string-regexp": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/stack-utils/node_modules/escape-string-regexp": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-2.0.0.tgz",
+      "integrity": "sha512-UpzcLCXolUWcNu5HtVMHYdXJjArjsF9C0aNnquZYY4uW/Vu0miy5YoWvbV345HauVvcAUnpRuhMMcqTcGOY2+w==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/static-extend": {
+      "version": "0.1.2",
+      "resolved": "https://registry.npmjs.org/static-extend/-/static-extend-0.1.2.tgz",
+      "integrity": "sha512-72E9+uLc27Mt718pMHt9VMNiAL4LMsmDbBva8mxWUCkT07fSzEGMYUCk0XWY6lp0j6RBAG4cJ3mWuZv2OE3s0g==",
+      "dev": true,
+      "dependencies": {
+        "define-property": "^0.2.5",
+        "object-copy": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/static-extend/node_modules/define-property": {
+      "version": "0.2.5",
+      "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz",
+      "integrity": "sha512-Rr7ADjQZenceVOAKop6ALkkRAmH1A4Gx9hV/7ZujPUN2rkATqFO0JZLZInbAjpZYoJ1gUx8MRMQVkYemcbMSTA==",
+      "dev": true,
+      "dependencies": {
+        "is-descriptor": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/string-length": {
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/string-length/-/string-length-4.0.2.tgz",
+      "integrity": "sha512-+l6rNN5fYHNhZZy41RXsYptCjA2Igmq4EG7kZAYFQI1E1VTXarr6ZPXBg6eq7Y6eK4FEhY6AJlyuFIb/v/S0VQ==",
+      "dev": true,
+      "dependencies": {
+        "char-regex": "^1.0.2",
+        "strip-ansi": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/string-length/node_modules/strip-ansi": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "dev": true,
+      "dependencies": {
+        "ansi-regex": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/string-width": {
+      "version": "4.2.3",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
+      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
+      "dev": true,
+      "dependencies": {
+        "emoji-regex": "^8.0.0",
+        "is-fullwidth-code-point": "^3.0.0",
+        "strip-ansi": "^6.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/string-width/node_modules/strip-ansi": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "dev": true,
+      "dependencies": {
+        "ansi-regex": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/strip-ansi": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-5.2.0.tgz",
+      "integrity": "sha512-DuRs1gKbBqsMKIZlrffwlug8MHkcnpjs5VPmL1PAh+mA30U0DTotfDZ0d2UUsXpPmPmMMJ6W773MaA3J+lbiWA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-regex": "^4.1.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/strip-ansi/node_modules/ansi-regex": {
+      "version": "4.1.1",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-4.1.1.tgz",
+      "integrity": "sha512-ILlv4k/3f6vfQ4OoP2AGvirOktlQ98ZEL1k9FaQjxa3L1abBgbuTDAdPOpvbGncC0BTVQrl+OM8xZGK6tWXt7g==",
+      "dev": true,
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/strip-bom": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-4.0.0.tgz",
+      "integrity": "sha512-3xurFv5tEgii33Zi8Jtp55wEIILR9eh34FAW00PZf+JnSsTmV/ioewSgQl97JHvgjoRGwPShsWm+IdrxB35d0w==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/strip-eof": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/strip-eof/-/strip-eof-1.0.0.tgz",
+      "integrity": "sha512-7FCwGGmx8mD5xQd3RPUvnSpUXHM3BWuzjtpD4TXsfcZ9EL4azvVVUscFYwD9nx8Kh+uCBC00XBtAykoMHwTh8Q==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/strip-final-newline": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/strip-final-newline/-/strip-final-newline-2.0.0.tgz",
+      "integrity": "sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==",
+      "dev": true,
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/strip-json-comments": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz",
+      "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/supports-color": {
+      "version": "5.5.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz",
+      "integrity": "sha512-QjVjwdXIt408MIiAqCX4oUKsgU2EqAGzs2Ppkm4aQYbjm+ZEWEcW4SfFNTr4uMNZma0ey4f5lgLrkB0aX0QMow==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/supports-hyperlinks": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/supports-hyperlinks/-/supports-hyperlinks-2.3.0.tgz",
+      "integrity": "sha512-RpsAZlpWcDwOPQA22aCH4J0t7L8JmAvsCxfOSEwm7cQs3LshN36QaTkwd70DnBOXDWGssw2eUoc8CaRWT0XunA==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0",
+        "supports-color": "^7.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/supports-hyperlinks/node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/supports-hyperlinks/node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/supports-preserve-symlinks-flag": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz",
+      "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==",
+      "dev": true,
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/symbol-tree": {
+      "version": "3.2.4",
+      "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz",
+      "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==",
+      "dev": true
+    },
+    "node_modules/table": {
+      "version": "5.4.6",
+      "resolved": "https://registry.npmjs.org/table/-/table-5.4.6.tgz",
+      "integrity": "sha512-wmEc8m4fjnob4gt5riFRtTu/6+4rSe12TpAELNSqHMfF3IqnA+CH37USM6/YR3qRZv7e56kAEAtd6nKZaxe0Ug==",
+      "dev": true,
+      "dependencies": {
+        "ajv": "^6.10.2",
+        "lodash": "^4.17.14",
+        "slice-ansi": "^2.1.0",
+        "string-width": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=6.0.0"
+      }
+    },
+    "node_modules/table/node_modules/emoji-regex": {
+      "version": "7.0.3",
+      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-7.0.3.tgz",
+      "integrity": "sha512-CwBLREIQ7LvYFB0WyRvwhq5N5qPhc6PMjD6bYggFlI5YyDgl+0vxq5VHbMOFqLg7hfWzmu8T5Z1QofhmTIhItA==",
+      "dev": true
+    },
+    "node_modules/table/node_modules/is-fullwidth-code-point": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz",
+      "integrity": "sha512-VHskAKYM8RfSFXwee5t5cbN5PZeq1Wrh6qd5bkyiXIf6UQcN6w/A0eXM9r6t8d+GYOh+o6ZhiEnb88LN/Y8m2w==",
+      "dev": true,
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/table/node_modules/string-width": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-3.1.0.tgz",
+      "integrity": "sha512-vafcv6KjVZKSgz06oM/H6GDBrAtz8vdhQakGjFIvNrHA6y3HCF1CInLy+QLq8dTJPQ1b+KDUqDFctkdRW44e1w==",
+      "dev": true,
+      "dependencies": {
+        "emoji-regex": "^7.0.1",
+        "is-fullwidth-code-point": "^2.0.0",
+        "strip-ansi": "^5.1.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/terminal-link": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/terminal-link/-/terminal-link-2.1.1.tgz",
+      "integrity": "sha512-un0FmiRUQNr5PJqy9kP7c40F5BOfpGlYTrxonDChEZB7pzZxRNp/bt+ymiy9/npwXya9KH99nJ/GXFIiUkYGFQ==",
+      "dev": true,
+      "dependencies": {
+        "ansi-escapes": "^4.2.1",
+        "supports-hyperlinks": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/test-exclude": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/test-exclude/-/test-exclude-6.0.0.tgz",
+      "integrity": "sha512-cAGWPIyOHU6zlmg88jwm7VRyXnMN7iV68OGAbYDk/Mh/xC/pzVPlQtY6ngoIH/5/tciuhGfvESU8GrHrcxD56w==",
+      "dev": true,
+      "dependencies": {
+        "@istanbuljs/schema": "^0.1.2",
+        "glob": "^7.1.4",
+        "minimatch": "^3.0.4"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/text-table": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz",
+      "integrity": "sha512-N+8UisAXDGk8PFXP4HAzVR9nbfmVJ3zYLAWiTIoqC5v5isinhr+r5uaO8+7r3BMfuNIufIsA7RdpVgacC2cSpw==",
+      "dev": true
+    },
+    "node_modules/throat": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/throat/-/throat-5.0.0.tgz",
+      "integrity": "sha512-fcwX4mndzpLQKBS1DVYhGAcYaYt7vsHNIvQV+WXMvnow5cgjPphq5CaayLaGsjRdSCKZFNGt7/GYAuXaNOiYCA==",
+      "dev": true
+    },
+    "node_modules/through": {
+      "version": "2.3.8",
+      "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz",
+      "integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==",
+      "dev": true
+    },
+    "node_modules/tmp": {
+      "version": "0.0.33",
+      "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.0.33.tgz",
+      "integrity": "sha512-jRCJlojKnZ3addtTOjdIqoRuPEKBvNXcGYqzO6zWZX8KfKEpnGY5jfggJQ3EjKuu8D4bJRr0y+cYJFmYbImXGw==",
+      "dev": true,
+      "dependencies": {
+        "os-tmpdir": "~1.0.2"
+      },
+      "engines": {
+        "node": ">=0.6.0"
+      }
+    },
+    "node_modules/tmpl": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/tmpl/-/tmpl-1.0.5.tgz",
+      "integrity": "sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==",
+      "dev": true
+    },
+    "node_modules/to-fast-properties": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/to-fast-properties/-/to-fast-properties-2.0.0.tgz",
+      "integrity": "sha512-/OaKK0xYrs3DmxRYqL/yDc+FxFUVYhDlXMhRmv3z915w2HF1tnN1omB354j8VUGO/hbRzyD6Y3sA7v7GS/ceog==",
+      "dev": true,
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/to-object-path": {
+      "version": "0.3.0",
+      "resolved": "https://registry.npmjs.org/to-object-path/-/to-object-path-0.3.0.tgz",
+      "integrity": "sha512-9mWHdnGRuh3onocaHzukyvCZhzvr6tiflAy/JRFXcJX0TjgfWA9pk9t8CMbzmBE4Jfw58pXbkngtBtqYxzNEyg==",
+      "dev": true,
+      "dependencies": {
+        "kind-of": "^3.0.2"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/to-object-path/node_modules/kind-of": {
+      "version": "3.2.2",
+      "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
+      "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
+      "dev": true,
+      "dependencies": {
+        "is-buffer": "^1.1.5"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/to-regex": {
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/to-regex/-/to-regex-3.0.2.tgz",
+      "integrity": "sha512-FWtleNAtZ/Ki2qtqej2CXTOayOH9bHDQF+Q48VpWyDXjbYxA4Yz8iDB31zXOBUlOHHKidDbqGVrTUvQMPmBGBw==",
+      "dev": true,
+      "dependencies": {
+        "define-property": "^2.0.2",
+        "extend-shallow": "^3.0.2",
+        "regex-not": "^1.0.2",
+        "safe-regex": "^1.1.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/to-regex-range": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
+      "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==",
+      "dev": true,
+      "dependencies": {
+        "is-number": "^7.0.0"
+      },
+      "engines": {
+        "node": ">=8.0"
+      }
+    },
+    "node_modules/tough-cookie": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.2.tgz",
+      "integrity": "sha512-G9fqXWoYFZgTc2z8Q5zaHy/vJMjm+WV0AkAeHxVCQiEB1b+dGvWzFW6QV07cY5jQ5gRkeid2qIkzkxUnmoQZUQ==",
+      "dev": true,
+      "dependencies": {
+        "psl": "^1.1.33",
+        "punycode": "^2.1.1",
+        "universalify": "^0.2.0",
+        "url-parse": "^1.5.3"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/tr46": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/tr46/-/tr46-2.1.0.tgz",
+      "integrity": "sha512-15Ih7phfcdP5YxqiB+iDtLoaTz4Nd35+IiAv0kQ5FNKHzXgdWqPoTIqEDDJmXceQt4JZk6lVPT8lnDlPpGDppw==",
+      "dev": true,
+      "dependencies": {
+        "punycode": "^2.1.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/tslib": {
+      "version": "1.14.1",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.14.1.tgz",
+      "integrity": "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==",
+      "dev": true
+    },
+    "node_modules/tsutils": {
+      "version": "3.21.0",
+      "resolved": "https://registry.npmjs.org/tsutils/-/tsutils-3.21.0.tgz",
+      "integrity": "sha512-mHKK3iUXL+3UF6xL5k0PEhKRUBKPBCv/+RkEOpjRWxxx27KKRBmmA60A9pgOUvMi8GKhRMPEmjBRPzs2W7O1OA==",
+      "dev": true,
+      "dependencies": {
+        "tslib": "^1.8.1"
+      },
+      "engines": {
+        "node": ">= 6"
+      },
+      "peerDependencies": {
+        "typescript": ">=2.8.0 || >= 3.2.0-dev || >= 3.3.0-dev || >= 3.4.0-dev || >= 3.5.0-dev || >= 3.6.0-dev || >= 3.6.0-beta || >= 3.7.0-dev || >= 3.7.0-beta"
+      }
+    },
+    "node_modules/type-check": {
+      "version": "0.3.2",
+      "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.3.2.tgz",
+      "integrity": "sha512-ZCmOJdvOWDBYJlzAoFkC+Q0+bUyEOS1ltgp1MGU03fqHG+dbi9tBFU2Rd9QKiDZFAYrhPh2JUf7rZRIuHRKtOg==",
+      "dev": true,
+      "dependencies": {
+        "prelude-ls": "~1.1.2"
+      },
+      "engines": {
+        "node": ">= 0.8.0"
+      }
+    },
+    "node_modules/type-detect": {
+      "version": "4.0.8",
+      "resolved": "https://registry.npmjs.org/type-detect/-/type-detect-4.0.8.tgz",
+      "integrity": "sha512-0fr/mIH1dlO+x7TlcMy+bIDqKPsw/70tVyeHW787goQjhmqaZe10uwLujubK9q9Lg6Fiho1KUKDYz0Z7k7g5/g==",
+      "dev": true,
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/type-fest": {
+      "version": "0.8.1",
+      "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.8.1.tgz",
+      "integrity": "sha512-4dbzIzqvjtgiM5rw1k5rEHtBANKmdudhGyBEajN01fEyhaAIhsoKNy6y7+IN93IfpFtwY9iqi7kD+xwKhQsNJA==",
+      "dev": true,
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/typedarray-to-buffer": {
+      "version": "3.1.5",
+      "resolved": "https://registry.npmjs.org/typedarray-to-buffer/-/typedarray-to-buffer-3.1.5.tgz",
+      "integrity": "sha512-zdu8XMNEDepKKR+XYOXAVPtWui0ly0NtohUscw+UmaHiAWT8hrV1rr//H6V+0DvJ3OQ19S979M0laLfX8rm82Q==",
+      "dev": true,
+      "dependencies": {
+        "is-typedarray": "^1.0.0"
+      }
+    },
+    "node_modules/typedoc": {
+      "version": "0.17.8",
+      "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.17.8.tgz",
+      "integrity": "sha512-/OyrHCJ8jtzu+QZ+771YaxQ9s4g5Z3XsQE3Ma7q+BL392xxBn4UMvvCdVnqKC2T/dz03/VXSLVKOP3lHmDdc/w==",
+      "dev": true,
+      "dependencies": {
+        "fs-extra": "^8.1.0",
+        "handlebars": "^4.7.6",
+        "highlight.js": "^10.0.0",
+        "lodash": "^4.17.15",
+        "lunr": "^2.3.8",
+        "marked": "1.0.0",
+        "minimatch": "^3.0.0",
+        "progress": "^2.0.3",
+        "shelljs": "^0.8.4",
+        "typedoc-default-themes": "^0.10.2"
+      },
+      "bin": {
+        "typedoc": "bin/typedoc"
+      },
+      "engines": {
+        "node": ">= 8.0.0"
+      },
+      "peerDependencies": {
+        "typescript": ">=3.8.3"
+      }
+    },
+    "node_modules/typedoc-default-themes": {
+      "version": "0.10.2",
+      "resolved": "https://registry.npmjs.org/typedoc-default-themes/-/typedoc-default-themes-0.10.2.tgz",
+      "integrity": "sha512-zo09yRj+xwLFE3hyhJeVHWRSPuKEIAsFK5r2u47KL/HBKqpwdUSanoaz5L34IKiSATFrjG5ywmIu98hPVMfxZg==",
+      "dev": true,
+      "dependencies": {
+        "lunr": "^2.3.8"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/typescript": {
+      "version": "3.9.10",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-3.9.10.tgz",
+      "integrity": "sha512-w6fIxVE/H1PkLKcCPsFqKE7Kv7QUwhU8qQY2MueZXWx5cPZdwFupLgKK3vntcK98BtNHZtAF4LA/yl2a7k8R6Q==",
+      "dev": true,
+      "bin": {
+        "tsc": "bin/tsc",
+        "tsserver": "bin/tsserver"
+      },
+      "engines": {
+        "node": ">=4.2.0"
+      }
+    },
+    "node_modules/uglify-js": {
+      "version": "3.17.4",
+      "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.17.4.tgz",
+      "integrity": "sha512-T9q82TJI9e/C1TAxYvfb16xO120tMVFZrGA3f9/P4424DNu6ypK103y0GPFVa17yotwSyZW5iYXgjYHkGrJW/g==",
+      "dev": true,
+      "optional": true,
+      "bin": {
+        "uglifyjs": "bin/uglifyjs"
+      },
+      "engines": {
+        "node": ">=0.8.0"
+      }
+    },
+    "node_modules/union-value": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/union-value/-/union-value-1.0.1.tgz",
+      "integrity": "sha512-tJfXmxMeWYnczCVs7XAEvIV7ieppALdyepWMkHkwciRpZraG/xwT+s2JN8+pr1+8jCRf80FFzvr+MpQeeoF4Xg==",
+      "dev": true,
+      "dependencies": {
+        "arr-union": "^3.1.0",
+        "get-value": "^2.0.6",
+        "is-extendable": "^0.1.1",
+        "set-value": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/universalify": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz",
+      "integrity": "sha512-CJ1QgKmNg3CwvAv/kOFmtnEN05f0D/cn9QntgNOQlQF9dgvVTHj3t+8JPdjqawCHk7V/KA+fbUqzZ9XWhcqPUg==",
+      "dev": true,
+      "engines": {
+        "node": ">= 4.0.0"
+      }
+    },
+    "node_modules/unset-value": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/unset-value/-/unset-value-1.0.0.tgz",
+      "integrity": "sha512-PcA2tsuGSF9cnySLHTLSh2qrQiJ70mn+r+Glzxv2TWZblxsxCC52BDlZoPCsz7STd9pN7EZetkWZBAvk4cgZdQ==",
+      "dev": true,
+      "dependencies": {
+        "has-value": "^0.3.1",
+        "isobject": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/unset-value/node_modules/has-value": {
+      "version": "0.3.1",
+      "resolved": "https://registry.npmjs.org/has-value/-/has-value-0.3.1.tgz",
+      "integrity": "sha512-gpG936j8/MzaeID5Yif+577c17TxaDmhuyVgSwtnL/q8UUTySg8Mecb+8Cf1otgLoD7DDH75axp86ER7LFsf3Q==",
+      "dev": true,
+      "dependencies": {
+        "get-value": "^2.0.3",
+        "has-values": "^0.1.4",
+        "isobject": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/unset-value/node_modules/has-value/node_modules/isobject": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/isobject/-/isobject-2.1.0.tgz",
+      "integrity": "sha512-+OUdGJlgjOBZDfxnDjYYG6zp487z0JGNQq3cYQYg5f5hKR+syHMsaztzGeml/4kGG55CSpKSpWTY+jYGgsHLgA==",
+      "dev": true,
+      "dependencies": {
+        "isarray": "1.0.0"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/unset-value/node_modules/has-values": {
+      "version": "0.1.4",
+      "resolved": "https://registry.npmjs.org/has-values/-/has-values-0.1.4.tgz",
+      "integrity": "sha512-J8S0cEdWuQbqD9//tlZxiMuMNmxB8PlEwvYwuxsTmR1G5RXUePEX/SJn7aD0GMLieuZYSwNH0cQuJGwnYunXRQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/update-browserslist-db": {
+      "version": "1.0.10",
+      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.0.10.tgz",
+      "integrity": "sha512-OztqDenkfFkbSG+tRxBeAnCVPckDBcvibKd35yDONx6OU8N7sqgwc7rCbkJ/WcYtVRZ4ba68d6byhC21GFh7sQ==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/browserslist"
+        }
+      ],
+      "dependencies": {
+        "escalade": "^3.1.1",
+        "picocolors": "^1.0.0"
+      },
+      "bin": {
+        "browserslist-lint": "cli.js"
+      },
+      "peerDependencies": {
+        "browserslist": ">= 4.21.0"
+      }
+    },
+    "node_modules/uri-js": {
+      "version": "4.4.1",
+      "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
+      "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
+      "dev": true,
+      "dependencies": {
+        "punycode": "^2.1.0"
+      }
+    },
+    "node_modules/urix": {
+      "version": "0.1.0",
+      "resolved": "https://registry.npmjs.org/urix/-/urix-0.1.0.tgz",
+      "integrity": "sha512-Am1ousAhSLBeB9cG/7k7r2R0zj50uDRlZHPGbazid5s9rlF1F/QKYObEKSIunSjIOkJZqwRRLpvewjEkM7pSqg==",
+      "deprecated": "Please see https://github.com/lydell/urix#deprecated",
+      "dev": true
+    },
+    "node_modules/url-parse": {
+      "version": "1.5.10",
+      "resolved": "https://registry.npmjs.org/url-parse/-/url-parse-1.5.10.tgz",
+      "integrity": "sha512-WypcfiRhfeUP9vvF0j6rw0J3hrWrw6iZv3+22h6iRMJ/8z1Tj6XfLP4DsUix5MhMPnXpiHDoKyoZ/bdCkwBCiQ==",
+      "dev": true,
+      "dependencies": {
+        "querystringify": "^2.1.1",
+        "requires-port": "^1.0.0"
+      }
+    },
+    "node_modules/use": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/use/-/use-3.1.1.tgz",
+      "integrity": "sha512-cwESVXlO3url9YWlFW/TA9cshCEhtu7IKJ/p5soJ/gGpj7vbvFrAY/eIioQ6Dw23KjZhYgiIo8HOs1nQ2vr/oQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/uuid": {
+      "version": "8.3.2",
+      "resolved": "https://registry.npmjs.org/uuid/-/uuid-8.3.2.tgz",
+      "integrity": "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==",
+      "dev": true,
+      "optional": true,
+      "bin": {
+        "uuid": "dist/bin/uuid"
+      }
+    },
+    "node_modules/v8-compile-cache": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/v8-compile-cache/-/v8-compile-cache-2.3.0.tgz",
+      "integrity": "sha512-l8lCEmLcLYZh4nbunNZvQCJc5pv7+RCwa8q/LdUx8u7lsWvPDKmpodJAJNwkAhJC//dFY48KuIEmjtd4RViDrA==",
+      "dev": true
+    },
+    "node_modules/v8-to-istanbul": {
+      "version": "7.1.2",
+      "resolved": "https://registry.npmjs.org/v8-to-istanbul/-/v8-to-istanbul-7.1.2.tgz",
+      "integrity": "sha512-TxNb7YEUwkLXCQYeudi6lgQ/SZrzNO4kMdlqVxaZPUIUjCv6iSSypUQX70kNBSERpQ8fk48+d61FXk+tgqcWow==",
+      "dev": true,
+      "dependencies": {
+        "@types/istanbul-lib-coverage": "^2.0.1",
+        "convert-source-map": "^1.6.0",
+        "source-map": "^0.7.3"
+      },
+      "engines": {
+        "node": ">=10.10.0"
+      }
+    },
+    "node_modules/v8-to-istanbul/node_modules/source-map": {
+      "version": "0.7.4",
+      "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.7.4.tgz",
+      "integrity": "sha512-l3BikUxvPOcn5E74dZiq5BGsTb5yEwhaTSzccU6t4sDOH8NWJCstKO5QT2CvtFoK6F0saL7p9xHAqHOlCPJygA==",
+      "dev": true,
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/validate-npm-package-license": {
+      "version": "3.0.4",
+      "resolved": "https://registry.npmjs.org/validate-npm-package-license/-/validate-npm-package-license-3.0.4.tgz",
+      "integrity": "sha512-DpKm2Ui/xN7/HQKCtpZxoRWBhZ9Z0kqtygG8XCgNQ8ZlDnxuQmWhj566j8fN4Cu3/JmbhsDo7fcAJq4s9h27Ew==",
+      "dev": true,
+      "dependencies": {
+        "spdx-correct": "^3.0.0",
+        "spdx-expression-parse": "^3.0.0"
+      }
+    },
+    "node_modules/w3c-hr-time": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/w3c-hr-time/-/w3c-hr-time-1.0.2.tgz",
+      "integrity": "sha512-z8P5DvDNjKDoFIHK7q8r8lackT6l+jo/Ye3HOle7l9nICP9lf1Ci25fy9vHd0JOWewkIFzXIEig3TdKT7JQ5fQ==",
+      "deprecated": "Use your platform's native performance.now() and performance.timeOrigin.",
+      "dev": true,
+      "dependencies": {
+        "browser-process-hrtime": "^1.0.0"
+      }
+    },
+    "node_modules/w3c-xmlserializer": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-2.0.0.tgz",
+      "integrity": "sha512-4tzD0mF8iSiMiNs30BiLO3EpfGLZUT2MSX/G+o7ZywDzliWQ3OPtTZ0PTC3B3ca1UAf4cJMHB+2Bf56EriJuRA==",
+      "dev": true,
+      "dependencies": {
+        "xml-name-validator": "^3.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/walker": {
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz",
+      "integrity": "sha512-ts/8E8l5b7kY0vlWLewOkDXMmPdLcVV4GmOQLyxuSswIJsweeFZtAsMF7k1Nszz+TYBQrlYRmzOnr398y1JemQ==",
+      "dev": true,
+      "dependencies": {
+        "makeerror": "1.0.12"
+      }
+    },
+    "node_modules/webidl-conversions": {
+      "version": "6.1.0",
+      "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-6.1.0.tgz",
+      "integrity": "sha512-qBIvFLGiBpLjfwmYAaHPXsn+ho5xZnGvyGvsarywGNc8VyQJUMHJ8OBKGGrPER0okBeMDaan4mNBlgBROxuI8w==",
+      "dev": true,
+      "engines": {
+        "node": ">=10.4"
+      }
+    },
+    "node_modules/whatwg-encoding": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-1.0.5.tgz",
+      "integrity": "sha512-b5lim54JOPN9HtzvK9HFXvBma/rnfFeqsic0hSpjtDbVxR3dJKLc+KB4V6GgiGOvl7CY/KNh8rxSo9DKQrnUEw==",
+      "dev": true,
+      "dependencies": {
+        "iconv-lite": "0.4.24"
+      }
+    },
+    "node_modules/whatwg-mimetype": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-2.3.0.tgz",
+      "integrity": "sha512-M4yMwr6mAnQz76TbJm914+gPpB/nCwvZbJU28cUD6dR004SAxDLOOSUaB1JDRqLtaOV/vi0IC5lEAGFgrjGv/g==",
+      "dev": true
+    },
+    "node_modules/whatwg-url": {
+      "version": "8.7.0",
+      "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-8.7.0.tgz",
+      "integrity": "sha512-gAojqb/m9Q8a5IV96E3fHJM70AzCkgt4uXYX2O7EmuyOnLrViCQlsEBmF9UQIu3/aeAIp2U17rtbpZWNntQqdg==",
+      "dev": true,
+      "dependencies": {
+        "lodash": "^4.7.0",
+        "tr46": "^2.1.0",
+        "webidl-conversions": "^6.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/which": {
+      "version": "1.3.1",
+      "resolved": "https://registry.npmjs.org/which/-/which-1.3.1.tgz",
+      "integrity": "sha512-HxJdYWq1MTIQbJ3nw0cqssHoTNU267KlrDuGZ1WYlxDStUtKUhOaJmh112/TZmHxxUfuJqPXSOm7tDyas0OSIQ==",
+      "dev": true,
+      "dependencies": {
+        "isexe": "^2.0.0"
+      },
+      "bin": {
+        "which": "bin/which"
+      }
+    },
+    "node_modules/which-module": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/which-module/-/which-module-2.0.0.tgz",
+      "integrity": "sha512-B+enWhmw6cjfVC7kS8Pj9pCrKSc5txArRyaYGe088shv/FGWH+0Rjx/xPgtsWfsUtS27FkP697E4DDhgrgoc0Q==",
+      "dev": true
+    },
+    "node_modules/word-wrap": {
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz",
+      "integrity": "sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/wordwrap": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-1.0.0.tgz",
+      "integrity": "sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q==",
+      "dev": true
+    },
+    "node_modules/wrap-ansi": {
+      "version": "6.2.0",
+      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-6.2.0.tgz",
+      "integrity": "sha512-r6lPcBGxZXlIcymEu7InxDMhdW0KDxpLgoFLcguasxCaJ/SOIZwINatK9KY/tf+ZrlywOKU0UDj3ATXUBfxJXA==",
+      "dev": true,
+      "dependencies": {
+        "ansi-styles": "^4.0.0",
+        "string-width": "^4.1.0",
+        "strip-ansi": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/wrap-ansi/node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/wrap-ansi/node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/wrap-ansi/node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true
+    },
+    "node_modules/wrap-ansi/node_modules/strip-ansi": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "dev": true,
+      "dependencies": {
+        "ansi-regex": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/wrappy": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
+      "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
+      "dev": true
+    },
+    "node_modules/write": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/write/-/write-1.0.3.tgz",
+      "integrity": "sha512-/lg70HAjtkUgWPVZhZcm+T4hkL8Zbtp1nFNOn3lRrxnlv50SRBv7cR7RqR+GMsd3hUXy9hWBo4CHTbFTcOYwig==",
+      "dev": true,
+      "dependencies": {
+        "mkdirp": "^0.5.1"
+      },
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/write-file-atomic": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/write-file-atomic/-/write-file-atomic-3.0.3.tgz",
+      "integrity": "sha512-AvHcyZ5JnSfq3ioSyjrBkH9yW4m7Ayk8/9My/DD9onKeu/94fwrMocemO2QAJFAlnnDN+ZDS+ZjAR5ua1/PV/Q==",
+      "dev": true,
+      "dependencies": {
+        "imurmurhash": "^0.1.4",
+        "is-typedarray": "^1.0.0",
+        "signal-exit": "^3.0.2",
+        "typedarray-to-buffer": "^3.1.5"
+      }
+    },
+    "node_modules/ws": {
+      "version": "7.5.9",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-7.5.9.tgz",
+      "integrity": "sha512-F+P9Jil7UiSKSkppIiD94dN07AwvFixvLIj1Og1Rl9GGMuNipJnV9JzjD6XuqmAeiswGvUmNLjr5cFuXwNS77Q==",
+      "dev": true,
+      "engines": {
+        "node": ">=8.3.0"
+      },
+      "peerDependencies": {
+        "bufferutil": "^4.0.1",
+        "utf-8-validate": "^5.0.2"
+      },
+      "peerDependenciesMeta": {
+        "bufferutil": {
+          "optional": true
+        },
+        "utf-8-validate": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/xml-name-validator": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-3.0.0.tgz",
+      "integrity": "sha512-A5CUptxDsvxKJEU3yO6DuWBSJz/qizqzJKOMIfUJHETbBw/sFaDxgd6fxm1ewUaM0jZ444Fc5vC5ROYurg/4Pw==",
+      "dev": true
+    },
+    "node_modules/xmlchars": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz",
+      "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==",
+      "dev": true
+    },
+    "node_modules/y18n": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.3.tgz",
+      "integrity": "sha512-JKhqTOwSrqNA1NY5lSztJ1GrBiUodLMmIZuLiDaMRJ+itFd+ABVE8XBjOvIWL+rSqNDC74LCSFmlb/U4UZ4hJQ==",
+      "dev": true
+    },
+    "node_modules/yallist": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz",
+      "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==",
+      "dev": true
+    },
+    "node_modules/yargs": {
+      "version": "15.4.1",
+      "resolved": "https://registry.npmjs.org/yargs/-/yargs-15.4.1.tgz",
+      "integrity": "sha512-aePbxDmcYW++PaqBsJ+HYUFwCdv4LVvdnhBy78E57PIor8/OVvhMrADFFEDh8DHDFRv/O9i3lPhsENjO7QX0+A==",
+      "dev": true,
+      "dependencies": {
+        "cliui": "^6.0.0",
+        "decamelize": "^1.2.0",
+        "find-up": "^4.1.0",
+        "get-caller-file": "^2.0.1",
+        "require-directory": "^2.1.1",
+        "require-main-filename": "^2.0.0",
+        "set-blocking": "^2.0.0",
+        "string-width": "^4.2.0",
+        "which-module": "^2.0.0",
+        "y18n": "^4.0.0",
+        "yargs-parser": "^18.1.2"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/yargs-parser": {
+      "version": "18.1.3",
+      "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-18.1.3.tgz",
+      "integrity": "sha512-o50j0JeToy/4K6OZcaQmW6lyXXKhq7csREXcDwk2omFPJEwUNOVtJKvmDr9EI1fAJZUyZcRF7kxGBWmRXudrCQ==",
+      "dev": true,
+      "dependencies": {
+        "camelcase": "^5.0.0",
+        "decamelize": "^1.2.0"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    }
+  },
   "dependencies": {
     "@ampproject/remapping": {
       "version": "2.2.0",
@@ -1181,9 +9919,9 @@
       }
     },
     "@webgpu/types": {
-      "version": "0.0.31",
-      "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.0.31.tgz",
-      "integrity": "sha512-cvvCMSZBT4VsRNtt0lI6XQqvOIIWw6+NRUtnPUMDVDgsI4pCZColz3qzF5QcP9wIYOHEc3jssIBse8UWONKhlQ==",
+      "version": "0.1.24",
+      "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.24.tgz",
+      "integrity": "sha512-Mkz+SVJwHApTg6nCzqIuHDt3HsGRcCvHJNkWT2PgZTTC2Gy+LXvN4+7x6YvduAcx3F/pEDWW5OfAHs6VSo6J4Q==",
       "dev": true
     },
     "abab": {
diff --git a/web/package.json b/web/package.json
index 4cc88dc4c59e..c9e6c7793a61 100644
--- a/web/package.json
+++ b/web/package.json
@@ -21,7 +21,7 @@
     "@types/node": "^12.12.37",
     "@typescript-eslint/eslint-plugin": "^2.29.0",
     "@typescript-eslint/parser": "^2.29.0",
-    "@webgpu/types": "^0.0.31",
+    "@webgpu/types": "^0.1.24",
     "eslint": "^6.8.0",
     "jest": "^26.0.1",
     "rollup-plugin-typescript2": "^0.27.0",
diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts
index 226797eb7d19..daaa122682e3 100644
--- a/web/src/webgpu.ts
+++ b/web/src/webgpu.ts
@@ -65,13 +65,13 @@ export class WebGPUContext {
    * Wait for all pending GPU tasks to complete
    */
   async sync(): Promise<void> {
-    const fence = this.device.defaultQueue.createFence();
-    this.device.defaultQueue.signal(fence, 1);
     if (this.numPendingReads != 0) {
-      // eslint-disable-next-line @typescript-eslint/no-empty-function
-      await Promise.all([fence.onCompletion(1), this.pendingRead]);
+      await Promise.all([
+        this.device.queue.onSubmittedWorkDone(),
+        this.pendingRead
+      ])
     } else {
-      await fence.onCompletion(1);
+      await this.device.queue.onSubmittedWorkDone()
     }
   }
 
@@ -89,8 +89,7 @@ export class WebGPUContext {
       if (dtype == "handle") {
         layoutEntries.push({
           binding: i,
-          visibility: GPUShaderStage.COMPUTE,
-          type: "storage-buffer"
+          visibility: GPUShaderStage.COMPUTE
         });
       } else {
         throw new Error("Cannot handle argument type " + dtype + " in WebGPU shader");
@@ -100,13 +99,16 @@ export class WebGPUContext {
       entries: layoutEntries
     });
 
+    const textDecoder = new TextDecoder('utf-8')
+    const codeString = textDecoder.decode(data.buffer)
+
     const pipeline = this.device.createComputePipeline({
       layout: this.device.createPipelineLayout({
         bindGroupLayouts: [ bindGroupLayout ]
       }),
-      computeStage: {
+      compute: {
         module: this.device.createShaderModule({
-          code: new Uint32Array(data.buffer)
+          code: codeString
         }),
         entryPoint: "main"
       }
@@ -153,10 +155,10 @@ export class WebGPUContext {
       for (let i = 0; i < dispatchToDim.length; ++i) {
         wl[dispatchToDim[i]] = args[layoutEntries.length + i];
       }
-      compute.dispatch(wl[0], wl[1], wl[2]);
-      compute.endPass();
+      compute.dispatchWorkgroups(wl[0], wl[1], wl[2])
+      compute.end()
       const command = commandEncoder.finish();
-      this.device.defaultQueue.submit([command]);
+      this.device.queue.submit([command]);
     };
 
     return submitShader;
@@ -256,7 +258,7 @@ export class WebGPUContext {
       nbytes
     );
     const copyCommands = copyEncoder.finish();
-    this.device.defaultQueue.submit([copyCommands]);
+    this.device.queue.submit([copyCommands]);
     gpuTemp.destroy();
   }
 
@@ -281,7 +283,7 @@ export class WebGPUContext {
       nbytes
     );
     const copyCommands = copyEncoder.finish();
-    this.device.defaultQueue.submit([copyCommands]);
+    this.device.queue.submit([copyCommands]);
 
     this.numPendingReads += 1;
 
@@ -318,7 +320,7 @@ export class WebGPUContext {
       nbytes
     );
     const copyCommands = copyEncoder.finish();
-    this.device.defaultQueue.submit([copyCommands]);
+    this.device.queue.submit([copyCommands]);
   }
 
   private gpuBufferFromPtr(ptr: GPUPointer): GPUBuffer {

From 77b5b4c0c9f6d4c34a95f734190b9492d2a9dda7 Mon Sep 17 00:00:00 2001
From: Chun-I Tsai <quic_chunit@quicinc.com>
Date: Tue, 10 Jan 2023 05:40:14 +0800
Subject: [PATCH 139/286] [Relay][Frontend] Span Filling TFLite (#13727)

- Set tensor name as the source name of span during the conversion of
  TFLite model
- Add structural_equal comparisons with and without set_span to the
  existing test cases.
- Add span test cases for frequent conversions.

Co-authored-by: Joey Tsai <chunit@qti.qualcomm.com>
---
 python/tvm/relay/frontend/tflite.py          |  57 ++++--
 tests/python/frontend/tflite/test_forward.py | 173 ++++++++++++++++++-
 2 files changed, 214 insertions(+), 16 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 09e6523534cf..95bdb0ce513c 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -34,6 +34,7 @@
 from .common import ExprTable
 from .common import infer_shape as _infer_shape
 from .common import lstm_cell, to_int_list, shape_of, try_infer_value
+from .common import set_span
 from .tflite_flexbuffer import FlexBufferDecoder
 
 __all__ = ["from_tflite"]
@@ -275,6 +276,11 @@ def convert_op_to_relay(self):
             if ret is None:
                 continue
 
+            output_names = ", ".join(
+                [get_tensor_name(self.subgraph, tensor.tensor_idx) for tensor in output_tensors]
+            )
+            ret = set_span(ret, f"{output_names}")
+
             if len(output_tensors) == 1:
                 tensor_idx = output_tensors[0].tensor_idx
                 self.exp_tab.set_expr(get_tensor_name(self.subgraph, tensor_idx), ret)
@@ -1553,7 +1559,9 @@ def convert_gather(self, op):
         else:
             indices_val = self.get_tensor_value(indices)
             indices_expr = self.exp_tab.new_const(
-                indices_val, dtype=self.get_tensor_type_str(indices_type)
+                indices_val,
+                dtype=self.get_tensor_type_str(indices_type),
+                source_name=indices.tensor.Name(),
             )
             indices_shape = list(indices_val.shape)
             indices_len = len(indices_shape)
@@ -1954,7 +1962,9 @@ def convert_fully_connected(self, op):
             weight_expr = self.get_expr(weight_tensor.tensor_idx)
         else:
             weight_value = self.get_tensor_value(weight_tensor)
-            weight_expr = self.exp_tab.new_const(weight_value, dtype=weight_tensor_type_str)
+            weight_expr = self.exp_tab.new_const(
+                weight_value, dtype=weight_tensor_type_str, source_name=weight_tensor.tensor.Name()
+            )
         weight_shape = _infer_shape(weight_expr)
 
         if input_tensor.qnn_params:
@@ -1983,7 +1993,9 @@ def convert_fully_connected(self, op):
                     bias_expr = self.get_expr(bias_tensor.tensor_idx)
                 else:
                     bias_expr = self.exp_tab.new_const(
-                        self.get_tensor_value(bias_tensor), dtype=bias_tensor_type_str
+                        self.get_tensor_value(bias_tensor),
+                        dtype=bias_tensor_type_str,
+                        source_name=bias_tensor.tensor.Name(),
                     )
                 out = _op.nn.bias_add(out, bias_expr)
 
@@ -2195,7 +2207,9 @@ def convert_conv(self, op, conv_type):
             else:
                 weight_value = weight_value.transpose((1, 2, 3, 0))
 
-            weight_expr = self.exp_tab.new_const(weight_value, dtype=weight_tensor_type_str)
+            weight_expr = self.exp_tab.new_const(
+                weight_value, dtype=weight_tensor_type_str, source_name=weight_tensor.tensor.Name()
+            )
 
         if padding == Padding.VALID:
             pass
@@ -2236,7 +2250,9 @@ def convert_conv(self, op, conv_type):
                 bias_expr = self.get_expr(bias_tensor.tensor_idx)
             else:
                 bias_expr = self.exp_tab.new_const(
-                    self.get_tensor_value(bias_tensor), dtype=bias_tensor_type_str
+                    self.get_tensor_value(bias_tensor),
+                    dtype=bias_tensor_type_str,
+                    source_name=bias_tensor.tensor.Name(),
                 )
             channel_axis = 3
             out = _op.nn.bias_add(out, bias_expr, axis=channel_axis)
@@ -3043,7 +3059,9 @@ def convert_prelu(self, op):
             alpha_tensor_type = alpha_tensor.tensor.Type()
             alpha_tensor_type_str = self.get_tensor_type_str(alpha_tensor_type)
             alpha_expr = self.exp_tab.new_const(
-                self.get_tensor_value(alpha_tensor), dtype=alpha_tensor_type_str
+                self.get_tensor_value(alpha_tensor),
+                dtype=alpha_tensor_type_str,
+                source_name=alpha_tensor.tensor.Name(),
             )
         in_expr = self.get_expr(input_tensor.tensor_idx)
         data_shape = to_int_list(self.get_tensor_shape(input_tensor))
@@ -3119,7 +3137,9 @@ def convert_transpose_conv(self, op):
             # Relay weights layout should be different from kernel_layout - it should be IOHW
             weight_value_iohw = np.transpose(weight_value_ohwi, (3, 0, 1, 2))
             weight_expr_iohw = self.exp_tab.new_const(
-                weight_value_iohw, dtype=weight_tensor_type_str
+                weight_value_iohw,
+                dtype=weight_tensor_type_str,
+                source_name=weights_tensor.tensor.Name(),
             )
 
         # Output shape value
@@ -3181,7 +3201,9 @@ def convert_transpose_conv(self, op):
                 bias_expr = self.get_expr(bias_tensor.tensor_idx)
             else:
                 bias_expr = self.exp_tab.new_const(
-                    self.get_tensor_value(bias_tensor), dtype=bias_tensor_type_str
+                    self.get_tensor_value(bias_tensor),
+                    dtype=bias_tensor_type_str,
+                    source_name=bias_tensor.tensor.Name(),
                 )
             channel_axis = 3
             out = _op.nn.bias_add(out, bias_expr, axis=channel_axis)
@@ -3258,7 +3280,9 @@ def convert_dequantize(self, op):
         if input_tensor.tensor.Type() == TensorType.FLOAT16:
             dtype = self.get_tensor_type_str(input_tensor.tensor.Type())
             input_value = self.get_tensor_value(input_tensor)
-            in_expr = self.exp_tab.new_const(input_value, dtype=dtype)
+            in_expr = self.exp_tab.new_const(
+                input_value, dtype=dtype, source_name=input_tensor.tensor.Name()
+            )
             out = relay.cast(in_expr, dtype="float32")
             return out
 
@@ -3292,7 +3316,9 @@ def convert_detection_postprocess(self, op):
         anchor_values = self.get_tensor_value(inputs[2])
         anchor_boxes = len(anchor_values)
         anchor_type = self.get_tensor_type_str(inputs[2].tensor.Type())
-        anchor_expr = self.exp_tab.new_const(anchor_values, dtype=anchor_type)
+        anchor_expr = self.exp_tab.new_const(
+            anchor_values, dtype=anchor_type, source_name=inputs[2].tensor.Name()
+        )
 
         if inputs[0].qnn_params:
             loc_prob = _qnn.op.dequantize(
@@ -3685,7 +3711,11 @@ def get_tensor_expr(self, tensor, is_sparse=False):
             expr = self.get_expr(tensor.tensor_idx)
         else:
             type_str = self.get_tensor_type_str(tensor.tensor.Type())
-            expr = self.exp_tab.new_const(self.get_tensor_value(tensor, is_sparse), dtype=type_str)
+            expr = self.exp_tab.new_const(
+                self.get_tensor_value(tensor, is_sparse),
+                dtype=type_str,
+                source_name=tensor.tensor.Name(),
+            )
         return expr
 
     def get_tensor_shape(self, tensor_wrapper):
@@ -4022,7 +4052,10 @@ def from_tflite(model, shape_dict=None, dtype_dict=None, op_converter=OperatorCo
         model_input_name = get_tensor_name(subgraph, model_input)
         shape = _shape_dict[model_input_name] if model_input_name in _shape_dict else None
         dtype = _dtype_dict[model_input_name] if model_input_name in _dtype_dict else "float32"
-        exp_tab.set_expr(model_input_name, _expr.var(model_input_name, shape=shape, dtype=dtype))
+        input_var = set_span(
+            _expr.var(model_input_name, shape=shape, dtype=dtype), model_input_name
+        )
+        exp_tab.set_expr(model_input_name, input_var)
 
     # op code in model
     op_converter = op_converter(model, subgraph, exp_tab)
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 877406ae2a64..1d743ceb6938 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -35,9 +35,10 @@
 import tvm
 import tvm.relay.testing.tf as tf_testing
 from tvm.contrib.download import download_testdata
-from tvm import relay
+from tvm import relay, ir
 from tvm.contrib import graph_executor
 from tflite.BuiltinOperator import BuiltinOperator
+from relay.utils.tag_span import _set_span, _create_span, _verify_structural_equal_with_span
 
 
 try:
@@ -213,9 +214,15 @@ def run_tvm_graph(
         shape_dict[node] = input_data[i].shape
         dtype_dict[node] = input_data[i].dtype.name
 
-    mod, params = relay.frontend.from_tflite(
-        tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict, op_converter=op_converter
-    )
+    with tvm.testing.disable_span_filling():
+        mod, params = relay.frontend.from_tflite(
+            tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict, op_converter=op_converter
+        )
+    with tvm.testing.enable_span_filling():
+        mod_with_span, _ = relay.frontend.from_tflite(
+            tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict, op_converter=op_converter
+        )
+    assert tvm.ir.structural_equal(mod["main"], mod_with_span["main"])
 
     if mode in ["debug", "vm"]:
         inputs = []
@@ -5139,6 +5146,161 @@ def test_forward_nms_v5():
     _test_nms_v5((1000, 4), (1000,), 0.7, 0.3, 50)
 
 
+#######################################################################
+# Test structural_equal and span of a model
+# --------------------------------------
+def test_structure_and_span():
+    """Test Structure and span of frequently-used models"""
+
+    def _verify(res_fptr, golden_fptr):
+        with tvm.testing.enable_span_filling():
+            with_span = res_fptr()
+        with tvm.testing.disable_span_filling():
+            without_span = res_fptr()
+        assert tvm.ir.structural_equal(with_span, without_span)
+        _verify_structural_equal_with_span(with_span, golden_fptr())
+
+    def _tf_to_tflite(
+        input_tensors, output_tensors, init_global_variables=False, experimental_new_converter=False
+    ):
+        with tf.Session() as sess:
+            if init_global_variables:
+                sess.run(variables.global_variables_initializer())
+            converter = tf.lite.TFLiteConverter.from_session(sess, input_tensors, output_tensors)
+            converter.experimental_new_converter = experimental_new_converter
+
+            tflite_model_buffer = converter.convert()
+
+        try:
+            import tflite.Model
+
+            tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buffer, 0)
+        except AttributeError:
+            import tflite
+
+            tflite_model = tflite.Model.GetRootAsModel(tflite_model_buffer, 0)
+        except ImportError:
+            raise ImportError("The tflite package must be installed")
+        return tflite_model
+
+    def _test_conv2d_bias_add_span():
+        def _res():
+            in_shape = (1, 5, 5, 1)
+            kernel_shpae = (2, 2, 1, 2)
+            kernel_in = np.ones(kernel_shpae)
+
+            with tf.Graph().as_default():
+                x = array_ops.placeholder(shape=in_shape, dtype="float32", name="input")
+                kernel = tf.constant(kernel_in, dtype=tf.float32, name="filter_weight")
+                tf_model = tf.nn.conv2d(
+                    x, kernel, strides=[1, 1, 1, 1], padding="VALID", name="conv2d"
+                )
+                tflite_model = _tf_to_tflite([x], [tf_model])
+
+            mod, _ = relay.frontend.from_tflite(
+                tflite_model,
+                shape_dict={"input": in_shape},
+                dtype_dict={"input": "float32"},
+                op_converter=relay.frontend.tflite.OperatorConverter,
+            )
+            return mod["main"]
+
+        def _golden():
+            in_input = relay.var(
+                "input", relay.TensorType([1, 5, 5, 1]), span=_create_span("input")
+            )
+            weight = relay.var(
+                "_param_1", relay.TensorType([2, 2, 1, 2]), span=_create_span("filter_weight")
+            )
+            bias = relay.var("_param_2", relay.TensorType([2]), span=_create_span("conv2d_bias"))
+            conv2d = _set_span(
+                relay.nn.conv2d(
+                    in_input,
+                    weight,
+                    channels=2,
+                    kernel_size=[2, 2],
+                    data_layout="NHWC",
+                    kernel_layout="HWIO",
+                ),
+                "conv2d",
+            )
+            bias_add = _set_span(relay.nn.bias_add(conv2d, bias, axis=3), "conv2d")
+            attrs = ir.make_node("DictAttrs", **{"output_tensor_names": ["conv2d"]})
+            func = relay.Function([in_input, weight, bias], bias_add, attrs=attrs)
+            mod = ir.IRModule.from_expr(func)
+            return mod["main"]
+
+        _verify(_res, _golden)
+
+    def _test_fully_connected_bias_add_span():
+        def _res():
+            in_shape = (1, 10)
+            kernel_shpae = (10, 10)
+            kernel_in = np.ones(kernel_shpae)
+
+            with tf.Graph().as_default():
+                x = array_ops.placeholder(shape=in_shape, dtype="float32", name="input")
+                weight = tf.constant(kernel_in, dtype=tf.float32, name="filter_weight")
+                tf_model = math_ops.mat_mul(x, weight, name="dense")
+                tflite_model = _tf_to_tflite([x], [tf_model])
+
+            mod, _ = relay.frontend.from_tflite(
+                tflite_model,
+                shape_dict={"input": in_shape},
+                dtype_dict={"input": "float32"},
+                op_converter=relay.frontend.tflite.OperatorConverter,
+            )
+            return mod["main"]
+
+        def _golden():
+            in_input = relay.var("input", relay.TensorType([1, 10]), span=_create_span("input"))
+            weight = relay.var(
+                "_param_1", relay.TensorType([10, 10]), span=_create_span("filter_weight/transpose")
+            )
+            bias = relay.var("_param_2", relay.TensorType([10]), span=_create_span("dense_bias"))
+            reshape = _set_span(relay.reshape(in_input, [-1, 10]), "dense")
+            dense = _set_span(relay.nn.dense(reshape, weight, units=10), "dense")
+            bias_add = _set_span(relay.nn.bias_add(dense, bias), "dense")
+            attrs = ir.make_node("DictAttrs", **{"output_tensor_names": ["dense"]})
+            func = relay.Function([in_input, weight, bias], bias_add, attrs=attrs)
+            mod = ir.IRModule.from_expr(func)
+            return mod["main"]
+
+        _verify(_res, _golden)
+
+    def _test_reshape_span():
+        def _res():
+            in_shape = (1, 10)
+            output_shape = (2, 5)
+
+            with tf.Graph().as_default():
+                x = array_ops.placeholder(shape=in_shape, dtype="float32", name="input")
+                tf_model = array_ops.reshape(x, output_shape, "reshape")
+                tflite_model = _tf_to_tflite([x], [tf_model])
+
+            mod, _ = relay.frontend.from_tflite(
+                tflite_model,
+                shape_dict={"input": in_shape},
+                dtype_dict={"input": "float32"},
+                op_converter=relay.frontend.tflite.OperatorConverter,
+            )
+            return mod["main"]
+
+        def _golden():
+            in_input = relay.var("input", relay.TensorType([1, 10]), span=_create_span("input"))
+            reshape = _set_span(relay.reshape(in_input, [2, 5]), "reshape")
+            attrs = ir.make_node("DictAttrs", **{"output_tensor_names": ["reshape"]})
+            func = relay.Function([in_input], reshape, attrs=attrs)
+            mod = ir.IRModule.from_expr(func)
+            return mod["main"]
+
+        _verify(_res, _golden)
+
+    _test_conv2d_bias_add_span()
+    _test_fully_connected_bias_add_span()
+    _test_reshape_span()
+
+
 #######################################################################
 # Main
 # ----
@@ -5239,6 +5401,9 @@ def test_forward_nms_v5():
     # Overwrite Converter
     test_custom_op_converter()
 
+    # test structural_equal and span information
+    test_structure_and_span()
+
     # End to End
     test_forward_mobilenet_v1()
     test_forward_mobilenet_v2()

From e61f60b172d52463d86987d0212d0d542b8a48b6 Mon Sep 17 00:00:00 2001
From: Chun-I Tsai <quic_chunit@quicinc.com>
Date: Tue, 10 Jan 2023 05:40:41 +0800
Subject: [PATCH 140/286] [Relay][Frontend] Span Filling TensorFlow 1 (#13728)

- Set node name as the source name of span during the conversion of
  Tensorflow1 model
- Add structural_equal comparison with and without set_span to the
  existing test cases.
- Add span test cases for frequent conversions.

Co-authored-by: Joey Tsai <chunit@qti.qualcomm.com>
---
 python/tvm/relay/frontend/tensorflow.py       |  56 ++--
 .../frontend/tensorflow/test_bn_dynamic.py    |   6 +-
 .../frontend/tensorflow/test_control_flow.py  |  10 +-
 .../frontend/tensorflow/test_debugging.py     |  11 +-
 .../frontend/tensorflow/test_forward.py       | 258 ++++++++++++++++--
 .../python/frontend/tensorflow/test_no_op.py  |   9 +-
 6 files changed, 292 insertions(+), 58 deletions(-)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index d35e0e1c203d..1e2a2d4f826f 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -37,6 +37,7 @@
 from .common import infer_type as _infer_type
 from .common import infer_shape as _infer_shape
 from .common import infer_value as _infer_value
+from .common import set_span
 
 from .tensorflow_ops import _convert_map
 from .tensorflow_ops import _need_prelude_for_shape_inference
@@ -328,7 +329,7 @@ def _while_loop(self):
         `while_loop` construct.
         """
         bind_map = {}
-        wl = tvm.relay.var("while_loop")
+        wl = set_span(tvm.relay.var("while_loop"), self._loop_name)
         sb = tvm.relay.scope_builder.ScopeBuilder()
 
         lv_list = []
@@ -345,7 +346,7 @@ def _while_loop(self):
             if lv not in self._lvar2expr[self._loop_name]:
                 var_name = "{}_loop_var_{}".format(self._loop_name, i)
                 var_type = _infer_type(lv, self._mod).checked_type
-                loop_var = tvm.relay.var(var_name, type_annotation=var_type)
+                loop_var = set_span(tvm.relay.var(var_name, type_annotation=var_type), var_name)
                 self._lvar2expr[self._loop_name][loop_var] = lv
                 bind_map[lv] = loop_var
                 self.loop_vars[i] = loop_var
@@ -358,7 +359,7 @@ def _while_loop(self):
             self.cond = rewrite_subgraph(self.cond, bind_map)
             self.body = [rewrite_subgraph(b, bind_map) for b in self.body]
 
-        cond = tvm.relay.op.min(self.cond)
+        cond = set_span(tvm.relay.op.min(self.cond), self.cond.span)
 
         for lv, exp in self._lvar2expr[self._loop_name].items():
             if lv not in self.loop_vars:
@@ -517,8 +518,11 @@ def _get_relay_func(self, graph, layout="NHWC", shape=None, outputs=None):
                 self._output_shapes[node.name] = [self._input_shapes[node.name]]
                 attr = self._parse_attr(node.attr)
                 self._nodes[node.name] = [
-                    _expr.var(
-                        node.name, shape=self._input_shapes[node.name], dtype=attr["dtype"].name
+                    set_span(
+                        _expr.var(
+                            node.name, shape=self._input_shapes[node.name], dtype=attr["dtype"].name
+                        ),
+                        node.name,
                     )
                 ]
 
@@ -708,16 +712,23 @@ def _parse_param(self, key, value, name, shape):
                     var_shape = shape[name]
                 else:
                     var_shape = tensor_util.TensorShapeProtoToList(value.tensor.tensor_shape)
-                self._nodes[name] = [_expr.var(name, shape=var_shape, dtype="uint8")]
+                self._nodes[name] = [
+                    set_span(_expr.var(name, shape=var_shape, dtype="uint8"), span=name)
+                ]
                 return
 
             array_ndim = len(np_array.shape)
             if array_ndim == 0:
-                self._nodes[name] = [tvm.relay.const(np_array, np_array.dtype)]
+                self._nodes[name] = [set_span(tvm.relay.const(np_array, np_array.dtype), name)]
             else:
                 self._params[name] = tvm.nd.array(np_array)
                 self._nodes[name] = [
-                    _expr.var(name, shape=self._params[name].shape, dtype=self._params[name].dtype)
+                    set_span(
+                        _expr.var(
+                            name, shape=self._params[name].shape, dtype=self._params[name].dtype
+                        ),
+                        name,
+                    )
                 ]
         else:
             if key not in ("dtype", "_output_shapes", "_class"):
@@ -998,6 +1009,8 @@ def _convert_operator(
         ----------
         op_name : str
             Operator name, such as Conv2D, AvgPool
+        node_name : str
+            Node name, predefined by user or default setting of TF
         inputs : list of relay.op
             List of input symbols.
         attrs : dict
@@ -1028,22 +1041,8 @@ def _convert_operator(
         else:
             raise NotImplementedError("Operator {} not implemented.".format(op_name))
 
-        sym = self._set_span(sym, node_name)
-
-        return sym
+        sym = set_span(sym, node_name)
 
-    @staticmethod
-    def _set_span(sym, node_name):
-        span = tvm.relay.Span(tvm.relay.SourceName(node_name), 0, 0, 0, 0)
-        if isinstance(sym, _expr.Call) and sym.span is None:
-            sym = _expr.Call(sym.op, sym.args, sym.attrs, sym.type_args, span)
-        elif isinstance(sym, _expr.TupleWrapper):
-            tuple_value = sym.tuple_value
-            if isinstance(tuple_value, _expr.Call) and tuple_value.span is None:
-                tuple_value = _expr.Call(
-                    tuple_value.op, tuple_value.args, tuple_value.attrs, tuple_value.type_args, span
-                )
-                sym = _expr.TupleWrapper(tuple_value, sym.size)
         return sym
 
     def _licm_construct(self, loop_name, node_name):
@@ -1079,7 +1078,7 @@ def _licm_construct(self, loop_name, node_name):
             if node_name not in self._lname_map[loop_name]:
                 var_name = "{}_loop_var".format(node_name)
                 var_type = _infer_type(actual_expr, self._mod).checked_type
-                loop_var = tvm.relay.var(var_name, type_annotation=var_type)
+                loop_var = set_span(tvm.relay.var(var_name, type_annotation=var_type), var_name)
                 try:
                     extra_param = _infer_value(actual_expr, self._params, self._mod)
                     self._params[var_name] = extra_param
@@ -1183,10 +1182,13 @@ def _backtrack_construct(self, node_name):
             if isinstance(op, np.ndarray):
                 self._params[node.name] = tvm.nd.array(op)
                 op = [
-                    _expr.var(
+                    set_span(
+                        _expr.var(
+                            node.name,
+                            shape=self._params[node.name].shape,
+                            dtype=self._params[node.name].dtype,
+                        ),
                         node.name,
-                        shape=self._params[node.name].shape,
-                        dtype=self._params[node.name].dtype,
                     )
                 ]
 
diff --git a/tests/python/frontend/tensorflow/test_bn_dynamic.py b/tests/python/frontend/tensorflow/test_bn_dynamic.py
index 55555e885a60..df7052008821 100644
--- a/tests/python/frontend/tensorflow/test_bn_dynamic.py
+++ b/tests/python/frontend/tensorflow/test_bn_dynamic.py
@@ -65,7 +65,11 @@ def verify_fused_batch_norm(shape):
         if not tvm.testing.device_enabled(device):
             print("Skip because %s is not enabled" % device)
             continue
-        mod, params = relay.frontend.from_tensorflow(constant_graph, outputs=["output"])
+        with tvm.testing.disable_span_filling():
+            mod, params = relay.frontend.from_tensorflow(constant_graph, outputs=["output"])
+        with tvm.testing.enable_span_filling():
+            mod_with_span, _ = relay.frontend.from_tensorflow(constant_graph, outputs=["output"])
+        assert tvm.ir.structural_equal(mod["main"], mod_with_span["main"])
         with tvm.transform.PassContext(opt_level=3):
             graph, lib, params = relay.build(mod, target=device, params=params)
         from tvm.contrib import graph_executor
diff --git a/tests/python/frontend/tensorflow/test_control_flow.py b/tests/python/frontend/tensorflow/test_control_flow.py
index 49dc5170c52f..494deb46835f 100644
--- a/tests/python/frontend/tensorflow/test_control_flow.py
+++ b/tests/python/frontend/tensorflow/test_control_flow.py
@@ -25,13 +25,17 @@
     import tensorflow as tf
 from tensorflow.python.ops import control_flow_ops
 import numpy as np
-from tvm import nd
-from tvm import relay
+from tvm import nd, relay, ir, testing
 from tvm.relay.frontend.tensorflow import from_tensorflow
 
 
 def check_equal(graph, tf_out, input_map=None):
-    mod, params = from_tensorflow(graph.as_graph_def(add_shapes=True))
+    with testing.disable_span_filling():
+        mod, params = from_tensorflow(graph.as_graph_def(add_shapes=True))
+    with testing.enable_span_filling():
+        mod_with_span, _ = from_tensorflow(graph.as_graph_def(add_shapes=True))
+    assert ir.structural_equal(mod["main"], mod_with_span["main"])
+
     if input_map is not None:
         params.update(input_map)
     relay_out = relay.create_executor("vm", mod=mod).evaluate()(**params)
diff --git a/tests/python/frontend/tensorflow/test_debugging.py b/tests/python/frontend/tensorflow/test_debugging.py
index 0e08840e56ee..0f7c4dd7d65a 100644
--- a/tests/python/frontend/tensorflow/test_debugging.py
+++ b/tests/python/frontend/tensorflow/test_debugging.py
@@ -22,12 +22,19 @@
 except ImportError:
     import tensorflow as tf
 import numpy as np
-from tvm import relay
+from tvm import relay, ir, testing
 from tvm.relay.frontend.tensorflow import from_tensorflow
 
 
 def run_relay(graph, shape_dict=None, *vars):
-    mod, params = from_tensorflow(graph.as_graph_def(add_shapes=True), shape=shape_dict)
+    with testing.disable_span_filling():
+        mod, params = from_tensorflow(graph.as_graph_def(add_shapes=True), shape=shape_dict)
+    with testing.enable_span_filling():
+        mod_with_span, _ = relay.frontend.from_tensorflow(
+            graph.as_graph_def(add_shapes=True), shape=shape_dict
+        )
+    assert ir.structural_equal(mod["main"], mod_with_span["main"])
+
     return relay.create_executor("debug", mod=mod).evaluate()(*vars)
 
 
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index dce18ee231d3..2fb7c74f60a1 100755
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -31,11 +31,12 @@
 import pytest
 
 from PIL import Image
-from tvm import relay
+from tvm import relay, ir
 from tvm.runtime.vm import VirtualMachine
 from tvm.relay.frontend.tensorflow import from_tensorflow
 from tvm.contrib import graph_executor
 from tvm.contrib import utils
+from relay.utils.tag_span import _set_span, _create_span, _verify_structural_equal_with_span
 
 import tvm
 import tvm.relay.testing.tf as tf_testing
@@ -149,13 +150,23 @@ def run_tvm_graph(
         shape_dict = {
             e: i.shape if hasattr(i, "shape") else () for e, i in zip(input_node, input_data)
         }
-    mod, params = relay.frontend.from_tensorflow(
-        graph_def,
-        layout=layout,
-        shape=shape_dict,
-        outputs=out_names,
-        convert_config=convert_config,
-    )
+    with tvm.testing.disable_span_filling():
+        mod, params = relay.frontend.from_tensorflow(
+            graph_def,
+            layout=layout,
+            shape=shape_dict,
+            outputs=out_names,
+            convert_config=convert_config,
+        )
+    with tvm.testing.enable_span_filling():
+        mod_with_span, _ = relay.frontend.from_tensorflow(
+            graph_def,
+            layout=layout,
+            shape=shape_dict,
+            outputs=out_names,
+            convert_config=convert_config,
+        )
+    assert tvm.ir.structural_equal(mod["main"], mod_with_span["main"], map_free_vars=True)
 
     dev = tvm.device(target, 0)
     if mode == "debug":
@@ -1804,9 +1815,15 @@ def test_read_variable_op(target, dev):
 
         shape_dict = {e: i.shape for e, i in zip(in_name, in_data)}
         with pytest.raises(Exception) as execinfo:
-            _, _ = relay.frontend.from_tensorflow(
-                final_graph_def, layout=None, shape=shape_dict, outputs=None
-            )
+            with tvm.testing.disable_span_filling():
+                mod, _ = relay.frontend.from_tensorflow(
+                    final_graph_def, layout=None, shape=shape_dict, outputs=None
+                )
+            with tvm.testing.enable_span_filling():
+                mod_with_span, _ = relay.frontend.from_tensorflow(
+                    final_graph_def, layout=None, shape=shape_dict, outputs=None
+                )
+            assert tvm.ir.structural_equal(mod["main"], mod_with_span["main"])
 
         assert execinfo.value.args[0].startswith("Graph is not frozen. Provide a frozen graph")
 
@@ -4072,17 +4089,31 @@ def _get_tvm_graph_module(graph_def):
         # Cell inputs 'c and 'h' consist of all layers values
         shape_dict = {"Model/Placeholder": (batch_size, num_steps)}
 
-        mod, params = relay.frontend.from_tensorflow(
-            graph_def,
-            shape=shape_dict,
-            outputs=[
-                "Model/Softmax:0",
-                "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell:1",
-                "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell:6",
-                "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_1:1",
-                "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_1:6",
-            ],
-        )
+        with tvm.testing.disable_span_filling():
+            mod, params = relay.frontend.from_tensorflow(
+                graph_def,
+                shape=shape_dict,
+                outputs=[
+                    "Model/Softmax:0",
+                    "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell:1",
+                    "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell:6",
+                    "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_1:1",
+                    "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_1:6",
+                ],
+            )
+        with tvm.testing.enable_span_filling():
+            mod_with_span, _ = relay.frontend.from_tensorflow(
+                graph_def,
+                shape=shape_dict,
+                outputs=[
+                    "Model/Softmax:0",
+                    "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell:1",
+                    "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell:6",
+                    "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_1:1",
+                    "Model/RNN/RNN/multi_rnn_cell/cell_0/lstm_cell/LSTMBlockCell_1:6",
+                ],
+            )
+        assert tvm.ir.structural_equal(mod["main"], mod_with_span["main"])
 
         target = "llvm"
         with tvm.transform.PassContext(opt_level=0):
@@ -5723,7 +5754,12 @@ def test_moments():
         mean, variance = tf.nn.moments(A, [1], keep_dims=True)
         _ = (A - mean) / tf.sqrt(variance + 0.0005)
 
-    mod, _ = from_tensorflow(g.as_graph_def(add_shapes=True))
+    with tvm.testing.disable_span_filling():
+        mod, _ = from_tensorflow(g.as_graph_def(add_shapes=True))
+    with tvm.testing.enable_span_filling():
+        mod_with_span, _ = from_tensorflow(g.as_graph_def(add_shapes=True))
+    assert tvm.ir.structural_equal(mod["main"], mod_with_span["main"], map_free_vars=True)
+
     program = """
     def @main(%A: Tensor[(4, 176, 8, 8), float32]) {
         %527 = mean(%A, axis=[1], keepdims=True) /* moments/mean */;
@@ -5834,5 +5870,181 @@ def test_forward_dense_bincount():
         _test_dense_bincount((10,), 20, None, binary_output)
 
 
+#######################################################################
+# Test structural_equal and span of a model
+# --------------------------------------
+class TestSetSpan:
+    """Test Structure and span of frequently-used models"""
+
+    def _verify(self, res_fptr, golden_fptr):
+        with tvm.testing.enable_span_filling():
+            with_span = res_fptr()
+        with tvm.testing.disable_span_filling():
+            without_span = res_fptr()
+        assert tvm.ir.structural_equal(with_span, without_span)
+        _verify_structural_equal_with_span(with_span, golden_fptr())
+
+    def test_conv2d_bias_add_span(self):
+        """Test Structure and span of conv2d and bias add model match to the expected result"""
+
+        def _res():
+            in_shape = (1, 5, 5, 1)
+            kernel_shpae = (2, 2, 1, 2)
+            kernel_in = np.ones(kernel_shpae)
+            bias_val_shape = tuple([2])
+            bias_val_in = np.ones(bias_val_shape)
+
+            with tf.Graph().as_default() as g:
+                x = array_ops.placeholder(shape=in_shape, dtype="float32", name="input")
+                kernel = tf.constant(kernel_in, dtype=tf.float32, name="filter_weight")
+                bias_val_tensor = tf.constant(bias_val_in, dtype=tf.float32, name="conv2d_bias")
+                conv2d = tf.nn.conv2d(
+                    x, kernel, strides=[1, 1, 1, 1], padding="VALID", name="conv2d"
+                )
+                _ = tf.nn.bias_add(conv2d, bias_val_tensor, name="bias_add")
+
+                mod, _ = relay.frontend.from_tensorflow(
+                    g.as_graph_def(), shape={"input": in_shape}, outputs=["bias_add"]
+                )
+                return mod["main"]
+
+        def _golden():
+            model_in = relay.var(
+                "input", relay.TensorType([1, 5, 5, 1]), span=_create_span("input")
+            )
+            weight = relay.var(
+                "filter_weight", relay.TensorType([2, 2, 1, 2]), span=_create_span("filter_weight")
+            )
+            bias = relay.var("conv2d_bias", relay.TensorType([2]), span=_create_span("conv2d_bias"))
+            conv2d = _set_span(
+                relay.nn.conv2d(
+                    model_in,
+                    weight,
+                    channels=2,
+                    kernel_size=[2, 2],
+                    data_layout="NHWC",
+                    kernel_layout="HWIO",
+                ),
+                "conv2d",
+            )
+            add = _set_span(relay.op.add(conv2d, bias), "bias_add")
+            mod = ir.IRModule.from_expr(add)
+            return mod["main"]
+
+        self._verify(_res, _golden)
+
+    def test_fully_connected_bias_add_span(self):
+        """Test Structure and span of fully connected model match to the expected result"""
+
+        def _res():
+            in_shape = (1, 10)
+            kernel_shpae = (10, 10)
+            kernel_in = np.ones(kernel_shpae)
+            bias_val_shape = tuple([10])
+            bias_val_in = np.ones(bias_val_shape)
+
+            with tf.Graph().as_default() as g:
+                x = array_ops.placeholder(shape=in_shape, dtype="float32", name="input")
+                in_filter = tf.constant(kernel_in, dtype=tf.float32, name="filter_weight")
+                bias_val_tensor = tf.constant(bias_val_in, dtype=tf.float32, name="dense_bias")
+                mat_mul = math_ops.mat_mul(x, in_filter, name="dense")
+                _ = tf.nn.bias_add(mat_mul, bias_val_tensor, name="bias_add")
+
+                mod, _ = relay.frontend.from_tensorflow(
+                    g.as_graph_def(),
+                    shape={"input": in_shape},
+                    outputs=["bias_add"],
+                    convert_config={"use_dense": True},
+                )
+                return mod["main"]
+
+        def _golden():
+            model_in = relay.var("input", relay.TensorType([1, 10]), span=_create_span("input"))
+            weight = relay.var(
+                "filter_weight", relay.TensorType([10, 10]), span=_create_span("filter_weight")
+            )
+            bias = relay.var("dense_bias", relay.TensorType([10]), span=_create_span("dense_bias"))
+            transpose = _set_span(relay.transpose(weight, [1, 0]), "dense")
+            dense = _set_span(relay.nn.dense(model_in, transpose, units=10), "dense")
+            add = _set_span(relay.op.add(dense, bias), "bias_add")
+            mod = ir.IRModule.from_expr(add)
+            return mod["main"]
+
+        self._verify(_res, _golden)
+
+    def test_reshape_span(self):
+        """Test Structure and span of reshape model match to the expected result"""
+
+        def _res():
+            in_shape = (1, 10)
+            output_shape = (2, 5)
+
+            with tf.Graph().as_default() as g:
+                x = array_ops.placeholder(shape=in_shape, dtype="float32", name="input")
+                _ = array_ops.reshape(x, output_shape, "reshape")
+
+                mod, _ = relay.frontend.from_tensorflow(
+                    g.as_graph_def(), shape={"input": in_shape}, outputs=["reshape"]
+                )
+                return mod["main"]
+
+        def _golden():
+            model_in = relay.var("input", relay.TensorType([1, 10]), span=_create_span("input"))
+            reshape = _set_span(relay.reshape(model_in, [2, 5]), "reshape")
+            mod = ir.IRModule.from_expr(reshape)
+            return mod["main"]
+
+        self._verify(_res, _golden)
+
+    def test_batch_norm_span(self):
+        """Test Structure and span of batchnorm model match to the expected result"""
+
+        def _res():
+            in_shape = (1, 12, 12, 32)
+            with tf.Graph().as_default() as g:
+                input_tensor = tf.placeholder(tf.float32, shape=in_shape, name="input")
+                alpha = tf.constant(
+                    np.ones(
+                        in_shape[-1],
+                    ),
+                    dtype=tf.float32,
+                    name="alpha",
+                )
+                beta = tf.constant(
+                    np.ones(
+                        in_shape[-1],
+                    ),
+                    dtype=tf.float32,
+                    name="beta",
+                )
+                _ = tf.nn.fused_batch_norm(x=input_tensor, offset=beta, scale=alpha, name="bn")
+                mod, _ = relay.frontend.from_tensorflow(
+                    g.as_graph_def(), shape={"input": in_shape}, outputs=["bn"]
+                )
+                return mod["main"]
+
+        def _golden():
+            model_in = relay.var(
+                "input", relay.TensorType([1, 12, 12, 32]), span=_create_span("input")
+            )
+            alpha = relay.var("alpha", relay.TensorType([32]), span=_create_span("alpha"))
+            beta = relay.var("beta", relay.TensorType([32]), span=_create_span("beta"))
+            mean = _set_span(relay.op.mean(model_in, axis=[3], exclude=True), "bn")
+            variance_mean = _set_span(
+                relay.op.mean(model_in, axis=[3], keepdims=True, exclude=True), "bn"
+            )
+            variance = _set_span(
+                relay.op._make._variance(model_in, variance_mean, [3], False, True, False), "bn"
+            )
+            bn = _set_span(
+                relay.nn.batch_norm(model_in, alpha, beta, mean, variance, axis=3, epsilon=0.001),
+                "bn",
+            )
+            mod = ir.IRModule.from_expr(bn[0])
+            return mod["main"]
+
+        self._verify(_res, _golden)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/frontend/tensorflow/test_no_op.py b/tests/python/frontend/tensorflow/test_no_op.py
index 4f8583f71cff..bc6be5c3059c 100644
--- a/tests/python/frontend/tensorflow/test_no_op.py
+++ b/tests/python/frontend/tensorflow/test_no_op.py
@@ -22,12 +22,17 @@
 except ImportError:
     import tensorflow as tf
 import numpy as np
-from tvm import relay
+from tvm import relay, ir, testing
 from tvm.relay.frontend.tensorflow import from_tensorflow
 
 
 def run_relay(graph):
-    mod, params = from_tensorflow(graph.as_graph_def(add_shapes=True))
+    with testing.disable_span_filling():
+        mod, params = from_tensorflow(graph.as_graph_def(add_shapes=True))
+    with testing.enable_span_filling():
+        mod_with_span, _ = relay.frontend.from_tensorflow(graph.as_graph_def(add_shapes=True))
+    assert ir.structural_equal(mod["main"], mod_with_span["main"])
+
     return relay.create_executor("debug", mod=mod).evaluate()(**params)
 
 
From e41a89c6cc070bc403fd2952cf7ed5e32a95ff4d Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Mon, 9 Jan 2023 13:49:48 -0800
Subject: [PATCH 141/286] Add DisallowAsyncStridedMemCopy post processor to rem
  (#13720)

* [MetaScheduler] Add DisallowAsyncStridedMemCopy post processor to remove schedules that use async strided mem copies.

* [MetaScheduler] Add test for DisallowAsyncStridedMemCopy
---
 include/tvm/meta_schedule/postproc.h          |   6 +
 python/tvm/meta_schedule/postproc/__init__.py |   1 +
 .../disallow_async_strided_mem_copy.py        |  38 ++++
 .../disallow_async_strided_mem_copy.cc        | 189 ++++++++++++++++++
 src/tir/transforms/lower_async_dma.cc         |  11 +-
 ...ostproc_disallow_async_strided_mem_copy.py | 110 ++++++++++
 6 files changed, 351 insertions(+), 4 deletions(-)
 create mode 100644 python/tvm/meta_schedule/postproc/disallow_async_strided_mem_copy.py
 create mode 100644 src/meta_schedule/postproc/disallow_async_strided_mem_copy.cc
 create mode 100644 tests/python/unittest/test_meta_schedule_postproc_disallow_async_strided_mem_copy.py

diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index 76f8d71ad65b..24cfd4cb2167 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -109,6 +109,12 @@ class Postproc : public runtime::ObjectRef {
    * \return The postprocessor created
    */
   TVM_DLL static Postproc DisallowDynamicLoop();
+  /*!
+   * \brief Create a postprocessor that checks if all async mem copies are not strided.
+   * \param merge_async_commit_queue_scope Whether or not to merge async commit queue scope.
+   * \return The postprocessor created
+   */
+  TVM_DLL static Postproc DisallowAsyncStridedMemCopy(bool merge_async_commit_queue_scope = true);
   /*!
    * \brief Create a postprocessor that rewrites the cooperative fetch annotation to
    * actual vectorized cooperative fetching in loop bindings.
diff --git a/python/tvm/meta_schedule/postproc/__init__.py b/python/tvm/meta_schedule/postproc/__init__.py
index 0598a53e2ac1..93842200712d 100644
--- a/python/tvm/meta_schedule/postproc/__init__.py
+++ b/python/tvm/meta_schedule/postproc/__init__.py
@@ -16,6 +16,7 @@
 # under the License.
 """The tvm.meta_schedule.postproc package."""
 from .disallow_dynamic_loop import DisallowDynamicLoop
+from .disallow_async_strided_mem_copy import DisallowAsyncStridedMemCopy
 from .postproc import Postproc, PyPostproc
 from .rewrite_cooperative_fetch import RewriteCooperativeFetch
 from .rewrite_layout import RewriteLayout
diff --git a/python/tvm/meta_schedule/postproc/disallow_async_strided_mem_copy.py b/python/tvm/meta_schedule/postproc/disallow_async_strided_mem_copy.py
new file mode 100644
index 000000000000..7e0e00de2949
--- /dev/null
+++ b/python/tvm/meta_schedule/postproc/disallow_async_strided_mem_copy.py
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A postprocessor that checks if the IRModule has any strided memory copies"""
+
+from tvm._ffi.registry import register_object
+from .. import _ffi_api
+from .postproc import Postproc
+
+
+@register_object("meta_schedule.DisallowAsyncStridedMemCopy")
+class DisallowAsyncStridedMemCopy(Postproc):
+    """A postprocessor that disallows schedules that use async strided mem copies.
+
+    Parameters
+    ----------
+    merge_async_commit_queue_scope : bool
+       Whether or not to merge the async commit queue scope.
+    """
+
+    def __init__(self, merge_async_commit_queue_scope=True) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.PostprocDisallowAsyncStridedMemCopy,  # type: ignore # pylint: disable=no-member
+            merge_async_commit_queue_scope,
+        )
diff --git a/src/meta_schedule/postproc/disallow_async_strided_mem_copy.cc b/src/meta_schedule/postproc/disallow_async_strided_mem_copy.cc
new file mode 100644
index 000000000000..952810a47aee
--- /dev/null
+++ b/src/meta_schedule/postproc/disallow_async_strided_mem_copy.cc
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace tir {
+
+/*! \brief Check if an IRModule has any async strided mem copies. */
+struct AsyncStridedMemCopyFinder : private StmtExprVisitor {
+ public:
+  static bool Find(const IRModule& mod) {
+    AsyncStridedMemCopyFinder finder;
+    for (const auto& kv : mod->functions) {
+      if (const auto* prim_func = kv.second.as<PrimFuncNode>()) {
+        finder(prim_func->body);
+        if (finder.found_) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+ private:
+  void VisitStmt_(const ForNode* loop) final {
+    if (!found_) {
+      input_iters.Set(loop->loop_var, Range(loop->min, loop->extent));
+      StmtExprVisitor::VisitStmt_(loop);
+    }
+  }
+
+  void VisitStmt_(const AttrStmtNode* attrStmt) final {
+    if (!found_) {
+      if (attrStmt->attr_key == tir::attr::async_commit_queue_scope) {
+        auto async_scope = attrStmt->body.as<AttrStmtNode>();
+        if (!async_scope) {
+          StmtExprVisitor::VisitStmt_(attrStmt);
+        }
+
+        auto for_loop = async_scope->body.as<ForNode>();
+        if (!for_loop) {
+          StmtExprVisitor::VisitStmt_(attrStmt);
+        }
+
+        input_iters.Set(for_loop->loop_var, Range(for_loop->min, for_loop->extent));
+
+        auto bufferstorenode = for_loop->body.as<BufferStoreNode>();
+        if (!bufferstorenode) {
+          StmtExprVisitor::VisitStmt_(attrStmt);
+        }
+
+        auto bufferloadnode = bufferstorenode->value.as<BufferLoadNode>();
+        if (!bufferloadnode) {
+          StmtExprVisitor::VisitStmt_(attrStmt);
+        }
+
+        // get store buffer; assert it exists and is contiguous given it uses a single index
+        auto bufferstore = bufferstorenode->buffer.as<BufferNode>();
+
+        // get load buffer; assert it exists and is contiguous given it uses a single index
+        auto bufferload = bufferloadnode->buffer.as<BufferNode>();
+
+        if (!bufferstore || !bufferload) {
+          StmtExprVisitor::VisitStmt_(attrStmt);
+        }
+
+        // map loop variable to zero for the store index & simplify
+        Array<PrimExpr> store_index = bufferstorenode->indices;
+
+        // Use DetectIterMap to detect whether store index is non-contiguous.
+        arith::Analyzer analyzer;
+        auto store_iter_map = DetectIterMap(store_index, input_iters, 1,
+                                            arith::IterMapLevel::Surjective, &analyzer, false);
+        if (!store_iter_map->errors.empty()) {
+          found_ = true;
+        }
+
+        // map loop variable to zero for the load index & simplify
+        Array<PrimExpr> load_index = bufferloadnode->indices;
+
+        // Use DetectIterMap to detect whether load index is non-contiguous.
+        auto load_iter_map = DetectIterMap(load_index, input_iters, 1,
+                                           arith::IterMapLevel::Surjective, &analyzer, false);
+        if (!load_iter_map->errors.empty()) {
+          found_ = true;
+        }
+      }
+      if (!found_) {
+        StmtExprVisitor::VisitStmt_(attrStmt);
+      }
+    }
+  }
+
+  bool found_ = false;
+  Map<Var, Range> input_iters = Map<Var, Range>();
+};
+
+}  // namespace tir
+
+namespace meta_schedule {
+
+/*! \brief Check if the IRModule has any loop with non-constant extent. */
+class DisallowAsyncStridedMemCopyNode : public PostprocNode {
+ public:
+  // Inherited from PostprocNode
+  void InitializeWithTuneContext(const TuneContext& context) final {}
+  // Inherited from PostprocNode
+  bool Apply(const tir::Schedule& sch) final {
+    IRModule mod = sch->mod();
+    for (const auto& kv : mod->functions) {
+      const GlobalVar& g_var = kv.first;
+      const BaseFunc& base_func = kv.second;
+      if (const auto* prim_func = base_func.as<tir::PrimFuncNode>()) {
+        IRModule lowered{nullptr};
+        try {
+          auto pass_list = Array<tvm::transform::Pass>();
+          pass_list.push_back(tir::transform::LowerInitBlock());
+          pass_list.push_back(tir::transform::PlanAndUpdateBufferAllocationLocation());
+          pass_list.push_back(tir::transform::ConvertBlocksToOpaque());
+          pass_list.push_back(tir::transform::CompactBufferAllocation());
+          pass_list.push_back(tir::transform::LowerMatchBuffer());
+          pass_list.push_back(tir::transform::InjectSoftwarePipeline());
+          pass_list.push_back(tir::transform::LowerOpaqueBlock());
+          pass_list.push_back(tir::transform::FlattenBuffer());
+          pass_list.push_back(tir::transform::BF16Legalize());
+          pass_list.push_back(tir::transform::NarrowDataType(32));
+          pass_list.push_back(tir::transform::Simplify());
+          pass_list.push_back(tir::transform::InjectVirtualThread());
+          pass_list.push_back(tir::transform::InjectDoubleBuffer());
+          pass_list.push_back(tir::transform::VectorizeLoop(true));
+          pass_list.push_back(tir::transform::StorageRewrite());
+          transform::PassContext pass_ctx = transform::PassContext::Current();
+          pass_ctx->config.Set("tir.merge_async_commit_queue_scope",
+                               Bool(merge_async_commit_queue_scope));
+          tir::PrimFunc f = WithAttr(GetRef<tir::PrimFunc>(prim_func), "global_symbol",
+                                     runtime::String(g_var->name_hint));
+          IRModule mod = IRModule(Map<GlobalVar, BaseFunc>({{GlobalVar(g_var->name_hint), f}}));
+          lowered = tvm::transform::Sequential(pass_list)(std::move(mod));
+        } catch (const dmlc::Error& e) {
+          return false;
+        }
+        if (tir::AsyncStridedMemCopyFinder::Find(lowered)) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+  // Inherited from PostprocNode
+  Postproc Clone() const {
+    ObjectPtr<DisallowAsyncStridedMemCopyNode> n =
+        make_object<DisallowAsyncStridedMemCopyNode>(*this);
+    return Postproc(n);
+  }
+
+  bool merge_async_commit_queue_scope = true;
+
+  static constexpr const char* _type_key = "meta_schedule.DisallowAsyncStridedMemCopy";
+  TVM_DECLARE_FINAL_OBJECT_INFO(DisallowAsyncStridedMemCopyNode, PostprocNode);
+};
+
+Postproc Postproc::DisallowAsyncStridedMemCopy(bool merge_async_commit_queue_scope) {
+  ObjectPtr<DisallowAsyncStridedMemCopyNode> n = make_object<DisallowAsyncStridedMemCopyNode>();
+  n->merge_async_commit_queue_scope = merge_async_commit_queue_scope;
+  return Postproc(n);
+}
+
+TVM_REGISTER_NODE_TYPE(DisallowAsyncStridedMemCopyNode);
+TVM_REGISTER_GLOBAL("meta_schedule.PostprocDisallowAsyncStridedMemCopy")
+    .set_body_typed(Postproc::DisallowAsyncStridedMemCopy);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/tir/transforms/lower_async_dma.cc b/src/tir/transforms/lower_async_dma.cc
index 5abdc5da84d7..57cff7985f1c 100644
--- a/src/tir/transforms/lower_async_dma.cc
+++ b/src/tir/transforms/lower_async_dma.cc
@@ -119,6 +119,9 @@ class AsyncDMALowerer : public StmtExprMutator {
         return StmtExprMutator::VisitStmt_(op);
       }
 
+      // Add the current loop to the input iters mapping.
+      input_iters.Set(for_loop->loop_var, Range(for_loop->min, for_loop->extent));
+
       // 3) for loop contains buffer store with single index
       auto bufferstorenode = for_loop->body.as<BufferStoreNode>();
       if (!bufferstorenode || bufferstorenode->indices.size() != 1) {
@@ -156,8 +159,8 @@ class AsyncDMALowerer : public StmtExprMutator {
 
       // Use DetectIterMap to detect whether store index is non-contiguous.
       arith::Analyzer analyzer;
-      auto store_iter_map = DetectIterMap(store_index, input_iters, 1, arith::IterMapLevel::NoCheck,
-                                          &analyzer, false);
+      auto store_iter_map = DetectIterMap(store_index, input_iters, 1,
+                                          arith::IterMapLevel::Surjective, &analyzer, false);
       if (!store_iter_map->errors.empty()) {
         LOG(FATAL)
             << "Unable to lower async dma for non contiguous memory access with store index: "
@@ -173,8 +176,8 @@ class AsyncDMALowerer : public StmtExprMutator {
       Array<PrimExpr> load_index = bufferloadnode->indices;
 
       // Use DetectIterMap to detect whether load index is non-contiguous.
-      auto load_iter_map =
-          DetectIterMap(load_index, input_iters, 1, arith::IterMapLevel::NoCheck, &analyzer, false);
+      auto load_iter_map = DetectIterMap(load_index, input_iters, 1,
+                                         arith::IterMapLevel::Surjective, &analyzer, false);
       if (!load_iter_map->errors.empty()) {
         LOG(FATAL) << "Unable to lower async dma for non contiguous memory access with load index: "
                    << load_index;
diff --git a/tests/python/unittest/test_meta_schedule_postproc_disallow_async_strided_mem_copy.py b/tests/python/unittest/test_meta_schedule_postproc_disallow_async_strided_mem_copy.py
new file mode 100644
index 000000000000..046bd7220f01
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_postproc_disallow_async_strided_mem_copy.py
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+
+import tvm
+from tvm import meta_schedule as ms
+from tvm import tir
+from tvm.script import tir as T
+from tvm.target import Target
+
+
+def _target() -> Target:
+    return Target("hexagon", host="llvm")
+
+
+def _create_context(mod, target) -> ms.TuneContext:
+    ctx = ms.TuneContext(
+        mod=mod,
+        target=target,
+        space_generator=ms.space_generator.PostOrderApply(
+            sch_rules=[],
+            postprocs=[
+                ms.postproc.DisallowAsyncStridedMemCopy(),
+            ],
+            mutator_probs={},
+        ),
+        task_name="test",
+    )
+    return ctx
+
+
+# pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
+# fmt: off
+
+@tvm.script.ir_module
+class Matmul:
+    @T.prim_func
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main"})
+        A = T.match_buffer(a, (1024, 1024), "float32")
+        B = T.match_buffer(b, (1024, 1024), "float32")
+        C = T.match_buffer(c, (1024, 1024), "float32")
+        for i, j, k in T.grid(1024, 1024, 1024):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+# fmt: on
+# pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
+
+
+def test_postproc_disallow_async_strided_mem_copy_allows():
+    mod = Matmul
+    sch = tir.Schedule(mod, debug_mask="all")
+
+    matmul_block = sch.get_block("matmul")
+
+    loops = sch.get_loops(matmul_block)
+    cache_read = sch.cache_read(matmul_block, 0, "global.vtcm")
+
+    sch.compute_at(cache_read, loops[1])
+
+    sch.annotate(loops[1], "software_pipeline_stage", [0, 1])
+    sch.annotate(loops[1], "software_pipeline_order", [0, 1])
+    sch.annotate(loops[1], "software_pipeline_async_stages", [0])
+
+    ctx = _create_context(sch.mod, target=_target())
+    sch.mod.show()
+    assert ctx.space_generator.postprocs[0].apply(sch)
+
+
+def test_postproc_disallow_async_strided_mem_copy_disallows():
+    mod = Matmul
+    sch = tir.Schedule(mod, debug_mask="all")
+
+    matmul_block = sch.get_block("matmul")
+
+    loops = sch.get_loops(matmul_block)
+    # Make it a strided mem copy.
+    cache_read = sch.cache_read(matmul_block, 1, "global.vtcm")
+
+    sch.compute_at(cache_read, loops[1])
+    sch.annotate(loops[1], "software_pipeline_stage", [0, 1])
+    sch.annotate(loops[1], "software_pipeline_order", [0, 1])
+    sch.annotate(loops[1], "software_pipeline_async_stages", [0])
+
+    sch.mod.show()
+    ctx = _create_context(sch.mod, target=_target())
+    assert not ctx.space_generator.postprocs[0].apply(sch)
+
+
+if __name__ == "__main__":
+    test_postproc_disallow_async_strided_mem_copy_allows()
+    test_postproc_disallow_async_strided_mem_copy_disallows()

From 0ddeb28d25a0fadd63ae74d4712817068f50faae Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 9 Jan 2023 14:24:32 -0800
Subject: [PATCH 142/286] [microTVM][Zephyr] Fix flash command for nrfjprog
 (#13723)

This PR adds serial number to flash command (nrfjprog) to fix cases where multiple devices are available.
---
 .../template_project/microtvm_api_server.py   | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index e93918e44844..0bafd7066d70 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -262,7 +262,11 @@ def _get_openocd_device_args(serial_number: str = None):
     return ["--serial", generic_find_serial_port(serial_number)]
 
 
-def _get_nrf_device_args(serial_number: str = None):
+def _get_nrf_device_args(serial_number: str = None) -> list:
+    # iSerial has string type which could mistmatch with
+    # the output of `nrfjprog --ids`. Example: 001050007848 vs 1050007848
+    serial_number = serial_number.lstrip("0")
+
     nrfjprog_args = ["nrfjprog", "--ids"]
     nrfjprog_ids = subprocess.check_output(nrfjprog_args, encoding="utf-8")
     if not nrfjprog_ids.strip("\n"):
@@ -276,9 +280,7 @@ def _get_nrf_device_args(serial_number: str = None):
             )
 
         if serial_number not in boards:
-            raise BoardError(
-                f"serial number ({serial_number}) not found in {nrfjprog_args}: {boards}"
-            )
+            raise BoardError(f"serial number ({serial_number}) not found in {boards}")
 
         return ["--snr", serial_number]
 
@@ -721,23 +723,27 @@ def flash(self, options):
         if _find_platform_from_cmake_file(API_SERVER_DIR / CMAKELIST_FILENAME):
             return  # NOTE: qemu requires no flash step--it is launched from open_transport.
 
+        flash_runner = _get_flash_runner()
         # The nRF5340DK requires an additional `nrfjprog --recover` before each flash cycle.
         # This is because readback protection is enabled by default when this device is flashed.
         # Otherwise, flashing may fail with an error such as the following:
         #  ERROR: The operation attempted is unavailable due to readback protection in
         #  ERROR: your device. Please use --recover to unlock the device.
         zephyr_board = _find_board_from_cmake_file(API_SERVER_DIR / CMAKELIST_FILENAME)
-        if zephyr_board.startswith("nrf5340dk") and _get_flash_runner() == "nrfjprog":
+        if zephyr_board.startswith("nrf5340dk") and flash_runner == "nrfjprog":
             recover_args = ["nrfjprog", "--recover"]
             recover_args.extend(_get_nrf_device_args(serial_number))
             check_call(recover_args, cwd=API_SERVER_DIR / "build")
 
         flash_extra_args = []
-        if _get_flash_runner() == "openocd" and serial_number:
-            flash_extra_args = ["--cmd-pre-init", f"""hla_serial {serial_number}"""]
+        if flash_runner == "openocd" and serial_number:
+            flash_extra_args += ["--cmd-pre-init", f"""hla_serial {serial_number}"""]
+
+        if flash_runner == "nrfjprog":
+            flash_extra_args += _get_nrf_device_args(serial_number)
 
         check_call(
-            west_cmd_list + ["flash", "-r", _get_flash_runner()] + flash_extra_args,
+            west_cmd_list + ["flash", "-r", flash_runner] + flash_extra_args,
             cwd=API_SERVER_DIR / "build",
         )
 

From 7f74170ffcc6af5c8575d78cb9f8cadafe64e22f Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 9 Jan 2023 17:10:04 -0800
Subject: [PATCH 143/286] [CI] Fix MLF input and output name map (#13740)

This PR fixes error introduced by #13704
---
 python/tvm/micro/model_library_format.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 2e4b00da289c..df5170f0e025 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -468,12 +468,6 @@ def _export_graph_model_library_format(
             input_names = list(inputs.keys())
             output_names = list(outputs.keys())
 
-            input_name_to_size_map = {
-                name: property_map["size"] for name, property_map in inputs_sizes.items()
-            }
-            output_name_to_size_map = {
-                name: property_map["size"] for name, property_map in output_sizes.items()
-            }
             generate_c_interface_header(
                 mod.libmod_name,
                 input_names,
@@ -483,8 +477,8 @@ def _export_graph_model_library_format(
                 devices,
                 workspace_size,
                 include_path,
-                input_name_to_size_map,
-                output_name_to_size_map,
+                inputs_sizes,
+                output_sizes,
             )
 
         is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)

From 9d0caeb3f929cf648c5c54b1645328f56045e07c Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Tue, 10 Jan 2023 15:45:38 +0800
Subject: [PATCH 144/286] [TE][PrimFunc] Fix create primfunc from te extern
 with explicit buffer load (#13729)

fix create primfunc from te extern with explicit buffer load
---
 src/te/operation/create_primfunc.cc           | 43 ++++++++++++++++++-
 src/tir/ir/specialize.cc                      |  6 ++-
 .../unittest/test_te_create_primfunc.py       | 30 +++++++++++++
 3 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 21456af1bdf4..92186a4ffea4 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -60,6 +60,44 @@ class ProducerToBufferTransformer : public StmtExprMutator {
   const std::unordered_map<te::Tensor, Buffer>& tensor2buffers_;
 };
 
+/*! \brief The helper mutator to rewrite buffer and buffer var accessed by block body */
+class BufferSubstituter : public StmtExprMutator {
+ public:
+  explicit BufferSubstituter(const std::unordered_map<const VarNode*, PrimExpr>& var_map,
+                             const std::unordered_map<const BufferNode*, Buffer>& buffer_map)
+      : var_map_(var_map), buffer_map_(buffer_map) {}
+
+  PrimExpr VisitExpr_(const VarNode* op) final {
+    auto it = var_map_.find(op);
+    if (it != var_map_.end()) {
+      return it->second;
+    }
+    return StmtExprMutator::VisitExpr_(op);
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
+    auto load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    auto it = buffer_map_.find(load->buffer.get());
+    if (it != buffer_map_.end()) {
+      return BufferLoad(it->second, load->indices, load->span);
+    }
+    return load;
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* op) final {
+    auto store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    auto it = buffer_map_.find(store->buffer.get());
+    if (it != buffer_map_.end()) {
+      return BufferStore(it->second, store->value, store->indices, store->span);
+    }
+    return store;
+  }
+
+ private:
+  const std::unordered_map<const VarNode*, PrimExpr>& var_map_;
+  const std::unordered_map<const BufferNode*, Buffer>& buffer_map_;
+};
+
 /*! \brief Helper data structure to store information. */
 struct CreateFuncInfo {
   /*! \brief The Tensor arg_list. */
@@ -364,6 +402,7 @@ Stmt GenerateStmtFromCompute(const te::ComputeOp& compute_op, CreateFuncInfo* in
 Stmt GenerateStmtFromExternOp(const te::ExternOp& extern_op, CreateFuncInfo* info) {
   // Step 1. Check all inputs are visited before and update var_map.
   std::unordered_map<const VarNode*, PrimExpr> var_map;
+  std::unordered_map<const BufferNode*, Buffer> input_buffer_map;
   ICHECK_EQ(extern_op->inputs.size(), extern_op->input_placeholders.size());
   for (size_t i = 0; i < extern_op->inputs.size(); ++i) {
     const Buffer& placeholder = extern_op->input_placeholders[i];
@@ -371,6 +410,7 @@ Stmt GenerateStmtFromExternOp(const te::ExternOp& extern_op, CreateFuncInfo* inf
     auto it = info->tensor2buffers.find(input_tensor);
     ICHECK(it != info->tensor2buffers.end());
     var_map[placeholder->data.get()] = it->second->data;
+    input_buffer_map[placeholder.get()] = it->second;
   }
 
   // Step 2. Update info with its output tensor and placeholder buffer.
@@ -394,7 +434,8 @@ Stmt GenerateStmtFromExternOp(const te::ExternOp& extern_op, CreateFuncInfo* inf
     writes.push_back(BufferRegion::FullRegion(buffer));
   }
 
-  Stmt body = Substitute(extern_op->body, var_map);
+  BufferSubstituter substituter(var_map, input_buffer_map);
+  Stmt body = substituter(extern_op->body);
 
   // Step 4. Generate opaque block as body.
   return BlockRealize(/*iter_values=*/{},
diff --git a/src/tir/ir/specialize.cc b/src/tir/ir/specialize.cc
index ea68015bc73b..7ead6e6ae6fb 100644
--- a/src/tir/ir/specialize.cc
+++ b/src/tir/ir/specialize.cc
@@ -128,7 +128,8 @@ class PrimFuncSpecializer : public StmtExprMutator {
     Array<BufferRegion> writes = op->writes.Map(
         std::bind(&PrimFuncSpecializer::MutateBufferRegion, this, std::placeholders::_1));
 
-    if (alloc_buffers.same_as(op->alloc_buffers) && reads.same_as(op->reads)) {
+    if (alloc_buffers.same_as(op->alloc_buffers) && reads.same_as(op->reads) &&
+        writes.same_as(op->writes)) {
       return GetRef<Block>(op);
     } else {
       ObjectPtr<BlockNode> n = CopyOnWrite(op);
@@ -238,12 +239,13 @@ class PrimFuncSpecializer : public StmtExprMutator {
 
   BufferRegion MutateBufferRegion(const BufferRegion& buffer_region) {
     auto it = buffer_map_.find(buffer_region->buffer);
+    const Buffer& buffer = it != buffer_map_.end() ? it->second : buffer_region->buffer;
     Array<Range> region = buffer_region->region.Map(
         std::bind(&PrimFuncSpecializer::MutateRange, this, std::placeholders::_1));
     if (it == buffer_map_.end() && region.same_as(buffer_region->region)) {
       return buffer_region;
     } else {
-      return BufferRegion(it->second, std::move(region));
+      return BufferRegion(buffer, std::move(region));
     }
   }
 
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index c13ede08313d..f78dc458d9d3 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -689,5 +689,35 @@ def test_argmax():
     tvm.ir.assert_structural_equal(prim_func, argmax_expected)
 
 
+def test_extern_with_explicit_buffer_access():
+    def te_extern():
+        A = te.placeholder((128, 128), name="A")
+        B = te.placeholder((128, 128), name="B")
+        P = te.placeholder((1,), name="P")
+        C = te.extern(
+            (128, 128),
+            [A, B, P],
+            lambda ins, outs: tvm.tir.call_extern(
+                "", "myfunc", ins[0].data, ins[1].data, outs[0].data, ins[2][0]
+            ),
+            name="C",
+        )
+        return [A, B, P, C]
+
+    @T.prim_func
+    def tir_extern(var_A: T.handle, var_B: T.handle, var_P: T.handle, var_C: T.handle):
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(var_A, [128, 128], dtype="float32", offset_factor=1)
+        B = T.match_buffer(var_B, [128, 128], dtype="float32", offset_factor=1)
+        P = T.match_buffer(var_P, [1], dtype="float32", offset_factor=1)
+        C = T.match_buffer(var_C, [128, 128], dtype="float32", offset_factor=1)
+        with T.block("C"):
+            T.reads(A[0:128, 0:128], B[0:128, 0:128], P[0])
+            T.writes(C[0:128, 0:128])
+            T.call_extern("myfunc", A.data, B.data, C.data, P[0], dtype="")
+
+    _check_workload(te_extern, tir_extern)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 46546e08d5fbbbb87c53c74fd6d1ebbf94423e79 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Tue, 10 Jan 2023 09:52:15 +0000
Subject: [PATCH 145/286] Add Name Transforms for Rust style (#13706)

Adding transformations to Rust-style alongside the existing C-style transforms.

Co-authored-by: Ashutosh Parkhi <ashutosh.parkhi@arm.com>
---
 src/relay/backend/name_transforms.cc | 51 ++++++++++++++++++++++------
 src/relay/backend/name_transforms.h  | 24 +++++++++++++
 tests/cpp/name_transforms_test.cc    | 50 +++++++++++++++++++++++++--
 3 files changed, 113 insertions(+), 12 deletions(-)

diff --git a/src/relay/backend/name_transforms.cc b/src/relay/backend/name_transforms.cc
index 4f364b811bcc..a527d38fb84e 100644
--- a/src/relay/backend/name_transforms.cc
+++ b/src/relay/backend/name_transforms.cc
@@ -29,27 +29,34 @@ namespace tvm {
 namespace relay {
 namespace backend {
 
-std::string ToCFunctionStyle(const std::string& original_name) {
-  ICHECK(!original_name.empty()) << "Function name is empty";
-  ICHECK_EQ(original_name.find("TVM"), 0) << "Function not TVM prefixed";
-
-  int tvm_prefix_length = 3;
-  std::string function_name("TVM");
+std::string ToCamel(const std::string& original_name) {
+  std::string camel_name;
+  camel_name.reserve(original_name.size());
 
   bool new_block = true;
-  for (const char& symbol : original_name.substr(tvm_prefix_length)) {
+  for (const char& symbol : original_name) {
     if (std::isalpha(symbol)) {
       if (new_block) {
-        function_name.push_back(std::toupper(symbol));
+        camel_name.push_back(std::toupper(symbol));
         new_block = false;
       } else {
-        function_name.push_back(std::tolower(symbol));
+        camel_name.push_back(std::tolower(symbol));
       }
     } else if (symbol == '_') {
       new_block = true;
     }
   }
-  return function_name;
+  return camel_name;
+}
+
+std::string ToCFunctionStyle(const std::string& original_name) {
+  ICHECK(!original_name.empty()) << "Function name is empty";
+  ICHECK_EQ(original_name.find("TVM"), 0) << "Function not TVM prefixed";
+
+  int tvm_prefix_length = 3;
+  std::string function_prefix("TVM");
+
+  return function_prefix + ToCamel(original_name.substr(tvm_prefix_length));
 }
 
 std::string ToCVariableStyle(const std::string& original_name) {
@@ -71,6 +78,30 @@ std::string ToCConstantStyle(const std::string& original_name) {
   return constant_name;
 }
 
+std::string ToRustStructStyle(const std::string& original_name) {
+  ICHECK(!original_name.empty()) << "Struct name is empty";
+  return ToCamel(original_name);
+}
+
+std::string ToRustMacroStyle(const std::string& original_name) {
+  ICHECK(!original_name.empty()) << "Macro name is empty";
+
+  std::string macro_name;
+  macro_name.resize(original_name.size());
+
+  std::transform(original_name.begin(), original_name.end(), macro_name.begin(), ::tolower);
+  return macro_name;
+}
+
+std::string ToRustConstantStyle(const std::string& original_name) {
+  ICHECK(!original_name.empty()) << "Constant name is empty";
+  std::string constant_name;
+  constant_name.resize(original_name.size());
+
+  std::transform(original_name.begin(), original_name.end(), constant_name.begin(), ::toupper);
+  return constant_name;
+}
+
 std::string CombineNames(const Array<String>& names) {
   std::stringstream combine_stream;
   ICHECK(!names.empty()) << "Name segments empty";
diff --git a/src/relay/backend/name_transforms.h b/src/relay/backend/name_transforms.h
index f59280af2222..fab518debc63 100644
--- a/src/relay/backend/name_transforms.h
+++ b/src/relay/backend/name_transforms.h
@@ -79,6 +79,30 @@ std::string ToCVariableStyle(const std::string& original_name);
  */
 std::string ToCConstantStyle(const std::string& original_name);
 
+/*!
+ * \brief Transform a name to the Rust struct style assuming it is
+ * appropriately constructed using the combining functions
+ * \param name Original name
+ * \return Transformed function in the Rust struct style
+ */
+std::string ToRustStructStyle(const std::string& original_name);
+
+/*!
+ * \brief Transform a name to the Rust macro style assuming it is
+ * appropriately constructed using the combining functions
+ * \param name Original name
+ * \return Transformed function in the Rust macro style
+ */
+std::string ToRustMacroStyle(const std::string& original_name);
+
+/*!
+ * \brief Transform a name to the Rust constant style assuming it is
+ * appropriately constructed using the combining functions
+ * \param name Original name
+ * \return Transformed function in the Rust constant style
+ */
+std::string ToRustConstantStyle(const std::string& original_name);
+
 /*!
  * \brief Combine names together for use as a generated name
  * \param names Vector of strings to combine
diff --git a/tests/cpp/name_transforms_test.cc b/tests/cpp/name_transforms_test.cc
index 12a2ce1d0761..7e3cfe1d779c 100644
--- a/tests/cpp/name_transforms_test.cc
+++ b/tests/cpp/name_transforms_test.cc
@@ -23,15 +23,20 @@
 #include <tvm/runtime/container/string.h>
 #include <tvm/runtime/name_transforms.h>
 
-using namespace tvm::relay::backend;
+namespace tvm {
+namespace relay {
+namespace backend {
+
 using namespace tvm::runtime;
 
+std::string ToCamel(const std::string& original_name);
+
 TEST(NameTransforms, ToCFunctionStyle) {
   ASSERT_EQ(ToCFunctionStyle("TVM_Woof"), "TVMWoof");
   ASSERT_EQ(ToCFunctionStyle("TVM_woof"), "TVMWoof");
   ASSERT_EQ(ToCFunctionStyle("TVM_woof_woof"), "TVMWoofWoof");
   ASSERT_EQ(ToCFunctionStyle("TVMGen_woof_woof"), "TVMGenWoofWoof");
-  EXPECT_THROW(ToCVariableStyle("Cake_Bakery"), InternalError);  // Incorrect prefix
+  EXPECT_THROW(ToCFunctionStyle("Cake_Bakery"), InternalError);  // Incorrect prefix
   EXPECT_THROW(ToCFunctionStyle(""), InternalError);
 }
 
@@ -51,6 +56,27 @@ TEST(NameTransforms, ToCConstantStyle) {
   EXPECT_THROW(ToCConstantStyle(""), InternalError);
 }
 
+TEST(NameTransforms, ToRustStructStyle) {
+  ASSERT_EQ(ToRustStructStyle("Woof"), "Woof");
+  ASSERT_EQ(ToRustStructStyle("woof"), "Woof");
+  ASSERT_EQ(ToRustStructStyle("woof_woof"), "WoofWoof");
+  EXPECT_THROW(ToRustStructStyle(""), InternalError);
+}
+
+TEST(NameTransforms, ToRustMacroStyle) {
+  ASSERT_EQ(ToRustMacroStyle("Woof"), "woof");
+  ASSERT_EQ(ToRustMacroStyle("woof"), "woof");
+  ASSERT_EQ(ToRustMacroStyle("woof_Woof"), "woof_woof");
+  EXPECT_THROW(ToRustMacroStyle(""), InternalError);
+}
+
+TEST(NameTransforms, ToRustConstantStyle) {
+  ASSERT_EQ(ToRustConstantStyle("Woof"), "WOOF");
+  ASSERT_EQ(ToRustConstantStyle("woof"), "WOOF");
+  ASSERT_EQ(ToRustConstantStyle("woof_Woof"), "WOOF_WOOF");
+  EXPECT_THROW(ToRustConstantStyle(""), InternalError);
+}
+
 TEST(NameTransforms, PrefixName) {
   ASSERT_EQ(PrefixName({"Woof"}), "TVM_Woof");
   ASSERT_EQ(PrefixName({"woof"}), "TVM_woof");
@@ -94,3 +120,23 @@ TEST(NameTransforms, CombinedLogic) {
   ASSERT_EQ(ToCVariableStyle(PrefixName({"Device", "target", "t"})), "tvm_device_target_t");
   ASSERT_EQ(ToCVariableStyle(PrefixGeneratedName({"model", "Devices"})), "tvmgen_model_devices");
 }
+
+TEST(NameTransforms, Internal_ToCamel) {
+  ASSERT_EQ(ToCamel("Woof"), "Woof");
+  ASSERT_EQ(ToCamel("woof"), "Woof");
+  ASSERT_EQ(ToCamel("woof_woof"), "WoofWoof");
+}
+
+TEST(NameTransforms, Internal_ToCamel_Allocation) {
+  std::string woof = "Woof_woof_woof_woof";
+  std::string camel = ToCamel(woof);
+  std::string check;
+  check.reserve(woof.size());
+
+  // Check that the pre-allocation happens
+  ASSERT_EQ(camel.capacity(), check.capacity());
+}
+
+}  // namespace backend
+}  // namespace relay
+}  // namespace tvm

From 28c09f686ca44bdc1c35d2563ba73deb519f308d Mon Sep 17 00:00:00 2001
From: krishnaraj36 <45380557+krishnaraj36@users.noreply.github.com>
Date: Tue, 10 Jan 2023 17:09:30 +0530
Subject: [PATCH 146/286] [COLLAGE] Add more customization to support more
 targets (#13450)

* [COLLAGE] Add more customization to support more targets

1. Added custom cost module to provide a provision to incorporate
custom cost estimator python function instead using default cost function.
   eg: cost_estimator = CustomCostEstimator(py_fn_estimator="tvm.relay.collage.opencl_cost_estimator")
       mod = CollagePartition(config, cost_estimator=cost_estimator)(mod)
2. Added provision to select BYOC fusion style for all compiler target.
   eg : config = { "relay.collage.byoc_fusion_style": ["compiler.NoFusion", "compiler.TVMFusion"]}
        ctxt = tvm.transform.PassContext(config=config)

* Fix the lint errors

* Fix the lint error whitespace

* Fix the lint error tabs

* Fix the lint error tabs

* Fix the lint error tabs

* move the clml collage test case to test_clml

* Fix lint error whitespace

* Fix the import error

* Fix the envirnoment var and import

* Add comments

* Add clml preprocess module in cost estimator

* Fix whitespace lint error

* Fix whitespace lint error

* Fix whitespace lint error

* Fix the comments and removed unwanted code

* Fix whitespace error

* Removed Todo comments

* Removed TODO comments

* Updated naming convension

* Fix typo error

* Fixe the typo error

* Corrected typo error

* Corrected typo error

* Removed unused and fix typo error

* Removed redundent code and optimize the code

* Fix the lint error

* Fix whitespace lint error

* Removed Prints in file

* Fix lint error

* Fix lint error

* Removed runner template in test script

* Fix the lint error

* Fix lint error

* Fix lint error

* Fix the lint error

* Fix the lint error

Co-authored-by: kvegiraj <kvegiraj@qti.qualcomm.com>
---
 python/tvm/relay/collage/__init__.py          |   1 +
 python/tvm/relay/collage/collage.py           |   8 +
 python/tvm/relay/op/contrib/clml.py           |  20 +
 src/relay/collage/collage_partitioner.cc      |   2 +-
 src/relay/collage/custom_cost_estimator.cc    |  60 +++
 src/relay/collage/custom_cost_estimator.h     |  67 ++++
 src/relay/collage/gather_partition_specs.cc   |  35 +-
 src/relay/collage/utils.cc                    |  13 +
 src/relay/collage/utils.h                     |   6 +
 .../test_clml/test_adreno_collage_targets.py  | 354 ++++++++++++++++++
 .../relay/collage/demo_collage_partitioner.py |   6 +
 11 files changed, 567 insertions(+), 5 deletions(-)
 create mode 100644 src/relay/collage/custom_cost_estimator.cc
 create mode 100644 src/relay/collage/custom_cost_estimator.h
 create mode 100644 tests/python/contrib/test_clml/test_adreno_collage_targets.py

diff --git a/python/tvm/relay/collage/__init__.py b/python/tvm/relay/collage/__init__.py
index ff0d4866069e..b3b485ead40b 100644
--- a/python/tvm/relay/collage/__init__.py
+++ b/python/tvm/relay/collage/__init__.py
@@ -21,4 +21,5 @@
     WARMUP_MIN_REPEAT_MS,
     CostEstimator,
     MockCostEstimator,
+    CustomCostEstimator,
 )
diff --git a/python/tvm/relay/collage/collage.py b/python/tvm/relay/collage/collage.py
index 632ab1746f51..cfc527c2b977 100644
--- a/python/tvm/relay/collage/collage.py
+++ b/python/tvm/relay/collage/collage.py
@@ -52,6 +52,14 @@ def __init__(self, target_costs, max_estimates=0):
         self.__init_handle_by_constructor__(_ffi_api.MockCostEstimator, target_costs, max_estimates)
 
 
+@register_object("relay.collage.CustomCostEstimator")
+class CustomCostEstimator(Object):
+    """CustomEstimator class"""
+
+    def __init__(self, py_fn_estimator="tvm.relay.collage.estimate_seconds_custom"):
+        self.__init_handle_by_constructor__(_ffi_api.CustomCostEstimator, py_fn_estimator)
+
+
 def arg_for(arg_type, device):
     """Returns a test argument of Relay arg_type on device"""
     assert isinstance(arg_type, tvm.ir.TensorType)
diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
index 77882917b1ad..e6e535edc068 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -23,6 +23,7 @@
 from tvm._ffi import register_func
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
+from tvm.relay import function as _function
 from tvm.relay.expr_functor import ExprMutator
 from tvm.relay.expr import Call, TupleGetItem
 
@@ -161,6 +162,25 @@ def alter_conv(attrs, inputs, tinfos, out_type):
     return preprocessed_mod
 
 
+def preprocess_for_clml(mod):
+    """Preprocessing pass to alter the layouts for CLML compiler target"""
+
+    for _var in mod.get_global_vars():
+        if _var.name_hint == "main":
+            continue
+        fn = mod[_var.name_hint]
+        if "Compiler" in fn.attrs.keys() and fn.attrs["Compiler"] == "clml":
+            new_fn = fn.body
+            clml_mod = tvm.IRModule.from_expr(new_fn)
+            with tvm.transform.PassContext(opt_level=3):
+                clml_mod = preprocess_module(clml_mod)
+            new_body = clml_mod["main"].body
+            mod[_var.name_hint] = _function.Function(
+                fn.params, new_body, fn.ret_type, fn.type_params, fn.attrs
+            )
+    return mod
+
+
 @register_pattern_table("clml")
 def clml_pattern_table():
     """Get the CLML pattern table."""
diff --git a/src/relay/collage/collage_partitioner.cc b/src/relay/collage/collage_partitioner.cc
index ac038fba2a8c..54fc6c45ca70 100644
--- a/src/relay/collage/collage_partitioner.cc
+++ b/src/relay/collage/collage_partitioner.cc
@@ -55,7 +55,7 @@ namespace {
 
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.collage.tvm_max_depth", Integer);
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.collage.byoc_max_depth", Integer);
-
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.collage.byoc_fusion_style", Array<String>);
 /*!
  * \brief Represents the overall expression after some number of non-overlapping candidate
  * partitions have been applied.
diff --git a/src/relay/collage/custom_cost_estimator.cc b/src/relay/collage/custom_cost_estimator.cc
new file mode 100644
index 000000000000..dea4df072cac
--- /dev/null
+++ b/src/relay/collage/custom_cost_estimator.cc
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/custom_cost_estimator.cc
+ * \brief A custom CostEstimator to support alternative cost functions.
+ */
+
+#include "./custom_cost_estimator.h"
+
+#include <tvm/relay/expr_functor.h>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+TVM_REGISTER_OBJECT_TYPE(CustomCostEstimatorNode);
+
+Cost CustomCostEstimatorNode::Estimate(const IRModule& mod, const Target& target) const {
+  static const runtime::PackedFunc* estimate_seconds = runtime::Registry::Get(py_fn_estimator_);
+  ICHECK(estimate_seconds);
+  const double value = (*estimate_seconds)(mod, target);
+  if (std::isinf(value)) {
+    return Cost::Invalid();
+  } else if (std::isnan(value)) {
+    return Cost::Unknown();
+  } else {
+    return Cost::Value(value);
+  }
+}
+
+CustomCostEstimator::CustomCostEstimator(String py_fn_estimator) {
+  auto node = make_object<CustomCostEstimatorNode>();
+  node->py_fn_estimator_ = std::move(py_fn_estimator);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_GLOBAL("relay.collage.CustomCostEstimator").set_body_typed([](String py_fn_estimator) {
+  return CustomCostEstimator(std::move(py_fn_estimator));
+});
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/custom_cost_estimator.h b/src/relay/collage/custom_cost_estimator.h
new file mode 100644
index 000000000000..4e6b45832eb2
--- /dev/null
+++ b/src/relay/collage/custom_cost_estimator.h
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/custom_cost_estimator.cc
+ * \brief A custom CostEstimator to support target-specific cost functions.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_CUSTOM_COST_ESTIMATOR_H_
+#define TVM_RELAY_COLLAGE_CUSTOM_COST_ESTIMATOR_H_
+
+#include <tvm/relay/function.h>
+
+#include "./cost.h"
+#include "./cost_estimator.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief A cost estimator that uses a target-specific cost function.
+ */
+class CustomCostEstimatorNode : public CostEstimatorNode {
+ public:
+  Cost Estimate(const IRModule& mod, const Target& target) const override;
+
+  static constexpr const char* _type_key = "relay.collage.CustomCostEstimator";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CustomCostEstimatorNode, CostEstimatorNode);
+
+ protected:
+  /*!
+   * \brief Python implemented cost function name.
+   */
+  String py_fn_estimator_;
+
+  friend class CustomCostEstimator;
+};
+
+class CustomCostEstimator : public CostEstimator {
+ public:
+  explicit CustomCostEstimator(String py_fn_estimator);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(CustomCostEstimator, CostEstimator, CustomCostEstimatorNode);
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_CUSTOM_COST_ESTIMATOR_H_
diff --git a/src/relay/collage/gather_partition_specs.cc b/src/relay/collage/gather_partition_specs.cc
index 7e2836790892..ad451673341d 100644
--- a/src/relay/collage/gather_partition_specs.cc
+++ b/src/relay/collage/gather_partition_specs.cc
@@ -89,11 +89,9 @@ PartitionRule MakeTVMPartitionRule() {
 }
 
 /*!
- * \brief Returns the fusion style for \p compiler.
- *
- * TODO(mbs): Defer to per-BYOC integration definition.
+ * \brief Returns the fusion style for default compiler.
  */
-BYOCStyle BYOCFusionStyleForCompiler(const String& compiler) {
+BYOCStyle DefaultBYOCFusionStyleForCompiler(const String& compiler) {
   if (compiler == "cutlass" || compiler == "cublas" || compiler == "cudnn") {
     return kNoFusionBYOCStyle;
   } else if (compiler == "tensorrt") {
@@ -103,6 +101,35 @@ BYOCStyle BYOCFusionStyleForCompiler(const String& compiler) {
   }
 }
 
+/*!
+ * \brief Returns the fusion style for given compiler.
+ */
+BYOCStyle BYOCFusionStyleForCompiler(const String& compiler) {
+  tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
+  std::string config_key = "relay.collage.byoc_fusion_style";
+  Optional<Array<String>> byoc_configs = ctxt->GetConfig(config_key, Optional<Array<String>>());
+  BYOCStyle byoc_fusion_style = DefaultBYOCFusionStyleForCompiler(compiler);
+  if (!byoc_configs) {
+    return byoc_fusion_style;
+  }
+  for (auto config_ : byoc_configs.value()) {
+    std::vector<std::string> byoc_cfg = SplitString(config_, ".");
+    if (byoc_cfg[0] == compiler) {
+      if (byoc_cfg[1] == "NoFusion") {
+        byoc_fusion_style = kNoFusionBYOCStyle;
+      } else if (byoc_cfg[1] == "TVMFusion") {
+        byoc_fusion_style = kTVMFusionBYOCStyle;
+      } else if (byoc_cfg[1] == "ArbitraryFusion") {
+        byoc_fusion_style = kArbitraryFusionBYOCStyle;
+      } else {
+        ICHECK(false) << "Invalid fusion name for compiler " << byoc_cfg[0] << " in pass context";
+      }
+      break;
+    }
+  }
+  return byoc_fusion_style;
+}
+
 /*!
  * \brief Returns the primitive combiner rules which allow for any touching candidates
  * to be fused provided they don't have kind \p kOpaque.
diff --git a/src/relay/collage/utils.cc b/src/relay/collage/utils.cc
index cad29c4f6e6c..451e18c219d6 100644
--- a/src/relay/collage/utils.cc
+++ b/src/relay/collage/utils.cc
@@ -134,6 +134,19 @@ bool MustBeLowered(const Expr& expr) {
   return false;
 }
 
+std::vector<std::string> SplitString(std::string stmt, const char* del) {
+  std::vector<std::string> str_tokens;
+  int start = 0;
+  int end = stmt.find(del, 0);
+  str_tokens.emplace_back(stmt.substr(start, end));
+  while (end != -1) {
+    stmt = stmt.substr(end + 1, stmt.size());
+    end = stmt.find(del, 0);
+    str_tokens.emplace_back(stmt.substr(start, end));
+  }
+  return str_tokens;
+}
+
 }  // namespace collage
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/collage/utils.h b/src/relay/collage/utils.h
index 4c0493cdd675..630b3b22f199 100644
--- a/src/relay/collage/utils.h
+++ b/src/relay/collage/utils.h
@@ -31,6 +31,7 @@
 #include <tvm/runtime/container/string.h>
 
 #include <string>
+#include <vector>
 
 namespace tvm {
 namespace relay {
@@ -79,6 +80,11 @@ bool IsSpecialOp(const OpNode* op_node);
  */
 bool MustBeLowered(const Expr& expr);
 
+/*!
+ * \brief Returns the list of split strings of given statement with delimiter.
+ */
+std::vector<std::string> SplitString(std::string stmt, const char* del);
+
 }  // namespace collage
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/contrib/test_clml/test_adreno_collage_targets.py b/tests/python/contrib/test_clml/test_adreno_collage_targets.py
new file mode 100644
index 000000000000..d08b76c3b582
--- /dev/null
+++ b/tests/python/contrib/test_clml/test_adreno_collage_targets.py
@@ -0,0 +1,354 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Compares Collage with various other baselines."""
+
+import tvm
+import logging
+import tempfile
+import os
+import shutil
+import numpy as np
+from tvm.relay import testing
+from tvm import rpc
+from tvm.contrib import utils, ndk
+from tvm.relay.build_module import bind_params_by_name
+
+# The following are necessary to force global functions or pattern tables to be registered
+from tvm.relay.collage.collage import *
+from tvm.relay.op.contrib import clml
+import pytest
+
+logging.basicConfig(level=logging.INFO)
+
+
+########### Configuration ###########
+
+###
+### TVM Opencl AutoTvm log file name
+###
+TUNING_LOG = ""
+
+###
+### If true, run all models
+###
+ALL_MODELS = False
+
+###
+### If true, run all configurations
+###
+ALL_CONFIGS = False
+
+###
+### How aggressively to look for candidates?
+###
+TVM_MAX_DEPTH = 8
+BYOC_MAX_DEPTH = 8
+
+###
+### AutoTVM tuning parameters.
+###
+AUTOTVM_NUM_TRIALS = 1024
+AUTOTVM_EARLY_STOPPING = 600
+TIMEOUT = 10
+MEASURE_NUMBER = tvm.relay.collage.MEASURE_NUMBER
+MEASURE_REPEAT = tvm.relay.collage.MEASURE_REPEAT
+WARMUP_MIN_REPEAT_MS = tvm.relay.collage.WARMUP_MIN_REPEAT_MS
+
+##
+## RPC Build configuration
+##
+HOST = tvm.target.Target("llvm -mtriple=arm64-linux-android")
+OPENCL = tvm.target.Target("opencl", HOST)
+RPC_TRACKER_HOST = os.getenv("TVM_TRACKER_HOST", "localhost")
+RPC_TRACKER_PORT = int(os.getenv("TVM_TRACKER_PORT", 9090))
+RPC_KEY = os.getenv("RPC_DEVICE_KEY", "android")
+NDK_CROSS_COMPILER = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++")
+
+
+########### AutoTVM tuning helpers ###########
+
+
+def extract_autotvm_tasks(mod, target):
+    """Returns TVM kernels to tune for mod and target."""
+    return tvm.autotvm.task.extract_from_program(mod, target=target, params=None)
+
+
+def optional_tuning_records(log_filename):
+    """Returns existing tuning records, if any."""
+    if log_filename == "" or not os.path.exists(log_filename):
+        return tvm.autotvm.task.FallbackContext()
+    else:
+        return tvm.autotvm.task.ApplyHistoryBest(log_filename)
+
+
+def is_already_tuned(task, log_filename):
+    """Returns True if we already have a tuning record for task in turning logs in log_filename"""
+    if not os.path.exists(log_filename):
+        return False
+
+    dispatch_context = tvm.autotvm.task.ApplyHistoryBest(log_filename)
+    return dispatch_context._query_inside(task.target, task.workload)
+
+
+def tune_autotvm_tasks(tasks, log_filename):
+    """Appends to log filename the best strategies for tasks"""
+    if len(tasks) == 0:
+        return
+
+    measure_option = tvm.autotvm.measure_option(
+        builder=tvm.autotvm.LocalBuilder(build_func=ndk.create_shared, timeout=15),
+        runner=tvm.autotvm.RPCRunner(
+            RPC_KEY, host=RPC_TRACKER_HOST, port=RPC_TRACKER_PORT, number=100, timeout=15
+        ),
+    )
+
+    logging.info(
+        f"Using autotvm tuning for {len(tasks)} tasks with {AUTOTVM_NUM_TRIALS} trials, logging to {log_filename}"
+    )
+
+    # create tmp log file, starting with contents from existing log file
+    tmp_log_filename = log_filename + ".tmp"
+    if os.path.exists(tmp_log_filename):
+        os.remove(tmp_log_filename)
+    if os.path.exists(log_filename):
+        logging.info(f"Copying existing log {log_filename} to {tmp_log_filename}")
+        shutil.copy(log_filename, tmp_log_filename)
+
+    for i, task in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
+        logging.info(f"Considering task {task.name} {prefix}")
+        if is_already_tuned(task, tmp_log_filename):
+            logging.info(f"Re-using existing record for {task.name}")
+            continue
+
+        logging.info(f"Using autotvm to tune {task.name}")
+        tuner_obj = tvm.autotvm.tuner.XGBTuner(task, loss_type="rank")
+        if os.path.exists(tmp_log_filename):
+            tuner_obj.load_history(tvm.autotvm.record.load_from_file(tmp_log_filename))
+
+        # do tuning
+        n_trial = min(AUTOTVM_NUM_TRIALS, len(task.config_space))
+        tuner_obj.tune(
+            n_trial=n_trial,
+            early_stopping=AUTOTVM_EARLY_STOPPING,
+            measure_option=measure_option,
+            callbacks=[
+                tvm.autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                tvm.autotvm.callback.log_to_file(tmp_log_filename),
+            ],
+        )
+
+    # Pick best records and copy back to main log file
+    tvm.autotvm.record.pick_best(tmp_log_filename, log_filename)
+    os.remove(tmp_log_filename)
+
+    logging.info("Done with autotvm tuning")
+
+
+def autotvm_tune_module(mod, target, log_filename):
+    if log_filename == "":
+        logging.info("Not tuning with autotvm since disabled")
+        return
+    # Extract and tune any TVM kernels. BYOC partitions will have no tasks extracted.
+    logging.info("Extracting tasks from overall module")
+    tasks = extract_autotvm_tasks(mod, target)
+    logging.info(f"Auto-tuning {len(tasks)} tasks from overall module")
+    tune_autotvm_tasks(tasks, log_filename)
+
+
+########### Drivers ###########
+
+
+def compile_and_benchmark(label, model, targets, tmp_dir):
+    """Compile model for target and run it with profiling."""
+    logging.info(f"Compiling {model['name']} using {label} with {targets}...")
+    mod = model["mod"]
+    mod = clml.preprocess_for_clml(mod)
+    exe = tvm.relay.vm.compile(mod, target=targets, params=model["params"])
+    lib = exe.mod
+    lib_path = os.path.join(tmp_dir, "lib.so")
+    logging.info(f"Exporting library to {lib_path}...")
+    lib.export_library(lib_path, cc=NDK_CROSS_COMPILER)
+    tracker = rpc.connect_tracker(RPC_TRACKER_HOST, RPC_TRACKER_PORT)
+    remote = tracker.request(RPC_KEY, priority=0, session_timeout=600)
+    ctx = remote.cl(0)
+    remote_path = "lib.so"
+    remote.upload(lib_path, target=remote_path)
+    lib = remote.load_module(remote_path)
+    vm_factory = tvm.runtime.vm.VirtualMachine(lib, ctx)
+    args = {v.name_hint: arg_for(v.checked_type, ctx) for v in mod["main"].params}
+    logging.info(f"Benchmarking for {model['name']} generated by {label}...")
+    profile = vm_factory.benchmark(
+        ctx, repeat=MEASURE_REPEAT, number=MEASURE_NUMBER, min_repeat_ms=0, **args
+    )
+    logging.info(f"Benchmarked for {model['name']} generated by {label}: {profile}")
+    logging.info(f"RESULT: {label} | {model['name']} | {profile.median * 1e3}ms")
+
+
+# Custom cost function for Opencl RPC targets.
+@register_func("tvm.relay.collage.opencl_cost_estimator")
+def opencl_cost_estimator(mod, target):
+    mod = clml.preprocess_for_clml(mod) if "clml" == target.kind.name else mod
+    try:
+        # Build the module.
+        logging.info("Compiling module to estimate")
+        exe = tvm.relay.vm.compile(mod, target)
+    except RuntimeError as err:
+        # A build failure indicates the partition is not supported.
+        # eg trying to build an nn.batch_norm on GPU, which has no schedule since we assume it
+        # is only ever used with a tuple projection which is rewritten away.
+        logging.info("Assigning module infinite cost since unable to build: %s", err)
+        return math.inf
+
+    lib = exe.mod
+    tracker = rpc.connect_tracker(RPC_TRACKER_HOST, RPC_TRACKER_PORT)
+    remote = tracker.request(RPC_KEY, priority=0, session_timeout=600)
+    temp = utils.tempdir()
+    dso_binary = "dev_lib_cl.so"
+    dso_binary_path = temp.relpath(dso_binary)
+    ctx = remote.cl(0)
+    lib.export_library(dso_binary_path, cc=NDK_CROSS_COMPILER)
+    remote_path = dso_binary
+    remote.upload(dso_binary_path, target=remote_path)
+    lib = remote.load_module(remote_path)
+
+    vm_factory = tvm.runtime.vm.VirtualMachine(lib, ctx)
+    func_name = "main"
+    main_args = {v.name_hint: arg_for(v.checked_type, ctx) for v in mod[func_name].params}
+    cost = vm_factory.benchmark(
+        ctx, repeat=5, number=20, min_repeat_ms=0, func_name=func_name, **main_args
+    )
+    return cost.mean
+
+
+def collage(model):
+    """Run the Collage partitioner for a set of Opencl Adreno related targets and profile the result"""
+    logging.info(f"collage | {model['name']}")
+    logging.info("-------------- BEGIN ORIGINAL --------------")
+    logging.info(model["mod"])
+    logging.info("-------------- END ORIGINAL ----------------")
+    autotvm_tune_module(model["mod"], OPENCL, TUNING_LOG)
+    with optional_tuning_records(TUNING_LOG):
+        targets = []
+        targets.append(OPENCL)
+        use_fp16 = model["main_dtype"] == "float16"
+        tmp_dir = tempfile.mkdtemp()
+        targets.append(tvm.target.Target("clml", HOST))
+
+        # Register byoc fusion style for compiler with available
+        # options [compiler.NoFusion | compiler.TVMFusion | compiler.MaxDepthFusion]
+        config = {
+            "relay.collage.tvm_max_depth": TVM_MAX_DEPTH,
+            "relay.collage.byoc_max_depth": BYOC_MAX_DEPTH,
+            "relay.collage.byoc_fusion_style": ["clml.NoFusion"],
+        }
+        logging.info(f"Using PassContext(config={config}")
+        ctxt = tvm.transform.PassContext(config=config)
+        config = tvm.target.make_compilation_config(ctxt, targets)
+        with ctxt:
+            mod = model["mod"]
+            mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(mod)
+            logging.info("-------------- BEGIN INDEXED --------------")
+            logging.info(mod)
+            logging.info("-------------- END INDEXED ----------------")
+            # Register python custom cost function for targets in
+            # custom cost estimator module.
+            cost_estimator = CustomCostEstimator(
+                py_fn_estimator="tvm.relay.collage.opencl_cost_estimator"
+            )
+            mod = tvm.relay.transform.CollagePartition(config, cost_estimator=cost_estimator)(mod)
+            partitioned_model = model.copy()
+            partitioned_model["mod"] = mod
+            logging.info("-------------- BEGIN PARTITIONED --------------")
+            logging.info(partitioned_model["mod"])
+            logging.info("-------------- END PARTITIONED ----------------")
+            compile_and_benchmark("collage", partitioned_model, targets, tmp_dir)
+
+
+def just_clml(model):
+    """Run partition_for_clml, complete the compilation with TVM, and profile the result."""
+    logging.info(f"just_clml | {model['name']}")
+    logging.info("-------------- BEGIN ORIGINAL --------------")
+    logging.info(model["mod"])
+    logging.info("-------------- END ORIGINAL ----------------")
+    tmp_dir = tempfile.mkdtemp()
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        logging.info("Partitioning for CLML...")
+        mod = tvm.relay.op.contrib.clml.partition_for_clml(model["mod"], model["params"])
+        partitioned_model = model.copy()
+        partitioned_model["mod"] = mod
+        logging.info("-------------- BEGIN PARTITIONED --------------")
+        logging.info(partitioned_model["mod"])
+        logging.info("-------------- END PARTITIONED ----------------")
+        targets = []
+        targets.append(OPENCL)
+        targets.append(tvm.target.Target("clml", HOST))
+        compile_and_benchmark("just_clml", partitioned_model, targets, tmp_dir)
+
+
+def just_tvm(model):
+    """Compile and profile using vanilla TVM."""
+    logging.info(f"just_tvm | {model['name']}")
+    logging.info("-------------- BEGIN ORIGINAL --------------")
+    logging.info(model["mod"])
+    logging.info("-------------- END ORIGINAL ----------------")
+    tmp_dir = tempfile.mkdtemp()
+    autotvm_tune_module(model["mod"], OPENCL, TUNING_LOG)
+    with optional_tuning_records(TUNING_LOG):
+        with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+            compile_and_benchmark("just_tvm", model, OPENCL, tmp_dir)
+
+
+def get_model(model_name, dtype):
+
+    if "mobilenet" in model_name:
+        mod, params = testing.mobilenet.get_workload(batch_size=1, dtype=dtype)
+    elif "resnet" in model_name:
+        mod, params = testing.resnet.get_workload(num_layers=50, batch_size=1, dtype=dtype)
+    if params:
+        mod["main"] = bind_params_by_name(mod["main"], params)
+        mod = tvm.relay.transform.FoldConstant()(mod)
+    return {
+        "name": model_name,
+        "input_shapes": {"data": [1, 3, 224, 224]},
+        "input_dtypes": {"data": dtype},
+        "mod": mod,
+        "params": params,
+        "main_dtype": dtype,
+    }
+
+
+########### Runners ###########
+@pytest.mark.parametrize("dtype", ["float32"])
+@tvm.testing.requires_openclml
+def run_resnet50(dtype):
+
+    just_clml(get_model("resnet-50", dtype))
+    just_tvm(get_model("resnet-50", dtype))
+    """Run Collage for tvm and clml compiler target."""
+    collage(get_model("resnet-50", dtype))
+
+
+@pytest.mark.parametrize("dtype", ["float32"])
+@tvm.testing.requires_openclml
+def run_mobilenetv1(dtype):
+
+    just_clml(get_model("mobilenet", dtype))
+    just_tvm(get_model("mobilenet", dtype))
+    """Run Collage for tvm and clml compiler target."""
+    collage(get_model("mobilenet", dtype))
diff --git a/tests/python/relay/collage/demo_collage_partitioner.py b/tests/python/relay/collage/demo_collage_partitioner.py
index 47f2612d7f16..2c9314516746 100644
--- a/tests/python/relay/collage/demo_collage_partitioner.py
+++ b/tests/python/relay/collage/demo_collage_partitioner.py
@@ -264,6 +264,12 @@ def collage(model):
         config = {
             "relay.collage.tvm_max_depth": TVM_MAX_DEPTH,
             "relay.collage.byoc_max_depth": BYOC_MAX_DEPTH,
+            "relay.collage.byoc_fusion_style": [
+                "cutlass.NoFusion",
+                "cublas.NoFusion",
+                "cudnn.NoFusion",
+                "tensorrt.TVMFusion",
+            ],
         }
         logging.info(f"Using PassContext(config={config}")
         ctxt = tvm.transform.PassContext(config=config)

From dcf05ef54dd3c6e0b92d26bdc140d9c042212231 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Tue, 10 Jan 2023 15:08:37 +0000
Subject: [PATCH 147/286] [Docker update] Update ci_cpu tag to the latest from
 tlcpackstaging (#13748)

---
 ci/jenkins/docker-images.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/jenkins/docker-images.ini b/ci/jenkins/docker-images.ini
index 40e1b8a1313f..76f8a5cf3e38 100644
--- a/ci/jenkins/docker-images.ini
+++ b/ci/jenkins/docker-images.ini
@@ -19,7 +19,7 @@
 [jenkins]
 ci_arm: tlcpack/ci-arm:20221013-060115-61c9742ea
 ci_cortexm: tlcpack/ci-cortexm:20221013-060115-61c9742ea
-ci_cpu: tlcpack/ci-cpu:20221013-060115-61c9742ea
+ci_cpu: tlcpack/ci-cpu:20230110-070003-d00168ffb
 ci_gpu: tlcpack/ci-gpu:20221128-070141-ae4fd7df7
 ci_hexagon: tlcpack/ci-hexagon:20221013-060115-61c9742ea
 ci_i386: tlcpack/ci-i386:20221013-060115-61c9742ea

From b53706e5d192f51d0d58ff2237da62c1d869e960 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Tue, 10 Jan 2023 17:44:38 +0000
Subject: [PATCH 148/286] [CMSIS-NN] Support CMSIS NN from new GitHub location
 (#13656)

CMSIS NN has been moved out of the CMSIS project into a new GitHub project.
This pr adds support for CMSIS NN from this new GitHub location.
Both CMSIS and CMSIS NN are now downloaded in /opt/arm/ethosu/cmsis

I updated the pr with three commits:
- First, I supported CMSIS NN from the new GitHub location, as previously done.
- Second, I prevented the definition of the `cmsis_path` project API option from an environment variable. Before, thanks to the environment variable, the `cmsis_path` option was always enabled. That was not a problem, but now CMSIS NN uses a new header (`arm_acle.h`) which is not always present, so we need to explicitly enable CMSIS when we need it.
- In the end, I re-added support for the old location of CMSIS NN because the docker image is not yet updated, and we need the tests to pass to accept this pr. In this way, tvm will use the new CMSIS NN project when we will update the docker image, but for now, it uses the old one. I'll create a pr that reverts this last commit when the docker image is updated.
---
 apps/microtvm/cmsisnn/Makefile                |  9 +++-
 apps/microtvm/cmsisnn/README.md               |  1 +
 apps/microtvm/ethosu/Makefile                 |  9 +++-
 apps/microtvm/ethosu/README.md                |  1 +
 .../template_project/CMakeLists.txt.template  | 26 ++++++-----
 apps/microtvm/zephyr_cmsisnn/CMakeLists.txt   | 44 ++++++++++++-------
 apps/microtvm/zephyr_cmsisnn/README.md        |  5 +++
 docker/install/ubuntu_install_cmsis.sh        |  4 ++
 .../ubuntu_install_ethosu_driver_stack.sh     |  3 ++
 python/tvm/micro/project_api/server.py        |  2 +-
 tests/micro/zephyr/test_zephyr.py             |  3 +-
 tests/python/relay/aot/corstone300.mk         |  9 +++-
 12 files changed, 82 insertions(+), 34 deletions(-)

diff --git a/apps/microtvm/cmsisnn/Makefile b/apps/microtvm/cmsisnn/Makefile
index db72ab889663..e7d1b7081d54 100644
--- a/apps/microtvm/cmsisnn/Makefile
+++ b/apps/microtvm/cmsisnn/Makefile
@@ -31,6 +31,11 @@ CMAKE ?= cmake
 CC = arm-none-eabi-gcc
 AR = arm-none-eabi-ar
 RANLIB = arm-none-eabi-ranlib
+ifeq ($(shell [ -d ${CMSIS_PATH}/CMSIS-NN ]; echo $$?), 0)
+	CMSIS_NN_PATH = ${CMSIS_PATH}/CMSIS-NN
+else
+	CMSIS_NN_PATH = ${CMSIS_PATH}/CMSIS/NN
+endif
 PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I${STANDALONE_CRT_PATH}/include \
 	-I${STANDALONE_CRT_PATH}/src/runtime/crt/include \
@@ -38,7 +43,7 @@ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I${CORSTONE_300_PATH} \
 	-I${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Include/ \
 	-I${CMSIS_PATH}/CMSIS/Core/Include \
-	-I${CMSIS_PATH}/CMSIS/NN/Include \
+	-I${CMSIS_NN_PATH}/Include \
 	-I${CMSIS_PATH}/CMSIS/DSP/Include \
 	-I$(abspath $(BUILD_DIR))/codegen/host/include
 CMSIS_NN_CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=$(abspath $(BUILD_DIR))/../arm-none-eabi-gcc.cmake \
@@ -56,7 +61,7 @@ DEMO_MAIN = src/demo_bare_metal.c
 CODEGEN_SRCS = $(wildcard $(abspath $(BUILD_DIR))/codegen/host/src/*.c)
 CODEGEN_OBJS = $(subst .c,.o,$(CODEGEN_SRCS))
 CMSIS_STARTUP_SRCS = $(wildcard ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
-CMSIS_NN_SRCS = $(shell find ${CMSIS_PATH}/CMSIS/NN/Source/*/*.c)
+CMSIS_NN_SRCS = $(shell find ${CMSIS_NN_PATH}/Source/*/*.c)
 UART_SRCS = $(wildcard ${CORSTONE_300_PATH}/*.c)
 
 demo: $(BUILD_DIR)/demo
diff --git a/apps/microtvm/cmsisnn/README.md b/apps/microtvm/cmsisnn/README.md
index befcda6bb063..3f32c44baed5 100644
--- a/apps/microtvm/cmsisnn/README.md
+++ b/apps/microtvm/cmsisnn/README.md
@@ -35,6 +35,7 @@ If the demo is not run in the ci_cpu Docker container, then you will need the fo
   - [GCC toolchain from Arm(R)](https://developer.arm.com/-/media/Files/downloads/gnu-rm/10-2020q4/gcc-arm-none-eabi-10-2020-q4-major-x86_64-linux.tar.bz2)
   - [Arm(R) Ethos(TM)-U NPU driver stack](https://review.mlplatform.org)
   - [CMSIS](https://github.com/ARM-software/CMSIS_5)
+  - [CMSIS NN](https://github.com/ARM-software/CMSIS-NN)
 - The python libraries listed in the requirements.txt of this directory
   - These can be installed by running the following from the current directory:
     ```bash
diff --git a/apps/microtvm/ethosu/Makefile b/apps/microtvm/ethosu/Makefile
index 1b79548eaf62..63f8adbc2790 100644
--- a/apps/microtvm/ethosu/Makefile
+++ b/apps/microtvm/ethosu/Makefile
@@ -32,6 +32,11 @@ CMAKE ?= cmake
 CC = arm-none-eabi-gcc
 AR = arm-none-eabi-ar
 RANLIB = arm-none-eabi-ranlib
+ifeq ($(shell [ -d ${CMSIS_PATH}/CMSIS-NN ]; echo $$?), 0)
+	CMSIS_NN_PATH = ${CMSIS_PATH}/CMSIS-NN
+else
+	CMSIS_NN_PATH = ${CMSIS_PATH}/CMSIS/NN
+endif
 PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I${STANDALONE_CRT_PATH}/include \
 	-I${STANDALONE_CRT_PATH}/src/runtime/crt/include \
@@ -40,7 +45,7 @@ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I${ETHOSU_PATH}/core_driver/include \
 	-I${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Include/ \
 	-I${CMSIS_PATH}/CMSIS/Core/Include \
-	-I${CMSIS_PATH}/CMSIS/NN/Include \
+	-I${CMSIS_NN_PATH}/Include \
 	-I${CMSIS_PATH}/CMSIS/DSP/Include \
 	-I$(abspath $(BUILD_DIR))/codegen/host/include \
 	-DETHOSU_TEST_RUNNER_TOL=${ETHOSU_TEST_RUNNER_TOL}
@@ -78,7 +83,7 @@ endif
 CODEGEN_SRCS = $(wildcard $(abspath $(BUILD_DIR))/codegen/host/src/*.c)
 CODEGEN_OBJS = $(subst .c,.o,$(CODEGEN_SRCS))
 CMSIS_STARTUP_SRCS = $(wildcard ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
-CMSIS_NN_SOFTMAX_SRCS = $(shell find ${CMSIS_PATH}/CMSIS/NN/Source/SoftmaxFunctions/*.c)
+CMSIS_NN_SOFTMAX_SRCS = $(shell find ${CMSIS_NN_PATH}/Source/SoftmaxFunctions/*.c)
 UART_SRCS = $(wildcard ${CORSTONE_300_PATH}/*.c)
 
 demo: $(BUILD_DIR)/demo
diff --git a/apps/microtvm/ethosu/README.md b/apps/microtvm/ethosu/README.md
index 1f08928a6ee4..69834837ac42 100644
--- a/apps/microtvm/ethosu/README.md
+++ b/apps/microtvm/ethosu/README.md
@@ -35,6 +35,7 @@ If the demo is not run in the ci_cpu Docker container, then you will need the fo
   - [GCC toolchain from Arm(R)](https://developer.arm.com/-/media/Files/downloads/gnu-rm/10-2020q4/gcc-arm-none-eabi-10-2020-q4-major-x86_64-linux.tar.bz2)
   - [Arm(R) Ethos(TM)-U NPU driver stack](https://review.mlplatform.org)
   - [CMSIS](https://github.com/ARM-software/CMSIS_5)
+  - [CMSIS NN](https://github.com/ARM-software/CMSIS-NN)
 - The python libraries listed in the requirements.txt of this directory
   - These can be installed by running the following from the current directory:
     ```bash
diff --git a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
index a41d68a134ef..17200f7e9704 100644
--- a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
+++ b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
@@ -31,21 +31,27 @@ find_package(Zephyr HINTS $ENV{ZEPHYR_BASE})
 project(microtvm_autogenerated_project)
 
 if(DEFINED CMSIS_PATH)
+  if (EXISTS ${CMSIS_PATH}/CMSIS-NN)
+      set(CMSIS_NN_PATH ${CMSIS_PATH}/CMSIS-NN)
+  else()
+      set(CMSIS_NN_PATH ${CMSIS_PATH}/CMSIS/NN)
+  endif()
+
   file(GLOB_RECURSE cmsis_lib_srcs
-    ${CMSIS_PATH}/CMSIS/NN/Source/ActivationFunctions/*.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/BasicMathFunctions/*.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/ConcatenationFunctions/*.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/*.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/FullyConnectedFunctions/*.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/NNSupportFunctions/*.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/PoolingFunctions/*.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/ReshapeFunctions/*.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/SoftmaxFunctions/*.c
+    ${CMSIS_NN_PATH}/Source/ActivationFunctions/*.c
+    ${CMSIS_NN_PATH}/Source/BasicMathFunctions/*.c
+    ${CMSIS_NN_PATH}/Source/ConcatenationFunctions/*.c
+    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/*.c
+    ${CMSIS_NN_PATH}/Source/FullyConnectedFunctions/*.c
+    ${CMSIS_NN_PATH}/Source/NNSupportFunctions/*.c
+    ${CMSIS_NN_PATH}/Source/PoolingFunctions/*.c
+    ${CMSIS_NN_PATH}/Source/ReshapeFunctions/*.c
+    ${CMSIS_NN_PATH}/Source/SoftmaxFunctions/*.c
   )
 
   set(cmsis_includes
     ${CMSIS_PATH}/CMSIS/Core/Include
-    ${CMSIS_PATH}/CMSIS/NN/Include
+    ${CMSIS_NN_PATH}/Include
     ${CMSIS_PATH}/CMSIS/DSP/Include
     ${CMSIS_PATH}/CMSIS/DSP/Include/dsp
   )
diff --git a/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt b/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt
index 0ca000d48e6a..9dec75dc5030 100644
--- a/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt
+++ b/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt
@@ -51,22 +51,34 @@ set(DATA_FILES
     ${CMAKE_CURRENT_BINARY_DIR}/outputs.c
     ${CMAKE_CURRENT_BINARY_DIR}/labels.c
 )
+
+if (EXISTS ${CMSIS_PATH}/CMSIS-NN)
+    set(CMSIS_NN_PATH ${CMSIS_PATH}/CMSIS-NN)
+    set(CMSIS_NN_ADDITIONAL_SOURCES
+        ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_convolve_1x1_s8.c
+        ${CMSIS_NN_PATH}/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c
+    )
+else()
+    set(CMSIS_NN_PATH ${CMSIS_PATH}/CMSIS/NN)
+endif()
+
 set(CMSIS_SOURCES
-    ${CMSIS_PATH}/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_s8.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
-    ${CMSIS_PATH}/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
+    ${CMSIS_NN_PATH}/Source/SoftmaxFunctions/arm_softmax_s8.c
+    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c
+    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
+    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
+    ${CMSIS_NN_PATH}/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c
+    ${CMSIS_NN_PATH}/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c
+    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c
+    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c
+    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
+    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_convolve_s8.c
+    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
+    ${CMSIS_NN_PATH}/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
+    ${CMSIS_NN_PATH}/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
+    ${CMSIS_NN_PATH}/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
+    ${CMSIS_NN_PATH}/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
+    ${CMSIS_NN_ADDITIONAL_SOURCES}
 )
 
 add_custom_command(
@@ -93,5 +105,5 @@ target_sources(app PRIVATE
 target_include_directories(app
     PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
     PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/runtime/include ${CMAKE_CURRENT_BINARY_DIR}/codegen/host/include
-    PUBLIC ${CMSIS_PATH}/CMSIS/NN/Include/ ${CMSIS_PATH}/CMSIS/DSP/Include
+    PUBLIC ${CMSIS_NN_PATH}/Include/ ${CMSIS_PATH}/CMSIS/DSP/Include
 )
diff --git a/apps/microtvm/zephyr_cmsisnn/README.md b/apps/microtvm/zephyr_cmsisnn/README.md
index df54acfbc736..334af71d0c77 100644
--- a/apps/microtvm/zephyr_cmsisnn/README.md
+++ b/apps/microtvm/zephyr_cmsisnn/README.md
@@ -37,6 +37,11 @@ Checkout [CMSIS_5](https://github.com/ARM-software/CMSIS_5.git) (default is `/op
 git clone "https://github.com/ARM-software/CMSIS_5.git" cmsis
 ```
 
+Checkout [CMSIS NN](https://github.com/ARM-software/CMSIS-NN.git) (default is `/opt/arm/ethosu/cmsis/CMSIS-NN` to reflect `tlcpack/ci_cortexm`):
+```
+git clone "https://github.com/ARM-software/CMSIS-NN.git" cmsis/CMSIS-NN
+```
+
 And run the demo using `west`, with the path to CMSIS:
 ```
 west build -t run -- -DCMSIS_PATH=/opt/arm/ethosu/cmsis
diff --git a/docker/install/ubuntu_install_cmsis.sh b/docker/install/ubuntu_install_cmsis.sh
index a41e3df0ae55..1561cd3dfda7 100755
--- a/docker/install/ubuntu_install_cmsis.sh
+++ b/docker/install/ubuntu_install_cmsis.sh
@@ -48,4 +48,8 @@ wget ${CMSIS_URL} -O "${DOWNLOAD_PATH}"
 echo "$CMSIS_SHASUM" ${DOWNLOAD_PATH} | sha512sum -c
 tar -xf "${DOWNLOAD_PATH}" -C "${INSTALLATION_PATH}" --strip-components=1
 touch "${INSTALLATION_PATH}"/"${CMSIS_SHA}".sha
+
+CMSIS_NN_TAG="v4.0.0"
+CMSIS_NN_URL="https://github.com/ARM-software/CMSIS-NN.git"
+git clone ${CMSIS_NN_URL} --branch ${CMSIS_NN_TAG} --single-branch ${INSTALLATION_PATH}/CMSIS-NN
 echo "SUCCESS"
diff --git a/docker/install/ubuntu_install_ethosu_driver_stack.sh b/docker/install/ubuntu_install_ethosu_driver_stack.sh
index da2f955d3fb7..0fb35b13e797 100755
--- a/docker/install/ubuntu_install_ethosu_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosu_driver_stack.sh
@@ -85,6 +85,9 @@ cmake -DCMAKE_TOOLCHAIN_FILE=${ethosu_dir}/core_platform/cmake/toolchain/arm-non
 make
 
 # Build NN Library
+mkdir ${CMSIS_PATH}/CMSIS-NN/build/ && cd ${CMSIS_PATH}/CMSIS-NN/build/
+cmake .. -DCMAKE_TOOLCHAIN_FILE=${ethosu_dir}/core_platform/cmake/toolchain/arm-none-eabi-gcc.cmake -DTARGET_CPU=cortex-m55 -DBUILD_CMSIS_NN_FUNCTIONS=YES -DCMSIS_PATH=${CMSIS_PATH}
+
 mkdir ${CMSIS_PATH}/CMSIS/NN/build/ && cd ${CMSIS_PATH}/CMSIS/NN/build/
 cmake .. -DCMAKE_TOOLCHAIN_FILE=${ethosu_dir}/core_platform/cmake/toolchain/arm-none-eabi-gcc.cmake -DTARGET_CPU=cortex-m55 -DBUILD_CMSIS_NN_FUNCTIONS=YES
 make
diff --git a/python/tvm/micro/project_api/server.py b/python/tvm/micro/project_api/server.py
index 2d5db09f4bbe..5aed3a896241 100644
--- a/python/tvm/micro/project_api/server.py
+++ b/python/tvm/micro/project_api/server.py
@@ -804,7 +804,7 @@ def default_project_options(**kw) -> typing.List[ProjectOption]:
             "cmsis_path",
             optional=["generate_project"],
             type="str",
-            default=os.environ.get("CMSIS_PATH", None),
+            default=None,
             help="Path to the CMSIS directory.",
         ),
         ProjectOption(
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index 6b49c043cc3d..a053c905aa34 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -605,6 +605,7 @@ def test_schedule_build_with_cmsis_dependency(workspace_dir, board, microtvm_deb
         "project_type": "host_driven",
         "verbose": bool(build_config.get("debug")),
         "board": board,
+        "cmsis_path": os.getenv("CMSIS_PATH"),
         "use_fvp": bool(use_fvp),
     }
 
@@ -623,7 +624,7 @@ def test_schedule_build_with_cmsis_dependency(workspace_dir, board, microtvm_deb
     assert "CMSIS/DSP/Include" in cmake_content
     assert "CMSIS/DSP/Include/dsp" in cmake_content
     assert "CMSIS/DSP/Include" in cmake_content
-    assert "CMSIS/NN/Include" in cmake_content
+    # assert "CMSIS-NN/Include" in cmake_content
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/aot/corstone300.mk b/tests/python/relay/aot/corstone300.mk
index cb1db5ea9995..ebda50d9cfef 100644
--- a/tests/python/relay/aot/corstone300.mk
+++ b/tests/python/relay/aot/corstone300.mk
@@ -48,6 +48,11 @@ CC = arm-none-eabi-gcc
 AR = arm-none-eabi-ar
 RANLIB = arm-none-eabi-ranlib
 CC_OPTS = CC=$(CC) AR=$(AR) RANLIB=$(RANLIB)
+ifeq ($(shell [ -d ${CMSIS_PATH}/CMSIS-NN ]; echo $$?), 0)
+	CMSIS_NN_PATH = ${CMSIS_PATH}/CMSIS-NN
+else
+	CMSIS_NN_PATH = ${CMSIS_PATH}/CMSIS/NN
+endif
 PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	${CFLAGS} \
 	-I$(build_dir)/../include \
@@ -57,7 +62,7 @@ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I${DRIVER_PATH}/include \
 	-I${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Include/ \
 	-I${CMSIS_PATH}/CMSIS/Core/Include \
-	-I${CMSIS_PATH}/CMSIS/NN/Include \
+	-I${CMSIS_NN_PATH}/Include \
 	-I${CMSIS_PATH}/CMSIS/DSP/Include \
 	-isystem$(STANDALONE_CRT_DIR)/include
 DRIVER_CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=$(ETHOSU_TEST_ROOT)/arm-none-eabi-gcc.cmake \
@@ -78,7 +83,7 @@ CC_CODEGEN_SRCS = $(shell find $(abspath $(CODEGEN_ROOT)/host/src/*.cc))
 C_CODEGEN_OBJS = $(subst .c,.o,$(C_CODEGEN_SRCS))
 CC_CODEGEN_OBJS = $(subst .cc,.o,$(CC_CODEGEN_SRCS))
 CMSIS_STARTUP_SRCS = $(shell find ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
-CMSIS_NN_SRCS = $(shell find ${CMSIS_PATH}/CMSIS/NN/Source/*/*.c)
+CMSIS_NN_SRCS = $(shell find ${CMSIS_NN_PATH}/Source/*/*.c)
 UART_SRCS = $(shell find ${PLATFORM_PATH}/*.c)
 
 ifdef ETHOSU_TEST_ROOT

From 4c1da541af865c1bddd05d690e9f854da6f4adb6 Mon Sep 17 00:00:00 2001
From: Alexey Yazev <113356454+Alexey-Yazev@users.noreply.github.com>
Date: Tue, 10 Jan 2023 21:46:43 +0400
Subject: [PATCH 149/286] [microNPU] Add a legalization test for TFLite PAD
 (#13750)

Added a legalization test for stand-alone pad operation which is legalized to depthwise operation on the NPU.
---
 .../contrib/test_ethosu/test_legalize.py      | 100 ++++++++++++++++++
 1 file changed, 100 insertions(+)

diff --git a/tests/python/contrib/test_ethosu/test_legalize.py b/tests/python/contrib/test_ethosu/test_legalize.py
index 9b4dd467ff9f..5ddc7565f20c 100644
--- a/tests/python/contrib/test_ethosu/test_legalize.py
+++ b/tests/python/contrib/test_ethosu/test_legalize.py
@@ -674,6 +674,106 @@ def verify(ext_func):
     verify(mod["tvmgen_default_ethos_u_main_0"])
 
 
+@pytest.mark.parametrize("ifm_shape", [(1, 55, 55, 3), (1, 23, 32, 7)])
+@pytest.mark.parametrize("padding", [(0, 1, 0, 0), (1, 1, 1, 1), (1, 1, 5, 5)])
+@pytest.mark.parametrize("const_value", [0, 5, 125, -5])
+def test_tflite_separate_padding_legalize(ifm_shape, padding, const_value):
+    dtype = "int8"
+    kernel_shape = (1, 1)
+    strides = (1, 1)
+    dilation = (1, 1)
+
+    def create_tflite_graph():
+        class Model(tf.Module):
+            @tf.function
+            def tf_function(self, x):
+                return tf.pad(
+                    x,
+                    [[0, 0], [padding[0], padding[2]], [padding[1], padding[3]], [0, 0]],
+                    "CONSTANT",
+                    const_value,
+                )
+
+        model = Model()
+        concrete_func = model.tf_function.get_concrete_function(
+            tf.TensorSpec(ifm_shape, dtype=tf.float32)
+        )
+        # Convert the model
+        def representative_dataset():
+            for _ in range(100):
+                data = np.random.rand(*tuple(ifm_shape))
+                yield [data.astype(np.float32)]
+
+        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+        converter.optimizations = [tf.lite.Optimize.DEFAULT]
+        converter.representative_dataset = representative_dataset
+        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+        converter.inference_input_type = tf.int8
+        converter.inference_output_type = tf.int8
+        tflite_model = converter.convert()
+        return tflite_model
+
+    def verify(ext_func):
+        op = ext_func.body
+        ofm_channels = op.attrs.ofm_channels
+
+        # check IFM
+        ifm = op.args[0].checked_type
+        assert list(ifm.shape) == list(ifm_shape)
+        assert str(ifm.dtype) == dtype
+        assert ifm.shape[3] == ofm_channels
+
+        # check OFM
+        ofm = op.checked_type
+        expected_ofm_shape = infra.compute_ofm_shape(
+            ifm_shape, padding, kernel_shape, strides, dilation
+        )
+        assert list(ofm.shape) == list(expected_ofm_shape)
+        assert str(ofm.dtype) == dtype
+        assert ofm.shape[3] == ofm_channels
+
+        # check weights
+        weights_ohwi = op.args[1].data.asnumpy()
+        assert str(weights_ohwi.dtype) == dtype
+        assert weights_ohwi.shape[0] == ofm_channels
+        assert weights_ohwi.shape[1] == kernel_shape[0]
+        assert weights_ohwi.shape[2] == kernel_shape[1]
+        assert weights_ohwi.shape[3] == 1  # only depth multiplier 1 is supported
+
+        # Check that scale_bias matches weight tensor
+        assert list(op.args[2].checked_type.shape)[0] == ofm_channels
+
+        assert list(op.attrs.padding) == list(padding)
+        assert op.attrs.ofm_channels == ofm_channels
+        assert list(op.attrs.strides) == list(strides)
+        assert list(op.attrs.dilation) == list(dilation)
+
+    pad_pattern_table = [
+        (
+            ethosu.PadParams.composite_name,
+            ethosu.pad_pattern(),
+            lambda pat: ethosu.PadParams(pat).is_valid(),
+        )
+    ]
+
+    tflite_graph = create_tflite_graph()
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+
+    mod, params = relay.frontend.from_tflite(
+        tflite_model,
+        shape_dict={"input": ifm_shape},
+        dtype_dict={"input": dtype},
+    )
+
+    mod["main"] = bind_params_by_name(mod["main"], params)
+    mod = partition_ethosu_by_table(mod, pad_pattern_table)
+
+    mod["tvmgen_default_ethos_u_main_0"] = dataflow_pattern.rewrite(
+        legalize.PadRewriter(), mod["tvmgen_default_ethos_u_main_0"]
+    )
+    verify(mod["tvmgen_default_ethos_u_main_0"])
+
+
 @pytest.mark.parametrize("pooling_type", ["MAX", "AVG"])
 @pytest.mark.parametrize("ifm_shape", [[1, 3, 4, 3], [1, 4, 5, 2]])
 @pytest.mark.parametrize(

From 77d9574d013dbbb7e24a9b49abf62001a8a6dc09 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 10 Jan 2023 11:47:49 -0600
Subject: [PATCH 150/286] [Arith] Use ConstIntBound to remove negative
 numerator when lowering (#13724)

* [Arith] Use ConstIntBound to remove negative numerator when lowering

Negative numerators to modulo/remainder operations are not supported
by the Vulkan API.  While the SPIR-V instructions
[`OpSRem`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSRem)
and
[`OpSMod`](https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSMod)
have identical semantics to `tir::Mod` and `tir::FloorMod`,
respectively, use of either instruction within Vulkan results in
undefined behavior.  From the [Vulkan
spec](https://registry.khronos.org/vulkan/specs/1.3/html/chap37.html#spirvenv-op-prec):

> For the OpSRem and OpSMod instructions, if either operand is
> negative the result is undefined.
>
> Note: While the OpSRem and OpSMod instructions are supported by the
> Vulkan environment, they require non-negative values and thus do not
> enable additional functionality beyond what OpUMod provides.

This issue was first noticed in
https://github.com/apache/tvm/pull/13530, where use of integer
arithmetic resulted in negative numerators.  This hadn't caused issues
previously, because most use of div/mod use a denominator that is a
power of two.  In these cases, `tir.LowerIntrin` implements floordiv
and floormod using only bitwise operations.  When the denominator
isn't a power of two, both `tir::FloorDiv` and `tir::FloorMod` are
implemented in terms of `tir::Mod`, which triggers the undefined
behavior for negative numerators.

This commit alters the lowering of FloorDiv/FloorMod to
TruncDiv/TruncMod, in cases where the denominator is positive, the
numerator is sometimes negative, and the range of the numerator is
known.  In these cases, the FloorDiv/FloorMod is now implemented by
offsetting the numerator such that it is always positive.

* Add check to avoid -INT32_MIN

* Updated to use `tvm::min_value(DataType)`

* Added derivation for floordiv/floormod in terms of truncdiv/trundmod
---
 src/tir/transforms/lower_intrin.cc            | 136 ++++++++++++++----
 .../unittest/test_target_codegen_vulkan.py    |  42 ++++++
 2 files changed, 153 insertions(+), 25 deletions(-)

diff --git a/src/tir/transforms/lower_intrin.cc b/src/tir/transforms/lower_intrin.cc
index 2555002d29b0..8c850f0dea41 100644
--- a/src/tir/transforms/lower_intrin.cc
+++ b/src/tir/transforms/lower_intrin.cc
@@ -27,6 +27,7 @@
 #include <tvm/tir/op.h>
 #include <tvm/tir/transform.h>
 
+#include <limits>
 #include <unordered_set>
 
 #include "../../arith/ir_mutator_with_analyzer.h"
@@ -112,20 +113,63 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
       // Common path, positive divisor
       if (analyzer_->CanProveGreaterEqual(op->a, 0) || analyzer_->CanProveGreaterEqual(e, 0)) {
         return truncdiv(op->a, op->b);
+      }
+
+      // If the numerator's lower bound is known, express the floordiv
+      // in terms of truncdiv using only positive operands.
+      arith::ConstIntBound const_int_bound = analyzer_->const_int_bound(op->a);
+      if (const_int_bound->min_value != arith::ConstIntBound::kNegInf &&
+          const_int_bound->min_value < 0 &&
+          const_int_bound->min_value > Downcast<IntImm>(tvm::min_value(op->a->dtype))->value) {
+        // The goal is to write floordiv(a,b) in terms of truncdiv, without using
+        // negative operands.
+        //
+        // For any integer c
+        //
+        //   floordiv(a,b) == floordiv(a + b*c - b*c, b)
+        //                 == floordiv(a + b*c, b) - c
+        //
+        // Choosing `c = ceildiv(-a_min, b)`.  This can be rewritten in terms of
+        // truncdiv as follows.
+        //
+        //   c == ceildiv(-a_min,b)
+        //     == floordiv(-a_min + (b-1), b)
+        //     == truncdiv(-a_min + (b-1), b)
+        //
+        // When substituted into `a + b*c`, this results in a positive argument.
+        //
+        //   a + b*c
+        //     == a + b*ceildiv(-a_min,b)
+        //     == a - b*floordiv(a_min,b)
+        //     >= a - b*floordiv(a,b)
+        //     == floormod(a, b)
+        //     >= 0
+        //
+        // Since the argument is positive, this allows floordiv to be written as
+        // followed.
+        //
+        //   floordiv(a,b)
+        //     == floordiv(a + b*c, b) - c
+        //     == truncdiv(a + b*c, b) - c
+        IntImm min(op->a->dtype, const_int_bound->min_value);
+        PrimExpr ceildiv = truncdiv((op->b - 1) - min, op->b);
+        PrimExpr offset_numerator = analyzer_->Simplify(op->a + op->b * ceildiv);
+        return truncdiv(offset_numerator, op->b) - ceildiv;
+      }
+
+      DLOG(INFO) << "LowerFloorDiv: Cannot decide the sign of divident";
+      PrimExpr rdiv = truncdiv(op->a, op->b);
+      PrimExpr rmod = truncmod(op->a, op->b);
+      // condition on b >= 0.
+      // truncmod(a, b) < 0 will implies ceildiv,
+      // So we need to correct these cases.
+      if ((dtype == DataType::Int(32) || dtype == DataType::Int(64)) && support_bitwise_op_) {
+        // equivalent to rdiv + (rmod >= 0 ? 0: -1);
+        return rdiv + (rmod >> make_const(dtype, dtype.bits() - 1));
       } else {
-        DLOG(INFO) << "LowerFloorDiv: Cannot decide the sign of divident";
-        PrimExpr rdiv = truncdiv(op->a, op->b);
-        PrimExpr rmod = truncmod(op->a, op->b);
-        // condition on b >= 0.
-        // truncmod(a, b) < 0 will implies ceildiv,
-        // So we need to correct these cases.
-        if ((dtype == DataType::Int(32) || dtype == DataType::Int(64)) && support_bitwise_op_) {
-          // equivalent to rdiv + (rmod >= 0 ? 0: -1);
-          return rdiv + (rmod >> make_const(dtype, dtype.bits() - 1));
-        } else {
-          return tir::Select(rmod >= 0, rdiv, rdiv - make_const(dtype, 1));
-        }
+        return tir::Select(rmod >= 0, rdiv, rdiv - make_const(dtype, 1));
       }
+
     } else {
       if (dtype.is_float()) {
         // floor(a / b)
@@ -165,21 +209,63 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
       // Common pass, positive divisor
       if (analyzer_->CanProveGreaterEqual(op->a, 0)) {
         return truncmod(op->a, op->b);
+      }
+
+      // If the numerator's lower bound is known, express the floormod
+      // in terms of truncmod using only positive operands.
+      arith::ConstIntBound const_int_bound = analyzer_->const_int_bound(op->a);
+      if (const_int_bound->min_value != arith::ConstIntBound::kNegInf &&
+          const_int_bound->min_value < 0 &&
+          const_int_bound->min_value > Downcast<IntImm>(tvm::min_value(op->a->dtype))->value) {
+        // The goal is to write floormod(a,b) in terms of truncdiv and truncmod,
+        // without using negative operands.
+        //
+        // For any integer c
+        //
+        //   floormod(a, b) == floormod(a + b*c, b)
+        //
+        // Choosing `c = ceildiv(-a_min, b)`.  This can be rewritten in terms of
+        // truncdiv as follows.
+        //
+        //   c == ceildiv(-a_min,b)
+        //     == floordiv(-a_min + (b-1), b)
+        //     == truncdiv(-a_min + (b-1), b)
+        //
+        // When substituted into `a + b*c`, this results in a positive argument.
+        //
+        //   a + b*c
+        //     == a + b*ceildiv(-a_min,b)
+        //     == a - b*floordiv(a_min,b)
+        //     >= a - b*floordiv(a,b)
+        //     == floormod(a, b)
+        //     >= 0
+        //
+        // Since the argument is positive, this allows floordiv to be written as
+        // followed.
+        //
+        //   floormod(a,b)
+        //     == floormod(a + b*c, b)
+        //     == truncmod(a + b*c, b)
+        IntImm min(op->a->dtype, const_int_bound->min_value);
+        PrimExpr ceildiv = truncdiv(-min + (op->b - 1), op->b);
+        PrimExpr offset_numerator = analyzer_->Simplify(op->a + op->b * ceildiv);
+        return truncmod(offset_numerator, op->b);
+      }
+
+      DLOG(INFO) << "LowerFloorMod: Cannot decide the sign of divident";
+      // NOTE:condition on b >= 0.
+      // mod(a, b) < 0 will imply we are doing ceildiv,
+      // So we need to correct these cases.
+      PrimExpr rmod = truncmod(op->a, op->b);
+      if ((dtype == DataType::Int(32) || dtype == DataType::Int(64)) && support_bitwise_op_) {
+        // (rmod >> shift) & b
+        // -> (rmod >= 0 ? 0: -1) & b
+        // -> rmod >= 0 ? 0 : b
+        return rmod + (op->b & (rmod >> make_const(dtype, dtype.bits() - 1)));
       } else {
-        DLOG(INFO) << "LowerFloorMod: Cannot decide the sign of divident";
-        // NOTE:condition on b >= 0.
-        // mod(a, b) < 0 will imply we are doing ceildiv,
-        // So we need to correct these cases.
-        PrimExpr rmod = truncmod(op->a, op->b);
-        if ((dtype == DataType::Int(32) || dtype == DataType::Int(64)) && support_bitwise_op_) {
-          // (rmod >> shift) & b
-          // -> (rmod >= 0 ? 0: -1) & b
-          // -> rmod >= 0 ? 0 : b
-          return rmod + (op->b & (rmod >> make_const(dtype, dtype.bits() - 1)));
-        } else {
-          return tir::Select(rmod >= 0, rmod, rmod + op->b);
-        }
+        return tir::Select(rmod >= 0, rmod, rmod + op->b);
       }
+
     } else {
       if (dtype.is_float()) {
         // a - floor(a / b) * b
diff --git a/tests/python/unittest/test_target_codegen_vulkan.py b/tests/python/unittest/test_target_codegen_vulkan.py
index 76cad250e053..7b71f4d4ab17 100644
--- a/tests/python/unittest/test_target_codegen_vulkan.py
+++ b/tests/python/unittest/test_target_codegen_vulkan.py
@@ -28,6 +28,7 @@
 import tvm.testing
 from tvm import relay, te
 from tvm.topi.math import cast
+from tvm.script import tir as T
 
 
 dtype = tvm.testing.parameter("float32", "int32", "float16", "int8")
@@ -558,5 +559,46 @@ def do_compute(ins, outs):
         tvm.build(s, [Out], target)
 
 
+def test_negative_operand_divmod(target, dev):
+    """Test handling of negative offsets to floormod/floordiv
+
+    Even though the SPIR-V spec states that OpSRem and OpSMod can give
+    the signed modulo, the Vulkan spec states that any use of negative
+    operands is undefined behavior.  This test starts with negative
+    operands to floordiv, validating that they are simplified into the
+    corresponding positive operands, such that the final TIR can be
+    expressed using only positive operands.
+
+    SPIR-V: https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpSRem
+    Vulkan: https://registry.khronos.org/vulkan/specs/1.3/html/chap37.html#spirvenv-op-prec
+    """
+
+    N = 32
+    offset = 16
+    divisor = 5
+
+    @T.prim_func
+    def func(A: T.Buffer[(N, 2), "int32"]):
+        for i in T.serial(N):
+            with T.block("A"):
+                v_i = T.axis.spatial(N, i)
+                A[v_i, 0] = T.floordiv(v_i - offset, divisor)
+                A[v_i, 1] = T.floormod(v_i - offset, divisor)
+
+    if "gpu" in tvm.target.Target(target).keys:
+        sch = tvm.tir.Schedule(func)
+        sch.bind(sch.get_loops("A")[0], "threadIdx.x")
+        func = sch.mod["main"]
+
+    built = tvm.build(func, target=target)
+
+    a_dev = tvm.nd.empty([N, 2], "int32", dev)
+    built(a_dev)
+    a = a_dev.numpy()
+
+    np.testing.assert_array_equal(a[:, 0], (np.arange(N) - offset) // divisor)
+    np.testing.assert_array_equal(a[:, 1], (np.arange(N) - offset) % divisor)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 00e81e9eafe9b7b7a94f6f683096e1fcf2de6182 Mon Sep 17 00:00:00 2001
From: Zihao Ye <expye@outlook.com>
Date: Wed, 11 Jan 2023 02:17:24 +0800
Subject: [PATCH 151/286] [Profiler] Allow user to flush L2 cache in
 `time_evalutor` function for profiling CUDA kernels (#13726)

Currently, our default profiler (time_evaluator) does not flush the L2 cache per execution, this might lead to incorrect time measurement because the input data last run might reside in L2 cache and reduce the data fetching time in the next run. Both Triton and nvbench consider this effect thus reporting more accurate measurements.

Solution: time_evalutor has an argument f_preproc where user can specify a pre-processing function per execution of the kernel being evaluated. Currently, TVM supports cache_flush_cpu_non_first_arg which flushes CPU cache. But similar functionality for GPU is missing.

This PR completely borrows the design of nvbench's l2flush struct and allow the user to specify "l2_cache_flush_cuda" as a preprocessing function which flushes NVIDIA GPU's L2 cache. l2_cache_flush_cuda is not a default value so existing program's behavior would not be influenced.
---
 3rdparty/nvbench/l2_cache_flush.h             |  74 ++++++
 LICENSE                                       |   2 +-
 licenses/LICENSE.l2_cache_flush.txt           | 218 ++++++++++++++++++
 src/runtime/cuda/l2_cache_flush.cc            |  42 ++++
 .../unittest/test_evaluator_with_preproc.py   |  60 +++++
 5 files changed, 395 insertions(+), 1 deletion(-)
 create mode 100644 3rdparty/nvbench/l2_cache_flush.h
 create mode 100644 licenses/LICENSE.l2_cache_flush.txt
 create mode 100644 src/runtime/cuda/l2_cache_flush.cc
 create mode 100644 tests/python/unittest/test_evaluator_with_preproc.py

diff --git a/3rdparty/nvbench/l2_cache_flush.h b/3rdparty/nvbench/l2_cache_flush.h
new file mode 100644
index 000000000000..3d0211564535
--- /dev/null
+++ b/3rdparty/nvbench/l2_cache_flush.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ * \file l2_cache_flush.h
+ * \brief Functions to flush L2 cache using CUDA's API, adopted from nvbench.
+ */
+#ifndef L2_CACHE_FLUSH_H_
+#define L2_CACHE_FLUSH_H_
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <dmlc/logging.h>
+
+namespace tvm {
+namespace runtime {
+
+#define CUDA_CALL(func)                                       \
+  {                                                           \
+    cudaError_t e = (func);                                   \
+    ICHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
+        << "CUDA: " << cudaGetErrorString(e);                 \
+  }
+
+class L2Flush {
+ public:
+  L2Flush() : initialized_(false), l2_size_(0), l2_buffer_(nullptr) {}
+
+  ~L2Flush() {
+    if (l2_size_ > 0) {
+      CUDA_CALL(cudaFree(l2_buffer_));
+    }
+  }
+
+  void Flush(cudaStream_t stream) {
+    if (!initialized_) {
+      // initialize l2_buffer_ and l2_size_
+      initialized_ = true;
+      int device_id;
+      CUDA_CALL(cudaGetDevice(&device_id));
+      CUDA_CALL(cudaDeviceGetAttribute(&l2_size_, cudaDevAttrL2CacheSize, device_id));
+      if (l2_size_ > 0) {
+        CUDA_CALL(cudaMalloc(reinterpret_cast<void**>(&l2_buffer_), l2_size_));
+      }
+    }
+    if (l2_size_ > 0) {
+      CUDA_CALL(cudaMemsetAsync(l2_buffer_, 0, l2_size_, stream));
+    }
+  }
+
+  static L2Flush* ThreadLocal();
+
+ private:
+  bool initialized_ = false;
+  int l2_size_;
+  int* l2_buffer_;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // L2_CACHE_FLUSH_H_
diff --git a/LICENSE b/LICENSE
index 6524d530deca..fbc11be2deb5 100644
--- a/LICENSE
+++ b/LICENSE
@@ -212,6 +212,7 @@ Apache Software Foundation License 2.0
 3rdparty/dlpack
 3rdparty/dmlc-core
 3rdparty/OpenCL-Headers
+3rdparty/nvbench (with LLVM exception)
 
 
 BSD 2-clause License
@@ -234,7 +235,6 @@ MIT License
 3rdparty/cma
 3rdparty/compiler-rt/builtin_fp16.h
 
-
 The Unlicense
 -------------
 
diff --git a/licenses/LICENSE.l2_cache_flush.txt b/licenses/LICENSE.l2_cache_flush.txt
new file mode 100644
index 000000000000..bd8b243dfa02
--- /dev/null
+++ b/licenses/LICENSE.l2_cache_flush.txt
@@ -0,0 +1,218 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+--- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
diff --git a/src/runtime/cuda/l2_cache_flush.cc b/src/runtime/cuda/l2_cache_flush.cc
new file mode 100644
index 000000000000..6b2c4665301c
--- /dev/null
+++ b/src/runtime/cuda/l2_cache_flush.cc
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../../../3rdparty/nvbench/l2_cache_flush.h"
+
+#include <dmlc/thread_local.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+
+#include "cuda_common.h"
+
+namespace tvm {
+
+namespace runtime {
+
+typedef dmlc::ThreadLocalStore<L2Flush> L2FlushStore;
+
+L2Flush* L2Flush::ThreadLocal() { return L2FlushStore::Get(); }
+
+TVM_REGISTER_GLOBAL("l2_cache_flush_cuda").set_body([](TVMArgs args, TVMRetValue* rv) {
+  ICHECK(L2Flush::ThreadLocal() != nullptr) << "L2Flush::ThreadLocal do not exist.";
+  cudaStream_t stream = CUDAThreadEntry::ThreadLocal()->stream;
+  L2Flush::ThreadLocal()->Flush(stream);
+});
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/tests/python/unittest/test_evaluator_with_preproc.py b/tests/python/unittest/test_evaluator_with_preproc.py
new file mode 100644
index 000000000000..fc6eec25b8da
--- /dev/null
+++ b/tests/python/unittest/test_evaluator_with_preproc.py
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import te
+from tvm.script import tir as T
+import tvm.testing
+import numpy as np
+import pytest
+
+
+@T.prim_func
+def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, [128, 128])
+    B = T.match_buffer(b, [128, 128])
+    C = T.match_buffer(c, [128, 128])
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block("matmul"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vi, vj] = 0.0
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
+@tvm.testing.requires_cuda
+@pytest.mark.parametrize("f_preproc", ["", "l2_cache_flush_cuda"])
+def test_time_evalutor_with_preproc(f_preproc: str):
+    mod = tvm.IRModule.from_expr(matmul)
+    sch = tvm.tir.Schedule(mod)
+    blk = sch.get_block("matmul")
+    i, j, k = sch.get_loops(blk)
+    sch.bind(i, "blockIdx.x")
+    sch.bind(j, "threadIdx.x")
+    f = tvm.build(sch.mod["main"], target="cuda")
+    dev = tvm.cuda(0)
+    evaluator = f.time_evaluator(f.entry_name, dev, repeat=1000, number=1, f_preproc=f_preproc)
+
+    a = tvm.nd.array(np.random.rand(128, 128).astype("float32"), device=dev)
+    b = tvm.nd.array(np.random.rand(128, 128).astype("float32"), device=dev)
+    c = tvm.nd.array(np.zeros((128, 128)).astype("float32"), device=dev)
+    args = [a, b, c]
+    print("Evaluator (f_preproc={}):\t{:.5f}ms".format(f_preproc, evaluator(*args).mean * 1000))
+
+
+if __name__ == "__main__":
+    test_time_evalutor_with_preproc("l2_cache_flush_cuda")

From 8c7bbdd949e3e0d9c88c5e7b5f3e46cc23e29960 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 10 Jan 2023 16:09:26 -0600
Subject: [PATCH 152/286] [CMake][OpenCL] Remove warning for OpenCL wrapper
 (#13683)

* [CMake][OpenCL] Remove warning for OpenCL wrapper

Previously, setting `set(USE_OPENCL ON)` would result in a warning,
stating that the runtime wrapper for OpenCL would be used.  Since this
is the desired behavior when OpenCL support is enabled, and is not
something that a user should fix, this commit removes the warning.

* Added "STATUS" message instead of "WARNING", with more detail

* "installation location" -> "library location"

Co-authored-by: Egor Churaev <egor.churaev@gmail.com>

Co-authored-by: Egor Churaev <egor.churaev@gmail.com>
---
 cmake/modules/OpenCL.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
index ced2da2d17e3..53199f19cb25 100644
--- a/cmake/modules/OpenCL.cmake
+++ b/cmake/modules/OpenCL.cmake
@@ -43,7 +43,7 @@ if(USE_OPENCL)
   tvm_file_glob(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
 
   if(${USE_OPENCL} MATCHES ${IS_TRUE_PATTERN})
-    message(WARNING "Build with OpenCL wrapper")
+    message(STATUS "Enabled runtime search for OpenCL library location")
     file_glob_append(RUNTIME_OPENCL_SRCS
       "src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc"
     )

From 12fe0e9abdf789f2dd6695b95b8d18e46b1512d7 Mon Sep 17 00:00:00 2001
From: Janet Schneider <21978033+janetsc@users.noreply.github.com>
Date: Tue, 10 Jan 2023 16:32:21 -0800
Subject: [PATCH 153/286] [AOT]Aot module post-test error workaround (#13685)

The deletion of the AotModule python object results in the device shutting down RPC too early, causing an assertion in the DSP. This is a temporary workaround while this is still being investigated. The workaround is to keep a reference to the AotModule object in the HexagonSession object, so it stays until the session has been shut down.
---
 python/tvm/contrib/hexagon/session.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index 466103f6e2c9..506f1d968d70 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -248,8 +248,9 @@ def get_aot_executor(
         GraphModule :
             Runtime graph module that can be used to execute the graph.
         """
-        aot_mod = self.load_module(module_file)
-        return tvm.runtime.executor.AotModule(aot_mod["default"](self.device))
+        # Temporary workaround for https://github.com/apache/tvm/issues/13741
+        self.aot_mod = self.load_module(module_file)
+        return tvm.runtime.executor.AotModule(self.aot_mod["default"](self.device))
 
     def get_graph_debug_executor(
         self,

From a13648f02fcf63f53d909cd9682828a3572e8c8c Mon Sep 17 00:00:00 2001
From: Mohamad Katanbaf <mtkatanbaf@gmail.com>
Date: Tue, 10 Jan 2023 17:53:57 -0800
Subject: [PATCH 154/286] [microTVM] tuning on micro targets with meta-schedule
 (#13514)

adds support for tuning microTVM models using meta-schedule.

Summary of the changes:

adds "c" to the targets supported by meta-schedule
implements a builder and runner for micro devices
runs a simple tuning job for verification

Co-authored-by: Mohamad <mkatanbaf@users.noreply.github.com>
---
 include/tvm/meta_schedule/mutator.h           |   2 +
 include/tvm/meta_schedule/postproc.h          |   2 +
 include/tvm/meta_schedule/schedule_rule.h     |   2 +
 .../meta_schedule/local_builder_micro.py      |  84 +++++++
 .../micro/meta_schedule/rpc_runner_micro.py   | 233 ++++++++++++++++++
 python/tvm/meta_schedule/relay_integration.py |  30 ++-
 python/tvm/relay/backend/executor.py          |  13 +
 python/tvm/rpc/server.py                      |  30 ++-
 python/tvm/rpc/tracker.py                     |  25 +-
 src/meta_schedule/mutator/mutator.cc          |   8 +
 src/meta_schedule/postproc/postproc.cc        |   8 +
 .../schedule_rule/schedule_rule.cc            |  27 ++
 .../space_generator/space_generator.cc        |   7 +
 src/meta_schedule/utils.h                     |   2 +
 src/target/source/codegen_c_host.cc           |   1 +
 tests/micro/zephyr/test_ms_tuning.py          | 171 +++++++++++++
 tests/python/unittest/test_micro_ms_tuning.py | 126 ++++++++++
 17 files changed, 760 insertions(+), 11 deletions(-)
 create mode 100644 python/tvm/contrib/micro/meta_schedule/local_builder_micro.py
 create mode 100644 python/tvm/contrib/micro/meta_schedule/rpc_runner_micro.py
 create mode 100644 tests/micro/zephyr/test_ms_tuning.py
 create mode 100644 tests/python/unittest/test_micro_ms_tuning.py

diff --git a/include/tvm/meta_schedule/mutator.h b/include/tvm/meta_schedule/mutator.h
index 4095d6ca0397..498b2797ada5 100644
--- a/include/tvm/meta_schedule/mutator.h
+++ b/include/tvm/meta_schedule/mutator.h
@@ -139,6 +139,8 @@ class Mutator : public runtime::ObjectRef {
   TVM_DLL static Map<Mutator, FloatImm, void> DefaultCUDATensorCore();
   /*! \brief Create default mutators for Hexagon */
   TVM_DLL static Map<Mutator, FloatImm, void> DefaultHexagon();
+  /*! \brief Create default mutators for Micro */
+  TVM_DLL static Map<Mutator, FloatImm, void> DefaultMicro();
 
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Mutator, ObjectRef, MutatorNode);
 };
diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index 24cfd4cb2167..06fa086c4bca 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -171,6 +171,8 @@ class Postproc : public runtime::ObjectRef {
   TVM_DLL static Array<Postproc, void> DefaultCUDATensorCore();
   /*! \brief Create default postprocessors for Hexagon */
   TVM_DLL static Array<Postproc, void> DefaultHexagon();
+  /*! \brief Create default postprocessors for Micro */
+  TVM_DLL static Array<Postproc, void> DefaultMicro();
 
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Postproc, ObjectRef, PostprocNode);
 };
diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 879dd076a8b5..16202e18bf95 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -298,6 +298,8 @@ class ScheduleRule : public runtime::ObjectRef {
   TVM_DLL static Array<ScheduleRule, void> DefaultCUDATensorCore();
   /*! \brief Create default schedule rules for Hexagon */
   TVM_DLL static Array<ScheduleRule, void> DefaultHexagon();
+  /*! \brief Create default schedule rules for Micro */
+  TVM_DLL static Array<ScheduleRule, void> DefaultMicro();
 
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(ScheduleRule, ObjectRef, ScheduleRuleNode);
 };
diff --git a/python/tvm/contrib/micro/meta_schedule/local_builder_micro.py b/python/tvm/contrib/micro/meta_schedule/local_builder_micro.py
new file mode 100644
index 000000000000..20e0c45836ee
--- /dev/null
+++ b/python/tvm/contrib/micro/meta_schedule/local_builder_micro.py
@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Local builder for microTVM projects that compile on the local host"""
+
+import os
+import tempfile
+from typing import Optional, Dict
+from tvm.ir import IRModule
+from tvm.runtime import NDArray
+from tvm.target import Target
+from tvm.meta_schedule.builder import LocalBuilder
+from tvm.driver.build_module import OperatorModule
+from tvm import micro
+from tvm.contrib.tar import tar
+from tvm.relay.backend import Runtime
+from tvm.driver import build as tvm_build
+from tvm.tir.transform import RemoveWeightLayoutRewriteBlock
+
+
+def get_local_builder_micro():
+    """Return micro-compatible Builder for meta schedule."""
+
+    def _micro_build(
+        mod: IRModule, target: Target, _params: Optional[Dict[str, NDArray]]
+    ) -> OperatorModule:
+        """Build function for micro targets.
+
+        Parameters
+        ----------
+        mod : IRModule
+            The IRModule to be built.
+        target : Target
+            The target to be built.
+        _params : Optional[Dict[str, NDArray]]
+            The parameters to be used for the build. Must be None.
+
+        Returns
+        -------
+        rt_mod : OperatorModule
+            The built Module.
+        """
+
+        # Note: tvm_build assigns "global_symbol" to the name of generated C function
+        # changing it is necessary for micro targets,
+        # since the generated projects already include a main function.
+        prim_func = mod["main"].with_attr("global_symbol", "default_function")
+        mod = IRModule({"main": prim_func})
+        runtime = Runtime("crt", {"system-lib": True})
+        mod = RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite=True)(mod)
+        rt_mod = tvm_build(mod, target=target, runtime=runtime)
+        return rt_mod
+
+    def _micro_export(mod: OperatorModule) -> str:
+        """Export function for micro targets.
+
+        Parameters
+        ----------
+        mod : OperatorModule
+            The Module to be exported.
+
+        Returns
+        -------
+        artifact_path : str
+            The path to the exported Module.
+        """
+        artifact_path = os.path.join(tempfile.mkdtemp(), "tvm_tmp_mod." + tar.output_format)
+        micro.export_model_library_format(mod, artifact_path)
+        return artifact_path
+
+    return LocalBuilder(f_build=_micro_build, f_export=_micro_export)
diff --git a/python/tvm/contrib/micro/meta_schedule/rpc_runner_micro.py b/python/tvm/contrib/micro/meta_schedule/rpc_runner_micro.py
new file mode 100644
index 000000000000..e4c08351841d
--- /dev/null
+++ b/python/tvm/contrib/micro/meta_schedule/rpc_runner_micro.py
@@ -0,0 +1,233 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""RPC Runner Micro"""
+
+from contextlib import contextmanager
+from typing import Callable, List, Optional
+from collections import namedtuple
+import signal
+
+from tvm import micro
+from tvm import nd
+from tvm.contrib.popen_pool import PopenPoolExecutor
+from tvm.rpc.server import Server
+from tvm.rpc.tracker import Tracker
+from tvm.meta_schedule.logging import get_logger
+from tvm.meta_schedule.utils import cpu_count, derived_object
+from tvm.meta_schedule.runner.config import EvaluatorConfig, RPCConfig
+from tvm.meta_schedule.runner import PyRunner, RunnerFuture, RunnerInput
+from tvm.meta_schedule.runner.rpc_runner import RPCRunnerFuture
+from tvm.meta_schedule.runner.utils import T_ARG_INFO_JSON_OBJ_LIST
+
+logger = get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@derived_object
+class RPCRunnerMicro(PyRunner):
+    """RPC based runner for tuning micro models."""
+
+    def __init__(
+        self,
+        platform: str = "crt",
+        project_options: Optional[dict] = None,
+        rpc_config: Optional[RPCConfig] = None,
+        evaluator_config: Optional[EvaluatorConfig] = None,
+        max_workers: Optional[int] = None,
+        initializer: Optional[Callable[[], None]] = None,
+    ) -> None:
+        """Constructor
+
+        Parameters
+        ----------
+        platform: str
+            The platform used for project generation.
+        project_options: dict
+            The options for the generated micro project.
+        rpc_config: RPCConfig
+            The rpc configuration.
+        evaluator_config: EvaluatorConfig
+            The evaluator configuration.
+        max_workers: Optional[int] = None
+            The maximum number of connections. Defaults to number of logical CPU cores.
+        initializer: Optional[Callable[[], None]]
+            The initializer function.
+        """
+        super().__init__()
+        self.platform = platform
+        if project_options is None:
+            project_options = {}
+        self.project_options = project_options
+        self.rpc_config = RPCConfig._normalized(rpc_config)
+        self.evaluator_config = EvaluatorConfig._normalized(evaluator_config)
+
+        if max_workers is None:
+            max_workers = cpu_count(logical=True)
+        logger.info("RPCRunner: max_workers = %d", max_workers)
+        self.pool = PopenPoolExecutor(
+            max_workers=max_workers,
+            timeout=rpc_config.session_timeout_sec,
+            initializer=initializer,
+        )
+
+    def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
+        results: List[RunnerFuture] = []
+
+        for runner_input in runner_inputs:
+            future = RPCRunnerFuture(
+                future=self.pool.submit(
+                    _worker_func,
+                    self.platform,
+                    self.project_options or {},
+                    self.rpc_config,
+                    self.evaluator_config,
+                    str(runner_input.artifact_path),
+                    str(runner_input.device_type),
+                    tuple(arg_info.as_json() for arg_info in runner_input.args_info),
+                ),
+                timeout_sec=self.rpc_config.session_timeout_sec,
+            )
+            results.append(future)  # type: ignore
+        return results
+
+
+def _worker_func(
+    platform: str,
+    project_options: dict,
+    rpc_config: RPCConfig,
+    evaluator_config: EvaluatorConfig,
+    artifact_path: str,
+    device_type: str,
+    args_info: T_ARG_INFO_JSON_OBJ_LIST,
+) -> List[float]:
+
+    module_loader = micro.AutoTvmModuleLoader(
+        template_project_dir=micro.get_microtvm_template_projects(platform),
+        project_options=project_options,
+    )
+
+    remote_kw = {
+        "device_key": rpc_config.tracker_key,
+        "host": rpc_config.tracker_host,
+        "port": rpc_config.tracker_port,
+        "priority": 0,
+        "timeout": 100,
+    }
+    build_result = namedtuple("BuildResult", ["filename"])(artifact_path)
+
+    with module_loader(remote_kw, build_result) as (remote, mod):
+        dev = remote.device(device_type, 0)
+        f_prepare = ""
+        if evaluator_config.enable_cpu_cache_flush:
+            f_prepare = "cache_flush_cpu_non_first_arg"
+        time_f = mod.time_evaluator(
+            mod.entry_name,
+            dev,
+            number=evaluator_config.number,
+            repeat=evaluator_config.repeat,
+            min_repeat_ms=evaluator_config.min_repeat_ms,
+            f_preproc=f_prepare,
+        )
+
+        random_fill = remote.get_function("tvm.contrib.random.random_fill")
+        args = [nd.empty(x[2], x[1], dev) for x in args_info]
+        for arg in args:
+            random_fill(arg)
+        dev.sync()
+
+        costs = time_f(*args).results
+    return costs
+
+
+@contextmanager
+def get_rpc_runner_micro(
+    platform,
+    options,
+    rpc_config: RPCConfig = None,
+    evaluator_config: EvaluatorConfig = None,
+    session_timeout_sec=300,
+):
+    """Parameters
+    ----------
+    platform: str
+        The platform used for project generation.
+    project_options: dict
+        The options for the generated micro project.
+    rpc_config: RPCConfig
+        The rpc configuration.
+    evaluator_config: EvaluatorConfig
+        The evaluator configuration.
+    session_timeout_sec: int
+        The session timeout. if the number of candidates sent to runner is larger
+        than the runner workers, increase the timeout.
+    """
+    if rpc_config is None:
+        tracker_host = "127.0.0.1"
+        tracker_port = 9000
+        tracker_key = "$local$device$%d" % tracker_port
+        rpc_config = RPCConfig(
+            tracker_host=tracker_host,
+            tracker_port=tracker_port,
+            tracker_key=tracker_key,
+            session_priority=0,
+            session_timeout_sec=session_timeout_sec,
+        )
+    tracker_port_end = rpc_config.tracker_port + 1000
+
+    if evaluator_config is None:
+        evaluator_config = EvaluatorConfig(
+            number=3,
+            repeat=1,
+            min_repeat_ms=100,
+            enable_cpu_cache_flush=False,
+        )
+
+    tracker = Tracker(
+        port=rpc_config.tracker_port,
+        port_end=tracker_port_end,
+        silent=True,
+        reuse_addr=True,
+        timeout=60,
+    )
+    server = Server(
+        port=rpc_config.tracker_port,
+        port_end=tracker_port_end,
+        key=rpc_config.tracker_key,
+        silent=True,
+        tracker_addr=(rpc_config.tracker_host, rpc_config.tracker_port),
+        reuse_addr=True,
+        timeout=60,
+    )
+
+    def terminate():
+        tracker.terminate()
+        server.terminate()
+
+    def handle_SIGINT(signal, frame):
+        terminate()
+        raise KeyboardInterrupt("Received SIGINT")
+
+    signal.signal(signal.SIGINT, handle_SIGINT)
+
+    try:
+        yield RPCRunnerMicro(
+            platform=platform,
+            project_options=options,
+            rpc_config=rpc_config,
+            evaluator_config=evaluator_config,
+        )
+    finally:
+        terminate()
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index 876dba106c38..fbdf68d09767 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -73,12 +73,14 @@ def _normalize_params(
     params: Optional[Dict[str, NDArray]],
     pass_config: Mapping[str, Any],
     executor: Optional["relay.backend.Executor"],
+    runtime: Optional["relay.backend.Runtime"],
 ) -> Tuple[
     IRModule,
     Target,
     Dict[str, NDArray],
     Dict[str, Any],
     Optional["relay.backend.Executor"],
+    Optional["relay.backend.Runtime"],
 ]:
     from tvm import relay  # pylint: disable=import-outside-toplevel
 
@@ -97,13 +99,16 @@ def _normalize_params(
     if executor is None:
         executor = relay.backend.Executor("graph")
 
+    if runtime is None:
+        runtime = relay.backend.Runtime("cpp")
+
     if mod.get_attr("executor") is None:
         mod = mod.with_attr("executor", executor)
     else:
         executor = mod.get_attr("executor")
 
     pass_config = dict(pass_config)
-    return mod, target, relay_params, pass_config, executor
+    return mod, target, relay_params, pass_config, executor, runtime
 
 
 def extract_tasks(
@@ -119,6 +124,7 @@ def extract_tasks(
         }
     ),
     executor: Optional["relay.backend.Executor"] = None,
+    runtime: Optional["relay.backend.Runtime"] = None,
     module_equality: str = "structural",
     disabled_pass: Optional[Union[List[str], Set[str], Tuple[str]]] = None,
 ) -> List[ExtractedTask]:
@@ -138,6 +144,8 @@ def extract_tasks(
         The pass configuration
     executor : Optional[relay.backend.Executor]
         The executor to use
+    runtime : Optional[relay.backend.Runtime]
+        The runtime to use
     module_equality : Optional[str]
         A string to specify the module equality testing and hashing method.
         It must be one of the followings:
@@ -160,8 +168,13 @@ def extract_tasks(
     from tvm import autotvm
 
     # pylint: enable=import-outside-toplevel
-    mod, target, params, pass_config, _ = _normalize_params(
-        mod, target, params, pass_config, executor
+    mod, target, params, pass_config, _ex, _rt = _normalize_params(
+        mod,
+        target,
+        params,
+        pass_config,
+        executor,
+        runtime,
     )
     if target.kind.name != "cuda" and isinstance(
         autotvm.DispatchContext.current, autotvm.FallbackContext
@@ -355,6 +368,7 @@ def compile_relay(
     ),
     executor: Optional["relay.backend.Executor"] = None,
     disabled_pass: Optional[Union[List[str], Set[str], Tuple[str]]] = None,
+    runtime: Optional["relay.backend.Runtime"] = None,
 ):
     """Compile a relay program with a MetaSchedule database.
 
@@ -380,6 +394,8 @@ def compile_relay(
         The executor to use in relay.build. It is not supported by RelayVM.
     disabled_pass : Optional[Union[List[str], Set[str], Tuple[str]]]
         The list of disabled passes
+    runtime : Optional[relay.backend.Runtime]
+        The runtime to use in relay.build. It is not supported by RelayVM.
 
     Returns
     -------
@@ -390,8 +406,8 @@ def compile_relay(
     from tvm import relay
 
     # pylint: enable=import-outside-toplevel
-    mod, target, params, pass_config, executor = _normalize_params(
-        mod, target, params, pass_config, executor
+    mod, target, params, pass_config, executor, runtime = _normalize_params(
+        mod, target, params, pass_config, executor, runtime
     )
     pass_config.setdefault("relay.backend.use_meta_schedule_dispatch", True)
     with Profiler.timeit("PostTuningCompilation"):
@@ -402,7 +418,9 @@ def compile_relay(
                 disabled_pass=disabled_pass,
             ):
                 if backend == "graph":
-                    return relay.build(mod, target=target, params=params, executor=executor)
+                    return relay.build(
+                        mod, target=target, params=params, executor=executor, runtime=runtime
+                    )
                 elif backend == "vm":
                     return relay.vm.compile(mod, target=target, params=params)
                 else:
diff --git a/python/tvm/relay/backend/executor.py b/python/tvm/relay/backend/executor.py
index ac5e5bf1f829..854473f662c0 100644
--- a/python/tvm/relay/backend/executor.py
+++ b/python/tvm/relay/backend/executor.py
@@ -33,15 +33,28 @@ def __init__(self, name, options=None) -> None:
         if options is None:
             options = {}
         self.__init_handle_by_constructor__(_backend.CreateExecutor, name, options)
+        self._init_wrapper()
+
+    # Note:  sometimes the _attrs field is not properly populated,
+    # most likely since __new__ is called instead of __init__ in tvm/_ffi/_ctypes/object.py
+    def _init_wrapper(self):
         self._attrs = _backend.GetExecutorAttrs(self)
+        self._init_wrapper_called = True
+
+    def _check_init_wrapper(self):
+        if not (hasattr(self, "_init_wrapper_called") and self._init_wrapper_called):
+            self._init_wrapper()
 
     def __contains__(self, name):
+        self._check_init_wrapper()
         return name in self._attrs
 
     def __getitem__(self, name):
+        self._check_init_wrapper()
         return self._attrs[name]
 
     def __eq__(self, other):
+        self._check_init_wrapper()
         return str(other) == str(self) and dict(other._attrs) == dict(self._attrs)
 
     @staticmethod
diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index a144356f2e6d..2581ebcdc9a2 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -319,6 +319,8 @@ def __init__(
         load_library=None,
         custom_addr=None,
         silent=False,
+        reuse_addr=True,
+        timeout=None,
     ):
 
         # start update
@@ -332,6 +334,10 @@ def __init__(
 
         if not is_proxy:
             sock = socket.socket(base.get_addr_family((host, port)), socket.SOCK_STREAM)
+            if reuse_addr:
+                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            if timeout is not None:
+                sock.settimeout(timeout)
             self.port = None
             for my_port in range(port, port_end):
                 try:
@@ -371,6 +377,8 @@ def _popen_start_rpc_server(
     silent=False,
     no_fork=False,
     server_init_callback=None,
+    reuse_addr=True,
+    timeout=None,
 ):
     if no_fork:
         multiprocessing.set_start_method("spawn")
@@ -382,7 +390,17 @@ def _popen_start_rpc_server(
     # Popen worker to run on a separate process.
     # Create and start the server in a different thread
     state = PopenRPCServerState(
-        host, port, port_end, is_proxy, tracker_addr, key, load_library, custom_addr, silent
+        host,
+        port,
+        port_end,
+        is_proxy,
+        tracker_addr,
+        key,
+        load_library,
+        custom_addr,
+        silent,
+        reuse_addr,
+        timeout,
     )
     PopenRPCServerState.current = state
     # returns the port so that the main can get the port number.
@@ -434,6 +452,12 @@ class Server(object):
     server_init_callback: Callable, optional
         Additional initialization function when starting the server.
 
+    reuse_addr: bool, optional
+        Allows the kernel to reuse a local socket in TIME_WAIT state.
+
+    timeout: float, optional
+         set a timeout for all operations on the socket
+
     Note
     ----
     The RPC server only sees functions in the tvm namespace.
@@ -464,6 +488,8 @@ def __init__(
         silent=False,
         no_fork=False,
         server_init_callback=None,
+        reuse_addr=True,
+        timeout=None,
     ):
         try:
             if _ffi_api.ServerLoop is None:
@@ -486,6 +512,8 @@ def __init__(
                 silent,
                 no_fork,
                 server_init_callback,
+                reuse_addr,
+                timeout,
             ],
         )
         # receive the port
diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
index e65ed4a012f0..ab33d20daee1 100644
--- a/python/tvm/rpc/tracker.py
+++ b/python/tvm/rpc/tracker.py
@@ -387,11 +387,15 @@ class PopenTrackerServerState(object):
 
     current = None
 
-    def __init__(self, host, port=9190, port_end=9199, silent=False):
+    def __init__(self, host, port=9190, port_end=9199, silent=False, reuse_addr=True, timeout=None):
         if silent:
             logger.setLevel(logging.WARN)
 
         sock = socket.socket(base.get_addr_family((host, port)), socket.SOCK_STREAM)
+        if reuse_addr:
+            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        if timeout is not None:
+            sock.settimeout(timeout)
         self.port = None
         self.stop_key = base.random_key("tracker")
         for my_port in range(port, port_end):
@@ -412,11 +416,13 @@ def __init__(self, host, port=9190, port_end=9199, silent=False):
         self.host = host
 
 
-def _popen_start_tracker_server(host, port=9190, port_end=9199, silent=False):
+def _popen_start_tracker_server(
+    host, port=9190, port_end=9199, silent=False, reuse_addr=True, timeout=None
+):
     # This is a function that will be sent to the
     # Popen worker to run on a separate process.
     # Create and start the server in a different thread
-    state = PopenTrackerServerState(host, port, port_end, silent)
+    state = PopenTrackerServerState(host, port, port_end, silent, reuse_addr, timeout)
     PopenTrackerServerState.current = state
     # returns the port so that the main can get the port number.
     return (state.port, state.stop_key)
@@ -440,9 +446,18 @@ class Tracker(object):
 
     silent: bool, optional
         Whether run in silent mode
+
+    reuse_addr: bool, optional
+        Allows the kernel to reuse a local socket in TIME_WAIT state.
+
+    timeout: float, optional
+         set a timeout for all operations on the socket
+
     """
 
-    def __init__(self, host="0.0.0.0", port=9190, port_end=9199, silent=False):
+    def __init__(
+        self, host="0.0.0.0", port=9190, port_end=9199, silent=False, reuse_addr=True, timeout=None
+    ):
         if silent:
             logger.setLevel(logging.WARN)
         self.proc = PopenWorker()
@@ -454,6 +469,8 @@ def __init__(self, host="0.0.0.0", port=9190, port_end=9199, silent=False):
                 port,
                 port_end,
                 silent,
+                reuse_addr,
+                timeout,
             ],
         )
         # receive the port
diff --git a/src/meta_schedule/mutator/mutator.cc b/src/meta_schedule/mutator/mutator.cc
index 8f3d14b6c466..3cf43e11260e 100644
--- a/src/meta_schedule/mutator/mutator.cc
+++ b/src/meta_schedule/mutator/mutator.cc
@@ -78,6 +78,13 @@ Map<Mutator, FloatImm> Mutator::DefaultHexagon() {
       {Mutator::MutateParallel(/*max_jobs_per_core=*/16), FloatImm(DataType::Float(64), 0.02)}};
 }
 
+Map<Mutator, FloatImm> Mutator::DefaultMicro() {
+  return Map<Mutator, FloatImm>{
+      {Mutator::MutateTileSize(), FloatImm(DataType::Float(64), 0.9)},
+      {Mutator::MutateComputeLocation(), FloatImm(DataType::Float(64), 0.05)},
+      {Mutator::MutateUnroll(), FloatImm(DataType::Float(64), 0.03)}};
+}
+
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<PyMutatorNode>([](const ObjectRef& n, ReprPrinter* p) {
       const auto* self = n.as<PyMutatorNode>();
@@ -104,6 +111,7 @@ TVM_REGISTER_GLOBAL("meta_schedule.MutatorDefaultCUDA").set_body_typed(Mutator::
 TVM_REGISTER_GLOBAL("meta_schedule.MutatorDefaultCUDATensorCore")
     .set_body_typed(Mutator::DefaultCUDATensorCore);
 TVM_REGISTER_GLOBAL("meta_schedule.MutatorDefaultHexagon").set_body_typed(Mutator::DefaultHexagon);
+TVM_REGISTER_GLOBAL("meta_schedule.MutatorDefaultMicro").set_body_typed(Mutator::DefaultMicro);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/postproc/postproc.cc b/src/meta_schedule/postproc/postproc.cc
index dba523d094bf..7730e4372fa9 100644
--- a/src/meta_schedule/postproc/postproc.cc
+++ b/src/meta_schedule/postproc/postproc.cc
@@ -100,6 +100,14 @@ Array<Postproc> Postproc::DefaultHexagon() {
   };
 }
 
+Array<Postproc> Postproc::DefaultMicro() {
+  return Array<Postproc>{
+      Postproc::DisallowDynamicLoop(),
+      Postproc::RewriteParallelVectorizeUnroll(),
+      Postproc::RewriteReductionBlock(),
+  };
+}
+
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<PyPostprocNode>([](const ObjectRef& n, ReprPrinter* p) {
       const auto* self = n.as<PyPostprocNode>();
diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index e4f97c1fa673..113703272031 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -251,6 +251,31 @@ Array<ScheduleRule> ScheduleRule::DefaultHexagon() {
   };
 }
 
+Array<ScheduleRule> ScheduleRule::DefaultMicro() {
+  return {
+      ScheduleRule::ApplyCustomRule(),
+      ScheduleRule::InlineConstantScalars(),
+      ScheduleRule::AutoInline(
+          /*into_producer=*/false,
+          /*into_consumer=*/true,
+          /*inline_const_tensor=*/true,
+          /*disallow_if_then_else=*/true,
+          /*require_injective=*/true,
+          /*require_ordered=*/true,
+          /*disallow_op=*/Array<String>{"tir.exp"}),
+      ScheduleRule::MultiLevelTiling(
+          /*structure=*/"SSRSRS",
+          /*tile_binds=*/NullOpt,
+          /*max_innermost_factor=*/Integer(64),
+          /*vector_load_lens=*/NullOpt,
+          /*reuse_read=*/NullOpt,
+          /*reuse_write=*/
+          Map<String, ObjectRef>{{"req", String("may")},
+                                 {"levels", Array<Integer>{1, 2}},
+                                 {"scope", String("global")}}),
+  };
+}
+
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<PyScheduleRuleNode>([](const ObjectRef& n, ReprPrinter* p) {
       const auto* self = n.as<PyScheduleRuleNode>();
@@ -279,6 +304,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleDefaultCUDATensorCore")
     .set_body_typed(ScheduleRule::DefaultCUDATensorCore);
 TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleDefaultHexagon")
     .set_body_typed(ScheduleRule::DefaultHexagon);
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleDefaultMicro")
+    .set_body_typed(ScheduleRule::DefaultMicro);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index 2d69727384a7..926f86cc4ff9 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -57,6 +57,9 @@ String GetRuleKindFromTarget(const Target& target) {
     return "cuda";
   }
 
+  if (target->kind->name == "c") {
+    return "c";
+  }
   LOG(FATAL) << "Unsupported target: " << target;
   throw;
 }
@@ -90,6 +93,10 @@ void SpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
       default_sch_rules = ScheduleRule::DefaultVNNI();
       default_postprocs = Postproc::DefaultVNNI();
       default_mutator_probs = Mutator::DefaultVNNI();
+    } else if (kind == "c") {
+      default_sch_rules = ScheduleRule::DefaultMicro();
+      default_postprocs = Postproc::DefaultMicro();
+      default_mutator_probs = Mutator::DefaultMicro();
     } else {
       LOG(FATAL) << "Unsupported kind: " << kind;
       throw;
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 969aa630df39..6039423844e8 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -525,6 +525,8 @@ inline ScheduleRule GetDefaultAutoInline(const std::string& target_name) {
     rules = ScheduleRule::DefaultLLVM();
   } else if (target_name == "hexagon") {
     rules = ScheduleRule::DefaultHexagon();
+  } else if (target_name == "c") {
+    rules = ScheduleRule::DefaultMicro();
   } else if (IsGPUTarget(target_name)) {
     rules = ScheduleRule::DefaultCUDA();
   } else {
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 78eb08202dfe..e3f87b0954b0 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -54,6 +54,7 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, bool emit_fwd_func_d
   decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
   decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
   decl_stream << "#include <math.h>\n";
+  decl_stream << "#include <stdbool.h>\n";
   if (devices.find("ethos-u") != devices.end()) {
     decl_stream << "#include <tvm_ethosu_runtime.h>\n";
   }
diff --git a/tests/micro/zephyr/test_ms_tuning.py b/tests/micro/zephyr/test_ms_tuning.py
new file mode 100644
index 000000000000..3ce6ff68bc32
--- /dev/null
+++ b/tests/micro/zephyr/test_ms_tuning.py
@@ -0,0 +1,171 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import pytest
+from types import MappingProxyType
+import pathlib
+import json
+
+import tvm
+from tvm import relay
+from tvm.relay.backend import Executor
+from tvm.contrib import graph_executor, utils
+from tvm import meta_schedule as ms
+from tvm.contrib.micro.meta_schedule.local_builder_micro import get_local_builder_micro
+from tvm.contrib.micro.meta_schedule.rpc_runner_micro import get_rpc_runner_micro
+
+
+def create_relay_module():
+    data_shape = (1, 3, 16, 16)
+    weight_shape = (8, 3, 5, 5)
+    data = relay.var("data", relay.TensorType(data_shape, "float32"))
+    weight = relay.var("weight", relay.TensorType(weight_shape, "float32"))
+    y = relay.nn.conv2d(
+        data,
+        weight,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        kernel_layout="OIHW",
+        out_dtype="float32",
+    )
+    f = relay.Function([data, weight], y)
+    mod = tvm.IRModule.from_expr(f)
+    mod = relay.transform.InferType()(mod)
+
+    weight_sample = np.random.rand(
+        weight_shape[0], weight_shape[1], weight_shape[2], weight_shape[3]
+    ).astype("float32")
+    params = {mod["main"].params[1].name_hint: weight_sample}
+
+    model_info = {
+        "in_tensor": "data",
+        "in_shape": data_shape,
+        "in_dtype": "float32",
+    }
+
+    return mod, params, model_info
+
+
+@tvm.testing.requires_micro
+@pytest.mark.xfail_on_fvp()
+def test_ms_tuning_conv2d(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
+    """Test meta-schedule tuning for microTVM Zephyr"""
+
+    mod, params, model_info = create_relay_module()
+    input_name = model_info["in_tensor"]
+    input_shape = model_info["in_shape"]
+    input_dtype = model_info["in_dtype"]
+    data_sample = np.random.rand(*input_shape).astype(input_dtype)
+
+    platform = "zephyr"
+    project_options = {
+        "board": board,
+        "verbose": microtvm_debug,
+        "project_type": "host_driven",
+        "use_fvp": bool(use_fvp),
+        "serial_number": serial_number,
+        "config_main_stack_size": 4096,
+    }
+
+    boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
+    with open(boards_file) as f:
+        boards = json.load(f)
+    target = tvm.target.target.micro(model=boards[project_options["board"]]["model"])
+
+    runtime = relay.backend.Runtime("crt", {"system-lib": True})
+    executor = Executor("aot", {"link-params": True})
+    # This line is necessary for link-params to take effect during
+    # task extraction and relay.build(...).
+    mod = mod.with_attr("executor", executor)
+
+    builder = get_local_builder_micro()
+    with ms.Profiler() as profiler:
+        with get_rpc_runner_micro(
+            platform=platform, options=project_options, session_timeout_sec=120
+        ) as runner:
+
+            db: ms.Database = ms.relay_integration.tune_relay(
+                mod=mod,
+                params=params,
+                target=target,
+                builder=builder,
+                runner=runner,
+                strategy="evolutionary",
+                num_trials_per_iter=2,
+                max_trials_per_task=10,
+                max_trials_global=100,
+                work_dir=str(workspace_dir),
+                module_equality="ignore-ndarray",
+            )
+
+        #  Build model using meta_schedule logs
+        opt_mod, opt_params = relay.optimize(mod, target)
+        ms_mod: tvm.runtime.Module = ms.relay_integration.compile_relay(
+            database=db,
+            mod=opt_mod,
+            target=target,
+            params=opt_params,
+            pass_config=MappingProxyType(
+                {
+                    "relay.backend.use_meta_schedule": True,
+                    "relay.backend.tir_converter": "default",
+                    "tir.disable_vectorize": True,
+                }
+            ),
+            executor=executor,
+            runtime=runtime,
+        )
+    print(profiler.table())
+
+    project = tvm.micro.generate_project(
+        str(tvm.micro.get_microtvm_template_projects(platform)),
+        ms_mod,
+        str(workspace_dir / "project"),
+        options=project_options,
+    )
+    project.build()
+    project.flash()
+    with tvm.micro.Session(project.transport()) as session:
+        aot_executor = tvm.runtime.executor.aot_executor.AotModule(session.create_aot_executor())
+        aot_executor.get_input(0).copyfrom(data_sample)
+        result = aot_executor.module.time_evaluator("run", session.device, number=3)()
+        output = aot_executor.get_output(0).numpy()
+
+    # Build reference model (without tuning)
+    dev = tvm.cpu()
+    target = tvm.target.target.micro(model="host")
+    with tvm.transform.PassContext(
+        opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["AlterOpLayout"]
+    ):
+        ref_mod = relay.build(
+            mod,
+            target=target,
+            params=params,
+            runtime=runtime,
+        )
+    ref_mod.export_library(workspace_dir / "compiled_lib2.so")
+    mod2: tvm.runtime.Module = tvm.runtime.load_module(workspace_dir / "compiled_lib2.so")
+    graph_mod = graph_executor.GraphModule(mod2["default"](dev))
+    graph_mod.set_input(input_name, data_sample)
+    graph_mod.run()
+    ref_output = graph_mod.get_output(0).numpy()
+
+    assert np.allclose(output, ref_output, rtol=1e-4, atol=2e-4), "FAILED"
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_micro_ms_tuning.py b/tests/python/unittest/test_micro_ms_tuning.py
new file mode 100644
index 000000000000..81b412fd9c88
--- /dev/null
+++ b/tests/python/unittest/test_micro_ms_tuning.py
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import pytest
+from types import MappingProxyType
+import pathlib
+import json
+from tests.micro.zephyr.test_ms_tuning import create_relay_module
+import tvm
+from tvm import relay
+from tvm.relay.backend import Executor
+from tvm.contrib import graph_executor, utils
+from tvm import meta_schedule as ms
+from tvm.contrib.micro.meta_schedule.local_builder_micro import get_local_builder_micro
+from tvm.contrib.micro.meta_schedule.rpc_runner_micro import get_rpc_runner_micro
+
+
+def test_micro_tuning_with_meta_schedule():
+    platform = "crt"
+    target = tvm.target.target.micro(model="host")
+    options = {}
+
+    work_dir = utils.tempdir()
+    mod, params, model_info = create_relay_module()
+    input_name = model_info["in_tensor"]
+    input_shape = model_info["in_shape"]
+    input_dtype = model_info["in_dtype"]
+    data_sample = np.random.rand(*input_shape).astype(input_dtype)
+
+    runtime = relay.backend.Runtime("crt", {"system-lib": True})
+    executor = Executor("aot", {"link-params": True})
+    # This line is necessary for link-params to take effect during
+    # task extraction and relay.build(...).
+    mod = mod.with_attr("executor", executor)
+
+    builder = get_local_builder_micro()
+
+    with ms.Profiler() as profiler:
+        with get_rpc_runner_micro(
+            platform=platform, options=options, session_timeout_sec=120
+        ) as runner:
+            db: ms.Database = ms.relay_integration.tune_relay(
+                mod=mod,
+                params=params,
+                target=target,
+                builder=builder,
+                runner=runner,
+                strategy="evolutionary",
+                num_trials_per_iter=2,
+                max_trials_per_task=10,
+                max_trials_global=100,
+                work_dir=str(work_dir),
+                module_equality="ignore-ndarray",
+            )
+
+        #  Build model using meta_schedule logs
+        ms_mod: tvm.runtime.Module = ms.relay_integration.compile_relay(
+            database=db,
+            mod=mod,
+            target=target,
+            params=params,
+            pass_config=MappingProxyType(
+                {
+                    "relay.backend.use_meta_schedule": True,
+                    "relay.backend.tir_converter": "default",
+                    "tir.disable_vectorize": True,
+                }
+            ),
+            executor=executor,
+            runtime=runtime,
+        )
+    print(profiler.table())
+
+    project = tvm.micro.generate_project(
+        str(tvm.micro.get_microtvm_template_projects(platform)),
+        ms_mod,
+        str(work_dir / "project"),
+        options=options,
+    )
+    project.build()
+    project.flash()
+    with tvm.micro.Session(project.transport()) as session:
+        aot_executor = tvm.runtime.executor.aot_executor.AotModule(session.create_aot_executor())
+        aot_executor.get_input(0).copyfrom(data_sample)
+        result = aot_executor.module.time_evaluator("run", session.device, number=3)()
+        output = aot_executor.get_output(0).numpy()
+
+    # Build reference model (without tuning)
+    dev = tvm.cpu()
+    target = tvm.target.target.micro(model="host")
+    with tvm.transform.PassContext(
+        opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["AlterOpLayout"]
+    ):
+        ref_mod = relay.build(
+            mod,
+            target=target,
+            params=params,
+            runtime=runtime,
+        )
+    ref_mod.export_library(work_dir / "compiled_lib2.so")
+    mod2: tvm.runtime.Module = tvm.runtime.load_module(work_dir / "compiled_lib2.so")
+    graph_mod = graph_executor.GraphModule(mod2["default"](dev))
+    graph_mod.set_input(input_name, data_sample)
+    graph_mod.run()
+    ref_output = graph_mod.get_output(0).numpy()
+
+    assert np.allclose(output, ref_output, rtol=1e-4, atol=2e-4), "FAILED"
+    work_dir.remove()
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 9edabfec76152448d85d7540cbad5ceeaaf07a5b Mon Sep 17 00:00:00 2001
From: Qianshui <qianshui.jiang@intel.com>
Date: Wed, 11 Jan 2023 12:28:22 +0800
Subject: [PATCH 155/286] [Tensorize][TOPI] Add AMX Tensorizing for int8 batch
 matmul (#13745)

* amx int8 tensorized x86 bmm

* remove the unused amx schedule

* fix lint

* fix lint

* remove unused import

* fix Instr. assert in testcase.
---
 python/tvm/relay/op/strategy/x86.py   | 10 ++---
 python/tvm/topi/x86/batch_matmul.py   | 53 ++++++++++++++++++++------
 python/tvm/topi/x86/dense.py          | 21 ++--------
 tests/python/relay/test_op_level10.py | 55 +++++++++++++++++++++++++++
 4 files changed, 104 insertions(+), 35 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 4585809f63e1..d0ad377203c9 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -23,9 +23,7 @@
 from tvm.auto_scheduler import is_auto_scheduler_enabled
 from tvm.meta_schedule import is_meta_schedule_enabled
 from tvm.relay.ty import is_dynamic
-from tvm.target import Target
 from tvm.te import SpecializedCondition
-from tvm.topi.x86.utils import target_has_vnni
 
 from .. import op as _op
 from .generic import *
@@ -618,7 +616,6 @@ def dense_pack_strategy_cpu(attrs, inputs, out_type, target):
 def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
     """batch_matmul x86 strategy"""
     strategy = _op.OpStrategy()
-    mcpu = Target.current().mcpu
 
     need_auto_scheduler_layout = is_auto_scheduler_enabled()
     need_meta_schedule_layout = is_meta_schedule_enabled()
@@ -626,16 +623,15 @@ def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
     if (
         not attrs.transpose_a
         and attrs.transpose_b
-        and target_has_vnni(mcpu)
         and inputs[0].dtype == "uint8"
         and inputs[1].dtype == "int8"
         and inputs[1].shape[-2] % 16 == 0
         and inputs[1].shape[-1] % 4 == 0
     ):
         strategy.add_implementation(
-            wrap_compute_batch_matmul(topi.x86.batch_matmul_vnni_compute, need_out_dtype=True),
-            wrap_topi_schedule(topi.x86.schedule_batch_matmul_vnni),
-            name="batch_matmul_vnni.x86",
+            wrap_compute_batch_matmul(topi.x86.batch_matmul_int8_compute, need_out_dtype=True),
+            wrap_topi_schedule(topi.x86.schedule_batch_matmul_int8),
+            name="batch_matmul_int8.x86",
             plevel=10,
         )
     elif is_dynamic(out_type) or need_auto_scheduler_layout or need_meta_schedule_layout:
diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
index 025f41660c9c..9f3bc2951524 100644
--- a/python/tvm/topi/x86/batch_matmul.py
+++ b/python/tvm/topi/x86/batch_matmul.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name,too-many-locals,unused-variable
+# pylint: disable=unused-argument
 """x86 batch_matmul operators"""
 import tvm
 from tvm import autotvm, te
@@ -24,18 +25,24 @@
 from .. import generic, nn
 from ..transform import layout_transform
 from ..utils import get_const_tuple, get_max_power2_factor, traverse_inline
-from .dense import dense_vnni_schedule
+from .dense import dense_vnni_schedule, dense_amx_int8_schedule
 from .injective import schedule_injective_from_existing
+from .utils import target_has_vnni, target_has_amx
 
 
 @autotvm.register_topi_compute("batch_matmul_vnni.x86")
-def batch_matmul_vnni_compute(cfg, x, y, *_):
+def batch_matmul_int8_compute(cfg, x, y, *_):
     """Compute for uint8 x int8 -> int32 batch_matmul"""
     batch, m, k = x.shape
     packed_y_layout = "BNK16n4k"
     packed_y = layout_transform(y, "BNK", packed_y_layout)
     _, n_o, _, n_i, _ = packed_y.shape
     ak = te.reduce_axis((0, k), name="k")
+    mcpu = tvm.target.Target.current().mcpu
+    if target_has_vnni(mcpu):
+        attrs_info = {"schedule_rule": "batch_matmul_vnni"}
+    else:
+        attrs_info = None
 
     z = te.compute(
         (batch, m, n_o * n_i),
@@ -46,14 +53,10 @@ def batch_matmul_vnni_compute(cfg, x, y, *_):
             ),
             axis=ak,
         ),
-        tag="batch_matmul_vnni",
-        attrs={"schedule_rule": "batch_matmul_vnni"},
+        tag="batch_matmul_int8",
+        attrs=attrs_info,
     )
 
-    _, a_y, _ = z.op.axis
-    cfg.define_split("tile_y", a_y, num_outputs=2)
-    cfg.define_knob("layout_trans_compute_root", [0, 1])
-
     return z
 
 
@@ -67,6 +70,7 @@ def batch_matmul_vnni_schedule(cfg, s, C, O, layout_trans):
     # Parallelize over batch
     fused = s[O].fuse(O.op.axis[0], fused_inner)
     s[O].parallel(fused)
+    cfg.define_knob("layout_trans_compute_root", [0, 1])
 
     if cfg["layout_trans_compute_root"].val:
         s[layout_trans].compute_root()
@@ -80,6 +84,29 @@ def batch_matmul_vnni_schedule(cfg, s, C, O, layout_trans):
     return s
 
 
+def batch_matmul_amx_schedule(cfg, s, C, O, layout_trans):
+    """Schedule batch_matmul compute using AMX tdpbusd instruction"""
+    # C: The output of batched GEMM
+    # O: The output of the fused op
+
+    # Schedule the GEMM part
+    s, fused_inner = dense_amx_int8_schedule(cfg, s, C, O, do_parallel=False)
+    # Parallelize over ouuter loop
+    fused = s[O].fuse(O.op.axis[0], fused_inner)
+    s[O].parallel(fused)
+    cfg.define_knob("layout_trans_compute_root", [0, 1])
+
+    if cfg["layout_trans_compute_root"].val:
+        s[layout_trans].compute_root()
+        schedule_injective_from_existing(s, layout_trans)
+    else:
+        _, _, _, ni, ki = s[layout_trans].op.axis
+        s[layout_trans].vectorize(ki)
+        s[layout_trans].unroll(ni)
+
+    return s
+
+
 @autotvm.register_topi_compute("batch_matmul.x86")
 def batch_matmul(
     cfg, tensor_a, tensor_b, out_shape=None, out_dtype=None, transpose_a=False, transpose_b=True
@@ -202,14 +229,18 @@ def _callback(op):
 
 
 @autotvm.register_topi_schedule("batch_matmul_vnni.x86")
-def schedule_batch_matmul_vnni(cfg, outs):
+def schedule_batch_matmul_int8(cfg, outs):
     """Schedule for batch_matmul_vnni"""
     s = te.create_schedule([x.op for x in outs])
+    mcpu = tvm.target.Target.current().mcpu
 
     def _callback(op):
-        if "batch_matmul_vnni" in op.tag:
+        if "batch_matmul_int8" in op.tag:
             layout_trans = op.input_tensors[1]
-            batch_matmul_vnni_schedule(cfg, s, op.output(0), outs[0], layout_trans)
+            if target_has_amx(mcpu):
+                batch_matmul_amx_schedule(cfg, s, op.output(0), outs[0], layout_trans)
+            elif target_has_vnni(mcpu):
+                batch_matmul_vnni_schedule(cfg, s, op.output(0), outs[0], layout_trans)
 
     traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index ada19d598cdf..bb99a632811b 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -436,7 +436,7 @@ def split_k(out, rd_axis):
         cfg.define_split("tile_k", rd_axis, num_outputs=5, filter=lambda y: y.size[-1] == 128)
         return cfg["tile_k"].apply(s, out, rd_axis)
 
-    a_x, a_y = C.op.axis
+    a_x, a_y = C.op.axis[-2:]
     (a_k,) = C.op.reduce_axis
     CF = s.cache_write(C, "amx.tmm")
 
@@ -447,7 +447,7 @@ def split_k(out, rd_axis):
     s[CF].compute_at(s[C], a_yo)
 
     (a_k_f,) = CF.op.reduce_axis
-    a_x_f, a_y_f = CF.op.axis
+    a_x_f, a_y_f = CF.op.axis[-2:]
 
     a_xo_f, a_xi_f = s[CF].split(a_x_f, factor=32)
 
@@ -455,8 +455,8 @@ def split_k(out, rd_axis):
     a_k3_f, a_k2_f, a_k1_f, a_ko_f, a_ki_f = split_k(CF, a_k_f)
     s[CF].reorder(a_k3_f, a_k2_f, a_k1_f, a_ko_f, a_xo_f, a_yo_f, a_ki_f, a_xi_f, a_yi_f)
 
-    (m, k) = CF.op.input_tensors[0].shape
-    (n, c, n_i, c_i) = CF.op.input_tensors[1].shape
+    (m, k) = CF.op.input_tensors[0].shape[-2:]
+    (n, c, n_i, c_i) = CF.op.input_tensors[1].shape[-4:]
     n = n * n_i
 
     s[CF].tensorize(a_ki_f, dot_32x128x32_u8s8s32_sapphirerapids(LDA=int(k)))
@@ -479,19 +479,6 @@ def split_k(out, rd_axis):
     return s, fused
 
 
-@autotvm.register_topi_schedule("dense_amx_int8.x86")
-def schedule_dense_amx_int8(cfg, outs):
-    """Create a schedule for dense_amx_int8"""
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "dense_amx_int8" in op.tag:
-            dense_amx_int8_schedule(cfg, s, op.output(0), outs[0])
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
 def matmul_blas_common(cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, lib):
     """Compute matmul/dense using a BLAS library"""
     M, K = get_const_tuple(tensor_a.shape)
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 619a0b5a9333..cdf4e734842b 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -520,6 +520,61 @@ def test_batch_matmul_vnni(b, m, n, k):
         np.testing.assert_equal(out, ref)
 
 
+@pytest.mark.skip("skip due to AMX feature not avaliable yet")
+@pytest.mark.parametrize(
+    "b,m,n,k",
+    [
+        (16, 32, 32, 128),
+        (16, 32, 32, 127),
+        (16, 32, 31, 128),
+    ],
+)
+def test_batch_matmul_amx(b, m, n, k):
+    amx_init = tvm.get_global_func("runtime.amx_init")
+    amx_tileconfig = tvm.get_global_func("runtime.amx_tileconfig")
+    assert amx_init()
+    assert amx_tileconfig(16, 64)  # config tile size to 16 rows by 64 columns.
+
+    x_shape = (b, m, k)
+    y_shape = (b, n, k)
+    z_shape = (b, m, n)
+
+    for lhs_dtype in ["uint8", "int8"]:
+        x = relay.var("x", shape=x_shape, dtype=lhs_dtype)
+        y = relay.var("y", shape=y_shape, dtype="int8")
+        z = relay.var("z", shape=z_shape, dtype="int32")
+        bmm = relay.nn.batch_matmul(x, y, out_dtype="int32")
+        out = bmm + z
+        mod = tvm.IRModule.from_expr(out)
+
+        target = "llvm -mcpu=sapphirerapids"
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(mod, target=target)
+
+        asm = lib.lib.get_source("asm")
+        assert "tilezero" in asm
+        assert "tileloaddt1" in asm
+        assert "tdpbusd" in asm
+        assert "tilestored" in asm
+
+        dev = tvm.device(target, 0)
+        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+        x_np = np.random.uniform(1, 10, size=x_shape).astype(lhs_dtype)
+        y_np = np.random.uniform(1, 10, size=y_shape).astype("int8")
+        z_np = np.random.uniform(1, 10, size=z_shape).astype("int32")
+
+        runtime.set_input("x", x_np)
+        runtime.set_input("y", y_np)
+        runtime.set_input("z", z_np)
+        runtime.run()
+
+        out = runtime.get_output(0).numpy()
+        ref = tvm.topi.testing.batch_matmul(x_np, y_np, out_dtype="int32") + z_np
+
+        np.testing.assert_equal(out, ref)
+
+
 @pytest.mark.skip("Requires GFX10 AMDGPU")
 def test_batch_matmul_rocm_sdot4():
     x_shape = (16, 32, 96)

From 4c5264c1e2418e247307f17b1a41a6abf7bf5e0c Mon Sep 17 00:00:00 2001
From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com>
Date: Wed, 11 Jan 2023 11:03:26 +0300
Subject: [PATCH 156/286] [Hexagon][QNN] Improve performance wo QNN
 canonicalization (#13734)

This commit improves performance of different models tuned with
MetaScheduler for Hexagon target and without QNN canonicalization.

Benchmarking of several models on Snapdragon 8gen1 and tuned with MS:

shape            | QNN canon enabled, ms | QNN canon disabled, ms |   speedup   |
-----------------|-----------------------|------------------------|-------------|
ResNet, int8     |           50          |           48           |    +4.2%    |
Inception, int8  |          103          |          106           |    -2.8%    |
SRGAN, int8      |          348          |          431           |   -19.3%    |
--------------------------------------------------------------------------------|

What was done:

1) Added 2 new passes: QnnLegalize and QnnCanonicalize. But this is just
wrappers for Legalize("FTVMQnnLegalize") and
Legalize("FTVMQnnCanonicalize").

2) Added ability to disable inline for specific blocks in MetaSchedule
AutoInline rule. For example, it can be done through the
T.block_attr({"meta_schedule.inline_rule": "disable"}).

3) Implemented compute, alter op and legalization functions for
qnn.conv2d operation (for Hexagon target).
---
 include/tvm/relay/transform.h                 |   4 +
 include/tvm/tir/stmt.h                        |   3 +
 python/tvm/relay/qnn/op/_qnn.py               |  10 +-
 python/tvm/relay/qnn/op/legalizations.py      |  70 ++++++
 python/tvm/relay/qnn/strategy/hexagon.py      |  13 ++
 python/tvm/topi/hexagon/qnn/__init__.py       |   1 +
 .../tvm/topi/hexagon/qnn/conv2d_alter_op.py   |  53 +++++
 python/tvm/topi/hexagon/qnn/nn.py             | 208 +++++++++++++++---
 python/tvm/topi/nn/qnn.py                     |  19 ++
 .../schedule_rule/auto_inline.cc              |   5 +
 src/relay/qnn/op/convolution.cc               |   3 +-
 src/relay/qnn/op/requantize.cc                |   9 +-
 src/relay/qnn/pass/legalize.cc                |  22 +-
 .../test_wo_qnn_canonicalization.py           | 121 +++++++++-
 14 files changed, 499 insertions(+), 42 deletions(-)
 create mode 100644 python/tvm/topi/hexagon/qnn/conv2d_alter_op.py

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index cdea8e8e3c23..3227f7979d87 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -710,6 +710,10 @@ TVM_DLL Function UnCPS(const Function& f);
  */
 TVM_DLL Expr DeDup(const Expr& e);
 
+namespace legalize {
+TVM_DLL Expr Legalize(const Expr& expr, const std::string& legalize_map_attr_name);
+}  // namespace legalize
+
 }  // namespace relay
 }  // namespace tvm
 
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index dc257b1e8a21..96e03477a141 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -1613,6 +1613,9 @@ constexpr const char* meta_schedule_auto_tensorize_init = "meta_schedule.auto_te
  */
 constexpr const char* warp_execution = "warp_execution";
 
+/*! \brief Mark that a block is disallowed in auto inline. */
+constexpr const char* meta_schedule_inline_rule = "meta_schedule.inline_rule";
+
 /*!
  * \brief Check if attr_key is a pragma key extension
  * \param attr_key The attr key to be compared
diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py
index 64ef1ee92a1c..c9c4c86e8b47 100644
--- a/python/tvm/relay/qnn/op/_qnn.py
+++ b/python/tvm/relay/qnn/op/_qnn.py
@@ -22,7 +22,7 @@
 from .. import strategy
 from ...op.op import register_compute
 from ...op.op import register_injective_schedule
-from ...op.op import register_strategy, register_pattern, OpPattern
+from ...op.op import register_strategy, register_pattern, register_alter_op_layout, OpPattern
 
 
 @register_compute("qnn.simulated_quantize")
@@ -83,7 +83,13 @@ def simulated_dequantize_compute(attrs, inputs, output_type):
 
 # qnn.conv2d
 register_strategy("qnn.conv2d", strategy.qnn_conv2d_strategy)
-register_pattern("qnn.conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+@register_alter_op_layout("qnn.conv2d")
+def alter_op_layout_qnn_conv2d(attrs, inputs, tinfos, out_type):
+    """Alternate the layout of qnn.conv2d"""
+    return topi.nn.qnn_conv2d_alter_layout(attrs, inputs, tinfos, out_type)
+
 
 # qnn.dense
 register_strategy("qnn.dense", strategy.qnn_dense_strategy)
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index ad016bc20089..9baabf36a9d8 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -405,6 +405,11 @@ def is_fast_int8_on_intel():
     return target_has_sse42(target.mcpu)
 
 
+# Helper function to align up given value.
+def helper_align_up(value, aligner):
+    return ((value + aligner) // aligner) * aligner
+
+
 ########################
 # ARM CPU legalizations.
 ########################
@@ -483,3 +488,68 @@ def _qnn_dense_legalize_cuda(attrs, inputs, types):
         # CUDA prefers both datatypes to be the int8.
         return helper_change_dtypes_to_int8(attrs, inputs, types, relay.qnn.op.dense)
     return None
+
+
+########################
+# Hexagon legalizations.
+########################
+
+IN_CHANNEL_VECTOR_LENGTH = 4
+OUT_CHANNEL_VECTOR_LENGTH = 32
+
+
+@qnn_conv2d_legalize.register("hexagon")
+def _qnn_conv2d_legalize_hexagon(attrs, inputs, types):
+    """Legalize qnn.conv2d op for vrmpy tensorization.
+
+    If the inputs are signed or unsigned int8 and data/kernel layouts are NCHW/OIHW, then the input
+    and output channels are padded to be a multiple of 4 and 32 respectively.
+    """
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+
+    if data_layout != "NCHW" or kernel_layout != "OIHW":
+        return None
+
+    data_tensor, kernel_tensor = types[0], types[1]
+
+    if "int8" in data_tensor.dtype and "int8" in kernel_tensor.dtype:
+        in_channel = data_tensor.shape[1].value
+        out_channel = kernel_tensor.shape[0].value
+        ic_modified = False
+        oc_modified = False
+        data, kernel, input_zp, output_zp, input_scale, output_scale = inputs
+
+        if in_channel % IN_CHANNEL_VECTOR_LENGTH != 0:
+            new_in_channel = helper_align_up(in_channel, IN_CHANNEL_VECTOR_LENGTH)
+            diff = new_in_channel - in_channel
+            pad_width = ((0, 0), (0, diff), (0, 0), (0, 0))
+            data = relay.nn.pad(data, pad_width=pad_width)
+            kernel = relay.nn.pad(kernel, pad_width=pad_width)
+            ic_modified = True
+
+        new_out_channel = out_channel
+        if out_channel % OUT_CHANNEL_VECTOR_LENGTH != 0:
+            new_out_channel = helper_align_up(out_channel, OUT_CHANNEL_VECTOR_LENGTH)
+            diff = new_out_channel - out_channel
+            kernel = relay.nn.pad(kernel, pad_width=((0, diff), (0, 0), (0, 0), (0, 0)))
+            oc_modified = True
+
+        if ic_modified is True or oc_modified is True:
+            new_attrs = dict(attrs)
+            if oc_modified:
+                new_attrs["channels"] = new_out_channel
+                out = relay.qnn.op.conv2d(
+                    data, kernel, input_zp, output_zp, input_scale, output_scale, **new_attrs
+                )
+                output_tensor = types[6]
+                original_out_shape = list(output_tensor.shape)
+                out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
+            else:
+                out = relay.qnn.op.conv2d(
+                    data, kernel, input_zp, output_zp, input_scale, output_scale, **new_attrs
+                )
+
+            return out
+
+    return None
diff --git a/python/tvm/relay/qnn/strategy/hexagon.py b/python/tvm/relay/qnn/strategy/hexagon.py
index d17812e3fbcc..c25c96f8edb4 100644
--- a/python/tvm/relay/qnn/strategy/hexagon.py
+++ b/python/tvm/relay/qnn/strategy/hexagon.py
@@ -17,12 +17,18 @@
 """Definition of Hexagon operator strategy."""
 # pylint: disable=unused-argument,wildcard-import,unused-wildcard-import
 
+import re
+
 from tvm import topi
 from .generic import *
 from ... import op as _op
 from ...op.strategy.generic import is_depthwise_conv2d
 
 
+NCHWC_MATCHER = re.compile("^NCHW[0-9]+c$")
+OIHWIOI_MATCHER = re.compile("^OIHW[0-9]+i[0-9]+o[0-9]+i$")
+
+
 @qnn_quantize_strategy.register("hexagon")
 def qnn_quantize_strategy_hexagon(attrs, inputs, out_type, target):
     """qnn.quantize strategy for Hexagon"""
@@ -135,6 +141,13 @@ def qnn_conv2d_strategy_hexagon(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.hexagon.schedule_qnn_conv2d),
                 name="qnn_conv2d.hexagon",
             )
+        elif NCHWC_MATCHER.match(data_layout) and OIHWIOI_MATCHER.match(kernel_layout):
+            if data.dtype == "uint8" and kernel.dtype == "int8":
+                strategy.add_implementation(
+                    wrap_topi_qnn_conv2d(topi.hexagon.qnn_conv2d_NCHWc_int8),
+                    wrap_topi_schedule(topi.hexagon.schedule_qnn_conv2d_NCHWc_int8),
+                    name="qnn_conv2d_NCHWc_int8.hexagon",
+                )
     elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, kernel_layout, groups):
         if data_layout == "NCHW" and kernel_layout == "OIHW":
             strategy.add_implementation(
diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py
index d41d8854d7d1..b8cdc7a26d96 100644
--- a/python/tvm/topi/hexagon/qnn/__init__.py
+++ b/python/tvm/topi/hexagon/qnn/__init__.py
@@ -29,3 +29,4 @@
 from .qdepthwise_conv2d_slice import qdepthwise_conv2d_compute, qdepthwise_conv2d_schedule
 from .adaptive_avg_pool1d import *
 from .global_avg_pool2d import *
+from .conv2d_alter_op import *
diff --git a/python/tvm/topi/hexagon/qnn/conv2d_alter_op.py b/python/tvm/topi/hexagon/qnn/conv2d_alter_op.py
new file mode 100644
index 000000000000..867a477956e5
--- /dev/null
+++ b/python/tvm/topi/hexagon/qnn/conv2d_alter_op.py
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""QNN Conv2d alter op functions for Hexagon"""
+
+from tvm import relay
+from ...nn import qnn_conv2d_alter_layout
+from ...utils import get_const_tuple
+
+
+@qnn_conv2d_alter_layout.register("hexagon")
+def _alter_qnn_conv2d_layout(attrs, inputs, tinfos, _out_type):
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+    data_tensor, kernel_tensor, _, _, _, _ = tinfos
+
+    if (
+        "int8" in data_tensor.dtype
+        and "int8" in kernel_tensor.dtype
+        and data_layout == "NCHW"
+        and kernel_layout == "OIHW"
+    ):
+        out_channel, in_channel, _, _ = get_const_tuple(kernel_tensor.shape)
+
+        if out_channel % 32 != 0 or in_channel % 4 != 0:
+            return None
+
+        n_elems = 4
+        oc_bn = 32
+        ic_bn = min(in_channel, 32)
+
+        new_attrs = dict(attrs)
+        new_attrs["channels"] = out_channel
+        new_attrs["data_layout"] = "NCHW%dc" % ic_bn
+        new_attrs["kernel_layout"] = "OIHW{:n}i{:n}o{:n}i".format(ic_bn // n_elems, oc_bn, n_elems)
+        new_attrs["out_layout"] = "NCHW%dc" % oc_bn
+
+        return relay.qnn.op.conv2d(*inputs, **new_attrs)
+
+    return None
diff --git a/python/tvm/topi/hexagon/qnn/nn.py b/python/tvm/topi/hexagon/qnn/nn.py
index 49220d0fd013..aabdf2a63b8b 100644
--- a/python/tvm/topi/hexagon/qnn/nn.py
+++ b/python/tvm/topi/hexagon/qnn/nn.py
@@ -17,14 +17,17 @@
 """Hexagon QNN operators"""
 # pylint: disable=invalid-name
 
+import numpy as np
+
 import tvm
 from tvm import te, topi
-from ..utils import saturate
+from ..utils import saturate, get_fixed_point_value
 from ...utils import get_const_tuple
 from ...nn.utils import get_pad_tuple
 from ...nn.pad import pad
 from ... import tag, nn
-from ...x86.concat import concatenate
+from ..conv2d import conv2d_NCHWc_int8
+from ...transform import concatenate
 
 
 def clip_cast(val, dtype):
@@ -36,7 +39,9 @@ def clip_cast(val, dtype):
 
 # Return True if given Tensor is scalar constant value.
 def is_constant(tensor: te.Tensor):
-    return tensor.ndim == 0
+    return tensor.ndim == 0 and (
+        isinstance(tensor.op.body[0], (tvm.tir.expr.FloatImm, tvm.tir.expr.IntImm))
+    )
 
 
 def get_qnn_param(param, indices, axis):
@@ -65,6 +70,11 @@ def default_schedule(outs):
     outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
     s = tvm.te.create_schedule([x.op for x in outs])
     tvm.te.schedule.AutoInlineInjective(s)
+    for x in outs:
+        fused = s[x].fuse(*x.op.axis)
+        outer, inner = s[x].split(fused, factor=128 // np.dtype(x.dtype).itemsize)
+        s[x].vectorize(inner)
+        s[x].parallel(outer)
     return s
 
 
@@ -140,30 +150,58 @@ def schedule_qnn_dequantize(outs):
     return default_schedule(outs)
 
 
-def qnn_requantize(data, input_scale, input_zp, output_scale, output_zp, axis=-1, out_dtype="int8"):
+def qnn_requantize(
+    data: te.Tensor,
+    input_scale: te.Tensor,
+    input_zp: te.Tensor,
+    output_scale: te.Tensor,
+    output_zp: te.Tensor,
+    axis=-1,
+    out_dtype="int8",
+):
     """Compute for qnn.requantize
 
-    Q_output = zp_output + round((scale_input)/(scale_output) * (Q_input - zp_input))
+    If both input and output scales are constant scalars then we convert scale to fixed point value
+    and use integer arithmetic only for performance optimization purpose.
+    But this is a tradeoff between performance and accuracy, since we use int16 data type to
+    represent fixed point values (against QNN lowering approach where we use int32 for that).
+
+    if input and/or output scales are not constant scalars then we use the following formula:
+        Q_output = zp_output + round((scale_input)/(scale_output) * (Q_input - zp_input))
 
     TODO: support 'rounding' and 'compute_dtype' arguments.
     """
 
-    def _compute(*indices):
-        value = data(*indices)
+    if is_constant(input_scale) and is_constant(output_scale):
+        iscale = input_scale.op.body[0].value
+        oscale = output_scale.op.body[0].value
+        scale = iscale / oscale
+        scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
+
+        def _compute(*indices):
+            value = data(*indices)
+            # Subtract input zero point:
+            sub = te.subtract(value, input_zp)
+            # Fixed point multiply + roundup delta:
+            mul = (sub * scale_fixed_point + (1 << (rsh - 1))) >> rsh
+            # Add output zero point + clip + cast:
+            return saturate(te.add(mul, output_zp), out_dtype).astype(out_dtype)
 
-        iscale = get_qnn_param(input_scale, indices, axis)
-        oscale = get_qnn_param(output_scale, indices, axis)
+        return te.compute(data.shape, _compute)
+
+    else:
 
-        sub = te.subtract(value, input_zp)
-        mul = te.div(iscale, oscale)
-        val = te.add(te.round(te.multiply(mul, sub)), output_zp)
+        def _compute(*indices):
+            value = data(*indices)
+            iscale = get_qnn_param(input_scale, indices, axis)
+            oscale = get_qnn_param(output_scale, indices, axis)
 
-        # clip + cast:
-        const_min = tvm.tir.min_value(out_dtype)
-        const_max = tvm.tir.max_value(out_dtype)
-        return te.max(tvm.te.min(val, const_max), const_min).astype(out_dtype)
+            sub = te.subtract(value, input_zp)
+            mul = te.div(iscale, oscale)
+            val = te.add(te.round(te.multiply(mul, sub)), output_zp)
+            return saturate(val, out_dtype).astype(out_dtype)
 
-    return te.compute(data.shape, _compute)
+        return te.compute(data.shape, _compute)
 
 
 def schedule_qnn_requantize(outs):
@@ -188,9 +226,15 @@ def compute_qnn_binary_op(
 ):
     """Compute for QNN binary operation
 
-    Q_output = output_zp + round((lhs_scale)/(output_scale) * (lhs_input - lhs_zp))
-                      _OP_ round((rhs_scale)/(output_scale) * (rhs_input - rhs_zp))
-    where _OP_ is add/subtract
+    If rhs/lhs/output scales are constant scalars then we convert scale to fixed point value
+    and use integer arithmetic only for performance optimization purpose.
+    But this is a tradeoff between performance and accuracy, since we use int16 data type to
+    represent fixed point values (against QNN lowering approach where we use int32 for that).
+
+    if rhs/lhs/output scales are not constant scalars then we use the following formula:
+        Q_output = output_zp + round((lhs_scale)/(output_scale) * (lhs_input - lhs_zp))
+                        _OP_ round((rhs_scale)/(output_scale) * (rhs_input - rhs_zp))
+        where _OP_ is add/subtract
     """
     assert lhs.dtype == rhs.dtype
     dtype = lhs.dtype
@@ -200,13 +244,24 @@ def _compute_const(x: te.Tensor, iscale, input_zp):
             "int32"
         )
 
-    def _compute_tensor(x: te.Tensor, iscale, input_zp):
-        return te.compute(
-            x.shape,
-            lambda *i: te.round(
-                te.multiply(te.div(iscale, output_scale), te.subtract(x(*i), input_zp))
-            ).astype("int32"),
-        )
+    def _compute_tensor(x: te.Tensor, input_scale, input_zp):
+        if is_constant(input_scale) and is_constant(output_scale):
+            iscale = input_scale.op.body[0].value
+            oscale = output_scale.op.body[0].value
+            scale = iscale / oscale
+            scale_fixed_point, rsh = get_fixed_point_value(scale, "int16")
+            return te.compute(
+                x.shape,
+                lambda *i: (te.subtract(x(*i), input_zp) * scale_fixed_point + (1 << (rsh - 1)))
+                >> rsh,
+            )
+        else:
+            return te.compute(
+                x.shape,
+                lambda *i: te.round(
+                    te.multiply(te.div(input_scale, output_scale), te.subtract(x(*i), input_zp))
+                ).astype("int32"),
+            )
 
     if is_constant(lhs):
         lhs_tensor = _compute_const(lhs, lhs_scale, lhs_zp)
@@ -391,7 +446,7 @@ def qnn_concatenate(data, axis, out_dtype):
         # Requantize tensors and add them to the list.
         args.append(qnn_requantize(tensor, i_scale, i_zp, o_scale, o_zp, out_dtype=out_dtype))
 
-    # Call x86 implementation of concatenate.
+    # Call generic implementation of concatenate.
     return concatenate(args, axis)
 
 
@@ -454,6 +509,15 @@ def qnn_conv2d(  # Conv2d inputs
         get_const_tuple(padding), (dilated_kernel_h, dilated_kernel_w)
     )
 
+    # Subtract zero point from weights. axis=0 in get_qnn_param means 'O' dimension in "OIHW"
+    # weights layout.
+    weight = te.compute(
+        weight.shape,
+        lambda *indices: te.subtract(
+            weight(*indices), get_qnn_param(kernel_zero_point, indices, axis=0)
+        ),
+    )
+
     # Subtract zero point from input and then do padding with 0 value
     data = te.compute(data.shape, lambda *indices: te.subtract(data(*indices), input_zero_point))
 
@@ -469,7 +533,6 @@ def qnn_conv2d(  # Conv2d inputs
     kh = te.reduce_axis((0, kernel_height), name="kh")
     kw = te.reduce_axis((0, kernel_width), name="kw")
 
-    # axis=0 in get_qnn_param means 'O' dimension in "OIHW" weights layout.
     out = te.compute(
         oshape,
         lambda n, oc, oh, ow: te.sum(
@@ -479,9 +542,7 @@ def qnn_conv2d(  # Conv2d inputs
                 oh * height_stride + kh * dilation_h,
                 ow * width_stride + kw * dilation_w,
             ].astype("int32")
-            * te.subtract(
-                weight[oc, ic, kh, kw], get_qnn_param(kernel_zero_point, (oc, ic, kh, kw), axis=0)
-            ).astype("int32"),
+            * weight[oc, ic, kh, kw].astype("int32"),
             axis=[ic, kh, kw],
         ),
     )
@@ -532,6 +593,89 @@ def schedule_qnn_conv2d(outs):
     return default_schedule(outs)
 
 
+def qnn_conv2d_NCHWc_int8(  # Conv2d inputs
+    data,
+    weight,
+    # Conv2d quantization params:
+    input_zero_point,
+    kernel_zero_point,
+    _input_scale,
+    _kernel_scale,
+    # bias
+    bias,
+    # Requantization params:
+    rq_input_scale,
+    rq_input_zero_point,
+    rq_output_scale,
+    rq_output_zero_point,
+    # Conv2d attributes:
+    strides,
+    padding,
+    dilation,
+    _oshape,
+    odtype,
+):
+    """Compute for qnn.conv2d with NCHWc layout."""
+    # Subtract zero point from weights. Need to disable inline of this block
+    # (meta_schedule.inline_rule = disable). Otherwise, inline prevents from tensorization.
+    weight = te.compute(
+        weight.shape,
+        lambda *i: te.subtract(weight(*i), kernel_zero_point).astype(weight.dtype),
+        name="weight_zp",
+        attrs={"meta_schedule.inline_rule": "disable"},
+    )
+
+    # Subtract zero point from input. Again need to disable inline of this block
+    # (meta_schedule.inline_rule = disable). Otherwise, inline prevents from tensorization.
+    data = te.compute(
+        data.shape,
+        lambda *i: te.subtract(data(*i), input_zero_point).astype(data.dtype),
+        name="data_zp",
+        attrs={"meta_schedule.inline_rule": "disable"},
+    )
+
+    strides = get_const_tuple(strides)
+    padding = get_const_tuple(padding)
+    dilation = get_const_tuple(dilation)
+    out = conv2d_NCHWc_int8(data, weight, strides, padding, dilation, "NCHW32c", "NCHW32c")
+
+    # Add bias
+    if bias is not None:
+        assert len(out.shape) == len(bias.shape)
+        assert bias.shape[2] == 1 and bias.shape[3] == 1
+        out = te.compute(
+            out.shape, lambda n, c, h, w, ci: out[n, c, h, w, ci] + bias[n, c, 0, 0, ci]
+        )
+
+    # Requantize output of convolution
+    # Q_output = zp_output + round((scale_input)/(scale_output) * (Q_input - zp_input))
+    if rq_input_scale is not None and rq_output_scale is not None:
+        # Now supported only scalar and 1D quantization parameters
+        assert len(rq_input_scale.shape) == 0 or len(rq_input_scale.shape) == 1
+        assert len(rq_output_scale.shape) == 0 or len(rq_output_scale.shape) == 1
+        axis = -1
+        if len(rq_input_scale.shape) == 1 or len(rq_output_scale.shape) == 1:
+            axis = 1  # Axis param should correspond to 'C' dimension.
+
+        return qnn_requantize(
+            out,
+            rq_input_scale,
+            rq_input_zero_point,
+            rq_output_scale,
+            rq_output_zero_point,
+            axis,
+            odtype,
+        )
+
+    return out
+
+
+def schedule_qnn_conv2d_NCHWc_int8(outs):
+    """Schedule for qnn.conv2d with NCHWc layout."""
+
+    return default_schedule(outs)
+
+
 def qnn_depthwise_conv2d(  # Conv2d inputs
     data,
     weight,
diff --git a/python/tvm/topi/nn/qnn.py b/python/tvm/topi/nn/qnn.py
index 222f7a7c223e..7a29266b087c 100644
--- a/python/tvm/topi/nn/qnn.py
+++ b/python/tvm/topi/nn/qnn.py
@@ -236,3 +236,22 @@ def qnn_add_alter_layout(_attrs, _inputs, _tinfos, _out_type):
     Unlike other TOPI functions, this function operates on both graph level and operator level.
     """
     return None
+
+
+@tvm.target.generic_func
+def qnn_conv2d_alter_layout(_attrs, _inputs, _tinfos, _out_type):
+    """Change qnn.conv2D layout.
+    Not to change by default
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : tvm.relay.Expr
+        Grouped input symbols
+    tinfos : list
+        Input shape and dtype
+    out_type: type
+        The output type
+    """
+    return None
diff --git a/src/meta_schedule/schedule_rule/auto_inline.cc b/src/meta_schedule/schedule_rule/auto_inline.cc
index d2d48b9008ce..22e839692525 100644
--- a/src/meta_schedule/schedule_rule/auto_inline.cc
+++ b/src/meta_schedule/schedule_rule/auto_inline.cc
@@ -139,6 +139,11 @@ inline InlineType AutoInlineNode::CheckInline(const tir::Schedule& sch,
       }
     }
   }
+  // Cond 6. The block is disallowed for auto inline
+  if (Optional<String> ann =
+          tir::GetAnn<String>(block_sref, tir::attr::meta_schedule_inline_rule)) {
+    if (ann.value() == "disable") return InlineType::kNoInline;
+  }
   // Last cond: Check inline into the consumers or the spatial producer
   tir::StmtSRef scope_block = tir::GetScopeRoot(sch->state(), block_sref,
                                                 /*require_stage_pipeline=*/false);
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index 2170ba76e060..f5ac6af1dfd1 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -860,7 +860,8 @@ operator to understand how to scale back the int32 output to (u)int8 or (u)int16
     .add_type_rel("QnnConv2D", QnnConv2DRel)
     .set_attr<TNonComputational>("TNonComputational", true)
     .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnConv2DCanonicalize)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnConvInferCorrectLayout);
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnConvInferCorrectLayout)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 TVM_REGISTER_GLOBAL("relay.qnn.op._make.conv2d").set_body_typed(MakeQnnConv2D);
 
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 91df4a287ca7..e1d27ee53603 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -76,10 +76,17 @@ InferCorrectLayoutOutput RequantizeInferCorrectLayout(const Attrs& attrs,
       if (old_dim == layout_dim) {
         new_axis = tvm::Integer(axis_index);
       }
-      // Collect only the primal axis.
+
       if (layout_axis.IsPrimal()) {
         new_layout_string += layout_dim;
         axis_index++;
+      } else {
+        // Propogate layout if input_zero_point and input_scale are scalar values.
+        ICHECK_GE(old_in_types.size(), 3);
+        if (IsScalarType(old_in_types[1]) && IsScalarType(old_in_types[2])) {
+          new_layout_string += std::to_string(new_in_layouts[0].FactorOf(layout_axis)) + layout_dim;
+          axis_index++;
+        }
       }
     }
 
diff --git a/src/relay/qnn/pass/legalize.cc b/src/relay/qnn/pass/legalize.cc
index a5906cf5e694..fd88c4df8c06 100644
--- a/src/relay/qnn/pass/legalize.cc
+++ b/src/relay/qnn/pass/legalize.cc
@@ -30,10 +30,28 @@ namespace qnn {
 
 namespace transform {
 
+// QnnLegalize pass is a wrapper for relay::legalize::Legalize pass.
+Pass QnnLegalize() {
+  runtime::TypedPackedFunc<Function(Function, IRModule, relay::transform::PassContext)> pass_func =
+      [=](Function f, IRModule m, relay::transform::PassContext pc) {
+        return Downcast<Function>(relay::legalize::Legalize(f, "FTVMQnnLegalize"));
+      };
+  return relay::transform::CreateFunctionPass(pass_func, 1, "QnnLegalize", {"InferType"});
+}
+
+// QnnCanonicalize pass is a wrapper for relay::legalize::Legalize pass.
+Pass QnnCanonicalize() {
+  runtime::TypedPackedFunc<Function(Function, IRModule, relay::transform::PassContext)> pass_func =
+      [=](Function f, IRModule m, relay::transform::PassContext pc) {
+        return Downcast<Function>(relay::legalize::Legalize(f, "FTVMQnnCanonicalize"));
+      };
+  return relay::transform::CreateFunctionPass(pass_func, 1, "QnnCanonicalize", {"InferType"});
+}
+
 Pass Legalize() {
   Array<Pass> pass_seqs;
-  pass_seqs.push_back(relay::transform::Legalize("FTVMQnnLegalize"));
-  pass_seqs.push_back(relay::transform::Legalize("FTVMQnnCanonicalize"));
+  pass_seqs.push_back(QnnLegalize());
+  pass_seqs.push_back(QnnCanonicalize());
   relay::transform::Pass seq = relay::transform::Sequential(pass_seqs, "qnn.Legalize");
   return seq;
 }
diff --git a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
index 06e738d9b70e..e583b1b5eac8 100644
--- a/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
+++ b/tests/python/contrib/test_hexagon/test_wo_qnn_canonicalization.py
@@ -59,7 +59,7 @@ def execute(mod_executor, inputs: dict):
 
 
 def build_hexagon_module(mod):
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["qnn.Legalize"]):
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["QnnCanonicalize"]):
         hexagon_lowered = tvm.relay.build(
             mod,
             tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, host=HEXAGON_AOT_LLVM_TARGET),
@@ -87,7 +87,7 @@ def test_qnn_conv2d_rq(hexagon_session: Session):
     weight_shape = [16, 8, 3, 3]
     data = relay.var("data", shape=data_shape, dtype="float32")
     weight = relay.var("weight", shape=weight_shape, dtype="float32")
-    op0 = relay.qnn.op.quantize(data, relay.const(0.078), relay.const(0), out_dtype="int8")
+    op0 = relay.qnn.op.quantize(data, relay.const(0.078), relay.const(0), out_dtype="uint8")
     op1 = relay.qnn.op.quantize(weight, relay.const(0.07), relay.const(0), out_dtype="int8")
     op2 = relay.qnn.op.conv2d(
         op0,
@@ -116,7 +116,7 @@ def test_qnn_conv2d_rq(hexagon_session: Session):
     # Reference compilation
     llvm_lowered = build_ref_module(relay_mod)
 
-    data_np = np.random.rand(*data_shape) - 0.5
+    data_np = np.random.rand(*data_shape)
     weight_np = np.random.rand(*weight_shape) - 0.5
     inputs = {"data": data_np, "weight": weight_np}
 
@@ -181,7 +181,8 @@ def test_qnn_dense_bias_rq(hexagon_session: Session):
     llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
     llvm_out = execute(llvm_m, inputs)
 
-    np.testing.assert_equal(hexagon_output, llvm_out)
+    # Diff by 1 is Ok.
+    tvm.testing.assert_allclose(hexagon_output, llvm_out, atol=1)
 
 
 class TestQnnBinaryOp:
@@ -278,5 +279,117 @@ def test_qnn_binary_op_scalar(self, hexagon_session: Session, operation):
         tvm.testing.assert_allclose(hexagon_output, llvm_output, atol=1)
 
 
+class TestQnnOp:
+    """QNN op test class"""
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_requantize(self, hexagon_session: Session):
+        """qnn.requantize test without QNN canonicalization."""
+        data_shape = [256]
+        data = relay.var("data", shape=data_shape, dtype="int32")
+
+        op = relay.qnn.op.requantize(
+            data,
+            input_scale=relay.const(0.156),
+            input_zero_point=relay.const(2),
+            output_scale=relay.const(0.212),
+            output_zero_point=relay.const(1),
+            out_dtype="int8",
+        )
+        mod = tvm.IRModule.from_expr(op)
+
+        # Compile for Hexagon
+        hexagon_lowered = build_hexagon_module(mod)
+
+        # Reference compilation
+        llvm_lowered = build_ref_module(mod)
+
+        data_np = np.arange(-256, 256, 2, dtype="int32")
+        inputs = {"data": data_np}
+
+        hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
+        hexagon_output = execute(hx_m, inputs)
+
+        dev = tvm.cpu(0)
+        llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
+        llvm_output = execute(llvm_m, inputs)
+
+        np.testing.assert_equal(hexagon_output, llvm_output)
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_concatenate(self, hexagon_session: Session):
+        """qnn.concatenate op test without QNN canonicalization."""
+        x_shape = [1, 64]
+        y_shape = [2, 64]
+        z_shape = [3, 64]
+        input_x = relay.var("x", shape=x_shape, dtype="uint8")
+        input_y = relay.var("y", shape=y_shape, dtype="uint8")
+        input_z = relay.var("z", shape=z_shape, dtype="uint8")
+
+        op = relay.qnn.op.concatenate(
+            (input_x, input_y, input_z),
+            input_scales=(relay.const(0.3), relay.const(0.7), relay.const(1.3)),
+            input_zero_points=(relay.const(0), relay.const(1), relay.const(2)),
+            output_scale=relay.const(0.8),
+            output_zero_point=relay.const(5),
+            axis=0,
+        )
+        mod = tvm.IRModule.from_expr(op)
+
+        # Compile for Hexagon
+        hexagon_lowered = build_hexagon_module(mod)
+
+        # Reference compilation
+        llvm_lowered = build_ref_module(mod)
+
+        x_np = np.arange(0, 64, 1, dtype="uint8").reshape(x_shape)
+        y_np = np.arange(0, 128, 1, dtype="uint8").reshape(y_shape)
+        z_np = np.arange(0, 192, 1, dtype="uint8").reshape(z_shape)
+        inputs = {"x": x_np, "y": y_np, "z": z_np}
+
+        hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
+        hexagon_output = execute(hx_m, inputs)
+
+        dev = tvm.cpu(0)
+        llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
+        llvm_output = execute(llvm_m, inputs)
+
+        # Diff by 1 is Ok.
+        tvm.testing.assert_allclose(hexagon_output, llvm_output, atol=1)
+
+    @tvm.testing.requires_hexagon
+    def test_qnn_tanh(self, hexagon_session: Session):
+        """qnn.tanh op test without QNN canonicalization."""
+        data_shape = [256]
+        data = relay.var("data", shape=data_shape, dtype="uint8")
+
+        op = relay.qnn.op.tanh(
+            data,
+            scale=relay.const(0.518),
+            zero_point=relay.const(137),
+            output_scale=relay.const(0.207),
+            output_zero_point=relay.const(128),
+        )
+        mod = tvm.IRModule.from_expr(op)
+
+        # Compile for Hexagon
+        hexagon_lowered = build_hexagon_module(mod)
+
+        # Reference compilation
+        llvm_lowered = build_ref_module(mod)
+
+        data_np = np.arange(0, 256, 1, dtype="uint8")
+        inputs = {"data": data_np}
+
+        hx_m = hexagon_session.get_executor_from_factory(hexagon_lowered)
+        hexagon_output = execute(hx_m, inputs)
+
+        dev = tvm.cpu(0)
+        llvm_m = tvm.runtime.executor.AotModule(llvm_lowered["default"](dev))
+        llvm_output = execute(llvm_m, inputs)
+
+        np.testing.assert_equal(hexagon_output, llvm_output)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 7eae01f0773a3b5cb2534f03fd7cb6baa1844ea9 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 11 Jan 2023 08:51:45 -0800
Subject: [PATCH 157/286] [CI][Docker][Cortex-M]Update scripts to update
 ci_cortexm to Ubuntu 20.04 (#13736)

- Updates nrfjprog to latest (10.18.1) to fix issues related to running microTVM tests inside docker. The main issue was that with older version commands like nrfjprog --com was not working even with docker --privileged.
- Update python install script to support python 3.7 in 20.04 -> we keep using python 3.7 in ubuntu 20.04 until its EOL
- Add script to install LLVM in ubuntu 20.04
---
 .../base-box/base_box_setup_common.sh         |  2 +-
 docker/Dockerfile.ci_arm                      |  2 +-
 docker/Dockerfile.ci_cortexm                  |  8 ++---
 docker/Dockerfile.ci_cpu                      |  2 +-
 docker/Dockerfile.ci_gpu                      |  2 +-
 docker/Dockerfile.ci_hexagon                  |  2 +-
 docker/Dockerfile.ci_i386                     |  2 +-
 docker/Dockerfile.ci_lint                     |  2 +-
 docker/Dockerfile.ci_minimal                  |  2 +-
 docker/Dockerfile.ci_riscv                    |  2 +-
 docker/Dockerfile.ci_wasm                     |  2 +-
 docker/Dockerfile.demo_android                |  2 +-
 docker/Dockerfile.demo_rocm                   |  2 +-
 docker/Dockerfile.demo_vitis_ai               |  2 +-
 docker/install/ubuntu2004_install_llvm.sh     | 35 +++++++++++++++++++
 docker/install/ubuntu_install_nrfjprog.sh     | 24 +++++++++----
 docker/install/ubuntu_install_python.sh       | 23 ++++++++++--
 17 files changed, 89 insertions(+), 27 deletions(-)
 create mode 100755 docker/install/ubuntu2004_install_llvm.sh

diff --git a/apps/microtvm/reference-vm/base-box/base_box_setup_common.sh b/apps/microtvm/reference-vm/base-box/base_box_setup_common.sh
index f0c1d2c6bef0..c0d8c892f764 100755
--- a/apps/microtvm/reference-vm/base-box/base_box_setup_common.sh
+++ b/apps/microtvm/reference-vm/base-box/base_box_setup_common.sh
@@ -46,7 +46,7 @@ sudo apt-install-and-clear -y --no-install-recommends \
      cmake=3.22.2-0kitware1ubuntu18.04.1 cmake-data=3.22.2-0kitware1ubuntu18.04.1 \
 
 # Python
-sudo ~/ubuntu_install_python.sh
+sudo ~/ubuntu_install_python.sh 3.7
 rm -f ~/ubuntu_install_python.sh
 
 # Poetry deps
diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm
index 9bb34589b5f9..bd2b2d8fb145 100644
--- a/docker/Dockerfile.ci_arm
+++ b/docker/Dockerfile.ci_arm
@@ -56,7 +56,7 @@ RUN bash /install/ubuntu_install_llvm.sh
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
-RUN bash /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh 3.7
 ENV PATH ${TVM_VENV}/bin:$PATH
 ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm
index 29a19454a9ee..50062d9dea35 100644
--- a/docker/Dockerfile.ci_cortexm
+++ b/docker/Dockerfile.ci_cortexm
@@ -17,7 +17,7 @@
 
 # CI docker CPU env
 # tag: v0.62
-FROM ubuntu:18.04
+FROM ubuntu:20.04
 
 COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
 
@@ -38,15 +38,15 @@ RUN bash /install/ubuntu_install_googletest.sh
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
-RUN bash /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh 3.7
 ENV PATH ${TVM_VENV}/bin:$PATH
 ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
 
-COPY install/ubuntu1804_install_llvm.sh /install/ubuntu1804_install_llvm.sh
-RUN bash /install/ubuntu1804_install_llvm.sh
+COPY install/ubuntu2004_install_llvm.sh /install/ubuntu2004_install_llvm.sh
+RUN bash /install/ubuntu2004_install_llvm.sh
 
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 9436f1758e14..74e90670ab23 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -37,7 +37,7 @@ RUN bash /install/ubuntu_install_googletest.sh
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
-RUN bash /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh 3.7
 ENV PATH ${TVM_VENV}/bin:$PATH
 ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 9917e4cc78a7..daf6381908d8 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -44,7 +44,7 @@ RUN bash /install/ubuntu_install_googletest.sh /googletest
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
-RUN bash /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh 3.7
 ENV PATH ${TVM_VENV}/bin:$PATH
 ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index 2be11f034515..bf4e33fc7c18 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -40,7 +40,7 @@ RUN bash /install/ubuntu_install_googletest.sh
 ENV TVM_VENV /venv/apache-tvm-py3.8
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
-RUN bash /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh 3.8
 ENV PATH ${TVM_VENV}/bin:$PATH
 ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386
index 9a2e08eaab76..4dbf94a22e6d 100644
--- a/docker/Dockerfile.ci_i386
+++ b/docker/Dockerfile.ci_i386
@@ -52,7 +52,7 @@ ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
-RUN bash /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh 3.7
 ENV PATH ${TVM_VENV}/bin:$PATH
 
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 860a43fa2194..f7d5838877ba 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -29,7 +29,7 @@ RUN apt-install-and-clear -y wget git sudo make parallel
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
-RUN bash /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh 3.7
 ENV PATH ${TVM_VENV}/bin:$PATH
 ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
diff --git a/docker/Dockerfile.ci_minimal b/docker/Dockerfile.ci_minimal
index 8ebcc6c9b9f2..ec6aa78b2fc0 100644
--- a/docker/Dockerfile.ci_minimal
+++ b/docker/Dockerfile.ci_minimal
@@ -37,7 +37,7 @@ RUN bash /install/ubuntu_install_googletest.sh
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
-RUN bash /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh 3.7
 ENV PATH ${TVM_VENV}/bin:$PATH
 ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
diff --git a/docker/Dockerfile.ci_riscv b/docker/Dockerfile.ci_riscv
index 3dd1943d27e1..a640e996c7be 100644
--- a/docker/Dockerfile.ci_riscv
+++ b/docker/Dockerfile.ci_riscv
@@ -38,7 +38,7 @@ RUN bash /install/ubuntu_install_googletest.sh
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
-RUN bash /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh 3.7
 ENV PATH ${TVM_VENV}/bin:$PATH
 ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm
index 3e794c312c66..e69f0774008a 100644
--- a/docker/Dockerfile.ci_wasm
+++ b/docker/Dockerfile.ci_wasm
@@ -35,7 +35,7 @@ RUN bash /install/ubuntu_install_googletest.sh
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
-RUN bash /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh 3.7
 ENV PATH ${TVM_VENV}/bin:$PATH
 ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android
index cb5a9e0015ab..ebfacabb4e67 100644
--- a/docker/Dockerfile.demo_android
+++ b/docker/Dockerfile.demo_android
@@ -31,7 +31,7 @@ RUN bash /install/ubuntu_install_core.sh
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu1804_install_python.sh
-RUN bash /install/ubuntu1804_install_python.sh
+RUN bash /install/ubuntu1804_install_python.sh 3.7
 ENV PATH ${TVM_VENV}/bin:$PATH
 ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
diff --git a/docker/Dockerfile.demo_rocm b/docker/Dockerfile.demo_rocm
index b4cb83f7f68c..79bd2cb9be11 100644
--- a/docker/Dockerfile.demo_rocm
+++ b/docker/Dockerfile.demo_rocm
@@ -29,7 +29,7 @@ RUN bash /install/ubuntu_install_core.sh
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
-RUN bash /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh 3.7
 ENV PATH ${TVM_VENV}/bin:$PATH
 ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
diff --git a/docker/Dockerfile.demo_vitis_ai b/docker/Dockerfile.demo_vitis_ai
index c90091e7ecd2..df56c6326d49 100644
--- a/docker/Dockerfile.demo_vitis_ai
+++ b/docker/Dockerfile.demo_vitis_ai
@@ -35,7 +35,7 @@ RUN bash /install/ubuntu_install_vitis_ai_core.sh
 ENV TVM_VENV /venv/apache-tvm-py3.7
 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
-RUN bash /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh 3.7
 ENV PATH ${TVM_VENV}/bin:$PATH
 ENV PYTHONNOUSERSITE 1  # Disable .local directory from affecting CI.
 
diff --git a/docker/install/ubuntu2004_install_llvm.sh b/docker/install/ubuntu2004_install_llvm.sh
new file mode 100755
index 000000000000..a0c4c8e73183
--- /dev/null
+++ b/docker/install/ubuntu2004_install_llvm.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euxo pipefail
+
+echo deb http://apt.llvm.org/focal/ llvm-toolchain-focal main\
+    >> /etc/apt/sources.list.d/llvm.list
+
+echo deb http://apt.llvm.org/focal/ llvm-toolchain-focal-13 main\
+    >> /etc/apt/sources.list.d/llvm.list
+
+apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 15CF4D18AF4F7421
+
+apt-get update && apt-install-and-clear -y \
+     llvm-9 llvm-10 llvm-11 llvm-12 llvm-13 \
+     clang-9 libclang-9-dev \
+     clang-10 libclang-10-dev \
+     clang-11 libclang-11-dev \
+     clang-12 libclang-12-dev \
+     clang-13 libclang-13-dev
diff --git a/docker/install/ubuntu_install_nrfjprog.sh b/docker/install/ubuntu_install_nrfjprog.sh
index 1a82f057a950..7baedf54fb3b 100755
--- a/docker/install/ubuntu_install_nrfjprog.sh
+++ b/docker/install/ubuntu_install_nrfjprog.sh
@@ -21,19 +21,29 @@ set -u
 set -o pipefail
 set -x
 
+TEMP_PATH_NAME="nrfjprog_tmp_path"
+
+# Install JLink
 NRF_COMMANDLINE_TOOLS_FILE=nRFCommandLineToolsLinuxamd64.tar.gz
-NRF_COMMANDLINE_TOOLS_URL=https://nsscprodmedia.blob.core.windows.net/prod/software-and-other-downloads/desktop-software/nrf-command-line-tools/sw/versions-10-x-x/10-12-1/nrfcommandlinetools10121linuxamd64.tar.gz
-NRF_COMMANDLINE_TOOLS_INSTALLER=nRF-Command-Line-Tools_10_12_1_Linux-amd64.deb
-JLINK_LINUX_INSTALLER=JLink_Linux_V688a_x86_64.deb
+NRF_COMMANDLINE_TOOLS_URL=https://nsscprodmedia.blob.core.windows.net/prod/software-and-other-downloads/desktop-software/nrf-command-line-tools/sw/versions-10-x-x/10-18-1/nrf-command-line-tools-10.18.1_linux-amd64.tar.gz
+JLINK_LINUX_INSTALLER=JLink_Linux_V780c_x86_64.deb
+NRF_COMMANDLINE_TOOLS_SHA=5611536ca3377d64131ccd51232f9e33cde6d289b03ea33db0581a1288be8b0b10f995e2d60fdd4a3ce5a5c7b12bc85ddc672b282c9af8c5808707ab41543a7d
 
 cd ~
-mkdir -p nrfjprog
+mkdir -p ${TEMP_PATH_NAME}
 wget --no-verbose -O $NRF_COMMANDLINE_TOOLS_FILE $NRF_COMMANDLINE_TOOLS_URL
+echo "$NRF_COMMANDLINE_TOOLS_SHA $NRF_COMMANDLINE_TOOLS_FILE" | sha512sum --check
 
-cd nrfjprog
+cd ${TEMP_PATH_NAME}
 tar -xzvf "../${NRF_COMMANDLINE_TOOLS_FILE}"
 apt-install-and-clear -y "./${JLINK_LINUX_INSTALLER}"
-apt-install-and-clear -y "./${NRF_COMMANDLINE_TOOLS_INSTALLER}"
+
+# Install nrfjprog
+NRF_DEB_FILE=nrf-command-line-tools_amd64.deb
+NRF_DEB_FILE_SHA=1f0339e16d50345ddde9757c2a4211361bcc78ff7371aac09decfffa809d86329001f5bc135f33dd154000a8f0da8bee4a0e80d3865ceff229f63ff9ace5ea95
+wget --no-verbose -O $NRF_DEB_FILE https://nsscprodmedia.blob.core.windows.net/prod/software-and-other-downloads/desktop-software/nrf-command-line-tools/sw/versions-10-x-x/10-18-1/nrf-command-line-tools_10.18.1_amd64.deb
+echo "$NRF_DEB_FILE_SHA $NRF_DEB_FILE" | sha512sum --check
+apt-install-and-clear -y ./$NRF_DEB_FILE
 
 cd ..
-rm -rf nrfjprog "${NRF_COMMANDLINE_TOOLS_FILE}"
+rm -rf ${TEMP_PATH_NAME} "${NRF_COMMANDLINE_TOOLS_FILE}"
diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
index eb6dcec45c5d..85a8cc48a95a 100755
--- a/docker/install/ubuntu_install_python.sh
+++ b/docker/install/ubuntu_install_python.sh
@@ -27,17 +27,35 @@ if [ -z "${TVM_VENV+x}" ]; then
     exit 2
 fi
 
+if [ "$#" -lt 1 ]; then
+    echo "Usage: docker/install/ubuntu_install_python.sh <PYTHON_VERSION>"
+    exit -1
+fi
+PYTHON_VERSION=$1
+
+if [ "${PYTHON_VERSION}" != "3.7" ] && [ "${PYTHON_VERSION}" != "3.8" ]; then
+    echo "Only 3.7 and 3.8 versions are supported in this script."
+    exit -1
+fi
+
 apt-get update
 
 # Ensure lsb-release is installed.
 apt-install-and-clear -y \
     lsb-core
 
+apt-install-and-clear -y software-properties-common
+
 release=$(lsb_release -sc)
 if [ "${release}" == "bionic" ]; then
-    PYTHON_VERSION=3.7
+    if [ "${PYTHON_VERSION}" == "3.8" ]; then
+        echo "Only 3.7 is supported for bionic in this script."
+        exit -1
+    fi
 elif [ "${release}" == "focal" ]; then
-    PYTHON_VERSION=3.8
+    if [ "${PYTHON_VERSION}" == "3.7" ]; then
+        add-apt-repository -y ppa:deadsnakes/ppa
+    fi
 else
     echo "Don't know which version of python to install for lsb-release ${release}"
     exit 2
@@ -45,7 +63,6 @@ fi
 
 # Install python and pip. Don't modify this to add Python package dependencies,
 # instead modify install_python_package.sh
-apt-install-and-clear -y software-properties-common
 apt-install-and-clear -y \
     acl \
     python${PYTHON_VERSION} \

From 3f8926acd8631604620cebca25de8425598f1ddc Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Wed, 11 Jan 2023 12:47:24 -0800
Subject: [PATCH 158/286] [Fix,TOPI] Consolidate generic and x86 scatter nd
 (#13755)

The generic scatter nd was almost identical to the x86 one and was not tested. They now are one and the same.
---
 python/tvm/relay/op/strategy/x86.py           |   2 +-
 python/tvm/topi/scatter.py                    |  55 ++++----
 python/tvm/topi/x86/__init__.py               |   1 -
 python/tvm/topi/x86/scatter.py                | 119 ------------------
 tests/python/topi/python/test_topi_scatter.py |   4 -
 5 files changed, 26 insertions(+), 155 deletions(-)
 delete mode 100644 python/tvm/topi/x86/scatter.py

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index d0ad377203c9..fa002737a7b0 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -768,7 +768,7 @@ def scatter_nd_strategy_cpu(attrs, inputs, out_type, target):
     """scatter_nd x86 strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_scatter_nd(topi.x86.scatter_nd),
+        wrap_compute_scatter_nd(topi.scatter_nd),
         wrap_topi_schedule(topi.generic.schedule_extern),
         name="scatter_nd.x86",
         plevel=10,
diff --git a/python/tvm/topi/scatter.py b/python/tvm/topi/scatter.py
index afb0d6633a2b..e0578aab41b9 100644
--- a/python/tvm/topi/scatter.py
+++ b/python/tvm/topi/scatter.py
@@ -16,8 +16,8 @@
 # under the License.
 # pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks
 """Scatter operator"""
-from ..tir import decl_buffer, ir_builder, AssertStmt, StringImm, Evaluate, expr
 from ..te import extern, hybrid
+from ..tir import decl_buffer, expr, ir_builder
 
 
 @hybrid.script
@@ -268,6 +268,7 @@ def scatter_nd(data, indices, updates, mode):
     _verify_scatter_nd_inputs(data, indices, updates)
 
     def gen_ir(data_ptr, indices_ptr, updates_ptr, out_ptr):
+        # pylint: disable=invalid-name
         ib = ir_builder.create()
 
         data = ib.buffer_ptr(data_ptr)
@@ -275,56 +276,50 @@ def gen_ir(data_ptr, indices_ptr, updates_ptr, out_ptr):
         updates = ib.buffer_ptr(updates_ptr)
         out = ib.buffer_ptr(out_ptr)
 
-        fused_shape = 1
-        for i in data.shape:
-            fused_shape *= i
-        with ib.for_range(0, fused_shape) as i:
-            out[i] = data[i]
-
         # We combine all the indices dimensions but the first one into a single
         # dimension so we can iterate it in single loop instead of an arbitrary
-        # number of loops. We do the same thing for all the data dimensions.
+        # number of loops. We do the same thing for all the update dimensions.
         fused_indices_dimension = 1
         for i in indices_ptr.shape[1:]:
             fused_indices_dimension *= i
 
-        fused_data_dimension = 1
-        for i in data_ptr.shape[len(indices_ptr.shape) - 1 :]:
-            fused_data_dimension *= i
+        fused_updates_dimension = 1
+        for i in updates_ptr.shape[len(indices_ptr.shape) - 1 :]:
+            fused_updates_dimension *= i
+
+        fused_shape = 1
+        for i in data_ptr.shape:
+            fused_shape *= i
+
+        with ib.for_range(0, fused_shape) as i:
+            out[i] = data[i]
 
-        with ib.for_range(0, fused_indices_dimension, name="i") as i:
-            with ib.for_range(0, fused_data_dimension, name="j") as j:
-                offset = fused_data_dimension
+        with ib.for_range(0, fused_indices_dimension) as i:
+            with ib.for_range(0, fused_updates_dimension, kind="parallel") as j:
+                offset = fused_updates_dimension
                 index = j  # This is x_M, .. x_{N-1} part of the index into out.
                 # Build up the indices[0, y_0, .. y_{K-1}], .. indices[M-1, y_0, .. y_{K-1}] part
                 # of the index into out.
                 for l in reversed(range(indices_ptr.shape[0].value)):
                     # indices[i * l * fused_indices_dimension] = indices[l, y_0, ... y_{k-1}]
                     index += offset * indices[i + l * fused_indices_dimension]
-                    ib.emit(
-                        AssertStmt(
-                            indices[i + l * fused_indices_dimension] < shape[l],
-                            StringImm("index out of bounds"),
-                            Evaluate(0),
-                        )
-                    )
-                    offset *= shape[l]
-                if mode == "add":
-                    out[index] += updates[i * fused_data_dimension + j]
-                elif mode == "update":
-                    out[index] = updates[i * fused_data_dimension + j]
+                    offset *= data_ptr.shape[l]
+                if mode == "update":
+                    out[index] = updates[i * fused_updates_dimension + j]
+                elif mode == "add":
+                    out[index] += updates[i * fused_updates_dimension + j]
                 else:
                     raise NotImplementedError("scatter_nd mode not in [update, add]:", mode)
 
         return ib.get()
 
-    out_buf = decl_buffer(shape, data.dtype, "out_buf")
+    out_buf = decl_buffer(data.shape, data.dtype, "out_buf")
     return extern(
-        [shape],
+        [data.shape],
         [data, indices, updates],
         lambda ins, outs: gen_ir(ins[0], ins[1], ins[2], outs[0]),
         dtype=data.dtype,
         out_buffers=[out_buf],
-        name="scatter_nd_generic",
-        tag="scatter_nd_generic",
+        name="scatter_nd.generic",
+        tag="scatter_nd.generic",
     )
diff --git a/python/tvm/topi/x86/__init__.py b/python/tvm/topi/x86/__init__.py
index d075090f01ea..a54b156380d0 100644
--- a/python/tvm/topi/x86/__init__.py
+++ b/python/tvm/topi/x86/__init__.py
@@ -40,7 +40,6 @@
 from .sparse import *
 from .conv2d_alter_op import *
 from .dense_alter_op import *
-from .scatter import *
 from .group_conv2d import *
 from .math_alter_op import *
 from .concat import *
diff --git a/python/tvm/topi/x86/scatter.py b/python/tvm/topi/x86/scatter.py
deleted file mode 100644
index 5eb5e6e99b6c..000000000000
--- a/python/tvm/topi/x86/scatter.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Scatter operators for x86"""
-import tvm
-from tvm import te
-from ..scatter import _verify_scatter_nd_inputs
-
-
-def scatter_nd(data, indices, updates, mode):
-    """Scatter elements from a n-dimension array.
-
-    Given updates with shape (Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}), indices with shape
-    (M, Y_0, ..., Y_{K-1}), and output copied from data with shape (X_0, X_1, ..., X_{N-1}),
-    scatter_nd computes
-
-    .. code-block::
-
-        output[indices[0, y_0, ..., y_{K-1}],
-               ...,
-               indices[M-1, y_0, ..., y_{K-1}],
-               x_M,
-               ...,
-               x_{N-1}
-              ] = f(output[...], updates[y_0, ..., y_{K-1}, x_M, ..., x_{N-1}])
-
-    where the update function f is determinted by the mode.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        The source array.
-
-    indices : tvm.te.Tensor
-        The indices of the values to extract.
-
-    updates : tvm.te.Tensor
-        The updates to apply at the Indices
-
-    mode : string
-        The update mode for the algorithm, either "update" or "add"
-        If update, the update values will replace the input data
-        If add, the update values will be added to the input data
-
-    Returns
-    -------
-    ret : tvm.te.Tensor
-    """
-    _verify_scatter_nd_inputs(data, indices, updates)
-
-    def gen_ir(data_ptr, indices_ptr, updates_ptr, out_ptr):
-        # pylint: disable=invalid-name
-        ib = tvm.tir.ir_builder.create()
-
-        data = ib.buffer_ptr(data_ptr)
-        indices = ib.buffer_ptr(indices_ptr)
-        updates = ib.buffer_ptr(updates_ptr)
-        out = ib.buffer_ptr(out_ptr)
-
-        # We combine all the indices dimensions but the first one into a single
-        # dimension so we can iterate it in single loop instead of an arbitrary
-        # number of loops. We do the same thing for all the update dimensions.
-        fused_indices_dimension = 1
-        for i in indices_ptr.shape[1:]:
-            fused_indices_dimension *= i
-
-        fused_updates_dimension = 1
-        for i in updates_ptr.shape[len(indices_ptr.shape) - 1 :]:
-            fused_updates_dimension *= i
-
-        fused_shape = 1
-        for i in data_ptr.shape:
-            fused_shape *= i
-
-        with ib.for_range(0, fused_shape) as i:
-            out[i] = data[i]
-
-        with ib.for_range(0, fused_indices_dimension) as i:
-            with ib.for_range(0, fused_updates_dimension, kind="parallel") as j:
-                offset = fused_updates_dimension
-                index = j  # This is x_M, .. x_{N-1} part of the index into out.
-                # Build up the indices[0, y_0, .. y_{K-1}], .. indices[M-1, y_0, .. y_{K-1}] part
-                # of the index into out.
-                for l in reversed(range(indices_ptr.shape[0].value)):
-                    # indices[i * l * fused_indices_dimension] = indices[l, y_0, ... y_{k-1}]
-                    index += offset * indices[i + l * fused_indices_dimension]
-                    offset *= data_ptr.shape[l]
-                if mode == "update":
-                    out[index] = updates[i * fused_updates_dimension + j]
-                elif mode == "add":
-                    out[index] += updates[i * fused_updates_dimension + j]
-                else:
-                    raise NotImplementedError("scatter_nd mode not in [update, add]:", mode)
-
-        return ib.get()
-
-    out_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "out_buf")
-    return te.extern(
-        [data.shape],
-        [data, indices, updates],
-        lambda ins, outs: gen_ir(ins[0], ins[1], ins[2], outs[0]),
-        dtype=data.dtype,
-        out_buffers=[out_buf],
-        name="scatter_nd_x86",
-        tag="scatter_nd_x86",
-    )
diff --git a/tests/python/topi/python/test_topi_scatter.py b/tests/python/topi/python/test_topi_scatter.py
index 648ef62a04ee..025e44889d63 100644
--- a/tests/python/topi/python/test_topi_scatter.py
+++ b/tests/python/topi/python/test_topi_scatter.py
@@ -33,10 +33,6 @@ def check_scatter_nd(data, indices, updates, out, mode="add"):
                 lambda x, y, z: topi.cuda.scatter_nd(x, y, z, mode),
                 topi.generic.schedule_extern,
             ),
-            "cpu": (
-                lambda x, y, z: topi.x86.scatter_nd(x, y, z, mode),
-                topi.generic.schedule_extern,
-            ),
         }
         fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
         tvm.topi.testing.compare_numpy_tvm(

From b249b9a538ea1788113a5dcba7dd6aee762c21d8 Mon Sep 17 00:00:00 2001
From: Koke_Cacao <i@kokecacao.me>
Date: Thu, 12 Jan 2023 05:02:00 +0800
Subject: [PATCH 159/286] [Docs] Add `typing-extensions` dependency guide
 (#13730)

Although TVM does not, but `tvmc` depends on `typing-extensions`, which is not mentioned in the documentation.
---
 docs/install/from_source.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 7a6b93705759..37ca72d80f36 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -331,6 +331,12 @@ like ``virtualenv``.
 
        pip3 install --user numpy decorator attrs
 
+   * If you want to use ``tvmc``: the TVM command line driver.
+
+   .. code:: bash
+
+       pip3 install --user typing-extensions
+
    * If you want to use RPC Tracker
 
    .. code:: bash

From 292d088a3989fd791e5057e4d2c20e4f84c8819e Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Wed, 11 Jan 2023 13:50:27 -0800
Subject: [PATCH 160/286] [MetaSchedule] Add pass instrument to MetaSchedule
 api (#13688)

* [MetaSchedule] Add pass instrument to MetaSchedule api

Add the `instrument` parameter from the `PassContext` api to the meta
schedule tuning api.

* lint
---
 python/tvm/meta_schedule/relay_integration.py | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index fbdf68d09767..41d3f9d12ebc 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -17,16 +17,18 @@
 """MetaSchedule-Relay integration"""
 from contextlib import contextmanager
 from types import MappingProxyType
-from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Union, Set
+from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Union
 
 # isort: off
 from typing_extensions import Literal
 
 # isort: on
 import numpy as np  # type: ignore
+
 from tvm import nd
 from tvm._ffi import get_global_func
 from tvm.ir import IRModule, transform
+from tvm.ir.instrument import PassInstrument
 from tvm.runtime import NDArray
 from tvm.target import Target
 
@@ -127,6 +129,7 @@ def extract_tasks(
     runtime: Optional["relay.backend.Runtime"] = None,
     module_equality: str = "structural",
     disabled_pass: Optional[Union[List[str], Set[str], Tuple[str]]] = None,
+    instruments: Optional[Sequence[PassInstrument]] = None,
 ) -> List[ExtractedTask]:
     """Extract tuning tasks from a relay program.
 
@@ -158,6 +161,8 @@ def extract_tasks(
                             For the definition of the anchor block, see tir/analysis/analysis.py.
     disabled_pass : Optional[Union[List[str], Set[str], Tuple[str]]]
         The list of disabled passes
+    instruments : Optional[Sequence[PassInstrument]]
+        The list of pass instrument implementations.
 
     Returns
     -------
@@ -188,6 +193,7 @@ def extract_tasks(
                 opt_level=opt_level,
                 config=pass_config,
                 disabled_pass=disabled_pass,
+                instruments=instruments,
             ):
                 return list(_extract_task(mod, target, params, module_equality))
 
@@ -268,6 +274,7 @@ def tune_relay(
     module_equality: str = "structural",
     num_tuning_cores: Union[Literal["physical", "logical"], int] = "physical",
     disabled_pass: Optional[Union[List[str], Set[str], Tuple[str]]] = None,
+    instruments: Optional[Sequence[PassInstrument]] = None,
 ) -> Database:
     """Tune a Relay program.
 
@@ -319,6 +326,8 @@ def tune_relay(
         The number of CPU cores to use during tuning.
     disabled_pass : Optional[Union[List[str], Set[str], Tuple[str]]]
         The list of disabled passes during tasks extraction
+    instruments : Optional[Sequence[PassInstrument]]
+        The list of pass instrument implementations.
 
     Returns
     -------
@@ -327,7 +336,12 @@ def tune_relay(
     """
     tasks, task_weights = extracted_tasks_to_tune_contexts(
         extracted_tasks=extract_tasks(
-            mod, target, params, module_equality=module_equality, disabled_pass=disabled_pass
+            mod,
+            target,
+            params,
+            module_equality=module_equality,
+            disabled_pass=disabled_pass,
+            instruments=instruments,
         ),
         work_dir=work_dir,
         space=space,
@@ -369,6 +383,7 @@ def compile_relay(
     executor: Optional["relay.backend.Executor"] = None,
     disabled_pass: Optional[Union[List[str], Set[str], Tuple[str]]] = None,
     runtime: Optional["relay.backend.Runtime"] = None,
+    instruments: Optional[Sequence[PassInstrument]] = None,
 ):
     """Compile a relay program with a MetaSchedule database.
 
@@ -396,6 +411,8 @@ def compile_relay(
         The list of disabled passes
     runtime : Optional[relay.backend.Runtime]
         The runtime to use in relay.build. It is not supported by RelayVM.
+    instruments : Optional[Sequence[PassInstrument]]
+        The list of pass instrument implementations.
 
     Returns
     -------
@@ -416,6 +433,7 @@ def compile_relay(
                 opt_level=opt_level,
                 config=pass_config,
                 disabled_pass=disabled_pass,
+                instruments=instruments,
             ):
                 if backend == "graph":
                     return relay.build(

From 6fe6cd7221a53311110930c8c1591f2d7ab6bd57 Mon Sep 17 00:00:00 2001
From: Chun-I Tsai <quic_chunit@quicinc.com>
Date: Thu, 12 Jan 2023 12:28:07 +0800
Subject: [PATCH 161/286] [Relay][Frontend] Span Filling ONNX (#13767)

- Set node name as the source name of span during the conversion of
  ONNX model.
- Assign node name to a node based on op type when it is empty.
- To get the reference of renamed nodes. Add a function to export
  the ONNX model after conversion.
- Add structural_equal comparisons with and without set_span to the
  existing test cases.
- Add span test cases for frequent conversions.
- Add span test case for exporting model parameter.

Co-authored-by: Joey Tsai <chunit@qti.qualcomm.com>
---
 python/tvm/relay/frontend/onnx.py          | 133 +++++++-
 tests/python/frontend/onnx/test_forward.py | 376 ++++++++++++++++++++-
 2 files changed, 487 insertions(+), 22 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 328b5d7bd8d7..3e4c9db2b0ff 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -57,6 +57,7 @@
     shape_of,
     try_resolve_var_to_const,
     unbind,
+    set_span,
 )
 
 __all__ = ["from_onnx"]
@@ -556,6 +557,37 @@ def layer_norm(x, eps, gamma, beta):
     return output
 
 
+def get_source_name(node, type_dict):
+    """A helper function to get source information of onnx nodes."""
+    if node.name:
+        return node.name
+    else:
+        op_idx = 0
+        if node.op_type in type_dict:
+            op_idx = type_dict[node.op_type] + 1
+        type_dict[node.op_type] = op_idx
+        # rewrite name property in case any revisiting occurs to current node
+        node.name = "{}_{}".format(node.op_type, str(op_idx))
+        return node.name
+
+
+def get_source_name_from_parameter(expr, name_sep="."):
+    """A helper function to get source information of graph node from parameter."""
+    if expr.span:
+        source_name = expr.span.source_name.name
+        # discard variable/parameter name to get span of op node
+        # e.g. conv2d.w -> conv2d
+        if isinstance(expr, _expr.Var):
+            postfix = f"{name_sep}{expr.name_hint}"
+            source_name = source_name[: -len(postfix)]
+        return source_name
+    return None
+
+
+def make_parameter_span(source_name_list, name_sep="."):
+    return name_sep.join(source_name_list)
+
+
 class OnnxOpConverter(object):
     """A helper class for holding onnx op converters."""
 
@@ -2712,10 +2744,13 @@ def _impl_v9(cls, inputs, attr, params):
         else:
             dtype = get_type(dtype)
 
-        in_shape = _op.shape_of(inputs[0])
+        node_source_name = get_source_name_from_parameter(inputs[0])
+        # since there exists multi-comsumer for the same expression
+        # invoke set_span here to prevent expr-rewritten in span-filling stage
+        in_shape = set_span(_op.shape_of(inputs[0]), node_source_name)
         zeros = _op.zeros(in_shape, dtype)
 
-        dim = _op.take(in_shape, _op.const(0))
+        dim = set_span(_op.take(in_shape, _op.const(0)), node_source_name)
 
         indices = _op.arange(_op.const(0), dim, dtype="int32")
         ones = _op.full(_op.const(1), _op.reshape(dim, (1,)), dtype=dtype)
@@ -4128,7 +4163,10 @@ def cond_fn(*loop_inputs):
         # Get the current graph proto and create a clone for the subgraph
         graph_scope = GraphProto.current
         subgraph_scope = GraphProto(
-            graph_scope._shape, graph_scope._dtype, graph_scope._freeze_params
+            graph_scope._shape,
+            graph_scope._dtype,
+            graph_scope._freeze_params,
+            graph_scope._op_type_dict,
         )
         # Load nodes from outer graph into inner graph.
         subgraph_scope._nodes = graph_scope._nodes.copy()
@@ -4159,6 +4197,11 @@ def get_var(name, val, scan=False):
         ]
         loop_vars += [get_var(body.input[i + 2].name, v) for i, v in enumerate(loop_deps)]
         loop_var_names = [v.name_hint for v in loop_vars]
+        # get span information of loop body
+        body_source_name = get_source_name(body, subgraph_scope._op_type_dict)
+        # set span to inputs of loop body
+        for i, v in enumerate(loop_vars):
+            loop_vars[i] = set_span(v, make_parameter_span([v.name_hint, body_source_name]))
 
         num_scan_outputs = len(body.output) - (1 + num_deps)
 
@@ -4287,9 +4330,19 @@ def _impl_v1(cls, inputs, attr, params):
 
         # Create graph converters for both branches.
         graph_scope = GraphProto.current
-        then_graph = GraphProto(graph_scope._shape, graph_scope._dtype, graph_scope._freeze_params)
+        then_graph = GraphProto(
+            graph_scope._shape,
+            graph_scope._dtype,
+            graph_scope._freeze_params,
+            graph_scope._op_type_dict,
+        )
         then_graph._nodes = graph_scope._nodes.copy()
-        else_graph = GraphProto(graph_scope._shape, graph_scope._dtype, graph_scope._freeze_params)
+        else_graph = GraphProto(
+            graph_scope._shape,
+            graph_scope._dtype,
+            graph_scope._freeze_params,
+            graph_scope._op_type_dict,
+        )
         else_graph._nodes = graph_scope._nodes.copy()
 
         # Convert each branch to a relay expression.
@@ -4386,7 +4439,10 @@ def cond_fn(*loop_inputs):
         # Get the current graph proto and create a clone for the subgraph
         graph_scope = GraphProto.current
         subgraph_scope = GraphProto(
-            graph_scope._shape, graph_scope._dtype, graph_scope._freeze_params
+            graph_scope._shape,
+            graph_scope._dtype,
+            graph_scope._freeze_params,
+            graph_scope._op_type_dict,
         )
         # Load nodes from outer graph into inner graph.
         subgraph_scope._nodes = graph_scope._nodes.copy()
@@ -4440,6 +4496,12 @@ def get_var(name, val, scan=False):
         loop_vars += [
             get_var(body.input[i].name, v) for i, v in enumerate(inputs) if i < num_state_inputs
         ]
+        # get span information of scan body
+        body_source_name = get_source_name(body, subgraph_scope._op_type_dict)
+        # set span to inputs of scan body
+        for i, v in enumerate(loop_vars):
+            loop_vars[i] = set_span(v, make_parameter_span([v.name_hint, body_source_name]))
+
         loop_vars += scan_output_vars
         body_input_var_names = ["iter"] + [body.input[i].name for i in range(len(body.input))]
 
@@ -6197,11 +6259,16 @@ class GraphProto:
         at compile time and helps in making models static if certain inputs represent
         attributes relay would traditionally consider compile-time constants.
 
+    op_type_dict: Dict[str, int]
+        Dictionary for span filling usage. If the name property of op was not set
+        op_type_dict will provide an alternative by combining literal op type with
+        its presenting order
+
     """
 
     current = None
 
-    def __init__(self, shape, dtype, freeze_params=False):
+    def __init__(self, shape, dtype, freeze_params=False, op_type_dict=None):
         self._nodes = {}
         self._params = {}
         self._inputs = {}
@@ -6213,6 +6280,7 @@ def __init__(self, shape, dtype, freeze_params=False):
         self._dtype = dtype
         self.opset = None
         self._freeze_params = freeze_params
+        self._op_type_dict = op_type_dict
 
     def __enter__(self):
         self._old_manager = GraphProto.current
@@ -6365,6 +6433,9 @@ def _construct_nodes(self, graph):
         for node in graph.node:
             op_name = node.op_type
             attr = self._parse_attr(node.attribute)
+            # Fill in span of inputs
+            node_source_name = get_source_name(node, self._op_type_dict)
+            self._set_parameter_span(node, node_source_name)
             # Create and populate input list.
             inputs = onnx_input()
             for i in node.input:
@@ -6389,6 +6460,8 @@ def _construct_nodes(self, graph):
             else:
                 op = _expr.TupleWrapper(fold_constant(op.astuple()), len(op))
 
+            op = set_span(op, node_source_name)
+
             if outputs_num > 1:
                 # ONNX supports optional outputs for some nodes.
                 # This block searches for missing outputs in the ONNX graph
@@ -6427,6 +6500,19 @@ def _construct_nodes(self, graph):
                 for k, i in zip(list(node_output), range(len(node_output))):
                     self._nodes[k] = op[i]
 
+    def _set_parameter_span(self, node, node_source_name):
+        for i in node.input:
+            if i != "":
+                name = self._renames.get(i, i)
+                expr = self._nodes.get(name)
+                # relay.Var -> inputs / params
+                # relay.Constant -> freezed params / built-in constants
+                if isinstance(expr, (relay.Var, relay.Constant)):
+                    expr_with_span = set_span(expr, make_parameter_span([node_source_name, name]))
+                    self._nodes[name] = expr_with_span
+                    if name in self._inputs:
+                        self._inputs[name] = expr_with_span
+
     def _parse_value_proto(self, value_proto):
         """Parse ValueProto or raw str."""
         try:
@@ -6506,8 +6592,28 @@ def _fix_outputs(self, op_name, outputs):
         return outputs
 
 
+def export_model(location, graph):
+    """Convert the graph to an onnx model and export it to the location."""
+    import datetime
+    import os
+
+    from onnx import save, helper
+
+    if not os.path.exists(location):
+        os.makedirs(location)
+    time_stamp = datetime.datetime.now().strftime("%m_%d_%Y_%H_%M_%S")
+    model = helper.make_model(graph)
+    save(model, os.path.join(location, "tvm_exported_model_{}.onnx".format(time_stamp)))
+
+
 def from_onnx(
-    model, shape=None, dtype="float32", opset=None, freeze_params=True, convert_config=None
+    model,
+    shape=None,
+    dtype="float32",
+    opset=None,
+    freeze_params=True,
+    convert_config=None,
+    export_node_renamed_model_path=None,
 ):
     """Convert a ONNX model into an equivalent Relay Function.
 
@@ -6553,6 +6659,12 @@ def from_onnx(
                 True to convert qualified onnx `matmul` to `nn.batch_matmul` strict to NT format
                 (transpose_a=False, transpose_b=True).
 
+    export_node_renamed_model_path : str, optional
+        Export the node renamed onnx model to the path.
+        Some models do not contain names in their nodes. During the conversion, if names of nodes
+        are empty, new names will be assigned based on their op types. The exported model can be the
+        reference to spans.
+
     Returns
     -------
     mod : tvm.IRModule
@@ -6577,7 +6689,7 @@ def from_onnx(
                 warnings.warn(str(e))
     except ImportError:
         pass
-    g = GraphProto(shape, dtype, freeze_params)
+    g = GraphProto(shape, dtype, freeze_params, op_type_dict={})
     graph = model.graph
 
     try:
@@ -6607,6 +6719,9 @@ def from_onnx(
     with g:
         mod, params = g.from_onnx(graph, opset)
 
+    if export_node_renamed_model_path:
+        export_model(export_node_renamed_model_path, graph)
+
     if freeze_params:
         mod = relay.transform.DynamicToStatic()(mod)
 
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 09206b341dd9..c016078f8f11 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -34,7 +34,10 @@
 import tvm.testing
 import tvm.topi.testing
 from tvm import relay
-from tvm.contrib import graph_executor
+from tvm.contrib import graph_executor, utils
+from tvm.relay.frontend.common import infer_type
+from tvm.relay.build_module import bind_params_by_name
+from relay.utils.tag_span import _create_span, _set_span, _verify_structural_equal_with_span
 
 import onnx
 import onnxruntime.backend
@@ -81,18 +84,31 @@ def get_tvm_output_with_vm(
     opset=None,
     freeze_params=False,
     convert_config=None,
+    validate_structural_equal=True,
 ):
     """Generic function to execute and get tvm output with vm executor"""
     if not isinstance(input_data, list):
         input_data = [input_data]
     _, shape_dict = get_input_data_shape_dict(graph_def, input_data)
-    mod, params = relay.frontend.from_onnx(
-        graph_def,
-        shape_dict,
-        opset=opset,
-        freeze_params=freeze_params,
-        convert_config=convert_config,
-    )
+
+    with tvm.testing.disable_span_filling():
+        mod, params = relay.frontend.from_onnx(
+            graph_def,
+            shape_dict,
+            opset=opset,
+            freeze_params=freeze_params,
+            convert_config=convert_config,
+        )
+    if validate_structural_equal:
+        with tvm.testing.enable_span_filling():
+            mod_with_span, _ = relay.frontend.from_onnx(
+                graph_def,
+                shape_dict,
+                opset=opset,
+                freeze_params=freeze_params,
+                convert_config=convert_config,
+            )
+        assert tvm.ir.structural_equal(mod, mod_with_span)
 
     result = relay.create_executor("vm", mod=mod, device=dev, target=target).evaluate()(
         *input_data, **params
@@ -6667,7 +6683,13 @@ def get_random_uniform(shape, dtype="float32", high=1.0, low=0.0, seed=None):
             outputs=[helper.make_tensor_value_info("out", ONNX_DTYPE, shape)],
         )
         model = helper.make_model(graph, producer_name="random_uniform_test")
-        return get_tvm_output_with_vm(model, [], target=target, dev=dev)
+        return get_tvm_output_with_vm(
+            model,
+            [],
+            target=target,
+            dev=dev,
+            validate_structural_equal=(seed is not None),
+        )
 
     # Check that function runs and produces proper shape.
     vals = get_random_uniform([10], dtype="float32")
@@ -6733,7 +6755,13 @@ def get_random_uniform_like(input_, shape, dtype=None, high=1.0, low=0.0, seed=N
             outputs=[helper.make_tensor_value_info("out", ONNX_DTYPE, shape)],
         )
         model = helper.make_model(graph, producer_name="random_uniform_like_test")
-        return get_tvm_output_with_vm(model, [input_], target=target, dev=dev)
+        return get_tvm_output_with_vm(
+            model,
+            [input_],
+            target=target,
+            dev=dev,
+            validate_structural_equal=(seed is not None),
+        )
 
     # Check that function runs and produces proper shape and dtype.
     shape = [10]
@@ -6797,7 +6825,13 @@ def get_random_normal(shape, dtype="float32", scale=1.0, mean=0.0, seed=None):
             outputs=[helper.make_tensor_value_info("out", ONNX_DTYPE, shape)],
         )
         model = helper.make_model(graph, producer_name="random_normal_test")
-        return get_tvm_output_with_vm(model, [], target=target, dev=dev)
+        return get_tvm_output_with_vm(
+            model,
+            [],
+            target=target,
+            dev=dev,
+            validate_structural_equal=(seed is not None),
+        )
 
     # Test N-D tensor generation.
     vals = get_random_normal([1, 3, 100, 100], dtype="float32")
@@ -6837,7 +6871,13 @@ def get_random_normal_like(input_, shape, dtype="float32", scale=1.0, mean=0.0,
             outputs=[helper.make_tensor_value_info("out", ONNX_DTYPE, shape)],
         )
         model = helper.make_model(graph, producer_name="random_normal_like_test")
-        return get_tvm_output_with_vm(model, [input_], target=target, dev=dev)
+        return get_tvm_output_with_vm(
+            model,
+            [input_],
+            target=target,
+            dev=dev,
+            validate_structural_equal=(seed is not None),
+        )
 
     # Test N-D tensor generation.
     shape = [1, 3, 100, 100]
@@ -6875,7 +6915,13 @@ def get_multinomial(input, shape, sample_size, seed=None):
             outputs=[helper.make_tensor_value_info("out", OUT_DTYPE, shape)],
         )
         model = helper.make_model(graph, producer_name="multinomial_test")
-        return get_tvm_output_with_vm(model, [input], target=target, dev=dev)
+        return get_tvm_output_with_vm(
+            model,
+            [input],
+            target=target,
+            dev=dev,
+            validate_structural_equal=(seed is not None),
+        )
 
     # Test N-D tensor generation.
     shape = [3]
@@ -7348,5 +7394,309 @@ def verify_sequence_ops(tensor_shape, num_tensors, axis=0, position=0, new_axis=
     verify_sequence_ops((3, 3, 3, 3), 4, axis=2, new_axis=1)
 
 
+def test_exporting_node_renamed_model():
+    """test exproting model when export_node_renamed_model is set"""
+
+    a_name, a_shape = "a", (4, 3)
+    b_name, b_shape = "b", (3, 4)
+    out_name, out_shape = "out", [a_shape[0], b_shape[1]]
+    temp_dir = utils.tempdir().path
+
+    # model definition
+    mul_node = helper.make_node("MatMul", [a_name, b_name], [out_name])
+    graph = helper.make_graph(
+        [mul_node],
+        "matmul_test",
+        inputs=[
+            helper.make_tensor_value_info(a_name, TensorProto.FLOAT, a_shape),
+            helper.make_tensor_value_info(b_name, TensorProto.FLOAT, b_shape),
+        ],
+        outputs=[helper.make_tensor_value_info(out_name, TensorProto.FLOAT, out_shape)],
+    )
+    model = helper.make_model(graph, producer_name="matmul_test")
+
+    # get frontend model
+    shape_dict = {a_name: a_shape, b_name: b_shape}
+    _, _ = relay.frontend.from_onnx(model, shape_dict, export_node_renamed_model_path=temp_dir)
+
+    exported_model_name = os.listdir(temp_dir)[0]
+    assert "tvm_exported_model_" in exported_model_name
+
+    exported_model = onnx.load(os.path.join(temp_dir, exported_model_name))
+    assert exported_model.graph.node[0].name == "MatMul_0"
+
+
+class TestSetSpan:
+    """test structural equal between translated / hand-crafted relay IR with span tagged."""
+
+    def _verify(self, res_fptr, golden_fptr):
+        with tvm.testing.enable_span_filling():
+            with_span = res_fptr()
+        with tvm.testing.disable_span_filling():
+            without_span = res_fptr()
+        assert tvm.ir.structural_equal(with_span, without_span)
+        _verify_structural_equal_with_span(with_span, golden_fptr())
+
+    def test_conv2d_bias_add_span(self):
+        padding = [0, 0, 0, 0]
+        k_shape = [7, 7]
+        y_shape, y_name = [1, 6, 10, 10], "y"
+        x_shape, x_name = [1, 3, 10, 10], "x"
+        b_shape, b_name = [6], "b"
+        b_val = np.random.random(b_shape).astype(np.float32)
+        w_shape, w_name = [6, 3, 7, 7], "w"
+        w_val = np.random.random(w_shape).astype(np.float32)
+        group, strides, dilations = 1, [1, 1], [1, 1]
+        conv_name = "conv2d"
+
+        def _res():
+            # model definition
+            node = helper.make_node(
+                "Conv",
+                inputs=[x_name, w_name, b_name],
+                outputs=[y_name],
+                kernel_shape=k_shape,
+                strides=strides,
+                dilations=dilations,
+                group=group,
+                pads=padding,
+                name=conv_name,
+            )
+            graph = helper.make_graph(
+                [node],
+                "conv_test",
+                inputs=[helper.make_tensor_value_info(x_name, TensorProto.FLOAT, x_shape)],
+                outputs=[helper.make_tensor_value_info(y_name, TensorProto.FLOAT, y_shape)],
+                initializer=[
+                    helper.make_tensor(
+                        w_name,
+                        TensorProto.FLOAT,
+                        dims=w_shape,
+                        vals=w_val.flatten(),
+                    ),
+                    helper.make_tensor(
+                        b_name,
+                        TensorProto.FLOAT,
+                        dims=b_shape,
+                        vals=b_val.flatten(),
+                    ),
+                ],
+            )
+            model = helper.make_model(graph, producer_name="conv_test")
+
+            # get frontend model
+            shape_dict = {x_name: x_shape}
+            mod, _ = relay.frontend.from_onnx(model, shape_dict)
+            return mod["main"]
+
+        def _golden():
+            conv_si = conv_name
+            x = relay.var(
+                x_name,
+                shape=tuple(x_shape),
+                span=_create_span(f"{conv_si}.{x_name}"),
+            )
+            conv_weight = relay.const(
+                w_val,
+                span=_create_span(f"{conv_si}.{w_name}"),
+            )
+            conv_bias = relay.const(
+                b_val,
+                span=_create_span(f"{conv_si}.{b_name}"),
+            )
+            conv_out = _set_span(
+                relay.nn.conv2d(
+                    x,
+                    conv_weight,
+                    padding=[0] * 4,
+                    channels=y_shape[1],
+                    kernel_size=k_shape,
+                ),
+                conv_si,
+            )
+            bias_out = _set_span(relay.nn.bias_add(conv_out, conv_bias), conv_si)
+            return infer_type(relay.Function([x], bias_out))
+
+        self._verify(_res, _golden)
+
+    def test_batchnorm_span(self):
+        input_name, in_shape = "x", [1, 16, 10, 10]
+        bn_name = "bn"
+        output_name = "y"
+        scale_name = "scale"
+        bias_name = "b"
+        mean_name = "mean"
+        var_name = "var"
+
+        def _res():
+            # model definition
+            batchnorm = onnx.helper.make_node(
+                "BatchNormalization",
+                inputs=[input_name, scale_name, bias_name, mean_name, var_name],
+                outputs=[output_name],
+                name=bn_name,
+            )
+            graph = helper.make_graph(
+                [batchnorm],
+                "batchnorm_test",
+                inputs=[
+                    helper.make_tensor_value_info(input_name, TensorProto.FLOAT, in_shape),
+                    helper.make_tensor_value_info(scale_name, TensorProto.FLOAT, [in_shape[1]]),
+                    helper.make_tensor_value_info(bias_name, TensorProto.FLOAT, [in_shape[1]]),
+                    helper.make_tensor_value_info(mean_name, TensorProto.FLOAT, [in_shape[1]]),
+                    helper.make_tensor_value_info(var_name, TensorProto.FLOAT, [in_shape[1]]),
+                ],
+                outputs=[helper.make_tensor_value_info(output_name, TensorProto.FLOAT, in_shape)],
+            )
+            model = helper.make_model(graph, producer_name="batchnorm_test")
+
+            # get frontend model
+            shape_dict = {input_name: in_shape}
+            mod, _ = relay.frontend.from_onnx(model, shape_dict)
+            return mod["main"]
+
+        def _golden():
+            bn_si = bn_name
+            x = relay.var(
+                input_name,
+                shape=tuple(in_shape),
+                span=_create_span(f"{bn_si}.{input_name}"),
+            )
+            bn_scale = relay.var(
+                scale_name,
+                shape=(in_shape[1],),
+                span=_create_span(f"{bn_si}.{scale_name}"),
+            )
+            bn_bias = relay.var(
+                bias_name,
+                shape=(in_shape[1],),
+                span=_create_span(f"{bn_si}.{bias_name}"),
+            )
+            bn_rm = relay.var(
+                mean_name,
+                shape=(in_shape[1],),
+                span=_create_span(f"{bn_si}.{mean_name}"),
+            )
+            bn_rv = relay.var(
+                var_name,
+                shape=(in_shape[1],),
+                span=_create_span(f"{bn_si}.{var_name}"),
+            )
+            bn_out = _set_span(
+                relay.nn.batch_norm(x, bn_scale, bn_bias, bn_rm, bn_rv),
+                bn_si,
+            )
+            bn_tuple_get_item = _set_span(relay.TupleGetItem(bn_out.tuple_value, 0), bn_si)
+            return infer_type(
+                relay.Function([x, bn_scale, bn_bias, bn_rm, bn_rv], bn_tuple_get_item)
+            )
+
+        self._verify(_res, _golden)
+
+    def test_reshape_span(self):
+        input_shape = [2, 1, 10, 1, 10]
+        new_shape = [2, 1, 10, 10]
+        input_name = "in"
+        output_name = "out"
+        ref_name = "ref_in"
+        const_name = "const"
+        reshape_name = "reshape"
+
+        def _res():
+            # model definition
+            ref_array = np.array(new_shape)
+            ref_node = helper.make_node(
+                "Constant",
+                inputs=[],
+                outputs=[ref_name],
+                value=helper.make_tensor(
+                    name="const_tensor",
+                    data_type=TensorProto.INT32,
+                    dims=ref_array.shape,
+                    vals=ref_array.flatten().astype(int),
+                ),
+                name=const_name,
+            )
+            reshape_node = helper.make_node(
+                "Reshape",
+                [input_name, ref_name],
+                [output_name],
+                name=reshape_name,
+            )
+            graph = helper.make_graph(
+                [ref_node, reshape_node],
+                "reshape_test",
+                inputs=[helper.make_tensor_value_info(input_name, TensorProto.FLOAT, input_shape)],
+                outputs=[helper.make_tensor_value_info(output_name, TensorProto.FLOAT, new_shape)],
+            )
+            model = helper.make_model(graph, producer_name="reshape_test")
+
+            # get frontend model
+            shape_dict = {input_name: input_shape}
+            mod, _ = relay.frontend.from_onnx(model, shape_dict)
+            return mod["main"]
+
+        def _golden():
+            reshape_si = reshape_name
+            x = relay.var(
+                input_name,
+                shape=tuple(input_shape),
+                span=_create_span(f"{reshape_si}.{input_name}"),
+            )
+            reshape_out = _set_span(
+                relay.reshape(x, newshape=new_shape),
+                reshape_si,
+            )
+            return infer_type(relay.Function([x], reshape_out))
+
+        self._verify(_res, _golden)
+
+    def test_matmul_span(self):
+        a_name, a_shape = "a", (4, 3)
+        b_name, b_shape = "b", (3, 4)
+        out_name, out_shape = "out", [a_shape[0], b_shape[1]]
+        matmul_name = "matmul"
+
+        def _res():
+            # model definition
+            mul_node = helper.make_node("MatMul", [a_name, b_name], [out_name], name=matmul_name)
+            graph = helper.make_graph(
+                [mul_node],
+                "matmul_test",
+                inputs=[
+                    helper.make_tensor_value_info(a_name, TensorProto.FLOAT, a_shape),
+                    helper.make_tensor_value_info(b_name, TensorProto.FLOAT, b_shape),
+                ],
+                outputs=[helper.make_tensor_value_info(out_name, TensorProto.FLOAT, out_shape)],
+            )
+            model = helper.make_model(graph, producer_name="matmul_test")
+
+            # get frontend model
+            shape_dict = {a_name: a_shape, b_name: b_shape}
+            mod, _ = relay.frontend.from_onnx(model, shape_dict)
+            return mod["main"]
+
+        def _golden():
+            matmul_si = matmul_name
+            a = relay.var(
+                a_name,
+                shape=tuple(a_shape),
+                span=_create_span(f"{matmul_si}.{a_name}"),
+            )
+            b = relay.var(
+                b_name,
+                shape=tuple(b_shape),
+                span=_create_span(f"{matmul_si}.{b_name}"),
+            )
+            b_t = _set_span(relay.transpose(b, axes=[1, 0]), matmul_si)
+            matmul_out = _set_span(
+                relay.nn.dense(a, b_t, out_dtype="float32"),
+                matmul_si,
+            )
+            return infer_type(relay.Function([a, b], matmul_out))
+
+        self._verify(_res, _golden)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 4557d6ba7a17161ca28061eb008654d0c024ed94 Mon Sep 17 00:00:00 2001
From: LiangW <114222082+liangW-intellif@users.noreply.github.com>
Date: Thu, 12 Jan 2023 15:40:52 +0800
Subject: [PATCH 162/286] [TOPI][OP] Support grouped conv2d_NCHWc (#13733)

* [TOPI][OP] Support grouped conv2d_NCHWc

* Fix CI tests
---
 python/tvm/relay/op/strategy/x86.py           |  3 ++
 python/tvm/topi/nn/conv2d.py                  | 36 ++++++++++++++-----
 .../topi/python/test_topi_conv2d_NCHWc.py     | 19 ++++++++--
 3 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index fa002737a7b0..bcc9ca4e206b 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -254,6 +254,9 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
                 name="group_conv2d_nhwc.generic",
             )
+        elif _NCHWc_matcher.match(layout):  # check if layout is NCHWxc
+            assert _OIHWio_matcher.match(kernel_layout)  # check if kernel is OIHWio
+            return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
         else:
             raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
     return strategy
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 92b5a90e5b11..0485a17e98f5 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -388,9 +388,11 @@ def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, ou
     n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
     in_channel = ic_chunk * ic_bn
     target = tvm.target.Target.current(allow_none=False)
-    oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn = get_const_tuple(kernel.shape)
+    oc_chunk, ic_chunk_group, kernel_height, kernel_width, kernel_ic_bn, oc_bn = get_const_tuple(
+        kernel.shape
+    )
     num_filter = oc_chunk * oc_bn
-    groups = ic_chunk // ic_chunk_group
+    groups = in_channel // (ic_chunk_group * kernel_ic_bn)
 
     dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
     dilated_kernel_w = (kernel_width - 1) * dilation_w + 1
@@ -415,26 +417,44 @@ def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, ou
     else:
         data_pad = data
 
-    ic = te.reduce_axis((0, in_channel), name="ic")
     kh = te.reduce_axis((0, kernel_height), name="kh")
     kw = te.reduce_axis((0, kernel_width), name="kw")
 
     idxdiv = tvm.tir.indexdiv
     idxmod = tvm.tir.indexmod
 
+    if groups == 1:
+        ic = te.reduce_axis((0, in_channel), name="ic")
+        return te.compute(
+            oshape,
+            lambda n, oc_chunk, oh, ow, oc_block: te.sum(
+                data_pad[
+                    n,
+                    idxdiv(ic, ic_bn),
+                    oh * HSTR + kh * dilation_h,
+                    ow * WSTR + kw * dilation_w,
+                    idxmod(ic, ic_bn),
+                ].astype(out_dtype)
+                * kernel[oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block].astype(
+                    out_dtype
+                ),
+                axis=[ic, kh, kw],
+            ),
+            name="conv2d_NCHWc",
+            tag="conv2d_NCHWc",
+        )
+    ic = te.reduce_axis((0, in_channel // groups), name="ic")
     return te.compute(
         oshape,
-        lambda n, oc_chunk, oh, ow, oc_block: te.sum(
+        lambda n, occ, oh, ow, oc_block: te.sum(
             data_pad[
                 n,
-                idxdiv(ic, ic_bn),
+                (occ // (oc_chunk // groups)) * (ic_chunk // groups) + idxdiv(ic, ic_bn),
                 oh * HSTR + kh * dilation_h,
                 ow * WSTR + kw * dilation_w,
                 idxmod(ic, ic_bn),
             ].astype(out_dtype)
-            * kernel[oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block].astype(
-                out_dtype
-            ),
+            * kernel[occ, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block].astype(out_dtype),
             axis=[ic, kh, kw],
         ),
         name="conv2d_NCHWc",
diff --git a/tests/python/topi/python/test_topi_conv2d_NCHWc.py b/tests/python/topi/python/test_topi_conv2d_NCHWc.py
index 2298816d373a..007f2a5c6a16 100644
--- a/tests/python/topi/python/test_topi_conv2d_NCHWc.py
+++ b/tests/python/topi/python/test_topi_conv2d_NCHWc.py
@@ -63,6 +63,7 @@ def verify_conv2d_NCHWc(
     dilation=1,
     add_bias=False,
     add_relu=False,
+    groups=1,
     dtype="float32",
 ):
     pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
@@ -90,7 +91,14 @@ def verify_conv2d_NCHWc(
 
     A = te.placeholder((batch, in_channel // ic_block, in_height, in_width, ic_block), name="A")
     W = te.placeholder(
-        (num_filter // oc_block, in_channel // ic_block, kernel, kernel, ic_block, oc_block),
+        (
+            num_filter // oc_block,
+            in_channel // ic_block // groups,
+            kernel,
+            kernel,
+            ic_block,
+            oc_block,
+        ),
         name="W",
     )
     bias = te.placeholder((num_filter // oc_block, 1, 1, oc_block), name="bias")
@@ -98,10 +106,12 @@ def verify_conv2d_NCHWc(
     @memoize("topi.tests.test_topi_conv2d_NCHWc.verify_conv2d_NCHWc")
     def get_ref_data():
         a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype)
-        w_np = np.random.uniform(size=(num_filter, in_channel, kernel, kernel)).astype(dtype)
+        w_np = np.random.uniform(size=(num_filter, in_channel // groups, kernel, kernel)).astype(
+            dtype
+        )
         b_np = np.random.uniform(size=(num_filter, 1, 1)).astype(dtype)
         dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
+        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding, groups)
         if add_bias:
             c_np += b_np
         if add_relu:
@@ -195,6 +205,9 @@ def test_conv2d_NCHWc():
     verify_conv2d_NCHWc(4, 64, 56, 64, 3, 1, 1)
     verify_conv2d_NCHWc(9, 64, 56, 64, 3, 1, 1)
 
+    # groups
+    verify_conv2d_NCHWc(1, 2048, 10, 2048, 3, 1, 1, groups=128)
+
     # weird workloads
     verify_conv2d_NCHWc(2, 2, 2, 2, 2, 2, 2)
     verify_conv2d_NCHWc(3, 3, 3, 3, 3, 3, 3)

From 412135acc8d84c8e9a801dbc4b9e12c4421484e2 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 12 Jan 2023 03:04:15 -0800
Subject: [PATCH 163/286] [CI][microTVM]Update ci_cortexm image (#13764)

Update cortexm image to ci-cortexm:20230111-165944-a9c6f137d after
CMSIS-NN moved to a new github location and Cortex-M image was
supported for Ubuntu 20.04.
---
 ci/jenkins/docker-images.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/jenkins/docker-images.ini b/ci/jenkins/docker-images.ini
index 76f8a5cf3e38..b4be9fdd109c 100644
--- a/ci/jenkins/docker-images.ini
+++ b/ci/jenkins/docker-images.ini
@@ -18,7 +18,7 @@
 # This data file is read during when Jenkins runs job to determine docker images.
 [jenkins]
 ci_arm: tlcpack/ci-arm:20221013-060115-61c9742ea
-ci_cortexm: tlcpack/ci-cortexm:20221013-060115-61c9742ea
+ci_cortexm: tlcpack/ci-cortexm:20230111-165944-a9c6f137d
 ci_cpu: tlcpack/ci-cpu:20230110-070003-d00168ffb
 ci_gpu: tlcpack/ci-gpu:20221128-070141-ae4fd7df7
 ci_hexagon: tlcpack/ci-hexagon:20221013-060115-61c9742ea

From 39c0ef2457f6be2376e3f8d98647d212446c1fcf Mon Sep 17 00:00:00 2001
From: Alexey Yazev <113356454+Alexey-Yazev@users.noreply.github.com>
Date: Thu, 12 Jan 2023 15:06:51 +0400
Subject: [PATCH 164/286] [microNPU] Add relu6 relu_n1_to_1 test cases for
 Ethos-U (#13645)

Tests are extended with cases with activations relu6 and relu_n1_to_1. Test cases contain conv2d operation + activation because separate activation is not offloaded to NPU.
---
 .../contrib/test_ethosu/test_codegen.py       | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 13b54b988963..dc54ef071d19 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -1132,6 +1132,65 @@ def leaky_relu_func(x):
     )
 
 
+# conv2d + relu_n1_to_1 is used because separate activation is not offloaded to NPU.
+def test_tflite_relu_n1_to_1():
+    np.random.seed(0)
+    accel_type = "ethos-u55-256"
+    ifm_shape = (1, 55, 34, 3)
+    kernel_shape = (3, 2)
+    strides = (1, 1)
+
+    @tf.function
+    def conv2d_relu_n1_to_1(x):
+        tf_strides = [1, strides[0], strides[1], 1]
+        weight_shape = [kernel_shape[0], kernel_shape[1], ifm_shape[3], 3]
+        weight = tf.constant(np.random.uniform(size=weight_shape), dtype=tf.float32)
+        op = tf.nn.conv2d(
+            x,
+            weight,
+            strides=tf_strides,
+            padding="VALID",
+        )
+        # The specific pattern will be replaced into RELU_N1_TO_1 by tflite.
+        return tf.math.maximum(-1.0, tf.math.minimum(op, 1.0))
+
+    infra.compare_tvm_with_tflite(
+        conv2d_relu_n1_to_1,
+        [ifm_shape],
+        accel_type,
+        enable_cascader=True,
+    )
+
+
+# conv2d + relu6 is used because separate activation is not offloaded to NPU.
+def test_tflite_relu6():
+    np.random.seed(0)
+    accel_type = "ethos-u55-256"
+    ifm_shape = (1, 55, 34, 3)
+    kernel_shape = (3, 2)
+    strides = (1, 1)
+
+    @tf.function
+    def conv2d_relu6(x):
+        tf_strides = [1, strides[0], strides[1], 1]
+        weight_shape = [kernel_shape[0], kernel_shape[1], ifm_shape[3], 3]
+        weight = tf.constant(np.random.uniform(size=weight_shape), dtype=tf.float32)
+        op = tf.nn.conv2d(
+            x,
+            weight,
+            strides=tf_strides,
+            padding="VALID",
+        )
+        return tf.nn.relu6(op)
+
+    infra.compare_tvm_with_tflite(
+        conv2d_relu6,
+        [ifm_shape],
+        accel_type,
+        enable_cascader=True,
+    )
+
+
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
 @pytest.mark.parametrize("ifm_shape", [(1, 14), (1, 151)])
 @pytest.mark.parametrize("ofm_channels", [32, 64])

From 0da41e28230ebab6ee174b2c6fc41d896cc5a360 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 12 Jan 2023 10:04:11 -0800
Subject: [PATCH 165/286] [docs] Remove empty code blocks (#13689)

This removes the linter that checks that each sphinx gallery tutorial begins with the proper boilerplate by extracting it out to sphinx gallery reset functions. This runs automatically so no code in the tutorials nor associated linters are necessary.
---
 docs/conf.py                                  |   6 +
 gallery/how_to/compile_models/from_coreml.py  |   5 -
 gallery/how_to/compile_models/from_darknet.py |   6 -
 gallery/how_to/compile_models/from_keras.py   |   3 -
 gallery/how_to/compile_models/from_mxnet.py   |   5 +-
 gallery/how_to/compile_models/from_oneflow.py |   3 -
 gallery/how_to/compile_models/from_onnx.py    |   5 -
 gallery/how_to/compile_models/from_paddle.py  |   5 -
 gallery/how_to/compile_models/from_pytorch.py |   6 -
 .../how_to/compile_models/from_tensorflow.py  |   6 -
 gallery/how_to/compile_models/from_tflite.py  |   6 -
 .../deploy_models/deploy_model_on_adreno.py   |   6 -
 .../deploy_models/deploy_model_on_android.py  |   5 -
 .../deploy_models/deploy_model_on_nano.py     |   4 -
 .../deploy_models/deploy_model_on_rasp.py     |   6 -
 .../deploy_object_detection_pytorch.py        |   6 -
 .../deploy_models/deploy_prequantized.py      |   5 -
 .../deploy_prequantized_tflite.py             |   5 -
 .../how_to/deploy_models/deploy_quantized.py  |   5 -
 gallery/how_to/deploy_models/deploy_sparse.py |   5 -
 .../deploy_models/deploy_ssd_gluoncv.py       |   5 -
 .../extend_tvm/bring_your_own_datatypes.py    |   5 -
 .../extend_tvm/low_level_custom_pass.py       |   5 -
 gallery/how_to/extend_tvm/use_pass_infra.py   |   5 -
 .../how_to/extend_tvm/use_pass_instrument.py  |   5 -
 .../optimize_operators/opt_conv_cuda.py       |  10 +-
 .../optimize_operators/opt_conv_tensorcore.py |   4 -
 gallery/how_to/optimize_operators/opt_gemm.py |   5 -
 .../tune_conv2d_layer_cuda.py                 |   4 -
 .../tune_network_arm.py                       |   5 -
 .../tune_network_cuda.py                      |   5 -
 .../tune_network_mali.py                      |   5 -
 .../tune_network_x86.py                       |   5 -
 .../tune_sparse_x86.py                        |   5 -
 .../tune_with_autotvm/tune_conv2d_cuda.py     |   4 -
 .../tune_with_autotvm/tune_relay_arm.py       |   5 -
 .../tune_with_autotvm/tune_relay_cuda.py      |   4 -
 .../tune_relay_mobile_gpu.py                  |   5 -
 .../tune_with_autotvm/tune_relay_x86.py       |   5 -
 .../how_to/work_with_microtvm/micro_aot.py    |   5 -
 .../work_with_microtvm/micro_autotune.py      |   5 -
 .../how_to/work_with_microtvm/micro_ethosu.py |   5 -
 .../work_with_microtvm/micro_pytorch.py       |   5 -
 .../work_with_microtvm/micro_reference_vm.py  |   6 -
 .../how_to/work_with_microtvm/micro_tflite.py |   5 -
 .../work_with_pytorch/using_as_torch.py       |  12 +-
 .../using_optimized_torch.py                  |  18 +-
 gallery/how_to/work_with_relay/build_gcn.py   |   5 -
 .../work_with_relay/using_external_lib.py     |   5 -
 .../using_pipeline_executor.py                |  10 --
 .../how_to/work_with_relay/using_relay_viz.py |   5 -
 .../how_to/work_with_schedules/extern_op.py   |   5 -
 .../how_to/work_with_schedules/intrin_math.py |   7 +-
 .../how_to/work_with_schedules/reduction.py   |   3 -
 gallery/how_to/work_with_schedules/scan.py    |   3 -
 .../schedule_primitives.py                    |   5 -
 gallery/how_to/work_with_schedules/tedd.py    |   5 -
 .../how_to/work_with_schedules/tensorize.py   |   5 -
 .../work_with_schedules/tuple_inputs.py       |   5 -
 gallery/tutorial/auto_scheduler_matmul_x86.py |   5 -
 gallery/tutorial/autotvm_matmul_x86.py        |   5 -
 gallery/tutorial/autotvm_relay_x86.py         |   5 -
 gallery/tutorial/cross_compilation_and_rpc.py |   5 -
 gallery/tutorial/install.py                   |   6 -
 gallery/tutorial/intro_topi.py                |   4 -
 gallery/tutorial/introduction.py              |   5 -
 gallery/tutorial/relay_quick_start.py         |  10 +-
 gallery/tutorial/tensor_expr_get_started.py   |   5 -
 gallery/tutorial/tensor_ir_blitz_course.py    |   4 -
 gallery/tutorial/tvmc_command_line_driver.py  |   5 -
 gallery/tutorial/tvmc_python.py               |   5 -
 gallery/tutorial/uma.py                       |   6 -
 tests/lint/check_request_hook.py              | 161 ------------------
 tests/scripts/task_lint.sh                    |   3 -
 74 files changed, 24 insertions(+), 533 deletions(-)
 delete mode 100644 tests/lint/check_request_hook.py

diff --git a/docs/conf.py b/docs/conf.py
index 18c634c05d05..08fbedb8ffca 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -223,6 +223,10 @@ def rewrite_generic_admonition(match):
     return text
 
 
+def install_request_hook(gallery_conf, fname):
+    testing.utils.install_request_hook(depth=3)
+
+
 INSTALL_TVM_DEV = f"""\
 %%shell
 # Installs the latest dev build of TVM from PyPI. If you wish to build
@@ -431,6 +435,7 @@ def jupyter_notebook(script_blocks, gallery_conf, target_dir, real_func):
     "topic/vta/tutorials",
 ]
 
+
 subsection_order = ExplicitOrder(
     str(p)
     for p in [
@@ -563,6 +568,7 @@ def force_gc(gallery_conf, fname):
     "expected_failing_examples": [],
     "reset_modules": ("matplotlib", "seaborn", force_gc),
     "promote_jupyter_magic": True,
+    "reset_modules": (install_request_hook),
 }
 
 autodoc_default_options = {
diff --git a/gallery/how_to/compile_models/from_coreml.py b/gallery/how_to/compile_models/from_coreml.py
index 4d0eea2d8d52..b54329920b8d 100644
--- a/gallery/how_to/compile_models/from_coreml.py
+++ b/gallery/how_to/compile_models/from_coreml.py
@@ -34,11 +34,6 @@
 https://github.com/apple/coremltools
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import tvm.relay as relay
diff --git a/gallery/how_to/compile_models/from_darknet.py b/gallery/how_to/compile_models/from_darknet.py
index 8397efa63b97..ef0a8583777f 100644
--- a/gallery/how_to/compile_models/from_darknet.py
+++ b/gallery/how_to/compile_models/from_darknet.py
@@ -32,12 +32,6 @@
 
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 # numpy and matplotlib
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/gallery/how_to/compile_models/from_keras.py b/gallery/how_to/compile_models/from_keras.py
index ac961ca16ad0..3da674c25086 100644
--- a/gallery/how_to/compile_models/from_keras.py
+++ b/gallery/how_to/compile_models/from_keras.py
@@ -37,9 +37,6 @@
 
 # sphinx_gallery_start_ignore
 # sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
 import tvm
 from tvm import te
diff --git a/gallery/how_to/compile_models/from_mxnet.py b/gallery/how_to/compile_models/from_mxnet.py
index cfd66ecdb74c..0694d2aed081 100644
--- a/gallery/how_to/compile_models/from_mxnet.py
+++ b/gallery/how_to/compile_models/from_mxnet.py
@@ -33,13 +33,10 @@
 https://mxnet.apache.org/versions/master/install/index.html
 """
 
+# some standard imports
 # sphinx_gallery_start_ignore
 # sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
-# some standard imports
 import mxnet as mx
 import tvm
 import tvm.relay as relay
diff --git a/gallery/how_to/compile_models/from_oneflow.py b/gallery/how_to/compile_models/from_oneflow.py
index 0925c9fe81ce..64f659316bc4 100644
--- a/gallery/how_to/compile_models/from_oneflow.py
+++ b/gallery/how_to/compile_models/from_oneflow.py
@@ -39,9 +39,6 @@
 
 # sphinx_gallery_start_ignore
 # sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
 import os, math
 from matplotlib import pyplot as plt
diff --git a/gallery/how_to/compile_models/from_onnx.py b/gallery/how_to/compile_models/from_onnx.py
index 980091d391bd..c1f9be72c54c 100644
--- a/gallery/how_to/compile_models/from_onnx.py
+++ b/gallery/how_to/compile_models/from_onnx.py
@@ -32,11 +32,6 @@
 https://github.com/onnx/onnx
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 import onnx
 import numpy as np
 import tvm
diff --git a/gallery/how_to/compile_models/from_paddle.py b/gallery/how_to/compile_models/from_paddle.py
index 199547b814a4..5e78c8c3b06c 100644
--- a/gallery/how_to/compile_models/from_paddle.py
+++ b/gallery/how_to/compile_models/from_paddle.py
@@ -31,11 +31,6 @@
 https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 import tarfile
 import paddle
 import numpy as np
diff --git a/gallery/how_to/compile_models/from_pytorch.py b/gallery/how_to/compile_models/from_pytorch.py
index 064ed70e4645..14c264b9f4ac 100644
--- a/gallery/how_to/compile_models/from_pytorch.py
+++ b/gallery/how_to/compile_models/from_pytorch.py
@@ -41,12 +41,6 @@
 be unstable.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 import tvm
 from tvm import relay
 
diff --git a/gallery/how_to/compile_models/from_tensorflow.py b/gallery/how_to/compile_models/from_tensorflow.py
index b85b9e669a20..741d98109450 100644
--- a/gallery/how_to/compile_models/from_tensorflow.py
+++ b/gallery/how_to/compile_models/from_tensorflow.py
@@ -29,12 +29,6 @@
 Please refer to https://www.tensorflow.org/install
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 # tvm, relay
 import tvm
 from tvm import te
diff --git a/gallery/how_to/compile_models/from_tflite.py b/gallery/how_to/compile_models/from_tflite.py
index a248346c2971..226e67c82e89 100644
--- a/gallery/how_to/compile_models/from_tflite.py
+++ b/gallery/how_to/compile_models/from_tflite.py
@@ -56,12 +56,6 @@
 # Utils for downloading and extracting zip files
 # ----------------------------------------------
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 import os
 
 
diff --git a/gallery/how_to/deploy_models/deploy_model_on_adreno.py b/gallery/how_to/deploy_models/deploy_model_on_adreno.py
index 8d25e50b56b1..c120c5339b62 100644
--- a/gallery/how_to/deploy_models/deploy_model_on_adreno.py
+++ b/gallery/how_to/deploy_models/deploy_model_on_adreno.py
@@ -120,12 +120,6 @@
 # -----------------
 # As an example we would use classical cat image from ImageNet
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 from PIL import Image
 from tvm.contrib.download import download_testdata
 from matplotlib import pyplot as plt
diff --git a/gallery/how_to/deploy_models/deploy_model_on_android.py b/gallery/how_to/deploy_models/deploy_model_on_android.py
index 4bf86e2981a1..2e5d916cd6f2 100644
--- a/gallery/how_to/deploy_models/deploy_model_on_android.py
+++ b/gallery/how_to/deploy_models/deploy_model_on_android.py
@@ -25,11 +25,6 @@
 This is an example of using Relay to compile a keras model and deploy it on Android device.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import os
 import numpy as np
diff --git a/gallery/how_to/deploy_models/deploy_model_on_nano.py b/gallery/how_to/deploy_models/deploy_model_on_nano.py
index 3d8a4a796f8c..abd0b3fab61d 100644
--- a/gallery/how_to/deploy_models/deploy_model_on_nano.py
+++ b/gallery/how_to/deploy_models/deploy_model_on_nano.py
@@ -27,11 +27,7 @@
 
 # sphinx_gallery_start_ignore
 # sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
-
 import tvm
 from tvm import te
 import tvm.relay as relay
diff --git a/gallery/how_to/deploy_models/deploy_model_on_rasp.py b/gallery/how_to/deploy_models/deploy_model_on_rasp.py
index ab5374d93dbf..de4ed9aff074 100644
--- a/gallery/how_to/deploy_models/deploy_model_on_rasp.py
+++ b/gallery/how_to/deploy_models/deploy_model_on_rasp.py
@@ -26,12 +26,6 @@
 it on Raspberry Pi.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 import tvm
 from tvm import te
 import tvm.relay as relay
diff --git a/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py b/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py
index ffde042e2b88..8400e82b4215 100644
--- a/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py
+++ b/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py
@@ -40,12 +40,6 @@
 be unstable.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 import tvm
 from tvm import relay
 from tvm import relay
diff --git a/gallery/how_to/deploy_models/deploy_prequantized.py b/gallery/how_to/deploy_models/deploy_prequantized.py
index fdb4de289d91..b93ed5e4dacb 100644
--- a/gallery/how_to/deploy_models/deploy_prequantized.py
+++ b/gallery/how_to/deploy_models/deploy_prequantized.py
@@ -28,11 +28,6 @@
 Once loaded, we can run compiled, quantized models on any hardware TVM supports.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 #################################################################################
 # First, necessary imports
diff --git a/gallery/how_to/deploy_models/deploy_prequantized_tflite.py b/gallery/how_to/deploy_models/deploy_prequantized_tflite.py
index 494b4a9e219b..2d0e225dce39 100644
--- a/gallery/how_to/deploy_models/deploy_prequantized_tflite.py
+++ b/gallery/how_to/deploy_models/deploy_prequantized_tflite.py
@@ -42,11 +42,6 @@
 
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 ###############################################################################
 # Necessary imports
diff --git a/gallery/how_to/deploy_models/deploy_quantized.py b/gallery/how_to/deploy_models/deploy_quantized.py
index 24c7ce3331f5..f1b45dd7c158 100644
--- a/gallery/how_to/deploy_models/deploy_quantized.py
+++ b/gallery/how_to/deploy_models/deploy_quantized.py
@@ -27,11 +27,6 @@
 Relay, quantize the Relay model and then perform the inference.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import tvm
 from tvm import te
diff --git a/gallery/how_to/deploy_models/deploy_sparse.py b/gallery/how_to/deploy_models/deploy_sparse.py
index b9a26e0d3053..c90a3b566e7a 100644
--- a/gallery/how_to/deploy_models/deploy_sparse.py
+++ b/gallery/how_to/deploy_models/deploy_sparse.py
@@ -70,11 +70,6 @@
 sparse speed using fake weights to see the benefit of structured sparsity.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 ###############################################################################
 # Load Required Modules
diff --git a/gallery/how_to/deploy_models/deploy_ssd_gluoncv.py b/gallery/how_to/deploy_models/deploy_ssd_gluoncv.py
index f39244a2eb03..af15a9337c25 100644
--- a/gallery/how_to/deploy_models/deploy_ssd_gluoncv.py
+++ b/gallery/how_to/deploy_models/deploy_ssd_gluoncv.py
@@ -24,11 +24,6 @@
 We will use GluonCV pre-trained SSD model and convert it to Relay IR
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 
diff --git a/gallery/how_to/extend_tvm/bring_your_own_datatypes.py b/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
index bbd207dbac8b..f5ff89717c1a 100644
--- a/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
+++ b/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
@@ -52,11 +52,6 @@
     ctypes.CDLL('my-datatype-lib.so', ctypes.RTLD_GLOBAL)
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 ######################
 # A Simple TVM Program
diff --git a/gallery/how_to/extend_tvm/low_level_custom_pass.py b/gallery/how_to/extend_tvm/low_level_custom_pass.py
index 0f99c72cee9c..50634116ce8e 100644
--- a/gallery/how_to/extend_tvm/low_level_custom_pass.py
+++ b/gallery/how_to/extend_tvm/low_level_custom_pass.py
@@ -41,11 +41,6 @@
 
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import numpy as np
diff --git a/gallery/how_to/extend_tvm/use_pass_infra.py b/gallery/how_to/extend_tvm/use_pass_infra.py
index a41a26fc0b1e..f82cf40029d4 100644
--- a/gallery/how_to/extend_tvm/use_pass_infra.py
+++ b/gallery/how_to/extend_tvm/use_pass_infra.py
@@ -40,11 +40,6 @@
 The same approach can be used for tir as well.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import numpy as np
 import tvm
diff --git a/gallery/how_to/extend_tvm/use_pass_instrument.py b/gallery/how_to/extend_tvm/use_pass_instrument.py
index 3079e2f0e763..fd965cdf973a 100644
--- a/gallery/how_to/extend_tvm/use_pass_instrument.py
+++ b/gallery/how_to/extend_tvm/use_pass_instrument.py
@@ -34,11 +34,6 @@
 passes. Please also refer to the :ref:`pass-infra`.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 import tvm
 import tvm.relay as relay
 from tvm.relay.testing import resnet
diff --git a/gallery/how_to/optimize_operators/opt_conv_cuda.py b/gallery/how_to/optimize_operators/opt_conv_cuda.py
index 33e5d9855361..1ab38450f5c4 100644
--- a/gallery/how_to/optimize_operators/opt_conv_cuda.py
+++ b/gallery/how_to/optimize_operators/opt_conv_cuda.py
@@ -30,13 +30,6 @@
 
 """
 
-# sphinx_gallery_start_ignore
-# sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 ################################################################
 # Preparation and Algorithm
 # -------------------------
@@ -47,6 +40,9 @@
 # convolution. The following code defines the convolution algorithm in TVM.
 #
 
+# sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
+# sphinx_gallery_end_ignore
 import numpy as np
 import tvm
 from tvm import te
diff --git a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
index 5734f064f0dc..b43fac913956 100644
--- a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
+++ b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
@@ -53,11 +53,7 @@
 
 # sphinx_gallery_start_ignore
 # sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
-
 import tvm
 from tvm import te
 import numpy as np
diff --git a/gallery/how_to/optimize_operators/opt_gemm.py b/gallery/how_to/optimize_operators/opt_gemm.py
index 249a4e26e918..7ca423281570 100644
--- a/gallery/how_to/optimize_operators/opt_gemm.py
+++ b/gallery/how_to/optimize_operators/opt_gemm.py
@@ -48,11 +48,6 @@
 Intel i7-4770HQ CPU. The cache line size should be 64 bytes for all the x86 CPUs.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 ################################################################################################
 # Preparation and Baseline
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
index 7964694e68c0..ea03869c8c72 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
@@ -39,11 +39,7 @@
 
 # sphinx_gallery_start_ignore
 # sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
-
 import os
 
 import numpy as np
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py b/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py
index 09a1d0cea520..a109acba0695 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py
@@ -46,11 +46,6 @@
 __name__ == "__main__":` block.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import numpy as np
 import os
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py
index a430411fd9ee..670996410359 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py
@@ -44,11 +44,6 @@
 __name__ == "__main__":` block.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import numpy as np
 
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py b/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py
index 8ac0b235d72e..e72e261e4bc2 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py
@@ -44,11 +44,6 @@
 __name__ == "__main__":` block.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import numpy as np
 
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py b/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py
index f8caba075de3..6eb1b79bfe0a 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py
@@ -45,11 +45,6 @@
 __name__ == "__main__":` block.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import numpy as np
 
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py b/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py
index 0a2ddbd1bd81..3d810b25feb2 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py
@@ -35,11 +35,6 @@
 __name__ == "__main__":` block.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import os
 
diff --git a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
index a73b97525f12..d7047a0afbcc 100644
--- a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
+++ b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
@@ -50,11 +50,7 @@
 
 # sphinx_gallery_start_ignore
 # sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
-
 import logging
 import sys
 import numpy as np
diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_arm.py b/gallery/how_to/tune_with_autotvm/tune_relay_arm.py
index a8f66d9b08a1..0cb02c036fd7 100644
--- a/gallery/how_to/tune_with_autotvm/tune_relay_arm.py
+++ b/gallery/how_to/tune_with_autotvm/tune_relay_arm.py
@@ -62,11 +62,6 @@
 #
 # Now return to python code. Import packages.
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import os
 
diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py b/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py
index 7cb6cb8dd3f9..ee0a83ab8eb8 100644
--- a/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py
+++ b/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py
@@ -61,11 +61,7 @@
 
 # sphinx_gallery_start_ignore
 # sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
-
 import os
 
 import numpy as np
diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py b/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py
index d73e46448b7d..dd0a3a9837ac 100644
--- a/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py
+++ b/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py
@@ -60,11 +60,6 @@
 #
 # Now return to python code. Import packages.
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import os
 
diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_x86.py b/gallery/how_to/tune_with_autotvm/tune_relay_x86.py
index 2ba597d1da19..a44c30bb89f9 100644
--- a/gallery/how_to/tune_with_autotvm/tune_relay_x86.py
+++ b/gallery/how_to/tune_with_autotvm/tune_relay_x86.py
@@ -29,11 +29,6 @@
 __name__ == "__main__":` block.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 import os
 import numpy as np
 
diff --git a/gallery/how_to/work_with_microtvm/micro_aot.py b/gallery/how_to/work_with_microtvm/micro_aot.py
index 8646b6d7ecfa..81109b2965ef 100644
--- a/gallery/how_to/work_with_microtvm/micro_aot.py
+++ b/gallery/how_to/work_with_microtvm/micro_aot.py
@@ -35,11 +35,6 @@
 #     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
 #
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import os
 
diff --git a/gallery/how_to/work_with_microtvm/micro_autotune.py b/gallery/how_to/work_with_microtvm/micro_autotune.py
index 3dd4cab6c9af..9edb9ae75e7f 100644
--- a/gallery/how_to/work_with_microtvm/micro_autotune.py
+++ b/gallery/how_to/work_with_microtvm/micro_autotune.py
@@ -32,11 +32,6 @@
 #     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
 #
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 # You can skip the following two sections (installing Zephyr and CMSIS-NN) if the following flag is False.
 # Installing Zephyr takes ~20 min.
diff --git a/gallery/how_to/work_with_microtvm/micro_ethosu.py b/gallery/how_to/work_with_microtvm/micro_ethosu.py
index e80860dc0ce6..f257507bb5a5 100644
--- a/gallery/how_to/work_with_microtvm/micro_ethosu.py
+++ b/gallery/how_to/work_with_microtvm/micro_ethosu.py
@@ -37,11 +37,6 @@
 TVM to offload operators to the Ethos(TM)-U55 where possible.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 ################################################################################
 # Obtaining TVM
diff --git a/gallery/how_to/work_with_microtvm/micro_pytorch.py b/gallery/how_to/work_with_microtvm/micro_pytorch.py
index f7f0c9209a87..370e4d7e804b 100644
--- a/gallery/how_to/work_with_microtvm/micro_pytorch.py
+++ b/gallery/how_to/work_with_microtvm/micro_pytorch.py
@@ -34,11 +34,6 @@
 #     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
 #
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import pathlib
 import torch
diff --git a/gallery/how_to/work_with_microtvm/micro_reference_vm.py b/gallery/how_to/work_with_microtvm/micro_reference_vm.py
index 80ab0edf8fae..3121bca353a5 100644
--- a/gallery/how_to/work_with_microtvm/micro_reference_vm.py
+++ b/gallery/how_to/work_with_microtvm/micro_reference_vm.py
@@ -157,9 +157,3 @@
 
 
 """
-
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
diff --git a/gallery/how_to/work_with_microtvm/micro_tflite.py b/gallery/how_to/work_with_microtvm/micro_tflite.py
index cbdf6cd6f4ca..86e5d6b4b1ae 100644
--- a/gallery/how_to/work_with_microtvm/micro_tflite.py
+++ b/gallery/how_to/work_with_microtvm/micro_tflite.py
@@ -30,11 +30,6 @@
 #     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
 #
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import os
 
diff --git a/gallery/how_to/work_with_pytorch/using_as_torch.py b/gallery/how_to/work_with_pytorch/using_as_torch.py
index e2351a0d7c65..59c7f88845d9 100644
--- a/gallery/how_to/work_with_pytorch/using_as_torch.py
+++ b/gallery/how_to/work_with_pytorch/using_as_torch.py
@@ -16,7 +16,7 @@
 # under the License.
 """
 Wrap Your TVMScript as PyTorch Module
-======================
+=====================================
 **Author**:
 `Yaoda Zhou <https://github.com/juda>`_
 
@@ -32,12 +32,6 @@
 """
 
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 # Import PyTorch, as well as necessary libraries
 import torch
 import torch.nn.functional as F
@@ -49,7 +43,7 @@
 
 ######################################################################
 # Write your own PyTorch operator by TVMScript
-# -------------------------------
+# --------------------------------------------
 # PyTorch is a very popular machine learning framework which contains
 # optimized implementations of most commonly used operators.
 # Nevertheless, sometimes you might want to write your own operators in PyTorch.
@@ -130,7 +124,7 @@ def tvm_depthwise(
 
 ######################################################################
 # Benchmark
-# -------------------------------
+# ---------
 
 results = []
 for i in range(5):
diff --git a/gallery/how_to/work_with_pytorch/using_optimized_torch.py b/gallery/how_to/work_with_pytorch/using_optimized_torch.py
index baf80541b964..0feafad7c3c3 100644
--- a/gallery/how_to/work_with_pytorch/using_optimized_torch.py
+++ b/gallery/how_to/work_with_pytorch/using_optimized_torch.py
@@ -31,14 +31,10 @@
 
 """
 
+# Import PyTorch
 # sphinx_gallery_start_ignore
 # sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
-
-# Import PyTorch
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -53,7 +49,7 @@
 
 ######################################################################
 # Define a simple module written by PyTorch
-# ------------------------------
+# -----------------------------------------
 
 
 class SimpleModel(nn.Module):
@@ -69,7 +65,7 @@ def forward(self, x):
 
 ######################################################################
 # Optimize SimpleModel by TVM MetaSchedule
-# ------------------------------
+# ----------------------------------------
 # We provide the `optimize_torch` function, which has the similar usage as `torch.jit.trace`.
 # The PyTorch model to optimize, along with its example input, are provided by users.
 # The PyTorch module will be tuned by TVM for the target hardware.
@@ -81,7 +77,7 @@ def forward(self, x):
 
 ######################################################################
 # Save/Load module
-# ------------------------------
+# ----------------
 # We can save and load our tuned module like the standard `nn.Module`.
 
 # Let us run our tuned module.
@@ -104,7 +100,7 @@ def forward(self, x):
 
 ######################################################################
 # Optimize resnet18
-# ------------------------------
+# -----------------
 # In the following, we will show that our approach is able to
 # accelerate common models, such as resnet18.
 
@@ -123,8 +119,8 @@ def forward(self, x):
 
 
 ######################################################################
-# Compare the performance between two approaches.
-# ------------------------------
+# Compare the performance between two approaches
+# ----------------------------------------------
 
 results = []
 for i in range(5):
diff --git a/gallery/how_to/work_with_relay/build_gcn.py b/gallery/how_to/work_with_relay/build_gcn.py
index e6106dd95b84..16a87fb0f15f 100644
--- a/gallery/how_to/work_with_relay/build_gcn.py
+++ b/gallery/how_to/work_with_relay/build_gcn.py
@@ -125,11 +125,6 @@ def evaluate(data, logits):
     dimension of model output (Number of classes)
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 dataset = "cora"
 g, data = load_dataset(dataset)
 
diff --git a/gallery/how_to/work_with_relay/using_external_lib.py b/gallery/how_to/work_with_relay/using_external_lib.py
index c018ee13c724..38f5b2d460ba 100644
--- a/gallery/how_to/work_with_relay/using_external_lib.py
+++ b/gallery/how_to/work_with_relay/using_external_lib.py
@@ -32,11 +32,6 @@
 To begin with, we import Relay and TVM.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import numpy as np
diff --git a/gallery/how_to/work_with_relay/using_pipeline_executor.py b/gallery/how_to/work_with_relay/using_pipeline_executor.py
index 87516d656d70..8f6136865607 100755
--- a/gallery/how_to/work_with_relay/using_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_pipeline_executor.py
@@ -107,11 +107,6 @@ def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 8),
 
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 #########################################
 # Build the subgraph with cutlass target.
@@ -188,11 +183,6 @@ def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"
   |mod0.output(0)-> mod1.data_n_0
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-# testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 ##############################
 # Build the pipeline executor.
 # ----------------------------
diff --git a/gallery/how_to/work_with_relay/using_relay_viz.py b/gallery/how_to/work_with_relay/using_relay_viz.py
index ae22fe20e1f2..ce874ca48508 100644
--- a/gallery/how_to/work_with_relay/using_relay_viz.py
+++ b/gallery/how_to/work_with_relay/using_relay_viz.py
@@ -43,11 +43,6 @@
 For more details, please refer to :py:mod:`tvm.contrib.relay_viz`.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 from typing import (
     Dict,
     Union,
diff --git a/gallery/how_to/work_with_schedules/extern_op.py b/gallery/how_to/work_with_schedules/extern_op.py
index ad741a08d54c..9026eb016c56 100644
--- a/gallery/how_to/work_with_schedules/extern_op.py
+++ b/gallery/how_to/work_with_schedules/extern_op.py
@@ -32,11 +32,6 @@
 from __future__ import absolute_import, print_function
 
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import numpy as np
diff --git a/gallery/how_to/work_with_schedules/intrin_math.py b/gallery/how_to/work_with_schedules/intrin_math.py
index 5a8732abd776..5a35ae1cbd8e 100644
--- a/gallery/how_to/work_with_schedules/intrin_math.py
+++ b/gallery/how_to/work_with_schedules/intrin_math.py
@@ -30,12 +30,7 @@
 """
 from __future__ import absolute_import, print_function
 
-
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignoreimport numpy as np
+import numpy as np
 
 import tvm
 from tvm import te
diff --git a/gallery/how_to/work_with_schedules/reduction.py b/gallery/how_to/work_with_schedules/reduction.py
index c084c45d3839..72c8d691a9e0 100644
--- a/gallery/how_to/work_with_schedules/reduction.py
+++ b/gallery/how_to/work_with_schedules/reduction.py
@@ -30,9 +30,6 @@
 
 # sphinx_gallery_start_ignore
 # sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
 import tvm
 import tvm.testing
diff --git a/gallery/how_to/work_with_schedules/scan.py b/gallery/how_to/work_with_schedules/scan.py
index d523d5b9959d..4c5ce94e0121 100644
--- a/gallery/how_to/work_with_schedules/scan.py
+++ b/gallery/how_to/work_with_schedules/scan.py
@@ -27,9 +27,6 @@
 
 # sphinx_gallery_start_ignore
 # sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
 import tvm
 import tvm.testing
diff --git a/gallery/how_to/work_with_schedules/schedule_primitives.py b/gallery/how_to/work_with_schedules/schedule_primitives.py
index af67ed1527a0..a5c542df548b 100644
--- a/gallery/how_to/work_with_schedules/schedule_primitives.py
+++ b/gallery/how_to/work_with_schedules/schedule_primitives.py
@@ -29,11 +29,6 @@
 from __future__ import absolute_import, print_function
 
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import numpy as np
diff --git a/gallery/how_to/work_with_schedules/tedd.py b/gallery/how_to/work_with_schedules/tedd.py
index 7cb24f433587..7d7f8f149002 100644
--- a/gallery/how_to/work_with_schedules/tedd.py
+++ b/gallery/how_to/work_with_schedules/tedd.py
@@ -38,11 +38,6 @@
 
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 from tvm import topi
diff --git a/gallery/how_to/work_with_schedules/tensorize.py b/gallery/how_to/work_with_schedules/tensorize.py
index 45eaf349f37b..63ba8299033c 100644
--- a/gallery/how_to/work_with_schedules/tensorize.py
+++ b/gallery/how_to/work_with_schedules/tensorize.py
@@ -35,11 +35,6 @@
 from __future__ import absolute_import, print_function
 
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import tvm.testing
diff --git a/gallery/how_to/work_with_schedules/tuple_inputs.py b/gallery/how_to/work_with_schedules/tuple_inputs.py
index 86ec8b2d196b..edf82ddca75b 100644
--- a/gallery/how_to/work_with_schedules/tuple_inputs.py
+++ b/gallery/how_to/work_with_schedules/tuple_inputs.py
@@ -28,11 +28,6 @@
 from __future__ import absolute_import, print_function
 
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import numpy as np
diff --git a/gallery/tutorial/auto_scheduler_matmul_x86.py b/gallery/tutorial/auto_scheduler_matmul_x86.py
index 98fd95c33878..14f8040bf851 100644
--- a/gallery/tutorial/auto_scheduler_matmul_x86.py
+++ b/gallery/tutorial/auto_scheduler_matmul_x86.py
@@ -38,11 +38,6 @@
   __name__ == "__main__":` block.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import numpy as np
 import tvm
diff --git a/gallery/tutorial/autotvm_matmul_x86.py b/gallery/tutorial/autotvm_matmul_x86.py
index f074c454bde4..a2e355c8ca8f 100644
--- a/gallery/tutorial/autotvm_matmul_x86.py
+++ b/gallery/tutorial/autotvm_matmul_x86.py
@@ -64,11 +64,6 @@
 #
 # Now return to python code. Begin by importing the required packages.
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import logging
 import sys
diff --git a/gallery/tutorial/autotvm_relay_x86.py b/gallery/tutorial/autotvm_relay_x86.py
index b7dfbe28f462..b7e9cebb5d6a 100644
--- a/gallery/tutorial/autotvm_relay_x86.py
+++ b/gallery/tutorial/autotvm_relay_x86.py
@@ -42,11 +42,6 @@
 how to use them through the Python API.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 ################################################################################
 # TVM is a deep learning compiler framework, with a number of different modules
diff --git a/gallery/tutorial/cross_compilation_and_rpc.py b/gallery/tutorial/cross_compilation_and_rpc.py
index feab28fa11c1..c7e302693de7 100644
--- a/gallery/tutorial/cross_compilation_and_rpc.py
+++ b/gallery/tutorial/cross_compilation_and_rpc.py
@@ -93,11 +93,6 @@
 #
 # Here we will declare a simple kernel on the local machine:
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 import numpy as np
 
diff --git a/gallery/tutorial/install.py b/gallery/tutorial/install.py
index b864dbfa85f4..0eb3ccc94c06 100644
--- a/gallery/tutorial/install.py
+++ b/gallery/tutorial/install.py
@@ -48,9 +48,3 @@
 # Check out  `TLCPack <https://tlcpack.ai>`_ to learn more. Note that the
 # third party binary packages could contain additional licensing terms for
 # the hardware drivers that are bundled with it.
-
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
diff --git a/gallery/tutorial/intro_topi.py b/gallery/tutorial/intro_topi.py
index f2a4db608646..cfebc36b8128 100644
--- a/gallery/tutorial/intro_topi.py
+++ b/gallery/tutorial/intro_topi.py
@@ -28,11 +28,7 @@
 
 # sphinx_gallery_start_ignore
 # sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
-
 import tvm
 import tvm.testing
 from tvm import te
diff --git a/gallery/tutorial/introduction.py b/gallery/tutorial/introduction.py
index 908a8e52c751..8d1f0e2699b2 100644
--- a/gallery/tutorial/introduction.py
+++ b/gallery/tutorial/introduction.py
@@ -45,11 +45,6 @@
 #. :doc:`Compiling Deep Learning Models for GPUs <relay_quick_start>`
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 ################################################################################
 # An Overview of TVM and Model Optimization
diff --git a/gallery/tutorial/relay_quick_start.py b/gallery/tutorial/relay_quick_start.py
index e59f0107f943..0cbe35b3e075 100644
--- a/gallery/tutorial/relay_quick_start.py
+++ b/gallery/tutorial/relay_quick_start.py
@@ -26,13 +26,6 @@
 Notice that you need to build TVM with cuda and llvm enabled.
 """
 
-# sphinx_gallery_start_ignore
-# sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 ######################################################################
 # Overview for Supported Hardware Backend of TVM
 # ----------------------------------------------
@@ -44,6 +37,9 @@
 # In this tutorial, we'll choose cuda and llvm as target backends.
 # To begin with, let's import Relay and TVM.
 
+# sphinx_gallery_start_ignore
+# sphinx_gallery_requires_cuda = True
+# sphinx_gallery_end_ignore
 import numpy as np
 
 from tvm import relay
diff --git a/gallery/tutorial/tensor_expr_get_started.py b/gallery/tutorial/tensor_expr_get_started.py
index 11186d2f1458..ba7e0c027023 100644
--- a/gallery/tutorial/tensor_expr_get_started.py
+++ b/gallery/tutorial/tensor_expr_get_started.py
@@ -39,11 +39,6 @@
 features of TVM.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 ################################################################################
 # Example 1: Writing and Scheduling Vector Addition in TE for CPU
diff --git a/gallery/tutorial/tensor_ir_blitz_course.py b/gallery/tutorial/tensor_ir_blitz_course.py
index dc75a3fb9452..346dc6154f9b 100644
--- a/gallery/tutorial/tensor_ir_blitz_course.py
+++ b/gallery/tutorial/tensor_ir_blitz_course.py
@@ -31,11 +31,7 @@
 
 # sphinx_gallery_start_ignore
 # sphinx_gallery_requires_cuda = True
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
-
 import tvm
 from tvm.ir.module import IRModule
 from tvm.script import tir as T
diff --git a/gallery/tutorial/tvmc_command_line_driver.py b/gallery/tutorial/tvmc_command_line_driver.py
index 3f4413e848ce..27302b721bc1 100644
--- a/gallery/tutorial/tvmc_command_line_driver.py
+++ b/gallery/tutorial/tvmc_command_line_driver.py
@@ -41,11 +41,6 @@
 capabilities, and set the stage for understanding how TVM works.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 ################################################################################
 # Using TVMC
diff --git a/gallery/tutorial/tvmc_python.py b/gallery/tutorial/tvmc_python.py
index 417f8ad88747..a92c3af626f0 100644
--- a/gallery/tutorial/tvmc_python.py
+++ b/gallery/tutorial/tvmc_python.py
@@ -36,11 +36,6 @@
 Let's start editing the python file in your favorite text editor.
 """
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
 
 ################################################################################
 # Step 0: Imports
diff --git a/gallery/tutorial/uma.py b/gallery/tutorial/uma.py
index ea38813a7ace..5380aa116fcb 100644
--- a/gallery/tutorial/uma.py
+++ b/gallery/tutorial/uma.py
@@ -41,12 +41,6 @@
 # integrated into TVM using UMA.
 #
 
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)
-# sphinx_gallery_end_ignore
-
 
 ######################################################################
 # Vanilla
diff --git a/tests/lint/check_request_hook.py b/tests/lint/check_request_hook.py
deleted file mode 100644
index 925af5597c12..000000000000
--- a/tests/lint/check_request_hook.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import fnmatch
-import re
-from pathlib import Path
-from typing import List, Optional
-
-
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent
-
-EXPECTED_HOOK = """
-# sphinx_gallery_start_ignore
-from tvm import testing
-
-testing.utils.install_request_hook(depth=3)\
-# sphinx_gallery_end_ignore
-"""
-
-# Extra sphinx-gallery config options may be passed inside the ignore block before the hook. This
-# is a workaround that can be removed once sphinx-gallery #1059 merges and the version is updated.
-EXPECTED_REGEX = re.compile(
-    r"""
-\# sphinx_gallery_start_ignore
-(?:.*\n)*from tvm import testing
-
-testing\.utils\.install_request_hook\(depth=3\)\
-\# sphinx_gallery_end_ignore
-""".rstrip(),
-    re.MULTILINE,
-)
-IGNORE_PATTERNS = ["*/micro_tvmc.py", "*/micro_train.py"]
-APACHE_HEADER_LINES = 16
-
-
-def find_code_block_line(lines: List[str]) -> Optional[int]:
-    """
-    This returns the index in 'lines' of the first line of code in the tutorial
-    or none if there are no code blocks.
-    """
-    in_multiline_string = False
-    in_sphinx_directive = False
-
-    i = 0
-    lines = lines[APACHE_HEADER_LINES:]
-    while i < len(lines):
-        line = lines[i].strip()
-        if '"""' in line:
-            in_multiline_string = not in_multiline_string
-        elif "# sphinx_gallery_" in line:
-            in_sphinx_directive = not in_sphinx_directive
-        elif line.startswith("#") or in_sphinx_directive or in_multiline_string or line == "":
-            pass
-        else:
-            return i
-        i += 1
-
-    return None
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Check that all tutorials/docs override urllib.request.Request"
-    )
-    parser.add_argument(
-        "--fix", action="store_true", help="Insert expected code into erroring files"
-    )
-    args = parser.parse_args()
-
-    gallery_files = (REPO_ROOT / "gallery").glob("**/*.py")
-    # gallery_files = [x for x in gallery_files if "cross_compi" in str(x)]
-
-    errors = []
-    for file in gallery_files:
-        skip = False
-        for ignored_file in IGNORE_PATTERNS:
-            if fnmatch.fnmatch(str(file), ignored_file):
-                skip = True
-                break
-        if skip:
-            continue
-
-        with open(file) as f:
-            content = f.read()
-
-        regex_match = EXPECTED_REGEX.search(content)
-        if not regex_match:
-            errors.append((file, None))
-            continue
-
-        line = content.count("\n", 0, regex_match.end()) + 2
-        expected = find_code_block_line(content.split("\n"))
-        if expected is not None and line < expected:
-            errors.append((file, (line, expected)))
-
-    if args.fix:
-        for error, line_info in errors:
-            with open(error) as f:
-                content = f.read()
-
-            # Note: There must be a little bit of care taken here since inserting
-            # the block between a comment and multiline string will lead to an
-            # empty code block in the HTML output
-            if "from __future__" in content:
-                # Place after the last __future__ import
-                new_content = re.sub(
-                    r"((?:from __future__.*?\n)+)", r"\1\n" + EXPECTED_HOOK, content, flags=re.M
-                )
-            else:
-                # Place in the first codeblock
-                lines = content.split("\n")
-                position = find_code_block_line(lines)
-                if position is None:
-                    new_content = "\n".join(lines) + EXPECTED_HOOK + "\n"
-                else:
-                    print(position)
-                    new_content = (
-                        "\n".join(lines[:position])
-                        + EXPECTED_HOOK
-                        + "\n\n"
-                        + "\n".join(lines[position:])
-                    )
-
-            with open(error, "w") as f:
-                f.write(new_content)
-    else:
-        # Don't fix, just check and print an error message
-        if len(errors) > 0:
-            print(
-                f"These {len(errors)} file(s) did not contain the expected text to "
-                "override urllib.request.Request, it was at the wrong position, or "
-                "the whitespace is incorrect.\n"
-                "You can run 'python3 tests/lint/check_request_hook.py --fix' to "
-                "automatically fix these errors:\n"
-                f"{EXPECTED_HOOK}\n\nFiles:"
-            )
-            for file, line_info in errors:
-                if line_info is None:
-                    print(f"{file} (missing hook)")
-                else:
-                    actual, expected = line_info
-                    print(f"{file} (misplaced hook at {actual}, expected at {expected})")
-            exit(1)
-        else:
-            print("All files successfully override urllib.request.Request")
-            exit(0)
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index f71cb0f60243..83ea86ecccb8 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -40,9 +40,6 @@ function shard1 {
   echo "Checking CMake <-> LibInfo options mirroring"
   python3 tests/lint/check_cmake_options.py
 
-  echo "Checking that all sphinx-gallery docs override urllib.request.Request"
-  python3 tests/lint/check_request_hook.py
-
   echo "black check..."
   tests/lint/git-black.sh
 

From 31e3ca6485c831dc4f2cdefb58e531799f623ef1 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 12 Jan 2023 10:04:30 -0800
Subject: [PATCH 166/286] [ci][docker] Make branch names valid before using
 them as tags (#13738)

This fixes the issue encountered in #13722. Tested in https://ci.tlcpack.ai/blue/organizations/jenkins/tvm-docker/detail/PR-13738/1/pipeline
---
 ci/jenkins/generated/docker_jenkinsfile.groovy    | 4 ++--
 ci/jenkins/templates/docker_jenkinsfile.groovy.j2 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/jenkins/generated/docker_jenkinsfile.groovy b/ci/jenkins/generated/docker_jenkinsfile.groovy
index 9e1946c194e6..6735bd232152 100644
--- a/ci/jenkins/generated/docker_jenkinsfile.groovy
+++ b/ci/jenkins/generated/docker_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-09T15:39:24.508775
+// Generated at 2023-01-09T12:36:17.743091
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -589,7 +589,7 @@ def build_image(image_name) {
     returnStdout: true,
     script: 'git log -1 --format=\'%h\''
   ).trim()
-  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
+  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}".replace('/', '_')
   sh(
     script: "${docker_build} ${image_name} --spec ${full_name}",
     label: 'Build docker image'
diff --git a/ci/jenkins/templates/docker_jenkinsfile.groovy.j2 b/ci/jenkins/templates/docker_jenkinsfile.groovy.j2
index 07ae49811337..beb9b478bafb 100644
--- a/ci/jenkins/templates/docker_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/docker_jenkinsfile.groovy.j2
@@ -66,7 +66,7 @@ def build_image(image_name) {
     returnStdout: true,
     script: 'git log -1 --format=\'%h\''
   ).trim()
-  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
+  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}".replace('/', '_')
   sh(
     script: "${docker_build} ${image_name} --spec ${full_name}",
     label: 'Build docker image'

From a6538c9d3c1f77db4b06d15a21930ec0c302a709 Mon Sep 17 00:00:00 2001
From: Eye <380614540@qq.com>
Date: Fri, 13 Jan 2023 03:39:20 +0800
Subject: [PATCH 167/286] [ONNX,FIX] onnx Pad operator `constant_value` omit
 use default value 0 (#13758)

* fix: onnx Pad operator `constant_value` omit use default value 0

* fix format problem

* add onnx Pad unit test

* fix python format

Co-authored-by: rqg <ranqingguo318@gmail.com>
---
 python/tvm/relay/frontend/onnx.py          |  2 +-
 tests/python/frontend/onnx/test_forward.py | 30 ++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 3e4c9db2b0ff..c4eb7774d756 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1974,7 +1974,7 @@ def _impl_v2(cls, inputs, attr, params):
     @classmethod
     def _impl_v11(cls, inputs, attr, params):
         pads = inputs[1]
-        if len(inputs) == 3:
+        if len(inputs) == 3 and inputs[2] is not None:
             value = fold_constant(_op.take(inputs[2], _op.const(0)))
         else:
             value = 0.0
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index c016078f8f11..a84de82f3bab 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -7698,5 +7698,35 @@ def _golden():
         self._verify(_res, _golden)
 
 
+@tvm.testing.parametrize_targets
+def test_pad_constant_value(target, dev):
+    """test_pad_constant_value"""
+
+    def verify_pad_constant_value(constant_value):
+        tensor_shape = [1, 2, 257, 126]
+        tensor_values = [np.random.uniform(size=tensor_shape).astype("float32")]
+        graph_inputs = [helper.make_tensor_value_info("input", TensorProto.FLOAT, tensor_shape)]
+        graph_outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, None)]
+        pads = helper.make_tensor("pads", TensorProto.INT64, [8], [0, 0, 0, 2, 0, 0, 0, 0])
+        pad_node = helper.make_node(
+            "Pad", ["input", "pads", constant_value], ["output"], mode="constant"
+        )
+        graph_nodes = [pad_node]
+        graph = helper.make_graph(
+            graph_nodes,
+            "test_pad_constant_value",
+            inputs=graph_inputs,
+            outputs=graph_outputs,
+            initializer=[pads],
+        )
+        model = helper.make_model(
+            graph,
+            producer_name="test_pad_constant_value",
+        )
+        verify_with_ort_with_inputs(model, tensor_values, target=target, dev=dev)
+
+    verify_pad_constant_value("")
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 18ea96fff5dc961bb6f8208bcdf4582a78131232 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Thu, 12 Jan 2023 23:36:05 +0000
Subject: [PATCH 168/286] Remove tutorials CMSIS dependency when not needed
 (#13762)

* Remove tutorials CMSIS dependencies when not needed

Change-Id: Ic844422c74855f774b97ea57c6cb6eac2236e29b

* Address comments

Change-Id: Idbaada0753a9d46cfd94f71fcb30c7f7a8a1d17d
---
 gallery/how_to/work_with_microtvm/install_cmsis.rst | 4 ++++
 gallery/how_to/work_with_microtvm/micro_aot.py      | 8 +-------
 gallery/how_to/work_with_microtvm/micro_autotune.py | 6 +-----
 gallery/how_to/work_with_microtvm/micro_tflite.py   | 8 +-------
 4 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/gallery/how_to/work_with_microtvm/install_cmsis.rst b/gallery/how_to/work_with_microtvm/install_cmsis.rst
index 2f1d2fb1189a..13286b1b54f6 100644
--- a/gallery/how_to/work_with_microtvm/install_cmsis.rst
+++ b/gallery/how_to/work_with_microtvm/install_cmsis.rst
@@ -33,3 +33,7 @@ Install CMSIS-NN
         wget ${CMSIS_URL} -O "${DOWNLOAD_PATH}"
         tar -xf "${DOWNLOAD_PATH}" -C ${CMSIS_PATH} --strip-components=1
         rm ${DOWNLOAD_PATH}
+
+        CMSIS_NN_TAG="v4.0.0"
+        CMSIS_NN_URL="https://github.com/ARM-software/CMSIS-NN.git"
+        git clone ${CMSIS_NN_URL} --branch ${CMSIS_NN_TAG} --single-branch ${CMSIS_PATH}/CMSIS-NN
diff --git a/gallery/how_to/work_with_microtvm/micro_aot.py b/gallery/how_to/work_with_microtvm/micro_aot.py
index 81109b2965ef..c1b29ba5c582 100644
--- a/gallery/how_to/work_with_microtvm/micro_aot.py
+++ b/gallery/how_to/work_with_microtvm/micro_aot.py
@@ -41,7 +41,7 @@
 # By default, this tutorial runs on x86 CPU using TVM's C runtime. If you would like
 # to run on real Zephyr hardware, you must export the `TVM_MICRO_USE_HW` environment
 # variable. Otherwise (if you are using the C runtime), you can skip installing
-# Zephyr and CMSIS-NN. It takes ~20 minutes to install both of them.
+# Zephyr. It takes ~20 minutes to install Zephyr.
 use_physical_hw = bool(os.getenv("TVM_MICRO_USE_HW"))
 
 ######################################################################
@@ -49,11 +49,6 @@
 #     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_zephyr.rst
 #
 
-######################################################################
-#
-#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_cmsis.rst
-#
-
 ######################################################################
 # Import Python dependencies
 # -------------------------------
@@ -159,7 +154,6 @@
         "board": BOARD,
         "serial_number": SERIAL,
         "config_main_stack_size": 4096,
-        "cmsis_path": os.getenv("CMSIS_PATH", default="/content/cmsis"),
         "zephyr_base": os.getenv("ZEPHYR_BASE", default="/content/zephyrproject/zephyr"),
     }
 
diff --git a/gallery/how_to/work_with_microtvm/micro_autotune.py b/gallery/how_to/work_with_microtvm/micro_autotune.py
index 9edb9ae75e7f..9be257a57ac5 100644
--- a/gallery/how_to/work_with_microtvm/micro_autotune.py
+++ b/gallery/how_to/work_with_microtvm/micro_autotune.py
@@ -33,7 +33,7 @@
 #
 
 
-# You can skip the following two sections (installing Zephyr and CMSIS-NN) if the following flag is False.
+# You can skip the following section (installing Zephyr) if the following flag is False.
 # Installing Zephyr takes ~20 min.
 import os
 
@@ -44,10 +44,6 @@
 #     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_zephyr.rst
 #
 
-######################################################################
-#
-#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_cmsis.rst
-#
 
 ######################################################################
 # Import Python dependencies
diff --git a/gallery/how_to/work_with_microtvm/micro_tflite.py b/gallery/how_to/work_with_microtvm/micro_tflite.py
index 86e5d6b4b1ae..0770d472c9b8 100644
--- a/gallery/how_to/work_with_microtvm/micro_tflite.py
+++ b/gallery/how_to/work_with_microtvm/micro_tflite.py
@@ -36,7 +36,7 @@
 # By default, this tutorial runs on x86 CPU using TVM's C runtime. If you would like
 # to run on real Zephyr hardware, you must export the `TVM_MICRO_USE_HW` environment
 # variable. Otherwise (if you are using the C runtime), you can skip installing
-# Zephyr and CMSIS-NN. It takes ~20 minutes to install both of them.
+# Zephyr. It takes ~20 minutes to install Zephyr.
 use_physical_hw = bool(os.getenv("TVM_MICRO_USE_HW"))
 
 ######################################################################
@@ -44,11 +44,6 @@
 #     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_zephyr.rst
 #
 
-######################################################################
-#
-#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_cmsis.rst
-#
-
 ######################################################################
 # Import Python dependencies
 # -------------------------------
@@ -219,7 +214,6 @@
         "board": BOARD,
         "serial_number": SERIAL,
         "config_main_stack_size": 4096,
-        "cmsis_path": os.getenv("CMSIS_PATH", default="/content/cmsis"),
         "zephyr_base": os.getenv("ZEPHYR_BASE", default="/content/zephyrproject/zephyr"),
     }
 

From 254e8f5aab57cf4c9bd9c4793f30794bba8cd5b3 Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Thu, 12 Jan 2023 16:12:56 -0800
Subject: [PATCH 169/286] [Target] Make `key=arm_cpu` --> `key=arm_cpu,cpu` on
 AArch64 (#13775)

* arm cpu is cpu

* init commit

* fix test
---
 src/target/parsers/aprofile.cc            | 10 ++++++----
 tests/cpp/target/parsers/aprofile_test.cc |  3 ++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/target/parsers/aprofile.cc b/src/target/parsers/aprofile.cc
index 2fd5fe71e617..6b0712461026 100644
--- a/src/target/parsers/aprofile.cc
+++ b/src/target/parsers/aprofile.cc
@@ -134,15 +134,17 @@ static TargetFeatures GetFeatures(TargetJSON target) {
 }
 
 static Array<String> MergeKeys(Optional<Array<String>> existing_keys) {
-  const String kExtraKey = "arm_cpu";
+  const Array<String> kExtraKeys = {"arm_cpu", "cpu"};
 
   if (!existing_keys) {
-    return {kExtraKey};
+    return kExtraKeys;
   }
 
   Array<String> keys = existing_keys.value();
-  if (std::find(keys.begin(), keys.end(), kExtraKey) == keys.end()) {
-    keys.push_back(kExtraKey);
+  for (String key : kExtraKeys) {
+    if (std::find(keys.begin(), keys.end(), key) == keys.end()) {
+      keys.push_back(key);
+    }
   }
   return keys;
 }
diff --git a/tests/cpp/target/parsers/aprofile_test.cc b/tests/cpp/target/parsers/aprofile_test.cc
index 0382e7a84bd7..ffbc5fd431a7 100644
--- a/tests/cpp/target/parsers/aprofile_test.cc
+++ b/tests/cpp/target/parsers/aprofile_test.cc
@@ -48,8 +48,9 @@ static TargetFeatures ParseTargetWithAttrs(String mcpu, String mtriple, Array<St
 TEST(AProfileParser, ParseTargetKeys) {
   TargetJSON target = ParseTarget({});
   Array<String> keys = Downcast<Array<String>>(target.at("keys"));
-  ASSERT_EQ(keys.size(), 1);
+  ASSERT_EQ(keys.size(), 2);
   ASSERT_EQ(keys[0], "arm_cpu");
+  ASSERT_EQ(keys[1], "cpu");
 }
 
 TEST(AProfileParser, ParseTargetWithExistingKeys) {

From b8169d600b24f61422c3450d8034143e894420a4 Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Fri, 13 Jan 2023 04:00:34 +0000
Subject: [PATCH 170/286] [CI] Update ci_minimal docker image to cross-compile
 TVM to aarch64 (#13776)

This PR is a prerequisite to #13714 and needs to be merged before. It contains the changes to the ci_minimal docker image to support minimal cross-compilation of TVM to aarch64.
---
 docker/Dockerfile.ci_minimal                  |  8 +++
 ...buntu1804_install_aarch64_cross_compile.sh | 51 +++++++++++++++
 ...u1804_manual_install_llvm_cross_aarch64.sh | 65 +++++++++++++++++++
 3 files changed, 124 insertions(+)
 create mode 100644 docker/install/ubuntu1804_install_aarch64_cross_compile.sh
 create mode 100644 docker/install/ubuntu1804_manual_install_llvm_cross_aarch64.sh

diff --git a/docker/Dockerfile.ci_minimal b/docker/Dockerfile.ci_minimal
index ec6aa78b2fc0..5ea832454a8b 100644
--- a/docker/Dockerfile.ci_minimal
+++ b/docker/Dockerfile.ci_minimal
@@ -28,6 +28,10 @@ RUN bash /install/ubuntu_setup_tz.sh
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+# Install libraries for cross-compiling TVM to Aarch64
+COPY install/ubuntu1804_install_aarch64_cross_compile.sh /install/ubuntu1804_install_aarch64_cross_compile.sh
+RUN bash /install/ubuntu1804_install_aarch64_cross_compile.sh
+
 COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
 RUN bash /install/ubuntu_install_cmake_source.sh
 
@@ -47,6 +51,10 @@ RUN bash /install/ubuntu_install_python_package.sh
 COPY install/ubuntu1804_manual_install_llvm.sh /install/ubuntu1804_manual_install_llvm.sh
 RUN bash /install/ubuntu1804_manual_install_llvm.sh
 
+# Cross build LLVM to Aarch64
+COPY install/ubuntu1804_manual_install_llvm_cross_aarch64.sh /install/ubuntu1804_manual_install_llvm_cross_aarch64.sh
+RUN bash /install/ubuntu1804_manual_install_llvm_cross_aarch64.sh
+
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
 RUN bash /install/ubuntu_install_rust.sh
diff --git a/docker/install/ubuntu1804_install_aarch64_cross_compile.sh b/docker/install/ubuntu1804_install_aarch64_cross_compile.sh
new file mode 100644
index 000000000000..faa4aa3f23e6
--- /dev/null
+++ b/docker/install/ubuntu1804_install_aarch64_cross_compile.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+# Used for debugging RVM build
+set -x
+set -o pipefail
+
+architecture_type=$(uname -i)
+if [ "$architecture_type" != "aarch64" ]; then
+  # Install gcc and g++ for cross-compiling c++ on ubuntu
+  apt-get update && apt-install-and-clear -y --no-install-recommends \
+      g++-aarch64-linux-gnu \
+      gcc-aarch64-linux-gnu \
+
+  # Add Aarch64 packages to the apt sources list
+  echo >> /etc/apt/sources.list.d/arm64.list
+  echo "deb [arch=arm64] http://ports.ubuntu.com/ bionic main restricted" >> /etc/apt/sources.list.d/arm64.list
+  echo "deb [arch=arm64] http://ports.ubuntu.com/ bionic-updates main restricted" >> /etc/apt/sources.list.d/arm64.list
+  echo "deb [arch=arm64] http://ports.ubuntu.com/ bionic universe" >> /etc/apt/sources.list.d/arm64.list
+  echo "deb [arch=arm64] http://ports.ubuntu.com/ bionic-updates universe" >> /etc/apt/sources.list.d/arm64.list
+  echo "deb [arch=arm64] http://ports.ubuntu.com/ bionic multiverse" >> /etc/apt/sources.list.d/arm64.list
+  echo "deb [arch=arm64] http://ports.ubuntu.com/ bionic-updates multiverse" >> /etc/apt/sources.list.d/arm64.list
+  echo "deb [arch=arm64] http://ports.ubuntu.com/ bionic-backports main restricted universe multiverse" >> /etc/apt/sources.list.d/arm64.list
+
+  # Fix apt-get update by specifying the amd64 architecture in sources.list
+  sed -i -e 's/deb /deb [arch=amd64] /g' /etc/apt/sources.list
+
+  # Install the required packages for cross-compiling
+  dpkg --add-architecture arm64
+  apt-get update && apt-install-and-clear -y --no-install-recommends \
+      zlib1g-dev:arm64 \
+      libtinfo-dev:arm64
+
+fi
diff --git a/docker/install/ubuntu1804_manual_install_llvm_cross_aarch64.sh b/docker/install/ubuntu1804_manual_install_llvm_cross_aarch64.sh
new file mode 100644
index 000000000000..b1ef38a4095f
--- /dev/null
+++ b/docker/install/ubuntu1804_manual_install_llvm_cross_aarch64.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+architecture_type=$(uname -i)
+# Cross-build LLVM for aarch64 when not building natively.
+if [ "$architecture_type" != "aarch64" ]; then
+  git clone --depth 1 --branch release/11.x https://github.com/llvm/llvm-project.git
+  pushd llvm-project
+
+  # First build clang-tblgen and llvm-tblgen
+  mkdir build-host
+  pushd build-host
+  cmake \
+    -G Ninja \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DLLVM_ENABLE_ASSERTIONS=ON \
+    -DLLVM_ENABLE_PROJECTS="llvm;clang;clang-tools-extra" \
+    ../llvm
+  ninja clang-tblgen llvm-tblgen
+  popd
+
+  # Then cross-compile LLVM for Aarch64
+  mkdir build
+  pushd build
+  CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-g++ cmake \
+    -G Ninja \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_INSTALL_PREFIX=/usr/llvm-aarch64 \
+    -DLLVM_ENABLE_ASSERTIONS=ON \
+    -DLLVM_ENABLE_PROJECTS="llvm;clang" \
+    -DCMAKE_SYSTEM_NAME=Linux \
+    -DLLVM_TABLEGEN=/llvm-project/build-host/bin/llvm-tblgen \
+    -DCLANG_TABLEGEN=/llvm-project/build-host/bin/clang-tblgen \
+    -DLLVM_DEFAULT_TARGET_TRIPLE=aarch64-linux-gnu \
+    -DLLVM_TARGET_ARCH=AArch64 \
+    -DCMAKE_CXX_FLAGS='-march=armv8-a -mtune=cortex-a72' \
+    ../llvm
+  ninja install
+  popd
+  popd
+  rm -rf llvm-project
+
+  # This is a hack. Cross-compiling LLVM with gcc will cause the llvm-config to be an Aarch64 executable.
+  # We need it to be x86 to be able to call it when building TVM. We just copy and use the x86 one instead.
+  cp /usr/bin/llvm-config /usr/llvm-aarch64/bin/llvm-config
+fi

From 96a1089f61e784bda285ae45a3b2b11ab3fe9114 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 12 Jan 2023 22:55:20 -0600
Subject: [PATCH 171/286] [UnitTest] Parametrized
 test_arith_iter_affine_map::test_padding (#13774)

Parametrization helped in the debugging of
https://github.com/apache/tvm/pull/13530, but is not otherwise related
to that PR.
---
 .../unittest/test_arith_iter_affine_map.py    | 160 ++++++++----------
 1 file changed, 70 insertions(+), 90 deletions(-)

diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index 7ae5c58a9507..0d24b59bb45e 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -946,103 +946,83 @@ def test_free_variables():
     )
 
 
-def test_padding():
+class TestPadding:
     x = tvm.tir.Var("x", "int32")
     y = tvm.tir.Var("y", "int32")
     fld = tvm.tir.floordiv
     flm = tvm.tir.floormod
 
-    # left padding only, offset divisible
-    sum = 64 + y
-    dom_map = var_dom([(y, 192)])
-    assert_iter_sum_pattern(
-        {fld(sum, 32): (6, 2, 1), flm(sum, 32): (32, 0, 1)},
-        dom_map,
-        check_level="bijective",
-    )
-
-    # left padding only, offset non-divisible
-    sum = 80 + y
-    dom_map = var_dom([(y, 176)])
-    assert_iter_sum_pattern(
-        {fld(sum, 32): (6, 2, 1)},
-        dom_map,
-    )
-    assert_iter_sum_pattern(
-        {flm(fld(sum, 2), 16): (16, 0, 1), flm(sum, 2): (2, 0, 1)},
-        dom_map,
-    )
-    assert_iter_sum_failure({fld(sum, 32), flm(sum, 32)}, dom_map)
-    assert_iter_sum_failure({fld(sum, 32), fld(sum, 4)}, dom_map)
-
-    # right padding only, offset divisible
-    sum = x * 32 + y * 8
-    dom_map = var_dom([(x, 5), (y, 4)])
-    assert_iter_sum_pattern(
-        {fld(sum, 16): (10, 0, 1), flm(sum, 16): (2, 0, 8)},
-        dom_map,
-    )
-    assert_iter_sum_failure({fld(sum, 5)}, dom_map)
-
-    # right padding only, offset non-divisible
-    dom_map = var_dom([(x, 26)])
-    assert_iter_sum_pattern(
-        {fld(x, 15): (2, 0, 1)},
-        dom_map,
-    )
-    assert_iter_sum_pattern(
-        {flm(fld(x, 3), 5): (5, 0, 1), flm(x, 3): (3, 0, 1)},
-        dom_map,
-    )
-
-    # padding constants on both side
-    sum = x + 71
-    dom_map = var_dom([(x, 45)])
-    assert_iter_sum_pattern({fld(sum, 32): (2, 2, 1)}, dom_map)
-    assert_iter_sum_pattern(
-        {flm(fld(x, 4), 8): (8, 0, 1), flm(x, 4): (4, 0, 1)},
-        dom_map,
-    )
-
-    # padding for free iteration part
-    sum = x * 360 + y
-    dom_map = var_dom([(y, 360)])
-    assert_iter_sum_pattern({fld(sum, 16): (23, fld(x * 360 - flm(x, 2) * 8, 16), 1)}, dom_map)
-    assert_iter_sum_pattern({flm(x * 360 + y, 16): (16, 0, 1)}, dom_map)
-
-    # multiple split with same mark offset, could
-    # be surjective on missing (padded // LCM)
-    assert_iter_sum_pattern(
-        {
-            flm(x + 10, 3): (3, 0),
-            flm(fld(x + 10, 3), 4): (4, 0),
-            flm(fld(fld(x + 10, 3), 4), 5): (5, 0),
-        },
-        var_dom([(x, 240)]),
-    )
-    assert_iter_sum_failure(
-        {
-            flm(x + 10, 3),
-            flm(fld(x + 10, 3), 4),
-            flm(fld(fld(x + 10, 3), 4), 5),
-            fld(fld(fld(x + 10, 3), 4), 5),
-        },
-        var_dom([(x, 240)]),
-    )
-
-    # different offsets on splits
-    assert_iter_sum_pattern(
-        {
-            flm(x + 1, 3): (3, 0),
-            flm(fld(x + 10, 3) + 2, 4): (4, 0),
-            flm(fld(fld(x + 10, 3), 4) + 3, 5): (5, 0),
-        },
-        var_dom([(x, 240)]),
+    positive_test_case = tvm.testing.parameter(
+        # left padding only, offset divisible
+        ({y: 192}, {fld(64 + y, 32): (6, 2, 1), flm(64 + y, 32): (32, 0, 1)}, "bijective"),
+        # left padding only, offset non-divisible
+        ({y: 176}, {fld(80 + y, 32): (6, 2, 1)}),
+        ({y: 176}, {flm(fld(80 + y, 2), 16): (16, 0, 1), flm(80 + y, 2): (2, 0, 1)}),
+        # right padding only, offset divisible
+        ({x: 5, y: 4}, {fld(x * 32 + y * 8, 16): (10, 0, 1), flm(x * 32 + y * 8, 16): (2, 0, 8)}),
+        # right padding only, offset non-divisible
+        ({x: 26}, {fld(x, 15): (2, 0, 1)}),
+        ({x: 26}, {flm(fld(x, 3), 5): (5, 0, 1), flm(x, 3): (3, 0, 1)}),
+        # padding constants on both side
+        ({x: 45}, {fld(x + 71, 32): (2, 2, 1)}),
+        ({x: 45}, {flm(fld(x, 4), 8): (8, 0, 1), flm(x, 4): (4, 0, 1)}),
+        # padding for free iteration part
+        ({y: 360}, {fld(x * 360 + y, 16): (23, fld(x * 360 - flm(x, 2) * 8, 16), 1)}),
+        ({y: 360}, {flm(x * 360 + y, 16): (16, 0, 1)}),
+        # multiple split with same mark offset, could
+        # be surjective on missing (padded // LCM)
+        (
+            {x: 240},
+            {
+                flm(x + 10, 3): (3, 0),
+                flm(fld(x + 10, 3), 4): (4, 0),
+                flm(fld(fld(x + 10, 3), 4), 5): (5, 0),
+            },
+        ),
+        # different offsets on splits
+        (
+            {x: 240},
+            {
+                flm(x + 1, 3): (3, 0),
+                flm(fld(x + 10, 3) + 2, 4): (4, 0),
+                flm(fld(fld(x + 10, 3), 4) + 3, 5): (5, 0),
+            },
+        ),
     )
 
-    # original extent is smaller than the divident
-    # it is not surjective wrt to the region [0, 16)
-    assert_iter_sum_failure({flm(x, 16)}, var_dom([(x, 3)]))
+    negative_test_case = tvm.testing.parameter(
+        # left padding only, offset non-divisible
+        ({y: 176}, {fld(80 + y, 32), flm(80 + y, 32)}),
+        ({y: 176}, {fld(80 + y, 32), fld(80 + y, 4)}),
+        # right padding only, offset divisible
+        ({x: 5, y: 4}, {fld(x * 32 + y * 8, 5)}),
+        # multiple split with same mark offset, could
+        # be surjective on missing (padded // LCM)
+        (
+            {x: 240},
+            {
+                flm(x + 10, 3),
+                flm(fld(x + 10, 3), 4),
+                flm(fld(fld(x + 10, 3), 4), 5),
+                fld(fld(fld(x + 10, 3), 4), 5),
+            },
+        ),
+        # original extent is smaller than the divident
+        # it is not surjective wrt to the region [0, 16)
+        ({x: 3}, {flm(x, 16)}),
+    )
+
+    def test_padding(self, positive_test_case):
+        iter_extent, mapped_iterators, *args = positive_test_case
+        check_level = args[0] if args else "surjective"
+        dom_map = {var: tvm.ir.Range(0, ext) for var, ext in iter_extent.items()}
+        assert_iter_sum_pattern(mapped_iterators, dom_map, check_level=check_level)
+
+    def test_padding_error(self, negative_test_case):
+        iter_extent, mapped_iterators, *args = negative_test_case
+        check_level = args[0] if args else "surjective"
+        dom_map = {var: tvm.ir.Range(0, ext) for var, ext in iter_extent.items()}
+        assert_iter_sum_failure(mapped_iterators, dom_map, check_level=check_level)
 
 
 def test_overlapped_fuse():

From aa9ff23f761d770bbbd33d39cad42f5ef6d08adf Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Fri, 13 Jan 2023 12:56:41 +0000
Subject: [PATCH 172/286] [ETHOSN] Remove support for NPU driver 22.08 (#13763)

[ETHOSN] Remove support for NPU driver 22.08

This commit removes support for NPU driver 22.08 now
that tlcpack contains the latest docker image for
ci_cpu that encompasses support for NPU driver 22.11.
---
 python/tvm/relay/op/contrib/ethosn.py         |  2 +-
 src/runtime/contrib/ethosn/ethosn_device.cc   | 80 +------------------
 src/runtime/contrib/ethosn/ethosn_device.h    |  6 --
 src/runtime/contrib/ethosn/ethosn_runtime.cc  | 12 ---
 src/runtime/contrib/ethosn/ethosn_runtime.h   | 11 +--
 .../python/contrib/test_ethosn/test_conv2d.py |  6 +-
 .../contrib/test_ethosn/test_leaky_relu.py    |  6 +-
 7 files changed, 5 insertions(+), 118 deletions(-)

diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index e316c0863c6c..3e10f3d60415 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -117,7 +117,7 @@ def partition_for_ethosn(mod, params=None, **opts):
     ret : annotated and partitioned module.
     """
     api_version = ethosn_api_version()
-    supported_api_versions = ["3.2.0", "3.1.0"]
+    supported_api_versions = ["3.2.0"]
     if all(api_version != LooseVersion(exp_ver) for exp_ver in supported_api_versions):
         raise ValueError(
             f"Driver stack version {api_version} is unsupported. "
diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc
index d4ebec4de311..fa44ba856de2 100644
--- a/src/runtime/contrib/ethosn/ethosn_device.cc
+++ b/src/runtime/contrib/ethosn/ethosn_device.cc
@@ -42,9 +42,7 @@
 
 #include "ethosn_driver_library/Inference.hpp"
 #include "ethosn_driver_library/Network.hpp"
-#ifdef _ETHOSN_API_VERSION_3_2_0
 #include "ethosn_driver_library/ProcMemAllocator.hpp"
-#endif
 
 namespace tvm {
 namespace runtime {
@@ -90,7 +88,6 @@ InferenceWaitStatus WaitForInference(dl::Inference* inference, int timeout) {
   return InferenceWaitStatus(InferenceWaitErrorCode::kSuccess);
 }
 
-#ifdef _ETHOSN_API_VERSION_3_2_0
 void CreateBuffers(dl::ProcMemAllocator* proc_mem_alloc,
                    std::vector<std::shared_ptr<dl::Buffer>>* fm,
                    const std::vector<DLTensor*>& tensors, const std::vector<uint32_t>& tensor_sizes,
@@ -164,78 +161,6 @@ bool Inference(tvm::runtime::TVMArgs args, dl::ProcMemAllocator* proc_mem_alloc,
 
   return true;
 }
-#else
-void CreateBuffers(std::vector<std::shared_ptr<dl::Buffer>>* fm,
-                   const std::vector<DLTensor*>& tensors, const std::vector<uint32_t>& tensor_sizes,
-                   bool input) {
-  for (size_t i = 0; i < tensors.size(); i++) {
-    auto* data = static_cast<uint8_t*>(tensors[i]->data);
-    if (input) {
-      (*fm)[i] = std::make_shared<dl::Buffer>(data, tensor_sizes[i], dl::DataFormat::NHWC);
-    } else {
-      (*fm)[i] = std::make_shared<dl::Buffer>(tensor_sizes[i], dl::DataFormat::NHWC);
-    }
-  }
-}
-
-bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
-               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
-               const std::vector<uint32_t>& input_sizes,
-               const std::vector<uint32_t>& output_sizes) {
-  // Unpack parameters
-  size_t n_inputs = input_order.size();
-  size_t n_outputs = output_order.size();
-  std::vector<DLTensor*> inputs(n_inputs);
-  for (size_t i = 0; i < n_inputs; i++) {
-    inputs[i] = args[input_order[i]];
-  }
-  std::vector<DLTensor*> outputs(n_outputs);
-  size_t output_offset = n_inputs;
-  for (size_t i = 0; i < n_outputs; i++) {
-    outputs[i] = args[output_order[i] + output_offset];
-  }
-
-  // Set up input buffers
-  std::vector<std::shared_ptr<dl::Buffer>> ifm(n_inputs);
-  CreateBuffers(&ifm, inputs, input_sizes, true);
-
-  // Set up output buffers
-  std::vector<std::shared_ptr<dl::Buffer>> ofm(n_outputs);
-  CreateBuffers(&ofm, outputs, output_sizes, false);
-
-  // Raw pointers for the inference
-  dl::Buffer* ifm_raw[n_inputs];
-  for (size_t i = 0; i < n_inputs; i++) {
-    ifm_raw[i] = ifm[i].get();
-  }
-  dl::Buffer* ofm_raw[n_outputs];
-  for (size_t i = 0; i < n_outputs; i++) {
-    ofm_raw[i] = ofm[i].get();
-  }
-
-  // Execute the inference.
-  std::unique_ptr<dl::Inference> inference(
-      npu->ScheduleInference(ifm_raw, n_inputs, ofm_raw, n_outputs));
-  InferenceWaitStatus result = WaitForInference(inference.get(), 60);
-
-  if (result.GetErrorCode() != InferenceWaitErrorCode::kSuccess) {
-    LOG(FATAL) << "An error has occured waiting for the inference of a sub-graph on the NPU: "
-               << result.GetErrorDescription();
-  }
-
-  for (size_t i = 0; i < n_outputs; i++) {
-    DLTensor* tensor = outputs[i];
-    dl::Buffer* source_buffer = ofm_raw[i];
-    uint8_t* dest_buffer = static_cast<uint8_t*>(tensor->data);
-    size_t size = source_buffer->GetSize();
-    uint8_t* source_buffer_data = source_buffer->Map();
-    std::copy(source_buffer_data, source_buffer_data + size, dest_buffer);
-    source_buffer->Unmap();
-  }
-
-  return true;
-}
-#endif
 }  // namespace ethosn
 }  // namespace runtime
 }  // namespace tvm
@@ -270,10 +195,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.test.infra.inference_result")
     });
 
 // Allow the ethos-n support code to be tested without a device
-bool Inference(tvm::runtime::TVMArgs args,
-#ifdef _ETHOSN_API_VERSION_3_2_0
-               dl::ProcMemAllocator* /*proc_mem_alloc*/,
-#endif
+bool Inference(tvm::runtime::TVMArgs args, dl::ProcMemAllocator* /*proc_mem_alloc*/,
                dl::Network* /* npu */, const std::vector<uint32_t>& input_order,
                const std::vector<uint32_t>& output_order, const std::vector<uint32_t>& input_sizes,
                const std::vector<uint32_t>& output_sizes) {
diff --git a/src/runtime/contrib/ethosn/ethosn_device.h b/src/runtime/contrib/ethosn/ethosn_device.h
index a5f3d18cf9fd..862a3762f05c 100644
--- a/src/runtime/contrib/ethosn/ethosn_device.h
+++ b/src/runtime/contrib/ethosn/ethosn_device.h
@@ -38,15 +38,9 @@ namespace dl = ::ethosn::driver_library;
 
 using tvm::runtime::TVMArgs;
 
-#ifdef _ETHOSN_API_VERSION_3_2_0
 bool Inference(tvm::runtime::TVMArgs args, dl::ProcMemAllocator* proc_mem_alloc, dl::Network* npu,
                const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
                const std::vector<uint32_t>& input_sizes, const std::vector<uint32_t>& output_sizes);
-#else
-bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu,
-               const std::vector<uint32_t>& input_order, const std::vector<uint32_t>& output_order,
-               const std::vector<uint32_t>& input_sizes, const std::vector<uint32_t>& output_sizes);
-#endif
 }  // namespace ethosn
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.cc b/src/runtime/contrib/ethosn/ethosn_runtime.cc
index 0b68db1a1798..be4a1bbc1590 100644
--- a/src/runtime/contrib/ethosn/ethosn_runtime.cc
+++ b/src/runtime/contrib/ethosn/ethosn_runtime.cc
@@ -53,11 +53,9 @@ EthosnModule::EthosnModule(std::vector<OrderedCompiledNetwork>* cmms) {
     if (it.compiled_cmm != nullptr) {
       network_map_[it.name].compiled_cmm = std::move(it.compiled_cmm);
     }
-#ifdef _ETHOSN_API_VERSION_3_2_0
     if (it.proc_mem_alloc != nullptr) {
       network_map_[it.name].proc_mem_alloc = std::move(it.proc_mem_alloc);
     }
-#endif
     if (it.runtime_cmm != nullptr) {
       network_map_[it.name].runtime_cmm = std::move(it.runtime_cmm);
     }
@@ -72,16 +70,10 @@ PackedFunc EthosnModule::GetFunction(const std::string& name,
                                      const ObjectPtr<Object>& sptr_to_self) {
   if (network_map_.find(name) != network_map_.end()) {
     return PackedFunc([sptr_to_self, this, name](TVMArgs args, TVMRetValue* rv) {
-#ifdef _ETHOSN_API_VERSION_3_2_0
       *rv = Inference(args, network_map_[name].proc_mem_alloc.get(),
                       network_map_[name].runtime_cmm.get(), network_map_[name].inputs,
                       network_map_[name].outputs, network_map_[name].input_sizes,
                       network_map_[name].output_sizes);
-#else
-      *rv = Inference(args, network_map_[name].runtime_cmm.get(), network_map_[name].inputs,
-                      network_map_[name].outputs, network_map_[name].input_sizes,
-                      network_map_[name].output_sizes);
-#endif
     });
   } else {
     return PackedFunc();
@@ -126,13 +118,9 @@ Module EthosnModule::LoadFromBinary(void* strm) {
 #if defined ETHOSN_HW
     // If hardware unavaiable use the mock inference functionality. If hardware is
     // avaiable, deserialize the compiled graph.
-#ifdef _ETHOSN_API_VERSION_3_2_0
     compiled.proc_mem_alloc = std::make_unique<dl::ProcMemAllocator>();
     compiled.runtime_cmm = std::make_unique<dl::Network>(
         compiled.proc_mem_alloc->CreateNetwork(cmm.c_str(), cmm.size()));
-#else
-    compiled.runtime_cmm = std::make_unique<dl::Network>(cmm.c_str(), cmm.size());
-#endif
 #endif
     // Read the number of inputs
     stream->Read<uint64_t>(&input_size);
diff --git a/src/runtime/contrib/ethosn/ethosn_runtime.h b/src/runtime/contrib/ethosn/ethosn_runtime.h
index 2f8e445d97a8..57dc464ab2af 100644
--- a/src/runtime/contrib/ethosn/ethosn_runtime.h
+++ b/src/runtime/contrib/ethosn/ethosn_runtime.h
@@ -34,15 +34,8 @@
 #include <vector>
 
 #include "ethosn_driver_library/Network.hpp"
-#include "ethosn_support_library/Support.hpp"
-
-#if ETHOSN_SUPPORT_LIBRARY_VERSION_MAJOR == 3 && ETHOSN_SUPPORT_LIBRARY_VERSION_MINOR == 2 && \
-    ETHOSN_SUPPORT_LIBRARY_VERSION_PATCH == 0
-#define _ETHOSN_API_VERSION_3_2_0
-#endif
-#ifdef _ETHOSN_API_VERSION_3_2_0
 #include "ethosn_driver_library/ProcMemAllocator.hpp"
-#endif
+#include "ethosn_support_library/Support.hpp"
 
 namespace tvm {
 namespace runtime {
@@ -54,9 +47,7 @@ namespace dl = ::ethosn::driver_library;
 struct OrderedCompiledNetwork {
   std::unique_ptr<sl::CompiledNetwork> compiled_cmm;
   std::unique_ptr<dl::Network> runtime_cmm;
-#ifdef _ETHOSN_API_VERSION_3_2_0
   std::unique_ptr<dl::ProcMemAllocator> proc_mem_alloc;
-#endif
   std::string name;
   std::vector<uint32_t> inputs;
   std::vector<uint32_t> outputs;
diff --git a/tests/python/contrib/test_ethosn/test_conv2d.py b/tests/python/contrib/test_ethosn/test_conv2d.py
index 851bd031b38e..e4c8b1c8da29 100644
--- a/tests/python/contrib/test_ethosn/test_conv2d.py
+++ b/tests/python/contrib/test_ethosn/test_conv2d.py
@@ -22,7 +22,6 @@
 
 import tvm
 from tvm import relay
-from tvm.relay.op.contrib import ethosn_api_version
 from tvm.testing import requires_ethosn
 
 from . import infrastructure as tei
@@ -228,10 +227,7 @@ def test_conv2d_depthwise(
             )
         ),
     }
-    if ethosn_api_version() == "3.2.0":
-        input_zp = np.random.randint(0, np.iinfo(dtype).max)
-    else:
-        input_zp = np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max)
+    input_zp = np.random.randint(0, np.iinfo(dtype).max)
     input_sc = np.random.random() * 2
     if qnn_per_channel:
         kernel_sc = tvm.nd.array(
diff --git a/tests/python/contrib/test_ethosn/test_leaky_relu.py b/tests/python/contrib/test_ethosn/test_leaky_relu.py
index ee5f2048dbbb..baa1d34fbcaa 100644
--- a/tests/python/contrib/test_ethosn/test_leaky_relu.py
+++ b/tests/python/contrib/test_ethosn/test_leaky_relu.py
@@ -22,7 +22,6 @@
 
 import tvm
 from tvm import relay
-from tvm.relay.op.contrib import ethosn_api_version
 from tvm.testing import requires_ethosn
 
 from . import infrastructure as tei
@@ -56,10 +55,7 @@ def test_leaky_relu(dtype, shape, alpha):
     iinfo = np.iinfo(dtype)
     zp_min = iinfo.min
     zp_max = iinfo.max
-    if ethosn_api_version() == "3.2.0":
-        input_zp = zp_min + 128
-    else:
-        input_zp = zp_min + 120
+    input_zp = zp_min + 128
     input_sc = 0.0068132
     output_zp = zp_min + 126  # values offset more than 126 can cause saturation
     output_sc = 0.0078125

From b3ca50608fffbec4dad7349ac42c537785199ddc Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 13 Jan 2023 19:36:47 -0800
Subject: [PATCH 173/286] [TVMScript] IR Fragment Printing (#13742)

This PR introduces support for TIR fragment printing.
Fragment printing makes it possible to print TIR fragments in the text
format consistency with TVMScript PrimFunc/IRModule printing.

This PR still preserves the legacy ReprPrinter format by introducing an
API `LegacyTIRPrint` for TIR PrimExpr. This method is used in
AutoScheduler and TIR CSE for full backward compatibility.
---
 include/tvm/runtime/data_type.h               |   2 +-
 include/tvm/script/printer/ir_docsifier.h     |  12 +-
 include/tvm/script/printer/printer.h          |  17 +-
 include/tvm/tir/expr.h                        |   3 +
 python/tvm/script/ir_builder/tir/ir.py        |   8 +-
 python/tvm/script/printer/__init__.py         |   1 +
 python/tvm/script/printer/default.py          |  83 +++
 python/tvm/script/printer/printer.py          |  14 +-
 src/auto_scheduler/compute_dag.cc             |  27 +-
 src/ir/expr.cc                                |  35 -
 src/ir/type.cc                                |  28 -
 .../printer/doc_printer/python_doc_printer.cc |  18 +-
 src/script/printer/ir_docsifier.cc            |  11 +-
 src/script/printer/printer.cc                 |  22 +-
 src/script/printer/tir/block.cc               |  42 +-
 src/script/printer/tir/buffer.cc              | 144 ++--
 src/script/printer/tir/expr.cc                |  94 +--
 src/script/printer/tir/for_loop.cc            |  16 +-
 src/script/printer/tir/function.cc            |  17 +-
 src/script/printer/tir/ir.cc                  |  42 +-
 src/script/printer/tir/stmt.cc                |  53 +-
 src/script/printer/tir/utils.h                |  55 +-
 src/tir/ir/buffer.cc                          |   6 -
 src/tir/ir/expr.cc                            | 351 ----------
 src/tir/ir/function.cc                        |  15 -
 src/tir/ir/index_map.cc                       |   3 +-
 src/tir/ir/legacy_printer.cc                  | 270 ++++++++
 src/tir/ir/stmt.cc                            | 403 -----------
 src/tir/transforms/common_subexpr_elim.cc     |   4 +-
 .../transforms/common_subexpr_elim_tools.cc   |   4 +-
 tests/cpp/expr_test.cc                        |   2 +-
 tests/python/driver/tvmc/test_shape_parser.py |   5 +-
 tests/python/relay/aot/test_c_device_api.py   |  33 +-
 tests/python/relay/aot/test_crt_aot.py        |  35 +-
 ...st_tvmscript_printer_python_doc_printer.py |   3 +-
 .../unittest/test_tvmscript_printer_tir.py    | 638 ++++++++++++++++++
 .../test_tvmscript_printer_underlining.py     |  12 +-
 vta/python/vta/transform.py                   |   2 +-
 38 files changed, 1425 insertions(+), 1105 deletions(-)
 create mode 100644 python/tvm/script/printer/default.py
 create mode 100644 src/tir/ir/legacy_printer.cc
 create mode 100644 tests/python/unittest/test_tvmscript_printer_tir.py

diff --git a/include/tvm/runtime/data_type.h b/include/tvm/runtime/data_type.h
index 089147798a0e..f52e95c756bc 100644
--- a/include/tvm/runtime/data_type.h
+++ b/include/tvm/runtime/data_type.h
@@ -348,7 +348,7 @@ inline std::string DLDataType2String(DLDataType t) {
 inline DLDataType String2DLDataType(std::string s) {
   DLDataType t;
   // handle void type
-  if (s.length() == 0) {
+  if (s.length() == 0 || s == "void") {
     t = DataType::Void();
     return t;
   }
diff --git a/include/tvm/script/printer/ir_docsifier.h b/include/tvm/script/printer/ir_docsifier.h
index e97ddc0234b6..e426946b56fe 100644
--- a/include/tvm/script/printer/ir_docsifier.h
+++ b/include/tvm/script/printer/ir_docsifier.h
@@ -126,10 +126,6 @@ class IRDocsifierNode : public Object {
     /*! \brief The name of the variable */
     Optional<String> name;
   };
-  /*!
-   * \brief This map connects IR dispatch token to the name of identifier.
-   */
-  Map<String, String> ir_prefix;
   /*!
    * \brief The stack of frames.
    * \sa FrameNode
@@ -152,7 +148,6 @@ class IRDocsifierNode : public Object {
   std::unordered_map<const Object*, std::vector<const Object*>> common_prefix;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("ir_prefix", &ir_prefix);
     v->Visit("frames", &frames);
     v->Visit("dispatch_tokens", &dispatch_tokens);
     v->Visit("mod", &mod);
@@ -236,11 +231,8 @@ class IRDocsifierNode : public Object {
 class IRDocsifier : public ObjectRef {
  public:
   using FType = IRDocsifierFunctor<printer::Doc, ObjectPath, IRDocsifier>;
-  /*!
-   * \brief Create a IRDocsifier.
-   * \param ir_prefix The ir_prefix to use for this IRDocsifier.
-   */
-  explicit IRDocsifier(Map<String, String> ir_prefix);
+  /*! \brief Create a IRDocsifier. */
+  IRDocsifier();
   /*! \brief The registration table for IRDocsifier. */
   TVM_DLL static FType& vtable();
 
diff --git a/include/tvm/script/printer/printer.h b/include/tvm/script/printer/printer.h
index 31abd7d9ec89..289e838b52a8 100644
--- a/include/tvm/script/printer/printer.h
+++ b/include/tvm/script/printer/printer.h
@@ -22,6 +22,7 @@
 #include <tvm/node/node.h>
 #include <tvm/script/printer/ir_docsifier.h>
 
+#include <string>
 #include <unordered_map>
 #include <vector>
 
@@ -31,6 +32,8 @@ namespace printer {
 
 /*! \brief Default values in the TVMScript printer */
 struct Default {
+  /*! \brief The prefix of IR nodes */
+  std::unordered_map<std::string, std::string> ir_prefix = {{"ir", "I"}, {"tir", "T"}};
   /*! \brief Default data type of TIR buffer */
   DataType buffer_dtype = DataType::Float(32);
   /*! \brief Default data type of integer literals */
@@ -41,28 +44,30 @@ struct Default {
    * T.float32/T.float64 wrapper.
    */
   DataType float_dtype = DataType::Void();
+  /*! \brief Whether or not to verbose print expressions. */
+  bool verbose_expr = false;
   /*! \brief Returns a singleton of the configuration */
   static Default* Instance();
+  static std::string& Prefix(const std::string& ir) { return Instance()->ir_prefix.at(ir); }
   static DataType& BufferDType() { return Instance()->buffer_dtype; }
   static DataType& IntDType() { return Instance()->int_dtype; }
   static DataType& FloatDType() { return Instance()->float_dtype; }
+  static bool& VerboseExpr() { return Instance()->verbose_expr; }
 };
 
 /*!
  * \brief The entry method for TVMScript printing
  * \param obj The object to be printed
- * \param ir_prefix The prefix of IR nodes
  * \param indent_spaces Number of spaces used for indentation
  * \param print_line_numbers Whether to print line numbers
  * \param num_context_lines Number of context lines to print around the underlined text
  * \param path_to_underline Object path to be underlined
  * \return The TVMScript text format
  */
-String Script(ObjectRef obj,                                                //
-              Map<String, String> ir_prefix = {{"ir", "I"}, {"tir", "T"}},  //
-              int indent_spaces = 4,                                        //
-              bool print_line_numbers = false,                              //
-              int num_context_lines = -1,                                   //
+String Script(ObjectRef obj,                    //
+              int indent_spaces = 4,            //
+              bool print_line_numbers = false,  //
+              int num_context_lines = -1,       //
               Optional<ObjectPath> path_to_underline = NullOpt);
 
 /*!
diff --git a/include/tvm/tir/expr.h b/include/tvm/tir/expr.h
index 689b1c0a17ad..1d5e8f317a2e 100644
--- a/include/tvm/tir/expr.h
+++ b/include/tvm/tir/expr.h
@@ -1191,6 +1191,9 @@ class Any : public PrimExpr {
   TVM_DEFINE_OBJECT_REF_COW_METHOD(AnyNode);
 };
 
+/*! \brief Legacy ReprPrint format for TIR */
+std::string LegacyTIRPrint(const ObjectRef& obj);
+
 /*
  * \brief Template function to convert Map to unordered_map
  *  Sometimes useful for API gluing when internal uses unordered_map
diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py
index 06a85fa34082..d4b280a37fa3 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -1210,17 +1210,17 @@ def buffer_store(buffer: Buffer, value: PrimExpr, indices: List[Union[PrimExpr,
     )
 
 
-def prefetch(buffer: Buffer, indices: List[PrimExpr]) -> None:
+def prefetch(buffer: Buffer, bounds: List[Range]) -> None:
     """The prefetch hint for a buffer.
 
     Parameters
     ----------
     buffer : Buffer
         The buffer to be prefetched.
-    indices : List[PrimExpr]
-        The indices of the buffer to extract.
+    bounds : List[Range]
+        The range to be prefetched.
     """
-    return _ffi_api.Prefetch(buffer, indices)  # type: ignore[attr-defined] # pylint: disable=no-member
+    return _ffi_api.Prefetch(buffer, bounds)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def evaluate(value: PrimExpr) -> None:
diff --git a/python/tvm/script/printer/__init__.py b/python/tvm/script/printer/__init__.py
index 25ea619a410c..dc37ea1ff6a6 100644
--- a/python/tvm/script/printer/__init__.py
+++ b/python/tvm/script/printer/__init__.py
@@ -19,4 +19,5 @@
 This package provides a set of APIs to print supported TVM IR into TVMScript
 in a roundtrippable way.
 """
+from . import default
 from .printer import script
diff --git a/python/tvm/script/printer/default.py b/python/tvm/script/printer/default.py
new file mode 100644
index 000000000000..33ca693ebf32
--- /dev/null
+++ b/python/tvm/script/printer/default.py
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The printer configuration"""
+from typing_extensions import Literal
+
+from . import _ffi_api
+
+
+def ir_prefix(  # pylint: disable=invalid-name
+    ir: Literal["ir", "tir"],
+    prefix: str,
+) -> None:
+    """Set the prefix for the IR. If not set, the prefix for "tvm.ir" is "I", and for "tir" is "T.
+
+    Parameters
+    ----------
+    ir : str
+        The IR type, either "ir" or "tir".
+
+    prefix : str
+        The prefix to use.
+    """
+    _ffi_api.DefaultIRPrefix(ir, prefix)  # type: ignore  # pylint: disable=no-member
+
+
+def buffer_dtype(dtype: str) -> None:
+    """Set the default dtype for buffer. If not set, it is "float32".
+
+    Parameters
+    ----------
+    dtype : str
+        The default dtype for buffer.
+    """
+    _ffi_api.DefaultBufferDtype(dtype)  # type: ignore  # pylint: disable=no-member
+
+
+def int_dtype(dtype: str) -> None:
+    """Set the default dtype for integers. If not set, it is "int32".
+
+    Parameters
+    ----------
+    dtype : str
+        The default dtype for buffer.
+    """
+    _ffi_api.DefaultBufferDtype(dtype)  # type: ignore  # pylint: disable=no-member
+
+
+def float_dtype(dtype: str) -> None:
+    """Set the default dtype for buffer. If not set, there is no default,
+    which means every floating point numbers will be wrapped with its precise dtype.
+
+    Parameters
+    ----------
+    dtype : str
+        The default dtype for buffer.
+    """
+    _ffi_api.DefaultFloatDtype(dtype)  # type: ignore  # pylint: disable=no-member
+
+
+def verbose_expr(verbose: bool) -> None:
+    """Whether or not to verbose print expressions. If not, the definition of every variable in an
+    expression will be printed as separate statements. Otherwise, the result will be a one-liner.
+
+    Parameters
+    ----------
+    dtype : str
+        The default dtype for buffer.
+    """
+    _ffi_api.VerboseExpr(verbose)  # type: ignore  # pylint: disable=no-member
diff --git a/python/tvm/script/printer/printer.py b/python/tvm/script/printer/printer.py
index 120ef03f57d7..2ce6329dca08 100644
--- a/python/tvm/script/printer/printer.py
+++ b/python/tvm/script/printer/printer.py
@@ -15,8 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """The printer interface"""
-
-from typing import Mapping, Optional
+from typing import Optional
 
 from tvm.runtime.object_path import ObjectPath
 
@@ -25,7 +24,6 @@
 
 def script(
     obj,
-    ir_prefix: Optional[Mapping[str, str]] = None,
     indent_space: int = 4,
     print_line_number: bool = False,
     num_context_lines: int = -1,
@@ -37,9 +35,6 @@ def script(
     ----------
     obj : object
         An TVM object representing TVM IR
-    ir_prefix : Optional[Mapping[str, str]]
-        A mapping from IR type to the prefix of the script.
-        Default to {"ir": "I", "tir": T}
     indent_space : int = 4
         The number of spaces to indent
     print_line_number : bool = False
@@ -54,11 +49,6 @@ def script(
     script : str
         The TVMScript text format
     """
-    if ir_prefix is None:
-        ir_prefix = {
-            "ir": "I",
-            "tir": "T",
-        }
     return _ffi_api.Script(  # type: ignore # pylint: disable=no-member
-        obj, ir_prefix, indent_space, print_line_number, num_context_lines, path_to_underline
+        obj, indent_space, print_line_number, num_context_lines, path_to_underline
     )
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index 5500707fb9af..3a9224227680 100644
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -1270,29 +1270,32 @@ String ComputeDAG::PrintDAG(bool simple_mode) const {
         if (pop->body.size() > 1) {
           ss << ".v" << k;
         }
-        if (auto preduce = pop->body[k].as<ReduceNode>()) {
-          ICHECK_LT(k, preduce->combiner->result.size());
-          PrimExpr combiner = preduce->combiner->result[k];
+        if (auto p_reduce = pop->body[k].as<ReduceNode>()) {
+          ICHECK_LT(k, p_reduce->combiner->result.size());
+          PrimExpr combiner = p_reduce->combiner->result[k];
           if (combiner->IsInstance<AddNode>()) {
-            ss << " += " << preduce->source[0] << "\n";
+            ss << " += " << LegacyTIRPrint(p_reduce->source[0]) << "\n";
           } else if (combiner->IsInstance<MaxNode>()) {
-            ss << " max= " << preduce->source[0] << "\n";
+            ss << " max= " << LegacyTIRPrint(p_reduce->source[0]) << "\n";
           } else if (combiner->IsInstance<MinNode>()) {
-            ss << " min= " << preduce->source[0] << "\n";
+            ss << " min= " << LegacyTIRPrint(p_reduce->source[0]) << "\n";
           } else if (combiner->IsInstance<SelectNode>()) {
             const auto& select = combiner.as<SelectNode>();
-            ss << " select(" << select->condition << ", " << select->true_value << ", "
-               << select->false_value << ")= " << '(' << preduce->source[0] << ','
-               << preduce->source[1] << ")\n";
+            ss << " select(" << LegacyTIRPrint(select->condition)  //
+               << ", " << LegacyTIRPrint(select->true_value)       //
+               << ", " << LegacyTIRPrint(select->false_value)      //
+               << ")= (" << LegacyTIRPrint(p_reduce->source[0])    //
+               << ',' << LegacyTIRPrint(p_reduce->source[1])       //
+               << ")\n";
           } else {
-            ss << "reduce" << combiner << "\n";
+            ss << "reduce" << LegacyTIRPrint(combiner) << "\n";
           }
         } else {
           auto call = pop->body[k].as<CallNode>();
           if (simple_mode && call) {
-            ss << " = " << call->op << "\n";
+            ss << " = " << LegacyTIRPrint(call->op) << "\n";
           } else {
-            ss << " = " << pop->body[k] << "\n";
+            ss << " = " << LegacyTIRPrint(pop->body[k]) << "\n";
           }
         }
       }
diff --git a/src/ir/expr.cc b/src/ir/expr.cc
index f097f8f36352..7ba99e34d519 100644
--- a/src/ir/expr.cc
+++ b/src/ir/expr.cc
@@ -106,16 +106,6 @@ TVM_REGISTER_GLOBAL("ir.IntImm").set_body_typed([](DataType dtype, int64_t value
 
 TVM_REGISTER_NODE_TYPE(IntImmNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<IntImmNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const IntImmNode*>(node.get());
-      if (op->dtype == DataType::Int(32)) {
-        p->stream << op->value;
-      } else {
-        p->stream << "(" << op->dtype << ")" << op->value;
-      }
-    });
-
 FloatImm::FloatImm(DataType dtype, double value, Span span) {
   ICHECK_EQ(dtype.lanes(), 1) << "ValueError: FloatImm can only take scalar.";
 
@@ -149,25 +139,6 @@ TVM_REGISTER_GLOBAL("ir.FloatImm").set_body_typed([](DataType dtype, double valu
 
 TVM_REGISTER_NODE_TYPE(FloatImmNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<FloatImmNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const FloatImmNode*>(node.get());
-      auto& stream = p->stream;
-      switch (op->dtype.bits()) {
-        case 64:
-          stream << op->value;
-          break;
-        case 32:
-          stream << op->value << 'f';
-          break;
-        case 16:
-          stream << op->value << 'h';
-          break;
-        default:
-          LOG(FATAL) << "Unknown float type bits=" << op->dtype.bits();
-      }
-    });
-
 Range::Range(PrimExpr begin, PrimExpr end, Span span)
     : Range(make_object<RangeNode>(begin, tir::is_zero(begin) ? end : (end - begin), span)) {}
 
@@ -183,12 +154,6 @@ TVM_REGISTER_GLOBAL("ir.Range").set_body([](TVMArgs args, TVMRetValue* ret) {
 
 TVM_REGISTER_NODE_TYPE(RangeNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<RangeNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const RangeNode*>(node.get());
-      p->stream << "range(min=" << op->min << ", ext=" << op->extent << ')';
-    });
-
 GlobalVar::GlobalVar(String name_hint, Type type, Span span) {
   ObjectPtr<GlobalVarNode> n = make_object<GlobalVarNode>();
   n->name_hint = std::move(name_hint);
diff --git a/src/ir/type.cc b/src/ir/type.cc
index fe8e00329bbc..ee05fd03596a 100644
--- a/src/ir/type.cc
+++ b/src/ir/type.cc
@@ -37,12 +37,6 @@ TVM_REGISTER_GLOBAL("ir.PrimType").set_body_typed([](runtime::DataType dtype) {
   return PrimType(dtype);
 });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<PrimTypeNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const PrimTypeNode*>(ref.get());
-      p->stream << node->dtype;
-    });
-
 PointerType::PointerType(Type element_type, String storage_scope) {
   ObjectPtr<PointerTypeNode> n = make_object<PointerTypeNode>();
   n->element_type = std::move(element_type);
@@ -57,16 +51,6 @@ TVM_REGISTER_GLOBAL("ir.PointerType")
       return PointerType(element_type, storage_scope);
     });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<PointerTypeNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const PointerTypeNode*>(ref.get());
-      if (!node->storage_scope.empty()) {
-        p->stream << node->storage_scope << " ";
-      }
-      p->Print(node->element_type);
-      p->stream << '*';
-    });
-
 TypeVar::TypeVar(String name, TypeKind kind, Span span) {
   ObjectPtr<TypeVarNode> n = make_object<TypeVarNode>();
   n->name_hint = std::move(name);
@@ -148,12 +132,6 @@ TVM_REGISTER_GLOBAL("ir.TupleType").set_body_typed([](Array<Type> fields) {
   return TupleType(fields);
 });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<TupleTypeNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const TupleTypeNode*>(ref.get());
-      p->stream << "TupleTypeNode(" << node->fields << ")";
-    });
-
 IncompleteType::IncompleteType(TypeKind kind, Span span) {
   auto n = make_object<IncompleteTypeNode>();
   n->kind = std::move(kind);
@@ -167,12 +145,6 @@ TVM_REGISTER_GLOBAL("ir.IncompleteType").set_body_typed([](int kind) {
   return IncompleteType(static_cast<TypeKind>(kind));
 });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<IncompleteTypeNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const IncompleteTypeNode*>(ref.get());
-      p->stream << "IncompleteTypeNode(" << node->kind << ", " << node << ")";
-    });
-
 RelayRefType::RelayRefType(Type value, Span span) {
   ObjectPtr<RelayRefTypeNode> n = make_object<RelayRefTypeNode>();
   n->value = std::move(value);
diff --git a/src/script/printer/doc_printer/python_doc_printer.cc b/src/script/printer/doc_printer/python_doc_printer.cc
index 6851baf63866..8634236df5c3 100644
--- a/src/script/printer/doc_printer/python_doc_printer.cc
+++ b/src/script/printer/doc_printer/python_doc_printer.cc
@@ -549,7 +549,16 @@ void PythonDocPrinter::PrintTypedDoc(const WhileDoc& doc) {
 void PythonDocPrinter::PrintTypedDoc(const ForDoc& doc) {
   MaybePrintCommentWithNewLine(doc);
   output_ << "for ";
-  PrintDoc(doc->lhs);
+  if (const auto* tuple = doc->lhs.as<TupleDocNode>()) {
+    if (tuple->elements.size() == 1) {
+      PrintDoc(tuple->elements[0]);
+      output_ << ",";
+    } else {
+      PrintJoinedDocs(tuple->elements, ", ");
+    }
+  } else {
+    PrintDoc(doc->lhs);
+  }
   output_ << " in ";
   PrintDoc(doc->rhs);
   output_ << ":";
@@ -644,7 +653,12 @@ String DocToPythonScript(Doc doc, int indent_spaces, bool print_line_numbers, in
 
   PythonDocPrinter printer(options);
   printer.Append(doc, path_to_underline);
-  return printer.GetString();
+  std::string result = printer.GetString();
+  int last_space = result.size();
+  while (last_space > 0 && std::isspace(result[last_space - 1])) {
+    last_space--;
+  }
+  return result.substr(0, last_space);
 }
 
 TVM_REGISTER_GLOBAL("script.printer.DocToPythonScript").set_body_typed(DocToPythonScript);
diff --git a/src/script/printer/ir_docsifier.cc b/src/script/printer/ir_docsifier.cc
index 8584f360312f..4c52ce890c9d 100644
--- a/src/script/printer/ir_docsifier.cc
+++ b/src/script/printer/ir_docsifier.cc
@@ -27,7 +27,7 @@ namespace printer {
 
 String GenerateUniqueName(std::string name_hint, std::unordered_set<String>* defined_names) {
   for (char& c : name_hint) {
-    if (c != 'c' && !std::isalnum(c)) {
+    if (c != '_' && !std::isalnum(c)) {
       c = '_';
     }
   }
@@ -39,10 +39,10 @@ String GenerateUniqueName(std::string name_hint, std::unordered_set<String>* def
 }
 
 IdDoc IRDocsifierNode::Define(const ObjectRef& obj, const Frame& frame, const String& name_hint) {
+  ICHECK(obj2info.find(obj) == obj2info.end()) << "Duplicated object: " << obj;
   String name = GenerateUniqueName(name_hint, &this->defined_names);
   DocCreator doc_factory = [name]() { return IdDoc(name); };
-  auto result = obj2info.insert({obj, VariableInfo{std::move(doc_factory), name}});
-  ICHECK(result.second) << "Duplicated object: " << obj;
+  obj2info.insert({obj, VariableInfo{std::move(doc_factory), name}});
   IdDoc def_doc(name);
   frame->AddExitCallback([this, obj]() { this->RemoveVar(obj); });
   return def_doc;
@@ -50,8 +50,6 @@ IdDoc IRDocsifierNode::Define(const ObjectRef& obj, const Frame& frame, const St
 
 void IRDocsifierNode::Define(const ObjectRef& obj, const Frame& frame, DocCreator doc_factory) {
   ICHECK(obj2info.find(obj) == obj2info.end()) << "Duplicated object: " << obj;
-  ICHECK(!doc_factory()->IsInstance<IdDocNode>())
-      << "IRDocsifierNode::Define cannot be used for variable that's mapped to IdDoc.";
   obj2info.insert({obj, VariableInfo{std::move(doc_factory), NullOpt}});
   frame->AddExitCallback([this, obj]() { this->RemoveVar(obj); });
 }
@@ -146,9 +144,8 @@ void IRDocsifierNode::SetCommonPrefix(const ObjectRef& root,
   this->common_prefix = std::move(visitor.common_prefix);
 }
 
-IRDocsifier::IRDocsifier(Map<String, String> ir_prefix) {
+IRDocsifier::IRDocsifier() {
   auto n = make_object<IRDocsifierNode>();
-  n->ir_prefix = std::move(ir_prefix);
   n->dispatch_tokens.push_back("");
   data_ = std::move(n);
 }
diff --git a/src/script/printer/printer.cc b/src/script/printer/printer.cc
index 47fd0b89b09e..9ebdcb1e99b3 100644
--- a/src/script/printer/printer.cc
+++ b/src/script/printer/printer.cc
@@ -23,13 +23,10 @@ namespace tvm {
 namespace script {
 namespace printer {
 
-String Script(ObjectRef obj, Map<String, String> ir_prefix, int indent_spaces,
-              bool print_line_numbers, int num_context_lines,
+String Script(ObjectRef obj, int indent_spaces, bool print_line_numbers, int num_context_lines,
               Optional<ObjectPath> path_to_underline) {
-  IRDocsifier d(ir_prefix);
-  Doc doc = d->AsDoc(obj, ObjectPath::Root());
-  return DocToPythonScript(doc, indent_spaces, print_line_numbers, num_context_lines,
-                           path_to_underline);
+  return DocToPythonScript(IRDocsifier()->AsDoc(obj, ObjectPath::Root()), indent_spaces,
+                           print_line_numbers, num_context_lines, path_to_underline);
 }
 
 Default* Default::Instance() {
@@ -38,6 +35,19 @@ Default* Default::Instance() {
 }
 
 TVM_REGISTER_GLOBAL("script.printer.Script").set_body_typed(Script);
+TVM_REGISTER_GLOBAL("script.printer.DefaultIRPrefix")
+    .set_body_typed([](std::string ir, std::string prefix) { Default::Prefix(ir) = prefix; });
+TVM_REGISTER_GLOBAL("script.printer.DefaultBufferDType")
+    .set_body_typed([](runtime::DataType dtype) { Default::BufferDType() = dtype; });
+TVM_REGISTER_GLOBAL("script.printer.DefaultIntDType").set_body_typed([](runtime::DataType dtype) {
+  Default::IntDType() = dtype;
+});
+TVM_REGISTER_GLOBAL("script.printer.DefaultFloatDType").set_body_typed([](runtime::DataType dtype) {
+  Default::FloatDType() = dtype;
+});
+TVM_REGISTER_GLOBAL("script.printer.VerboseExpr").set_body_typed([](bool verbose_expr) {
+  Default::VerboseExpr() = verbose_expr;
+});
 
 }  // namespace printer
 }  // namespace script
diff --git a/src/script/printer/tir/block.cc b/src/script/printer/tir/block.cc
index f6dbf616a5a3..8f008375ff87 100644
--- a/src/script/printer/tir/block.cc
+++ b/src/script/printer/tir/block.cc
@@ -26,14 +26,15 @@ Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
                Optional<tir::BlockRealize> opt_realize, Optional<ObjectPath> opt_realize_p) {
   With<TIRFrame> frame(d, block);
   ICHECK_EQ(opt_realize.defined(), opt_realize_p.defined());
-  const tir::BlockRealizeNode* realize = opt_realize.value().get();
-  const ObjectPathNode* realize_p = opt_realize_p.get();
+  const tir::BlockRealizeNode* realize =
+      opt_realize.defined() ? opt_realize.value().get() : nullptr;
+  const ObjectPathNode* realize_p = opt_realize_p.defined() ? opt_realize_p.get() : nullptr;
   // Step 1. Handle block var and block bindings
   int n_vars = block->iter_vars.size();
   for (int i = 0; i < n_vars; ++i) {
     tir::IterVar iter_var = block->iter_vars[i];
     ObjectPath iter_var_p = block_p->Attr("iter_var")->ArrayIndex(i);
-    ExprDoc rhs = TIR(d)->Attr("axis");
+    ExprDoc rhs = TIR("axis");
     if (iter_var->iter_type == tir::IterVarType::kDataPar) {
       rhs = rhs->Attr("spatial");
     } else if (iter_var->iter_type == tir::IterVarType::kCommReduce) {
@@ -70,7 +71,7 @@ Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
   if (realize) {
     ICHECK(realize->predicate.defined() && realize->predicate->dtype.is_bool());
     if (!tir::is_one(realize->predicate)) {
-      (*frame)->stmts.push_back(ExprStmtDoc(TIR(d)->Attr("where")->Call(
+      (*frame)->stmts.push_back(ExprStmtDoc(TIR("where")->Call(
           {d->AsDoc<ExprDoc>(realize->predicate, realize_p->Attr("predicate"))})));
     }
   }
@@ -80,18 +81,17 @@ Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
     for (int i = 0, n = block->reads.size(); i < n; ++i) {
       reads.push_back(d->AsDoc<ExprDoc>(block->reads[i], block_p->Attr("reads")->ArrayIndex(i)));
     }
-    (*frame)->stmts.push_back(ExprStmtDoc(TIR(d)->Attr("reads")->Call(reads)));
+    (*frame)->stmts.push_back(ExprStmtDoc(TIR("reads")->Call(reads)));
     Array<ExprDoc> writes;
     for (int i = 0, n = block->writes.size(); i < n; ++i) {
       writes.push_back(d->AsDoc<ExprDoc>(block->writes[i], block_p->Attr("writes")->ArrayIndex(i)));
     }
-    (*frame)->stmts.push_back(ExprStmtDoc(TIR(d)->Attr("writes")->Call(writes)));
+    (*frame)->stmts.push_back(ExprStmtDoc(TIR("writes")->Call(writes)));
   }
   // Step 4. Handle block attributes
   if (!block->annotations.empty()) {
     (*frame)->stmts.push_back(ExprStmtDoc(
-        TIR(d)
-            ->Attr("block_attr")
+        TIR("block_attr")
             ->Call({d->AsDoc<ExprDoc>(block->annotations, block_p->Attr("annotations"))})));
   }
   // Step 5. Handle `alloc_buffer`
@@ -114,13 +114,19 @@ Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
     tir::Stmt init = block->init.value();
     With<TIRFrame> init_frame(d, init);
     AsDocBody(init, block_p->Attr("init"), init_frame->get(), d);
-    (*frame)->stmts.push_back(
-        ScopeDoc(NullOpt, TIR(d)->Attr("init")->Call({}), (*init_frame)->stmts));
+    (*frame)->stmts.push_back(ScopeDoc(NullOpt, TIR("init")->Call({}), (*init_frame)->stmts));
   }
   // Step 8. Handle block body
   AsDocBody(block->body, block_p->Attr("body"), frame->get(), d);
-  return ScopeDoc(NullOpt, TIR(d)->Attr("block")->Call({LiteralDoc::Str(block->name_hint)}),
-                  (*frame)->stmts);
+  Array<String> kwargs_keys;
+  Array<ExprDoc> kwargs_values;
+  if (!realize) {
+    kwargs_keys.push_back("no_realize");
+    kwargs_values.push_back(LiteralDoc::Boolean(true));
+  }
+  return ScopeDoc(
+      NullOpt, TIR("block")->Call({LiteralDoc::Str(block->name_hint)}, kwargs_keys, kwargs_values),
+      (*frame)->stmts);
 }
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
@@ -134,16 +140,8 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       return PrintBlock(d, block, p, NullOpt, NullOpt);
     });
 
-TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<tir::MatchBufferRegion>(
-        "", [](tir::MatchBufferRegion stmt, ObjectPath p, IRDocsifier d) -> Doc {
-          Frame frame = d->frames.back();
-          ExprDoc lhs = DefineBuffer(stmt->buffer, frame, d);
-          ExprDoc src_buffer = d->AsDoc<ExprDoc>(stmt->source, p->Attr("source"));
-          ExprDoc rhs = BufferDecl(stmt->buffer, "match_buffer", {src_buffer}, p->Attr("buffer"),
-                                   d->frames.back(), d);
-          return AssignDoc(lhs, rhs, NullOpt);
-        });
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BlockNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BlockRealizeNode>(ReprPrint);
 
 }  // namespace printer
 }  // namespace script
diff --git a/src/script/printer/tir/buffer.cc b/src/script/printer/tir/buffer.cc
index 3e1d71af4acd..b9eef12abc77 100644
--- a/src/script/printer/tir/buffer.cc
+++ b/src/script/printer/tir/buffer.cc
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/device_api.h>  // For `kAllocAlignment`
 
 #include "./utils.h"
 
@@ -121,73 +121,141 @@ ExprDoc BufferCall(const ExprDoc& prefix, const Map<String, ExprDoc>& attrs, Arr
 
 ExprDoc BufferDecl(const tir::Buffer& buffer, const String& method, const Array<ExprDoc>& args,
                    const ObjectPath& p, const Frame& frame, const IRDocsifier& d) {
-  return BufferCall(/*prefix=*/TIR(d)->Attr(method),
+  return BufferCall(/*prefix=*/TIR(method),
                     /*attrs=*/BufferAttrs(buffer, p, frame, d),
                     /*args=*/args);
 }
 
-Doc BufferIndex(const PrimExpr& index, const ObjectPath& p, const IRDocsifier& d) {
-  if (const auto* ramp = index.as<tir::RampNode>()) {
-    if (const auto* stride = ramp->stride.as<IntImmNode>()) {
-      ExprDoc start = d->AsDoc<ExprDoc>(ramp->base, p->Attr("base"));
-      ExprDoc stop = d->AsDoc<ExprDoc>(ramp->base + ramp->lanes * ramp->stride, p->Attr("lanes"));
-      Optional<ExprDoc> step = NullOpt;
-      if (stride->value != 1) {
-        step = d->AsDoc<ExprDoc>(ramp->stride, p->Attr("stride"));
+Array<Doc> BufferIndices(const Array<PrimExpr>& indices, const ObjectPath& p,
+                         const IRDocsifier& d) {
+  int n = indices.size();
+  Array<Doc> indices_doc;
+  indices_doc.reserve(n);
+  for (int i = 0; i < n; ++i) {
+    if (const auto* ramp = indices[i].as<tir::RampNode>()) {
+      if (const auto* stride = ramp->stride.as<IntImmNode>()) {
+        ObjectPath ramp_p = p->Attr("indices")->ArrayIndex(i);
+        ObjectPath stride_p = ramp_p->Attr("stride");
+        ExprDoc start = d->AsDoc<ExprDoc>(ramp->base,  //
+                                          ramp_p->Attr("base"));
+        ExprDoc stop = d->AsDoc<ExprDoc>(ramp->base + ramp->lanes * ramp->stride,  //
+                                         ramp_p->Attr("lanes"));
+        Optional<ExprDoc> step = NullOpt;
+        if (stride->value != 1) {
+          step = d->AsDoc<ExprDoc>(ramp->stride, ramp_p->Attr("stride"));
+        }
+        indices_doc.push_back(SliceDoc(start, stop, step));
+        continue;
       }
-      return SliceDoc(start, stop, step);
     }
+    indices_doc.push_back(d->AsDoc<ExprDoc>(indices[i], p->Attr("indices")->ArrayIndex(i)));
   }
-  return d->AsDoc<ExprDoc>(index, p);
+  return indices_doc;
 }
 
-ExprDoc BufferIndices(const tir::Buffer& buffer, const Array<PrimExpr>& indices,
-                      const ObjectPath& p, const IRDocsifier& d) {
-  int n = indices.size();
-  Array<Doc> indices_doc;
-  indices_doc.reserve(n);
+Array<Doc> BufferSlices(const Array<Range>& region, const ObjectPath& p, const IRDocsifier& d) {
+  int n = region.size();
+  Array<Doc> indices;
+  indices.reserve(n);
   for (int i = 0; i < n; ++i) {
-    indices_doc.push_back(BufferIndex(indices[i], p->Attr("indices")->ArrayIndex(i), d));
+    Range range = region[i];
+    ObjectPath range_p = p->ArrayIndex(i);
+    ExprDoc min = d->AsDoc<ExprDoc>(range->min, range_p->Attr("min"));
+    if (tir::is_one(range->extent)) {
+      indices.push_back(min);
+    } else {
+      ExprDoc max = d->AsDoc<ExprDoc>(range->min + range->extent, range_p->Attr("extent"));
+      indices.push_back(SliceDoc(min, max, NullOpt));
+    }
   }
-  return d->AsDoc<ExprDoc>(buffer, p->Attr("buffer"))[indices_doc];
+  return indices;
 }
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::BufferRegion>(
         "", [](tir::BufferRegion buffer_region, ObjectPath p, IRDocsifier d) -> Doc {
           ExprDoc prefix = d->AsDoc<ExprDoc>(buffer_region->buffer, p->Attr("buffer"));
-          p = p->Attr("region");
-          Array<Range> region = buffer_region->region;
-          int n = region.size();
-          Array<Doc> indices;
-          indices.reserve(n);
-          for (int i = 0; i < n; ++i) {
-            Range range = region[i];
-            ExprDoc min = d->AsDoc<ExprDoc>(range->min, p->ArrayIndex(i)->Attr("min"));
-            if (tir::is_one(range->extent)) {
-              indices.push_back(min);
-            } else {
-              ExprDoc max =
-                  d->AsDoc<ExprDoc>(range->min + range->extent, p->ArrayIndex(i)->Attr("extent"));
-              indices.push_back(SliceDoc(min, max, NullOpt));
-            }
-          }
-          return prefix[indices];
+          return prefix[BufferSlices(buffer_region->region, p->Attr("region"), d)];
         });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::BufferStore>(  //
         "", [](tir::BufferStore store, ObjectPath p, IRDocsifier d) -> Doc {
-          return AssignDoc(/*lhs=*/BufferIndices(store->buffer, store->indices, p, d),
+          ExprDoc buffer = d->AsDoc<ExprDoc>(store->buffer, p->Attr("buffer"));
+          return AssignDoc(/*lhs=*/buffer[BufferIndices(store->indices, p->Attr("indices"), d)],
                            /*rhs=*/d->AsDoc<ExprDoc>(store->value, p->Attr("value")), NullOpt);
         });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::BufferLoad>(  //
         "", [](tir::BufferLoad load, ObjectPath p, IRDocsifier d) -> Doc {
-          return BufferIndices(load->buffer, load->indices, p, d);
+          ExprDoc buffer = d->AsDoc<ExprDoc>(load->buffer, p->Attr("buffer"));
+          return buffer[BufferIndices(load->indices, p->Attr("indices"), d)];
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)  //
+    .set_dispatch<tir::Buffer>("", [](tir::Buffer buffer, ObjectPath p, IRDocsifier d) -> Doc {
+      if (!d->IsVarDefined(buffer)) {
+        if (Optional<Frame> opt_f = FindLowestVarDef(buffer, d)) {
+          ExprDoc lhs = DefineBuffer(buffer, opt_f.value(), d);
+          ExprDoc rhs = BufferDecl(buffer, "buffer_decl",  // TODO(@junrushao): name confusing
+                                   {}, p, opt_f.value(), d);
+          opt_f.value()->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
+        }
+      }
+      if (Optional<ExprDoc> doc = d->GetVarDoc(buffer)) {
+        return doc.value();
+      }
+      LOG(FATAL) << "IndexError: Buffer is not defined in the environment: " << buffer;
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::MatchBufferRegion>(
+        "", [](tir::MatchBufferRegion stmt, ObjectPath p, IRDocsifier d) -> Doc {
+          Frame frame = d->frames.back();
+          ExprDoc lhs = DefineBuffer(stmt->buffer, frame, d);
+          ExprDoc src_buffer = d->AsDoc<ExprDoc>(stmt->source, p->Attr("source"));
+          ExprDoc rhs = BufferDecl(stmt->buffer, "match_buffer", {src_buffer}, p->Attr("buffer"),
+                                   d->frames.back(), d);
+          return AssignDoc(lhs, rhs, NullOpt);
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::ProducerLoad>(  //
+        "", [](tir::ProducerLoad load, ObjectPath p, IRDocsifier d) -> Doc {
+          ExprDoc prefix = IdDoc(load->producer->GetNameHint());
+          return prefix[BufferIndices(load->indices, p->Attr("indices"), d)];
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::ProducerStore>(  //
+        "", [](tir::ProducerStore store, ObjectPath p, IRDocsifier d) -> Doc {
+          ExprDoc prefix = IdDoc(store->producer->GetNameHint());
+          prefix = prefix[BufferIndices(store->indices, p->Attr("indices"), d)];
+          return AssignDoc(prefix, d->AsDoc<ExprDoc>(store->value, p->Attr("value")), NullOpt);
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<tir::ProducerRealize>(  //
+        "", [](tir::ProducerRealize stmt, ObjectPath p, IRDocsifier d) -> Doc {
+          ExprDoc prefix = IdDoc(stmt->producer->GetNameHint());
+          prefix = prefix[BufferSlices(stmt->bounds, p->Attr("bounds"), d)];
+          prefix = TIR("ProducerRealize")
+                       ->Call({prefix, d->AsDoc<ExprDoc>(stmt->condition, p->Attr("condition"))});
+          With<TIRFrame> f(d, stmt);
+          AsDocBody(stmt->body, p->Attr("body"), f->get(), d);
+          return ScopeDoc(NullOpt, prefix, (*f)->stmts);
         });
 
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BufferRegionNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BufferLoadNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BufferStoreNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BufferNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::MatchBufferRegionNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::ProducerLoadNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::ProducerStoreNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::ProducerRealizeNode>(ReprPrint);
+
 }  // namespace printer
 }  // namespace script
 }  // namespace tvm
diff --git a/src/script/printer/tir/expr.cc b/src/script/printer/tir/expr.cc
index f9b4eb621447..317201fa3d74 100644
--- a/src/script/printer/tir/expr.cc
+++ b/src/script/printer/tir/expr.cc
@@ -34,7 +34,7 @@ Doc PrintVar(const tir::Var& var, const ObjectPath& p, const IRDocsifier& d) {
         ExprDoc rhs = d->AsDoc<ExprDoc>(type, p->Attr("type_annotation"));
         opt_f.value()->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
       } else {
-        ExprDoc rhs = TIR(d)->Attr("var")->Call({LiteralDoc::DataType(var->dtype)});
+        ExprDoc rhs = TIR("var")->Call({LiteralDoc::DataType(var->dtype)});
         opt_f.value()->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
       }
     }
@@ -57,8 +57,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)  //
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::IterVar>("", [](tir::IterVar var, ObjectPath p, IRDocsifier d) -> Doc {
-      return TIR(d)
-          ->Attr("iter_var")
+      return TIR("iter_var")
           ->Call({
               d->AsDoc<ExprDoc>(var->var, p->Attr("var")),
               d->AsDoc<ExprDoc>(var->dom, p->Attr("dom")),
@@ -67,27 +66,11 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           });
     });
 
-TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)  //
-    .set_dispatch<tir::Buffer>("", [](tir::Buffer buffer, ObjectPath p, IRDocsifier d) -> Doc {
-      if (!d->IsVarDefined(buffer)) {
-        if (Optional<Frame> opt_f = FindLowestVarDef(buffer, d)) {
-          ExprDoc lhs = DefineBuffer(buffer, opt_f.value(), d);
-          ExprDoc rhs = BufferDecl(buffer, "buffer_decl",  // TODO(@junrushao): name confusing
-                                   {}, p, opt_f.value(), d);
-          opt_f.value()->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
-        }
-      }
-      if (Optional<ExprDoc> doc = d->GetVarDoc(buffer)) {
-        return doc.value();
-      }
-      LOG(FATAL) << "IndexError: Buffer is not defined in the environment: " << buffer;
-    });
-
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Not>("", [](tir::Not node, ObjectPath p, IRDocsifier d) -> Doc {
       ExprDoc a = d->AsDoc<ExprDoc>(node->a, p->Attr("a"));
       if (a->IsInstance<LiteralDocNode>()) {
-        return TIR(d)->Attr("Not")->Call({a});
+        return TIR("Not")->Call({a});
       }
       return OperationDoc(OperationDocNode::Kind::kNot, {a});
     });
@@ -101,12 +84,12 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Cast>("", [](tir::Cast cast, ObjectPath p, IRDocsifier d) -> Doc {
       ExprDoc dtype = LiteralDoc::DataType(cast->dtype);
       ExprDoc value = d->AsDoc<ExprDoc>(cast->value, p->Attr("value"));
-      return TIR(d)->Attr("Cast")->Call({dtype, value});
+      return TIR("Cast")->Call({dtype, value});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Select>("", [](tir::Select select, ObjectPath p, IRDocsifier d) -> Doc {
-      return TIR(d)->Attr("Select")->Call({
+      return TIR("Select")->Call({
           d->AsDoc<ExprDoc>(select->condition, p->Attr("condition")),
           d->AsDoc<ExprDoc>(select->true_value, p->Attr("true_value")),
           d->AsDoc<ExprDoc>(select->false_value, p->Attr("false_value")),
@@ -115,7 +98,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Ramp>("", [](tir::Ramp ramp, ObjectPath p, IRDocsifier d) -> Doc {
-      return TIR(d)->Attr("Ramp")->Call({
+      return TIR("Ramp")->Call({
           d->AsDoc<ExprDoc>(ramp->base, p->Attr("base")),
           d->AsDoc<ExprDoc>(ramp->stride, p->Attr("stride")),
           LiteralDoc::Int(ramp->lanes),
@@ -124,8 +107,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Broadcast>("", [](tir::Broadcast bc, ObjectPath p, IRDocsifier d) -> Doc {
-      return TIR(d)
-          ->Attr("Broadcast")
+      return TIR("Broadcast")
           ->Call({
               d->AsDoc<ExprDoc>(bc->value, p->Attr("value")),
               LiteralDoc::Int(bc->lanes),
@@ -135,7 +117,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Shuffle>(  //
         "", [](tir::Shuffle shuffle, ObjectPath p, IRDocsifier d) -> Doc {
-          return TIR(d)->Attr("Shuffle")->Call({
+          return TIR("Shuffle")->Call({
               d->AsDoc<ExprDoc>(shuffle->vectors, p->Attr("vectors")),
               d->AsDoc<ExprDoc>(shuffle->indices, p->Attr("indices")),
           });
@@ -170,12 +152,12 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
             }
           }
           ExprDoc id = d->AsDoc<ExprDoc>(r->identity_element, p->Attr("identity_element"));
-          return TIR(d)->Attr("comm_reducer")->Call({lambda, id});
+          return TIR("comm_reducer")->Call({lambda, id});
         });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Let>("", [](tir::Let let, ObjectPath p, IRDocsifier d) -> Doc {
-      return TIR(d)->Attr("let")->Call({
+      return TIR("let")->Call({
           d->AsDoc<ExprDoc>(let->var, p->Attr("var")),
           d->AsDoc<ExprDoc>(let->value, p->Attr("value")),
           d->AsDoc<ExprDoc>(let->body, p->Attr("body")),
@@ -209,7 +191,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       ExprDoc prefix{nullptr};
       if (const auto* op = call->op.as<OpNode>()) {
         String name = op_names[GetRef<Op>(op)];
-        prefix = TIR(d)->Attr(name);
+        prefix = TIR(name);
       } else if (const auto* gv = call->op.as<GlobalVarNode>()) {
         prefix = LiteralDoc::Str(gv->name_hint);
       } else {
@@ -232,20 +214,22 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Any>("", [](tir::Any any, ObjectPath p, IRDocsifier d) -> Doc {
-      return TIR(d)->Attr("Any")->Call({});
+      return TIR("Any")->Call({});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Reduce>("", [](tir::Reduce r, ObjectPath p, IRDocsifier d) -> Doc {
+      ExprDoc combiner = d->AsDoc<ExprDoc>(r->combiner, p->Attr("combiner"));
+      ExprDoc source = d->AsDoc<ExprDoc>(r->source, p->Attr("source"));
+      ExprDoc init = d->AsDoc<ExprDoc>(r->init, p->Attr("init"));
+      ExprDoc axis = d->AsDoc<ExprDoc>(r->axis, p->Attr("axis"));
+      ExprDoc condition = d->AsDoc<ExprDoc>(r->condition, p->Attr("condition"));
+      ExprDoc value_index = LiteralDoc::Int(r->value_index);
+      return TIR("reduce")->Call({combiner}, {"source", "init", "axis", "condition", "value_index"},
+                                 {source, init, axis, condition, value_index});
       LOG(FATAL) << "ValueError: Reduce should never exist in TIR: " << r;
     });
 
-TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<tir::ProducerLoad>(
-        "", [](tir::ProducerLoad load, ObjectPath p, IRDocsifier d) -> Doc {
-          LOG(FATAL) << "ValueError: ProducerLoad should never exist in TIR: " << load;
-        });
-
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Load>("", [](tir::Load load, ObjectPath p, IRDocsifier d) -> Doc {
       LOG(FATAL) << "ValueError: Load has been deprecated for BufferLoad: " << load;
@@ -257,7 +241,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
                                    [](tir::NodeType node, ObjectPath p, IRDocsifier d) -> Doc { \
                                      ExprDoc a = d->AsDoc<ExprDoc>(node->a, p->Attr("a"));      \
                                      ExprDoc b = d->AsDoc<ExprDoc>(node->b, p->Attr("b"));      \
-                                     return TIR(d)->Attr(OpString)->Call({a, b});               \
+                                     return TIR(OpString)->Call({a, b});                        \
                                    });
 
 #define TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(NodeType, OpString, OpKind)          \
@@ -267,7 +251,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
             ExprDoc a = d->AsDoc<ExprDoc>(node->a, p->Attr("a"));                     \
             ExprDoc b = d->AsDoc<ExprDoc>(node->b, p->Attr("b"));                     \
             if (a->IsInstance<LiteralDocNode>() && b->IsInstance<LiteralDocNode>()) { \
-              return TIR(d)->Attr(OpString)->Call({a, b});                            \
+              return TIR(OpString)->Call({a, b});                                     \
             }                                                                         \
             return OperationDoc(OperationDocNode::Kind::OpKind, {a, b});              \
           });
@@ -294,6 +278,40 @@ TVM_SCRIPT_PRINTER_DEF_BINARY(Max, "max");
 #undef TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR
 #undef TVM_SCRIPT_PRINTER_DEF_BINARY
 
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::VarNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::SizeVarNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::IterVarNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::StringImmNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::CastNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::AddNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::SubNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::MulNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::DivNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::ModNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::FloorDivNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::FloorModNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::MinNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::MaxNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::LTNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::LENode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::EQNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::NENode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::GTNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::GENode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::AndNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::OrNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::NotNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::SelectNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::RampNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BroadcastNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::LetNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::CallNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::ShuffleNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::CommReducerNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::AnyNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::ReduceNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::LoadNode>(ReprPrint);
+
 }  // namespace printer
 }  // namespace script
 }  // namespace tvm
diff --git a/src/script/printer/tir/for_loop.cc b/src/script/printer/tir/for_loop.cc
index 6a375935bd79..239b8e565f35 100644
--- a/src/script/printer/tir/for_loop.cc
+++ b/src/script/printer/tir/for_loop.cc
@@ -59,7 +59,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           p = p->Attr("body");
         }
         AsDocBody(grid.back()->body, p, (*f).get(), d);
-        return ForDoc(TupleDoc(lhs), TIR(d)->Attr("grid")->Call(rhs), (*f)->stmts);
+        return ForDoc(TupleDoc(lhs), TIR("grid")->Call(rhs), (*f)->stmts);
       }
       // Step 3. If not `T.grid`, print loop kind accordingly
       IdDoc lhs = DefineVar(loop->loop_var, *f, d);
@@ -76,21 +76,21 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       if (!loop->annotations.empty()) {
         annotations = d->AsDoc<ExprDoc>(loop->annotations, p->Attr("annotations"));
       }
-      ExprDoc prefix = TIR(d);
+      ExprDoc prefix{nullptr};
       if (loop->kind == tir::ForKind::kSerial) {
         if (loop->annotations.empty()) {
           prefix = IdDoc("range");
         } else {
-          prefix = prefix->Attr("serial");
+          prefix = TIR("serial");
         }
       } else if (loop->kind == tir::ForKind::kParallel) {
-        prefix = prefix->Attr("parallel");
+        prefix = TIR("parallel");
       } else if (loop->kind == tir::ForKind::kUnrolled) {
-        prefix = prefix->Attr("unroll");
+        prefix = TIR("unroll");
       } else if (loop->kind == tir::ForKind::kVectorized) {
-        prefix = prefix->Attr("vectorized");
+        prefix = TIR("vectorized");
       } else if (loop->kind == tir::ForKind::kThreadBinding) {
-        prefix = prefix->Attr("thread_binding");
+        prefix = TIR("thread_binding");
         thread = LiteralDoc::Str(loop->thread_binding.value()->thread_tag);
       } else {
         LOG(FATAL) << "ValueError: Unknown ForKind: " << tir::ForKind2String(loop->kind);
@@ -117,6 +117,8 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       return ForDoc(lhs, rhs, (*f)->stmts);
     });
 
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::ForNode>(ReprPrint);
+
 }  // namespace printer
 }  // namespace script
 }  // namespace tvm
diff --git a/src/script/printer/tir/function.cc b/src/script/printer/tir/function.cc
index d47a60209e43..55e8c075deb7 100644
--- a/src/script/printer/tir/function.cc
+++ b/src/script/printer/tir/function.cc
@@ -36,11 +36,7 @@ String FindFunctionName(const IRDocsifier& d, const tir::PrimFunc& f) {
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::PrimFunc>("", [](tir::PrimFunc func, ObjectPath p, IRDocsifier d) -> Doc {
-      d->SetCommonPrefix(func, [](const ObjectRef& obj) {
-        return obj->IsInstance<tir::VarNode>() || obj->IsInstance<tir::BufferNode>();
-      });
-      With<TIRFrame> frame(d, func);
-      (*frame)->AddDispatchToken(d, "tir");
+      With<TIRFrame> frame(MakeDispatchFrame(d, func, func));
       int n_args = func->params.size();
       // Step 1. Handle `func->params`
       Array<AssignDoc> args;
@@ -54,8 +50,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       // Step 2. Handle `func->attrs`
       if (func->attrs.defined() && !func->attrs->dict.empty()) {
         (*frame)->stmts.push_back(
-            ExprStmtDoc(TIR(d)
-                            ->Attr("func_attr")  //
+            ExprStmtDoc(TIR("func_attr")  //
                             ->Call({d->AsDoc<ExprDoc>(func->attrs, p->Attr("attrs"))})));
       }
       // Step 3. Handle `func->buffer_map`
@@ -76,11 +71,17 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       return FunctionDoc(
           /*name=*/IdDoc(FindFunctionName(d, func)),
           /*args=*/args,
-          /*decorators=*/{TIR(d)->Attr("prim_func")},
+          /*decorators=*/{TIR("prim_func")},
           /*return_type=*/d->AsDoc<ExprDoc>(func->ret_type, p->Attr("ret_type")),
           /*body=*/(*frame)->stmts);
     });
 
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<tir::PrimFuncNode>([](const ObjectRef& obj, ReprPrinter* p) {
+      std::string res = DocToPythonScript(IRDocsifier()->AsDoc(obj, ObjectPath::Root()));
+      p->stream << res;
+    });
+
 }  // namespace printer
 }  // namespace script
 }  // namespace tvm
diff --git a/src/script/printer/tir/ir.cc b/src/script/printer/tir/ir.cc
index f4e3762fc022..5fea278a4444 100644
--- a/src/script/printer/tir/ir.cc
+++ b/src/script/printer/tir/ir.cc
@@ -34,8 +34,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       } else if (dtype == DataType::Bool()) {
         return LiteralDoc::Boolean(imm->value);
       } else {
-        return TIR(d)  //
-            ->Attr(runtime::DLDataType2String(dtype))
+        return TIR(runtime::DLDataType2String(dtype))  //
             ->Call({LiteralDoc::Int(imm->value)});
       }
     });
@@ -46,15 +45,14 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       if (dtype == Default::FloatDType()) {
         return LiteralDoc::Float(imm->value);
       } else {
-        return TIR(d)
-            ->Attr(runtime::DLDataType2String(dtype))
+        return TIR(runtime::DLDataType2String(dtype))  //
             ->Call({LiteralDoc::Float(imm->value)});
       }
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<Range>("", [](Range range, ObjectPath p, IRDocsifier d) -> Doc {
-      return TIR(d)->Attr("Range")->Call({
+      return TIR("Range")->Call({
           d->AsDoc<ExprDoc>(range->min, p->Attr("min")),
           d->AsDoc<ExprDoc>(range->extent, p->Attr("extent")),
       });
@@ -63,16 +61,23 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<PrimType>("", [](PrimType ty, ObjectPath p, IRDocsifier d) -> Doc {
       std::string dtype = ty->dtype.is_void() ? "void" : runtime::DLDataType2String(ty->dtype);
-      return TIR(d)->Attr(dtype);
+      return TIR(dtype);
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<PointerType>("", [](PointerType ty, ObjectPath p, IRDocsifier d) -> Doc {
-      ExprDoc element_type = d->AsDoc<ExprDoc>(ty->element_type, p->Attr("element_type"));
+      ExprDoc element_type{nullptr};
+      if (const auto* prim_type = ty->element_type.as<PrimTypeNode>()) {
+        std::string dtype =
+            prim_type->dtype.is_void() ? "void" : runtime::DLDataType2String(prim_type->dtype);
+        element_type = LiteralDoc::Str(dtype);
+      } else {
+        element_type = d->AsDoc<ExprDoc>(ty->element_type, p->Attr("element_type"));
+      }
       if (ty->storage_scope == "") {
-        return TIR(d)->Attr("Ptr")->Call({element_type});
+        return TIR("Ptr")->Call({element_type});
       } else {
-        return TIR(d)->Attr("Ptr")->Call({element_type, LiteralDoc::Str(ty->storage_scope)});
+        return TIR("Ptr")->Call({element_type, LiteralDoc::Str(ty->storage_scope)});
       }
     });
 
@@ -81,17 +86,28 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       if (ty->fields.empty()) {
         return LiteralDoc::None();
       }
-      return TIR(d)  //
-          ->Attr("Tuple")
-          ->Call(d->AsDoc<ListDoc>(ty->fields, p->Attr("fields"))->elements);
+      return TIR("Tuple")->Call(d->AsDoc<ListDoc>(ty->fields, p->Attr("fields"))->elements);
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<IncompleteType>("", [](IncompleteType ty, ObjectPath p, IRDocsifier d) -> Doc {
+      return TIR("IncompleteType")->Call({});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<Target>("", [](Target target, ObjectPath p, IRDocsifier d) -> Doc {
       Map<String, ObjectRef> config = target->Export();
-      return TIR(d)->Attr("target")->Call({d->AsDoc<ExprDoc>(config, p)});
+      return TIR("target")->Call({d->AsDoc<ExprDoc>(config, p)});
     });
 
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<IntImmNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<FloatImmNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<RangeNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<PrimTypeNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<PointerTypeNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<TupleTypeNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<IncompleteTypeNode>(ReprPrint);
+
 }  // namespace printer
 }  // namespace script
 }  // namespace tvm
diff --git a/src/script/printer/tir/stmt.cc b/src/script/printer/tir/stmt.cc
index 03e5657d24b7..436f2b202d85 100644
--- a/src/script/printer/tir/stmt.cc
+++ b/src/script/printer/tir/stmt.cc
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include "../../../tir/transforms/ir_utils.h"
+#include "../../../tir/transforms/ir_utils.h"  // For `GetPtrStorageScope`
 #include "./utils.h"
 
 namespace tvm {
@@ -51,7 +51,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       if (eval->value->IsInstance<tir::CallNode>()) {
         return ExprStmtDoc(value);
       }
-      return ExprStmtDoc(TIR(d)->Attr("evaluate")->Call({value}));
+      return ExprStmtDoc(TIR("evaluate")->Call({value}));
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
@@ -75,7 +75,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         stmts->insert(stmts->begin(), AssignDoc(lhs, rhs, type_doc));
         return StmtBlockDoc(*stmts);
       } else {
-        rhs = TIR(d)->Attr("let")->Call({lhs, rhs});
+        rhs = TIR("let")->Call({lhs, rhs});
         return ScopeDoc(NullOpt, rhs, *stmts);
       }
     });
@@ -93,7 +93,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
             stmts->insert(stmts->begin(), AssertDoc(cond, msg));
             return StmtBlockDoc(*stmts);
           }
-          return ScopeDoc(NullOpt, TIR(d)->Attr("Assert")->Call({cond, msg}), (*f)->stmts);
+          return ScopeDoc(NullOpt, TIR("Assert")->Call({cond, msg}), (*f)->stmts);
         });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
@@ -137,7 +137,6 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::SeqStmt>("", [](tir::SeqStmt stmt, ObjectPath p, IRDocsifier d) -> Doc {
-      // TODO(@junrushao): revisit for fragment printing
       With<TIRFrame> f(d, stmt);
       AsDocBody(stmt, p, f->get(), d);
       return StmtBlockDoc((*f)->stmts);
@@ -146,8 +145,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Prefetch>(  //
         "", [](tir::Prefetch stmt, ObjectPath p, IRDocsifier d) -> Doc {
-          return ExprStmtDoc(TIR(d)
-                                 ->Attr("prefetch")
+          return ExprStmtDoc(TIR("prefetch")
                                  ->Call({
                                      d->AsDoc<ExprDoc>(stmt->buffer, p->Attr("buffer")),
                                      d->AsDoc<ExprDoc>(stmt->bounds, p->Attr("bounds")),
@@ -174,7 +172,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           }
           ExprDoc lhs = DefineVar(stmt->buffer_var, d->frames.back(), d);
           With<TIRFrame> f(d, stmt);
-          ExprDoc rhs = TIR(d)->Attr("allocate")->Call(args, kwargs_keys, kwargs_values);
+          ExprDoc rhs = TIR("allocate")->Call(args, kwargs_keys, kwargs_values);
           AsDocBody(stmt->body, p->Attr("body"), f->get(), d);
           return DoConciseScoping(lhs, rhs, &(*f)->stmts, concise);
         });
@@ -253,7 +251,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           args.push_back(data_doc);
           args.push_back(LiteralDoc::DataType(stmt->dtype));
           args.push_back(d->AsDoc<ExprDoc>(stmt->extents, p->Attr("extents")));
-          ExprDoc rhs = TIR(d)->Attr("allocate_const")->Call(args, kwargs_keys, kwargs_values);
+          ExprDoc rhs = TIR("allocate_const")->Call(args, kwargs_keys, kwargs_values);
           With<TIRFrame> f(d, stmt);
           ExprDoc lhs = DefineVar(stmt->buffer_var, *f, d);
           AsDocBody(stmt->body, p->Attr("body"), f->get(), d);
@@ -286,7 +284,7 @@ ExprDoc DocsifyBufferRealize(const tir::BufferRealizeNode* stmt, Optional<ExprDo
     kwargs_keys.push_back("condition");
     kwargs_values.push_back(d->AsDoc<ExprDoc>(stmt->condition, p->Attr("condition")));
   }
-  return TIR(d)->Attr("realize")->Call(args, kwargs_keys, kwargs_values);
+  return TIR("realize")->Call(args, kwargs_keys, kwargs_values);
 }
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
@@ -326,13 +324,10 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
                 DefineVar(iter_var->var, f, d);
                 f->stmts.push_back(
                     AssignDoc(d->AsDoc<ExprDoc>(iter_var->var, p->Attr("node")->Attr("var")),
-                              TIR(d)  //
-                                  ->Attr("env_thread")
-                                  ->Call({LiteralDoc::Str(iter_var->thread_tag)}),  //
+                              TIR("env_thread")->Call({LiteralDoc::Str(iter_var->thread_tag)}),  //
                               NullOpt));
               }
-              rhs = TIR(d)
-                        ->Attr("launch_thread")
+              rhs = TIR("launch_thread")
                         ->Call({
                             d->AsDoc<ExprDoc>(iter_var->var, p->Attr("node")),
                             d->AsDoc<ExprDoc>(stmt->value, p->Attr("value")),
@@ -340,7 +335,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
             }
           }
           if (!rhs.defined()) {
-            rhs = TIR(d)->Attr("attr")->Call({
+            rhs = TIR("attr")->Call({
                 d->AsDoc<ExprDoc>(stmt->node, p->Attr("node")),
                 LiteralDoc::Str(stmt->attr_key),
                 d->AsDoc<ExprDoc>(stmt->value, p->Attr("value")),
@@ -351,24 +346,26 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           return DoConciseScoping(NullOpt, rhs.value(), &(*f)->stmts, concise);
         });
 
-TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<tir::ProducerRealize>(  //
-        "", [](tir::ProducerRealize stmt, ObjectPath p, IRDocsifier d) -> Doc {
-          LOG(FATAL) << "ValueError: ProducerRealize should never exist in TIR: " << stmt;
-        });
-
-TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<tir::ProducerStore>(  //
-        "", [](tir::ProducerStore stmt, ObjectPath p, IRDocsifier d) -> Doc {
-          LOG(FATAL) << "ValueError: ProducerStore should never exist in TIR: " << stmt;
-        });
-
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Store>(  //
         "", [](tir::Store stmt, ObjectPath p, IRDocsifier d) -> Doc {
           LOG(FATAL) << "ValueError: Store has been deprecated for BufferStore: " << stmt;
         });
 
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::LetStmtNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::AttrStmtNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::AssertStmtNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::WhileNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::AllocateNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::AllocateConstNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::DeclBufferNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::PrefetchNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::SeqStmtNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::IfThenElseNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::EvaluateNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BufferRealizeNode>(ReprPrint);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::StoreNode>(ReprPrint);
+
 }  // namespace printer
 }  // namespace script
 }  // namespace tvm
diff --git a/src/script/printer/tir/utils.h b/src/script/printer/tir/utils.h
index 6cae378d0e69..7f67c3a11c73 100644
--- a/src/script/printer/tir/utils.h
+++ b/src/script/printer/tir/utils.h
@@ -28,6 +28,7 @@
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt.h>
 
+#include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -70,9 +71,7 @@ class TIRFrame : public Frame {
 };
 
 /*! \brief Creates the TIR common prefix, which is by default `T` */
-inline IdDoc TIR(const IRDocsifier& d) {  //
-  return IdDoc(d->ir_prefix.Get("tir").value_or("T"));
-}
+inline ExprDoc TIR(const String& attr) { return IdDoc(Default::Prefix("tir"))->Attr(attr); }
 
 /*!
  * \brief Defines a variable in the IRDocsifier at the given frame,
@@ -141,10 +140,15 @@ inline Optional<Frame> FindLowestVarDef(const ObjectRef& var, const IRDocsifier&
   }
   int n_frames = d->frames.size();
   std::unordered_map<const Object*, const FrameNode*> tir_to_frame;
+  const FrameNode* fallback_frame = nullptr;
   tir_to_frame.reserve(n_frames);
   for (int i = n_frames - 1; i >= 0; --i) {
     if (const auto* f = d->frames[i].as<TIRFrameNode>()) {
-      tir_to_frame[f->tir.get()] = f;
+      if (f->tir.defined()) {
+        tir_to_frame[f->tir.get()] = f;
+      } else if (fallback_frame == nullptr) {
+        fallback_frame = f;
+      }
     }
   }
   const std::vector<const Object*>& path = d->common_prefix.at(var.get());
@@ -153,9 +157,52 @@ inline Optional<Frame> FindLowestVarDef(const ObjectRef& var, const IRDocsifier&
       return GetRef<Frame>(tir_to_frame.at(*it));
     }
   }
+  if (fallback_frame != nullptr) {
+    return GetRef<Frame>(fallback_frame);
+  }
   return NullOpt;
 }
 
+/*!
+ * \brief Create a frame and add dispatch token. Calculate LCA information for the frame.
+ * \param d The IRDocsifier
+ * \param root The root of the TIR AST
+ * \param tir The TIR to be saved in the new TIR frame
+ * \return The frame created
+ */
+inline TIRFrame MakeDispatchFrame(const IRDocsifier& d, const ObjectRef& root,
+                                  const ObjectRef& tir) {
+  d->SetCommonPrefix(root, [](const ObjectRef& obj) {
+    return obj->IsInstance<tir::VarNode>() || obj->IsInstance<tir::BufferNode>();
+  });
+  TIRFrame frame(d, tir);
+  frame->AddDispatchToken(d, "tir");
+  return frame;
+}
+
+/*! \brief Redirected method for the ReprPrinter */
+inline void ReprPrint(const ObjectRef& stmt, ReprPrinter* p) {
+  IRDocsifier d;
+  With<TIRFrame> f(MakeDispatchFrame(d, stmt, ObjectRef(nullptr)));
+  Doc doc = d->AsDoc(stmt, ObjectPath::Root());
+  if (const auto* expr_doc = doc.as<ExprDocNode>()) {
+    if (!Default::VerboseExpr()) {
+      (*f)->stmts.clear();
+    }
+    (*f)->stmts.push_back(ExprStmtDoc(GetRef<ExprDoc>(expr_doc)));
+  } else if (const auto* stmt_doc = doc.as<StmtDocNode>()) {
+    (*f)->stmts.push_back(GetRef<StmtDoc>(stmt_doc));
+  } else if (const auto* stmt_block = doc.as<StmtBlockDocNode>()) {
+    for (const StmtDoc& d : stmt_block->stmts) {
+      (*f)->stmts.push_back(d);
+    }
+  } else {
+    LOG(FATAL) << "TypeError: Unexpected doc type: " << doc->GetTypeKey();
+  }
+  std::string res = DocToPythonScript(StmtBlockDoc((*f)->stmts));
+  p->stream << res;
+}
+
 /*!
  * \brief Declare and define a buffer
  * \param buffer The buffer to be defined
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index 0dfda954b818..c2e6fad42dce 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -612,12 +612,6 @@ tir::Buffer BufferWithOffsetAlignment(Array<PrimExpr> shape, DataType dtype, std
                      offset_factor, buffer_type);
 }
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<BufferNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const BufferNode*>(node.get());
-      p->stream << "buffer(" << op->name << ", " << op << ")";
-    });
-
 TVM_REGISTER_NODE_TYPE(BufferNode);
 
 TVM_REGISTER_GLOBAL("tir.Buffer").set_body([](TVMArgs args, TVMRetValue* ret) {
diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc
index daae7eaf68f5..40606761f804 100644
--- a/src/tir/ir/expr.cc
+++ b/src/tir/ir/expr.cc
@@ -116,14 +116,6 @@ TVM_REGISTER_GLOBAL("tir.Var").set_body_typed([](String name_hint, runtime::TVMA
 
 TVM_REGISTER_NODE_TYPE(VarNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<VarNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const VarNode*>(node.get());
-      // omit the type
-      // stream << op->name << "." << op->type;
-      p->stream << op->name_hint;
-    });
-
 // SizeVar
 SizeVar::SizeVar(String name_hint, DataType dtype, Span span) {
   auto n = make_object<SizeVarNode>();
@@ -140,12 +132,6 @@ TVM_REGISTER_GLOBAL("tir.SizeVar").set_body_typed([](String s, DataType t, Span
 
 TVM_REGISTER_NODE_TYPE(SizeVarNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<SizeVarNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const SizeVarNode*>(node.get());
-      p->stream << "{" << op->name_hint << "|" << op->name_hint << ">=0}";
-    });
-
 // IterVar
 IterVar::IterVar(Range dom, Var var, IterVarType t, String thread_tag, Span span) {
   ObjectPtr<IterVarNode> n = make_object<IterVarNode>();
@@ -171,22 +157,6 @@ TVM_REGISTER_GLOBAL("tir.IterVar")
       return IterVar(dom, var, static_cast<IterVarType>(iter_type), thread_tag, span);
     });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<IterVarNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const IterVarNode*>(node.get());
-      p->stream << "iter_var(";
-      if (op->var->name_hint.length() != 0) {
-        p->stream << op->var->name_hint << ", ";
-      }
-      if (op->dom.defined()) {
-        p->stream << op->dom;
-      }
-      if (op->thread_tag.length() != 0) {
-        p->stream << ", " << op->thread_tag;
-      }
-      p->stream << ")";
-    });
-
 TVM_REGISTER_NODE_TYPE(IterVarNode);
 
 // StringImm
@@ -204,12 +174,6 @@ TVM_REGISTER_GLOBAL("tir.StringImm").set_body_typed([](String value, Span span)
 
 TVM_REGISTER_NODE_TYPE(StringImmNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<StringImmNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const StringImmNode*>(node.get());
-      p->stream << '\"' << support::StrEscape(op->value) << '\"';
-    });
-
 // Cast
 Cast::Cast(DataType t, PrimExpr value, Span span) {
   ICHECK(value.defined());
@@ -227,14 +191,6 @@ TVM_REGISTER_GLOBAL("tir.Cast").set_body_typed([](DataType dtype, PrimExpr value
 
 TVM_REGISTER_NODE_TYPE(CastNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<CastNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const CastNode*>(node.get());
-      p->stream << op->dtype << '(';
-      p->Print(op->value);
-      p->stream << ')';
-    });
-
 // Add
 TVM_DEFINE_BINOP_CONSTRUCTOR(Add);
 
@@ -244,16 +200,6 @@ TVM_REGISTER_GLOBAL("tir.Add").set_body_typed([](PrimExpr a, PrimExpr b, Span sp
 
 TVM_REGISTER_NODE_TYPE(AddNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<AddNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const AddNode*>(node.get());
-      p->stream << '(';
-      p->Print(op->a);
-      p->stream << " + ";
-      p->Print(op->b);
-      p->stream << ')';
-    });
-
 // Sub
 TVM_DEFINE_BINOP_CONSTRUCTOR(Sub);
 
@@ -263,16 +209,6 @@ TVM_REGISTER_GLOBAL("tir.Sub").set_body_typed([](PrimExpr a, PrimExpr b, Span sp
 
 TVM_REGISTER_NODE_TYPE(SubNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<SubNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const SubNode*>(node.get());
-      p->stream << '(';
-      p->Print(op->a);
-      p->stream << " - ";
-      p->Print(op->b);
-      p->stream << ')';
-    });
-
 // Mul
 TVM_DEFINE_BINOP_CONSTRUCTOR(Mul);
 
@@ -282,16 +218,6 @@ TVM_REGISTER_GLOBAL("tir.Mul").set_body_typed([](PrimExpr a, PrimExpr b, Span sp
 
 TVM_REGISTER_NODE_TYPE(MulNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<MulNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const MulNode*>(node.get());
-      p->stream << '(';
-      p->Print(op->a);
-      p->stream << "*";
-      p->Print(op->b);
-      p->stream << ')';
-    });
-
 // Div
 TVM_DEFINE_BINOP_CONSTRUCTOR(Div);
 
@@ -301,16 +227,6 @@ TVM_REGISTER_GLOBAL("tir.Div").set_body_typed([](PrimExpr a, PrimExpr b, Span sp
 
 TVM_REGISTER_NODE_TYPE(DivNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<DivNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const DivNode*>(node.get());
-      p->stream << '(';
-      p->Print(op->a);
-      p->stream << "/";
-      p->Print(op->b);
-      p->stream << ')';
-    });
-
 // Mod
 TVM_DEFINE_BINOP_CONSTRUCTOR(Mod);
 
@@ -320,16 +236,6 @@ TVM_REGISTER_GLOBAL("tir.Mod").set_body_typed([](PrimExpr a, PrimExpr b, Span sp
 
 TVM_REGISTER_NODE_TYPE(ModNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<ModNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const ModNode*>(node.get());
-      p->stream << '(';
-      p->Print(op->a);
-      p->stream << " % ";
-      p->Print(op->b);
-      p->stream << ')';
-    });
-
 // FloorDiv
 TVM_DEFINE_BINOP_CONSTRUCTOR(FloorDiv);
 
@@ -339,12 +245,6 @@ TVM_REGISTER_GLOBAL("tir.FloorDiv").set_body_typed([](PrimExpr a, PrimExpr b, Sp
 
 TVM_REGISTER_NODE_TYPE(FloorDivNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<FloorDivNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const FloorDivNode*>(node.get());
-      p->stream << "floordiv(" << op->a << ", " << op->b << ")";
-    });
-
 // FloorMod
 TVM_DEFINE_BINOP_CONSTRUCTOR(FloorMod);
 
@@ -354,12 +254,6 @@ TVM_REGISTER_GLOBAL("tir.FloorMod").set_body_typed([](PrimExpr a, PrimExpr b, Sp
 
 TVM_REGISTER_NODE_TYPE(FloorModNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<FloorModNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const FloorModNode*>(node.get());
-      p->stream << "floormod(" << op->a << ", " << op->b << ")";
-    });
-
 // Min
 TVM_DEFINE_BINOP_CONSTRUCTOR(Min);
 
@@ -369,16 +263,6 @@ TVM_REGISTER_GLOBAL("tir.Min").set_body_typed([](PrimExpr a, PrimExpr b, Span sp
 
 TVM_REGISTER_NODE_TYPE(MinNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<MinNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const MinNode*>(node.get());
-      p->stream << "min(";
-      p->Print(op->a);
-      p->stream << ", ";
-      p->Print(op->b);
-      p->stream << ")";
-    });
-
 // Max
 TVM_DEFINE_BINOP_CONSTRUCTOR(Max);
 
@@ -388,16 +272,6 @@ TVM_REGISTER_GLOBAL("tir.Max").set_body_typed([](PrimExpr a, PrimExpr b, Span sp
 
 TVM_REGISTER_NODE_TYPE(MaxNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<MaxNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const MaxNode*>(node.get());
-      p->stream << "max(";
-      p->Print(op->a);
-      p->stream << ", ";
-      p->Print(op->b);
-      p->stream << ")";
-    });
-
 // EQ
 TVM_DEFINE_CMPOP_CONSTRUCTOR(EQ);
 
@@ -407,16 +281,6 @@ TVM_REGISTER_GLOBAL("tir.EQ").set_body_typed([](PrimExpr a, PrimExpr b, Span spa
 
 TVM_REGISTER_NODE_TYPE(EQNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<EQNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const EQNode*>(node.get());
-      p->stream << '(';
-      p->Print(op->a);
-      p->stream << " == ";
-      p->Print(op->b);
-      p->stream << ')';
-    });
-
 // NE
 TVM_DEFINE_CMPOP_CONSTRUCTOR(NE);
 
@@ -426,16 +290,6 @@ TVM_REGISTER_GLOBAL("tir.NE").set_body_typed([](PrimExpr a, PrimExpr b, Span spa
 
 TVM_REGISTER_NODE_TYPE(NENode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<NENode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const NENode*>(node.get());
-      p->stream << '(';
-      p->Print(op->a);
-      p->stream << " != ";
-      p->Print(op->b);
-      p->stream << ')';
-    });
-
 // LT
 TVM_DEFINE_CMPOP_CONSTRUCTOR(LT);
 
@@ -445,16 +299,6 @@ TVM_REGISTER_GLOBAL("tir.LT").set_body_typed([](PrimExpr a, PrimExpr b, Span spa
 
 TVM_REGISTER_NODE_TYPE(LTNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<LTNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const LTNode*>(node.get());
-      p->stream << '(';
-      p->Print(op->a);
-      p->stream << " < ";
-      p->Print(op->b);
-      p->stream << ')';
-    });
-
 // LE
 TVM_DEFINE_CMPOP_CONSTRUCTOR(LE);
 
@@ -464,16 +308,6 @@ TVM_REGISTER_GLOBAL("tir.LE").set_body_typed([](PrimExpr a, PrimExpr b, Span spa
 
 TVM_REGISTER_NODE_TYPE(LENode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<LENode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const LENode*>(node.get());
-      p->stream << '(';
-      p->Print(op->a);
-      p->stream << " <= ";
-      p->Print(op->b);
-      p->stream << ')';
-    });
-
 // GT
 TVM_DEFINE_CMPOP_CONSTRUCTOR(GT);
 
@@ -483,16 +317,6 @@ TVM_REGISTER_GLOBAL("tir.GT").set_body_typed([](PrimExpr a, PrimExpr b, Span spa
 
 TVM_REGISTER_NODE_TYPE(GTNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<GTNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const GTNode*>(node.get());
-      p->stream << '(';
-      p->Print(op->a);
-      p->stream << " > ";
-      p->Print(op->b);
-      p->stream << ')';
-    });
-
 // GE
 TVM_DEFINE_CMPOP_CONSTRUCTOR(GE);
 
@@ -502,16 +326,6 @@ TVM_REGISTER_GLOBAL("tir.GE").set_body_typed([](PrimExpr a, PrimExpr b, Span spa
 
 TVM_REGISTER_NODE_TYPE(GENode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<GENode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const GENode*>(node.get());
-      p->stream << '(';
-      p->Print(op->a);
-      p->stream << " >= ";
-      p->Print(op->b);
-      p->stream << ')';
-    });
-
 // And
 And::And(PrimExpr a, PrimExpr b, Span span) {
   ICHECK(a.defined()) << "ValueError: a is undefined";
@@ -534,16 +348,6 @@ TVM_REGISTER_GLOBAL("tir.And").set_body_typed([](PrimExpr a, PrimExpr b, Span sp
 
 TVM_REGISTER_NODE_TYPE(AndNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<AndNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const AndNode*>(node.get());
-      p->stream << '(';
-      p->Print(op->a);
-      p->stream << " && ";
-      p->Print(op->b);
-      p->stream << ')';
-    });
-
 // Or
 Or::Or(PrimExpr a, PrimExpr b, Span span) {
   ICHECK(a.defined()) << "ValueError: a is undefined";
@@ -566,16 +370,6 @@ TVM_REGISTER_GLOBAL("tir.Or").set_body_typed([](PrimExpr a, PrimExpr b, Span spa
 
 TVM_REGISTER_NODE_TYPE(OrNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<OrNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const OrNode*>(node.get());
-      p->stream << '(';
-      p->Print(op->a);
-      p->stream << " || ";
-      p->Print(op->b);
-      p->stream << ')';
-    });
-
 // Not
 Not::Not(PrimExpr a, Span span) {
   ICHECK(a.defined()) << "ValueError: a is undefined";
@@ -592,13 +386,6 @@ TVM_REGISTER_GLOBAL("tir.Not").set_body_typed([](PrimExpr a, Span span) { return
 
 TVM_REGISTER_NODE_TYPE(NotNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<NotNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const NotNode*>(node.get());
-      p->stream << '!';
-      p->Print(op->a);
-    });
-
 // Select
 Select::Select(PrimExpr condition, PrimExpr true_value, PrimExpr false_value, Span span) {
   ICHECK(condition.defined()) << "ValueError: condition is undefined";
@@ -624,18 +411,6 @@ TVM_REGISTER_GLOBAL("tir.Select")
 
 TVM_REGISTER_NODE_TYPE(SelectNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<SelectNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const SelectNode*>(node.get());
-      p->stream << "select(";
-      p->Print(op->condition);
-      p->stream << ", ";
-      p->Print(op->true_value);
-      p->stream << ", ";
-      p->Print(op->false_value);
-      p->stream << ")";
-    });
-
 // Load
 Load::Load(DataType dtype, Var buffer_var, PrimExpr index, PrimExpr predicate, Span span) {
   LOG(FATAL) << "Unexpected use of deprecated Store node for buffer " << buffer_var->name_hint
@@ -703,18 +478,6 @@ TVM_REGISTER_GLOBAL("tir.Load").set_body([](TVMArgs args, TVMRetValue* ret) {
 
 TVM_REGISTER_NODE_TYPE(LoadNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<LoadNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const LoadNode*>(node.get());
-      p->stream << op->buffer_var << "[";
-      p->Print(op->index);
-      p->stream << "]";
-      if (!is_one(op->predicate)) {
-        p->stream << " if ";
-        p->Print(op->predicate);
-      }
-    });
-
 // Ramp
 Ramp::Ramp(PrimExpr base, PrimExpr stride, int lanes, Span span) {
   ICHECK(base.defined());
@@ -740,16 +503,6 @@ TVM_REGISTER_GLOBAL("tir.Ramp")
 
 TVM_REGISTER_NODE_TYPE(RampNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<RampNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const RampNode*>(node.get());
-      p->stream << "ramp(";
-      p->Print(op->base);
-      p->stream << ", ";
-      p->Print(op->stride);
-      p->stream << ", " << op->lanes << ")";
-    });
-
 // Broadcast
 Broadcast::Broadcast(PrimExpr value, int lanes, Span span) {
   ICHECK(value.defined());
@@ -770,14 +523,6 @@ TVM_REGISTER_GLOBAL("tir.Broadcast").set_body_typed([](PrimExpr value, int lanes
 
 TVM_REGISTER_NODE_TYPE(BroadcastNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<BroadcastNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const BroadcastNode*>(node.get());
-      p->stream << "x" << op->lanes << "(";
-      p->Print(op->value);
-      p->stream << ")";
-    });
-
 // Let
 Let::Let(Var var, PrimExpr value, PrimExpr body, Span span) {
   ICHECK(value.defined());
@@ -800,16 +545,6 @@ TVM_REGISTER_GLOBAL("tir.Let").set_body_typed([](Var var, PrimExpr value, PrimEx
 
 TVM_REGISTER_NODE_TYPE(LetNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<LetNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const LetNode*>(node.get());
-      p->stream << "(let " << op->var << " = ";
-      p->Print(op->value);
-      p->stream << " in ";
-      p->Print(op->body);
-      p->stream << ")";
-    });
-
 // Call
 Call::Call(DataType dtype, RelayExpr op, Array<PrimExpr> args, Span span) {
   for (size_t i = 0; i < args.size(); ++i) {
@@ -857,25 +592,6 @@ TVM_REGISTER_GLOBAL("tir.Call")
 
 TVM_REGISTER_NODE_TYPE(CallNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<CallNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const CallNode*>(node.get());
-      if (auto* ptr_op = op->op.as<OpNode>()) {
-        p->stream << ptr_op->name << "(";
-      } else {
-        auto* ptr_gvar = op->op.as<GlobalVarNode>();
-        ICHECK(ptr_gvar != nullptr);
-        p->stream << "@" << ptr_gvar->name_hint << "(";
-      }
-      for (size_t i = 0; i < op->args.size(); ++i) {
-        p->Print(op->args[i]);
-        if (i < op->args.size() - 1) {
-          p->stream << ", ";
-        }
-      }
-      p->stream << ")";
-    });
-
 // Shuffle
 Shuffle::Shuffle(Array<PrimExpr> vectors, Array<PrimExpr> indices, Span span) {
   ICHECK_NE(vectors.size(), 0U);
@@ -924,26 +640,6 @@ TVM_REGISTER_GLOBAL("tir.Shuffle")
 
 TVM_REGISTER_NODE_TYPE(ShuffleNode);
 
-template <typename T>
-void PrintList(const Array<T>& exprs, ReprPrinter* p) {
-  for (size_t i = 0; i < exprs.size(); ++i) {
-    p->Print(exprs[i]);
-    if (i < exprs.size() - 1) {
-      p->stream << ", ";
-    }
-  }
-}
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<ShuffleNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const ShuffleNode*>(node.get());
-      p->stream << "shuffle(";
-      PrintList(op->vectors, p);
-      p->stream << ", ";
-      PrintList(op->indices, p);
-      p->stream << ")";
-    });
-
 // CommReducer
 CommReducer::CommReducer(Array<Var> lhs, Array<Var> rhs, Array<PrimExpr> result,
                          Array<PrimExpr> identity_element, Span span) {
@@ -1009,13 +705,6 @@ TVM_REGISTER_GLOBAL("tir.CommReducerCombine")
 
 TVM_REGISTER_NODE_TYPE(CommReducerNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<CommReducerNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const CommReducerNode*>(node.get());
-      p->stream << "comm_reducer(result=" << op->result << ", lhs=" << op->lhs
-                << ", rhs=" << op->rhs << ", identity_element=" << op->identity_element << ")";
-    });
-
 // Reduce
 Reduce::Reduce(CommReducer combiner, Array<PrimExpr> source, Array<IterVar> axis,
                PrimExpr condition, int value_index, Array<PrimExpr> init, Span span) {
@@ -1057,18 +746,6 @@ TVM_REGISTER_GLOBAL("tir.Reduce")
 
 TVM_REGISTER_NODE_TYPE(ReduceNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<ReduceNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const ReduceNode*>(node.get());
-      p->stream << "reduce(combiner=" << op->combiner;
-      p->stream << ", source=" << op->source;
-      p->stream << ", init=" << op->init;
-      p->stream << ", axis=" << op->axis;
-      p->stream << ", where=" << op->condition;
-      p->stream << ", value_index=" << op->value_index;
-      p->stream << ")";
-    });
-
 // Any
 Any::Any(Span span) {
   auto n = make_object<AnyNode>();
@@ -1081,9 +758,6 @@ TVM_REGISTER_GLOBAL("tir.Any").set_body_typed([](Span span) { return Any(span);
 
 TVM_REGISTER_NODE_TYPE(AnyNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<AnyNode>([](const ObjectRef& node, ReprPrinter* p) { p->stream << "?"; });
-
 // BufferLoad
 void BufferLoadNode::LegalizeDType() {
   for (int i = 0; i < static_cast<int>(indices.size()) - 1; i++) {
@@ -1118,19 +792,6 @@ TVM_REGISTER_GLOBAL("tir.BufferLoad")
 
 TVM_REGISTER_NODE_TYPE(BufferLoadNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<BufferLoadNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const BufferLoadNode*>(node.get());
-      p->stream << op->buffer->name << "[";
-      for (size_t i = 0; i < op->indices.size(); ++i) {
-        p->Print(op->indices[i]);
-        if (i < op->indices.size() - 1) {
-          p->stream << ", ";
-        }
-      }
-      p->stream << "]";
-    });
-
 // ProducerLoad
 ProducerLoad::ProducerLoad(DataProducer producer, Array<PrimExpr> indices, Span span) {
   ObjectPtr<ProducerLoadNode> node = make_object<ProducerLoadNode>();
@@ -1148,17 +809,5 @@ TVM_REGISTER_GLOBAL("tir.ProducerLoad")
 
 TVM_REGISTER_NODE_TYPE(ProducerLoadNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<ProducerLoadNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const ProducerLoadNode*>(node.get());
-      p->stream << op->producer->GetNameHint() << "[";
-      for (size_t i = 0; i < op->indices.size(); ++i) {
-        p->Print(op->indices[i]);
-        if (i < op->indices.size() - 1) {
-          p->stream << ", ";
-        }
-      }
-      p->stream << "]";
-    });
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/ir/function.cc b/src/tir/ir/function.cc
index d4802e287693..5067d9083863 100644
--- a/src/tir/ir/function.cc
+++ b/src/tir/ir/function.cc
@@ -109,21 +109,6 @@ Optional<TensorIntrin> TensorIntrin::Get(String name, bool allow_missing) {
 
 TVM_REGISTER_NODE_TYPE(TensorIntrinNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<PrimFuncNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      // TODO(tvm-team) redirect to Text printer once we have a good text format.
-      auto* node = static_cast<const PrimFuncNode*>(ref.get());
-      p->stream << "PrimFunc(" << node->params << ") ";
-      if (node->attrs.defined()) {
-        p->stream << "attrs=" << node->attrs;
-      }
-      p->stream << " {\n";
-      p->indent += 2;
-      p->Print(node->body);
-      p->indent -= 2;
-      p->stream << "}\n";
-    });
-
 TVM_REGISTER_GLOBAL("tir.PrimFunc")
     .set_body_typed([](Array<tir::Var> params, Stmt body, Type ret_type,
                        Map<tir::Var, Buffer> buffer_map, DictAttrs attrs, Span span) {
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 03a2f29bd129..ee7e493b61e0 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -21,11 +21,10 @@
  * \file index_map.cc
  */
 
-#include "tvm/tir/index_map.h"
-
 #include <tvm/arith/analyzer.h>
 #include <tvm/arith/int_set.h>
 #include <tvm/arith/iter_affine_map.h>
+#include <tvm/tir/index_map.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 
diff --git a/src/tir/ir/legacy_printer.cc b/src/tir/ir/legacy_printer.cc
new file mode 100644
index 000000000000..4c2fd5037b65
--- /dev/null
+++ b/src/tir/ir/legacy_printer.cc
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include <sstream>
+
+#include "../../support/str_escape.h"
+
+namespace tvm {
+namespace tir {
+
+std::string LegacyTIRPrint(const ObjectRef& obj) {
+  using namespace tvm::tir;
+  class LegacyTIRPrinter : private tir::ExprVisitor {
+   public:
+    explicit LegacyTIRPrinter(std::ostream& os) : stream(os) {}
+
+    void Print(const ObjectRef& obj) {
+      if (const auto* op = obj.as<CommReducerNode>()) {
+        Print_(op);
+      } else if (const auto* op = obj.as<IterVarNode>()) {
+        Print_(op);
+      } else if (const auto* op = obj.as<RangeNode>()) {
+        Print_(op);
+      } else if (const auto* op = obj.as<OpNode>()) {
+        Print_(op);
+      } else {
+        VisitExpr(Downcast<PrimExpr>(obj));
+      }
+    }
+
+   private:
+    void VisitExpr_(const VarNode* op) final { stream << op->name_hint; }
+
+    void VisitExpr_(const SizeVarNode* op) final {
+      stream << "{" << op->name_hint << "|" << op->name_hint << ">=0}";
+    }
+
+    void VisitExpr_(const IntImmNode* op) final {
+      if (op->dtype == DataType::Int(32)) {
+        stream << op->value;
+      } else {
+        stream << "(" << op->dtype << ")" << op->value;
+      }
+    }
+
+    void VisitExpr_(const FloatImmNode* op) final {
+      switch (op->dtype.bits()) {
+        case 64:
+          stream << op->value;
+          break;
+        case 32:
+          stream << op->value << 'f';
+          break;
+        case 16:
+          stream << op->value << 'h';
+          break;
+        default:
+          LOG(FATAL) << "Unknown float type bits=" << op->dtype.bits();
+      }
+    }
+    void VisitExpr_(const StringImmNode* op) final {
+      stream << '\"' << support::StrEscape(op->value) << '\"';
+    }
+    void VisitExpr_(const CastNode* op) final {
+      stream << op->dtype << '(';
+      VisitExpr(op->value);
+      stream << ')';
+    }
+    void VisitExpr_(const AddNode* op) final { PrintBinary(op->a, op->b, " + "); }
+    void VisitExpr_(const SubNode* op) final { PrintBinary(op->a, op->b, " - "); }
+    void VisitExpr_(const MulNode* op) final { PrintBinary(op->a, op->b, "*"); }
+    void VisitExpr_(const DivNode* op) final { PrintBinary(op->a, op->b, "/"); }
+    void VisitExpr_(const ModNode* op) final { PrintBinary(op->a, op->b, " % "); }
+    void VisitExpr_(const FloorDivNode* op) final { PrintCall("floordiv", op->a, op->b); }
+    void VisitExpr_(const FloorModNode* op) final { PrintCall("floormod", op->a, op->b); }
+    void VisitExpr_(const MinNode* op) final { PrintCall("min", op->a, op->b); }
+    void VisitExpr_(const MaxNode* op) final { PrintCall("max", op->a, op->b); }
+    void VisitExpr_(const EQNode* op) final { PrintBinary(op->a, op->b, " == "); }
+    void VisitExpr_(const NENode* op) final { PrintBinary(op->a, op->b, " != "); }
+    void VisitExpr_(const LTNode* op) final { PrintBinary(op->a, op->b, " < "); }
+    void VisitExpr_(const LENode* op) final { PrintBinary(op->a, op->b, " <= "); }
+    void VisitExpr_(const GTNode* op) final { PrintBinary(op->a, op->b, " > "); }
+    void VisitExpr_(const GENode* op) final { PrintBinary(op->a, op->b, " >= "); }
+    void VisitExpr_(const AndNode* op) final { PrintBinary(op->a, op->b, " && "); }
+    void VisitExpr_(const OrNode* op) final { PrintBinary(op->a, op->b, " || "); }
+
+    void VisitExpr_(const NotNode* op) final {
+      stream << "!";
+      VisitExpr(op->a);
+    }
+
+    void VisitExpr_(const SelectNode* op) final {
+      stream << "select(";
+      VisitExpr(op->condition);
+      stream << ", ";
+      VisitExpr(op->true_value);
+      stream << ", ";
+      VisitExpr(op->false_value);
+      stream << ')';
+    }
+
+    void VisitExpr_(const RampNode* op) final {
+      stream << "ramp(";
+      VisitExpr(op->base);
+      stream << ", ";
+      VisitExpr(op->stride);
+      stream << ", " << op->lanes << ')';
+    }
+
+    void VisitExpr_(const BroadcastNode* op) final {
+      stream << "x" << op->lanes << "(";
+      VisitExpr(op->value);
+      stream << ")";
+    }
+
+    void VisitExpr_(const LetNode* op) final {
+      stream << "(let " << op->var << " = ";
+      VisitExpr(op->value);
+      stream << " in ";
+      VisitExpr(op->body);
+      stream << ")";
+    }
+
+    void VisitExpr_(const CallNode* op) final {
+      if (auto* ptr_op = op->op.as<OpNode>()) {
+        stream << ptr_op->name << "(";
+      } else {
+        auto* p = op->op.as<GlobalVarNode>();
+        ICHECK(p != nullptr);
+        stream << "@" << p->name_hint << "(";
+      }
+      for (size_t i = 0; i < op->args.size(); ++i) {
+        VisitExpr(op->args[i]);
+        if (i < op->args.size() - 1) {
+          stream << ", ";
+        }
+      }
+      stream << ")";
+    }
+
+    void VisitExpr_(const ShuffleNode* op) final {
+      stream << "shuffle(";
+      PrintList(op->vectors.GetArrayNode());
+      stream << ", ";
+      PrintList(op->indices.GetArrayNode());
+      stream << ")";
+    }
+
+    void VisitExpr_(const ReduceNode* op) final {
+      stream << "reduce(combiner=";
+      Print_(op->combiner.get());
+      stream << ", source=";
+      PrintList(op->source.GetArrayNode());
+      stream << ", init=";
+      PrintList(op->init.GetArrayNode());
+      stream << ", axis=";
+      PrintList(op->axis.GetArrayNode());
+      stream << ", where=";
+      VisitExpr(op->condition);
+      stream << ", value_index=" << op->value_index;
+      stream << ")";
+    }
+
+    void VisitExpr_(const AnyNode* op) final { stream << "?"; }
+
+    void VisitExpr_(const BufferLoadNode* op) final {
+      stream << op->buffer->name << "[";
+      for (size_t i = 0; i < op->indices.size(); ++i) {
+        VisitExpr(op->indices[i]);
+        if (i < op->indices.size() - 1) {
+          stream << ", ";
+        }
+      }
+      stream << "]";
+    }
+
+    void VisitExpr_(const ProducerLoadNode* op) final {
+      stream << op->producer->GetNameHint() << "[";
+      for (size_t i = 0; i < op->indices.size(); ++i) {
+        VisitExpr(op->indices[i]);
+        if (i < op->indices.size() - 1) {
+          stream << ", ";
+        }
+      }
+      stream << "]";
+    }
+
+   private:
+    void Print_(const CommReducerNode* op) {
+      stream << "comm_reducer(result=";
+      PrintList(op->result.GetArrayNode());
+      stream << ", lhs=";
+      PrintList(op->lhs.GetArrayNode());
+      stream << ", rhs=";
+      PrintList(op->rhs.GetArrayNode());
+      stream << ", identity_element=";
+      PrintList(op->identity_element.GetArrayNode());
+      stream << ")";
+    }
+
+    void Print_(const IterVarNode* op) {
+      stream << "{" << op->var->name_hint << "|" << op->var->name_hint << " in [";
+      VisitExpr(op->dom->min);
+      stream << ", ";
+      VisitExpr(op->dom->extent);
+      stream << ")}";
+    }
+
+    void Print_(const RangeNode* op) {
+      stream << "range(min=" << op->min << ", ext=" << op->extent << ')';
+    }
+
+    void Print_(const OpNode* op) { stream << "Op(" << op->name << ")"; }
+
+   private:
+    void PrintBinary(const PrimExpr& a, const PrimExpr& b, const std::string& sign) {
+      stream << '(';
+      VisitExpr(a);
+      stream << sign;
+      VisitExpr(b);
+      stream << ')';
+    }
+
+    void PrintCall(const std::string& call, const PrimExpr& a, const PrimExpr& b) {
+      stream << call << '(';
+      VisitExpr(a);
+      stream << ", ";
+      VisitExpr(b);
+      stream << ')';
+    }
+
+    void PrintList(const ArrayNode* exprs) {
+      int n = static_cast<int>(exprs->size());
+      for (int i = 0; i < n; ++i) {
+        VisitExpr(Downcast<PrimExpr>(exprs->at(i)));
+        if (i < n - 1) {
+          stream << ", ";
+        }
+      }
+    }
+
+    std::ostream& stream;
+  };
+  std::ostringstream os;
+  LegacyTIRPrinter(os).Print(obj);
+  return os.str();
+}
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index 1f4962489328..c01e6ccaec5f 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -60,16 +60,6 @@ TVM_REGISTER_GLOBAL("tir.LetStmt")
 
 TVM_REGISTER_NODE_TYPE(LetStmtNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<LetStmtNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const LetStmtNode*>(node.get());
-      p->PrintIndent();
-      p->stream << "let " << op->var << " = ";
-      p->Print(op->value);
-      p->stream << '\n';
-      p->Print(op->body);
-    });
-
 // AttrStmt
 AttrStmt::AttrStmt(ObjectRef node, String attr_key, PrimExpr value, Stmt body, Span span) {
   auto n = make_object<AttrStmtNode>();
@@ -88,18 +78,6 @@ TVM_REGISTER_GLOBAL("tir.AttrStmt")
 
 TVM_REGISTER_NODE_TYPE(AttrStmtNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<AttrStmtNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const AttrStmtNode*>(node.get());
-      p->PrintIndent();
-      p->stream << "// attr [";
-      p->Print(op->node);
-      p->stream << "] " << op->attr_key << " = ";
-      p->Print(op->value);
-      p->stream << '\n';
-      p->Print(op->body);
-    });
-
 // AssertStmt
 AssertStmt::AssertStmt(PrimExpr condition, PrimExpr message, Stmt body, Span span) {
   ICHECK(condition.defined());
@@ -126,18 +104,6 @@ TVM_REGISTER_GLOBAL("tir.AssertStmt")
       }
     });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<AssertStmtNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const AssertStmtNode*>(node.get());
-      p->PrintIndent();
-      p->stream << "assert(";
-      p->Print(op->condition);
-      p->stream << ", ";
-      p->Print(op->message);
-      p->stream << ")\n";
-      p->Print(op->body);
-    });
-
 // For
 For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForKind kind, Stmt body,
          Optional<IterVar> thread_binding, Map<String, ObjectRef> annotations, Span span) {
@@ -210,24 +176,6 @@ std::ostream& operator<<(std::ostream& out, ForKind type) {  // NOLINT(*)
   return out;
 }
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<ForNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const ForNode*>(node.get());
-      p->PrintIndent();
-      p->stream << op->kind << " (" << op->loop_var << ", ";
-      p->Print(op->min);
-      p->stream << ", ";
-      p->Print(op->extent);
-      p->stream << ") {\n";
-
-      p->indent += 2;
-      p->Print(op->body);
-      p->indent -= 2;
-
-      p->PrintIndent();
-      p->stream << "}\n";
-    });
-
 // While
 While::While(PrimExpr condition, Stmt body, Span span) {
   ICHECK(condition.defined());
@@ -248,18 +196,6 @@ TVM_REGISTER_GLOBAL("tir.While").set_body_typed([](PrimExpr condition, Stmt body
 
 TVM_REGISTER_NODE_TYPE(WhileNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<WhileNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const WhileNode*>(node.get());
-      p->PrintIndent();
-      p->stream << "while(" << op->condition << ") {\n";
-      p->indent += 2;
-      p->Print(op->body);
-      p->indent -= 2;
-      p->PrintIndent();
-      p->stream << "}\n";
-    });
-
 // Store
 Store::Store(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr predicate, Span span) {
   LOG(FATAL) << "Unexpected use of deprecated Store node for buffer " << buffer_var->name_hint
@@ -313,21 +249,6 @@ TVM_REGISTER_GLOBAL("tir.Store").set_body([](TVMArgs args, TVMRetValue* ret) {
 
 TVM_REGISTER_NODE_TYPE(StoreNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<StoreNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const StoreNode*>(node.get());
-      p->PrintIndent();
-      p->stream << op->buffer_var << "[";
-      p->Print(op->index);
-      p->stream << "] = ";
-      p->Print(op->value);
-      if (!is_one(op->predicate)) {
-        p->stream << " if ";
-        p->Print(op->predicate);
-      }
-      p->stream << '\n';
-    });
-
 // ProducerStore
 ProducerStore::ProducerStore(DataProducer producer, PrimExpr value, Array<PrimExpr> indices,
                              Span span) {
@@ -346,21 +267,6 @@ TVM_REGISTER_GLOBAL("tir.ProducerStore")
 
 TVM_REGISTER_NODE_TYPE(ProducerStoreNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<ProducerStoreNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const ProducerStoreNode*>(node.get());
-      p->PrintIndent();
-      p->stream << op->producer->GetNameHint() << "[";
-      for (size_t i = 0; i < op->indices.size(); ++i) {
-        p->Print(op->indices[i]);
-        if (i < op->indices.size() - 1) p->stream << ", ";
-      }
-      p->stream << "]";
-      p->stream << " =";
-      p->Print(op->value);
-      p->stream << '\n';
-    });
-
 // Allocate
 Allocate::Allocate(Var buffer_var, DataType dtype, Array<PrimExpr> extents, PrimExpr condition,
                    Stmt body, Map<String, ObjectRef> annotations, Span span) {
@@ -413,26 +319,6 @@ TVM_REGISTER_GLOBAL("tir.Allocate")
 
 TVM_REGISTER_NODE_TYPE(AllocateNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<AllocateNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const AllocateNode*>(node.get());
-      const auto* ptr_type = op->buffer_var->type_annotation.as<PointerTypeNode>();
-      ICHECK(ptr_type) << "The provided variable is not of pointer type";
-      p->PrintIndent();
-      p->stream << "allocate " << op->buffer_var << "[" << op->dtype;
-      for (size_t i = 0; i < op->extents.size(); ++i) {
-        p->stream << " * ";
-        p->Print(op->extents[i]);
-      }
-      p->stream << "], storage_scope = " << ptr_type->storage_scope;
-      if (!is_one(op->condition)) {
-        p->stream << " if ";
-        p->Print(op->condition);
-      }
-      p->stream << "\n";
-      p->Print(op->body);
-    });
-
 // Const
 // The constructor to create a IRNode with constant data
 // depending on the type of ObjectRef, it will either
@@ -495,20 +381,6 @@ TVM_REGISTER_GLOBAL("tir.AllocateConst")
 
 TVM_REGISTER_NODE_TYPE(AllocateConstNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<AllocateConstNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const AllocateConstNode*>(node.get());
-      p->PrintIndent();
-      p->stream << "constant " << op->buffer_var << "[" << op->dtype;
-      for (size_t i = 0; i < op->extents.size(); ++i) {
-        p->stream << " * ";
-        p->Print(op->extents[i]);
-      }
-      p->stream << "]";
-      p->stream << "\n";
-      p->Print(op->body);
-    });
-
 // DeclBuffer
 DeclBuffer::DeclBuffer(Buffer buffer, Stmt body, Span span) {
   ObjectPtr<DeclBufferNode> node = make_object<DeclBufferNode>();
@@ -524,14 +396,6 @@ TVM_REGISTER_GLOBAL("tir.DeclBuffer").set_body_typed([](Buffer buffer, Stmt body
 
 TVM_REGISTER_NODE_TYPE(DeclBufferNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<DeclBufferNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const DeclBufferNode*>(node.get());
-      p->PrintIndent();
-      p->stream << "decl_buffer " << op->buffer << "\n";
-      p->stream << op->body;
-    });
-
 // ProducerRealize
 ProducerRealize::ProducerRealize(DataProducer producer, Region bounds, PrimExpr condition,
                                  Stmt body, String storage_scope, Span span) {
@@ -563,34 +427,6 @@ TVM_REGISTER_GLOBAL("tir.ProducerRealize")
 
 TVM_REGISTER_NODE_TYPE(ProducerRealizeNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<ProducerRealizeNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const ProducerRealizeNode*>(node.get());
-      p->PrintIndent();
-      p->stream << "producer_realize " << op->producer->GetNameHint() << "(";
-      for (size_t i = 0; i < op->bounds.size(); ++i) {
-        p->stream << "[";
-        p->Print(op->bounds[i]->min);
-        p->stream << ", ";
-        p->Print(op->bounds[i]->extent);
-        p->stream << "]";
-        if (i < op->bounds.size() - 1) p->stream << ", ";
-      }
-      p->stream << ")";
-      if (!is_one(op->condition)) {
-        p->stream << " if ";
-        p->Print(op->condition);
-      }
-      p->stream << " {\n";
-
-      p->indent += 2;
-      p->Print(op->body);
-      p->indent -= 2;
-
-      p->PrintIndent();
-      p->stream << "}\n";
-    });
-
 // Prefetch
 Prefetch::Prefetch(Buffer buffer, Array<Range> bounds, Span span) {
   data_ = make_object<PrefetchNode>(buffer, bounds, span);
@@ -603,22 +439,6 @@ TVM_REGISTER_GLOBAL("tir.Prefetch")
 
 TVM_REGISTER_NODE_TYPE(PrefetchNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<PrefetchNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const PrefetchNode*>(node.get());
-      p->PrintIndent();
-      p->stream << "prefetch " << op->buffer << "(";
-      for (size_t i = 0; i < op->bounds.size(); ++i) {
-        p->stream << "[";
-        p->Print(op->bounds[i]->min);
-        p->stream << ", ";
-        p->Print(op->bounds[i]->extent);
-        p->stream << "]";
-        if (i < op->bounds.size() - 1) p->stream << ", ";
-      }
-      p->stream << ")";
-    });
-
 // SeqStmt
 SeqStmt::SeqStmt(Array<Stmt> seq, Span span) {
   auto node = make_object<SeqStmtNode>();
@@ -633,14 +453,6 @@ TVM_REGISTER_GLOBAL("tir.SeqStmt").set_body_typed([](Array<Stmt> seq, Span span)
 
 TVM_REGISTER_NODE_TYPE(SeqStmtNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<SeqStmtNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const SeqStmtNode*>(node.get());
-      for (Stmt stmt : op->seq) {
-        p->Print(stmt);
-      }
-    });
-
 // IfThenElse
 IfThenElse::IfThenElse(PrimExpr condition, Stmt then_case, Optional<Stmt> else_case, Span span) {
   ICHECK(condition.defined());
@@ -661,37 +473,6 @@ TVM_REGISTER_GLOBAL("tir.IfThenElse")
       return IfThenElse(condition, then_case, else_case, span);
     });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<IfThenElseNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const IfThenElseNode*>(node.get());
-      p->PrintIndent();
-      while (true) {
-        p->stream << "if (" << op->condition << ") {\n";
-        p->indent += 2;
-        p->Print(op->then_case);
-        p->indent -= 2;
-
-        if (!op->else_case) {
-          break;
-        }
-
-        if (const IfThenElseNode* nested_if = op->else_case.as<IfThenElseNode>()) {
-          p->PrintIndent();
-          p->stream << "} else ";
-          op = nested_if;
-        } else {
-          p->PrintIndent();
-          p->stream << "} else {\n";
-          p->indent += 2;
-          p->Print(op->else_case);
-          p->indent -= 2;
-          break;
-        }
-      }
-      p->PrintIndent();
-      p->stream << "}\n";
-    });
-
 // Evaluate
 Evaluate::Evaluate(PrimExpr value, Span span) {
   ICHECK(value.defined());
@@ -708,14 +489,6 @@ TVM_REGISTER_GLOBAL("tir.Evaluate").set_body_typed([](PrimExpr value, Span span)
 
 TVM_REGISTER_NODE_TYPE(EvaluateNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<EvaluateNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const EvaluateNode*>(node.get());
-      p->PrintIndent();
-      p->Print(op->value);
-      p->stream << "\n";
-    });
-
 // BufferStore
 BufferStore::BufferStore(Buffer buffer, PrimExpr value, Array<PrimExpr> indices, Span span) {
   ICHECK_EQ(buffer->shape.size(), indices.size())
@@ -751,21 +524,6 @@ TVM_REGISTER_GLOBAL("tir.BufferStore")
 
 TVM_REGISTER_NODE_TYPE(BufferStoreNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<BufferStoreNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const BufferStoreNode*>(node.get());
-      p->PrintIndent();
-      p->stream << op->buffer->name << "[";
-      for (size_t i = 0; i < op->indices.size(); ++i) {
-        p->Print(op->indices[i]);
-        if (i < op->indices.size() - 1) p->stream << ", ";
-      }
-      p->stream << "]";
-      p->stream << " = ";
-      p->Print(op->value);
-      p->stream << '\n';
-    });
-
 // BufferRealize
 BufferRealize::BufferRealize(Buffer buffer, Array<Range> bounds, PrimExpr condition, Stmt body,
                              Span span) {
@@ -778,34 +536,6 @@ TVM_REGISTER_GLOBAL("tir.BufferRealize")
 
 TVM_REGISTER_NODE_TYPE(BufferRealizeNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<BufferRealizeNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const BufferRealizeNode*>(node.get());
-      p->PrintIndent();
-      p->stream << "buffer_realize " << op->buffer->name << "(";
-      for (size_t i = 0; i < op->bounds.size(); ++i) {
-        p->stream << "[";
-        p->Print(op->bounds[i]->min);
-        p->stream << ", ";
-        p->Print(op->bounds[i]->extent);
-        p->stream << "]";
-        if (i < op->bounds.size() - 1) p->stream << ", ";
-      }
-      p->stream << ")";
-      if (!is_one(op->condition)) {
-        p->stream << " if ";
-        p->Print(op->condition);
-      }
-      p->stream << " {\n";
-
-      p->indent += 2;
-      p->Print(op->body);
-      p->indent -= 2;
-
-      p->PrintIndent();
-      p->stream << "}\n";
-    });
-
 // BufferRegion
 BufferRegion::BufferRegion(Buffer buffer, Array<Range> region) {
   CHECK_EQ(buffer->shape.size(), region.size())
@@ -844,23 +574,6 @@ TVM_REGISTER_GLOBAL("tir.BufferRegion").set_body_typed([](Buffer buffer, Array<R
 
 TVM_REGISTER_NODE_TYPE(BufferRegionNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<BufferRegionNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const BufferRegionNode*>(node.get());
-      p->stream << op->buffer->name;
-      p->stream << "[";
-      for (size_t i = 0; i < op->region.size(); ++i) {
-        const auto& range = op->region[i];
-        p->Print(range->min);
-        if (!is_one(range->extent)) {
-          p->stream << ":";
-          p->Print(range->min + range->extent);
-        }
-        if (i != op->region.size() - 1) p->stream << ", ";
-      }
-      p->stream << "]";
-    });
-
 // MatchBufferRegion
 MatchBufferRegion::MatchBufferRegion(Buffer buffer, BufferRegion source) {
   const Buffer& source_buffer = source->buffer;
@@ -918,15 +631,6 @@ TVM_REGISTER_GLOBAL("tir.MatchBufferRegion").set_body_typed([](Buffer buffer, Bu
 
 TVM_REGISTER_NODE_TYPE(MatchBufferRegionNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<MatchBufferRegionNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const MatchBufferRegionNode*>(node.get());
-      p->PrintIndent();
-      p->stream << op->buffer->name << " = match_buffer(";
-      p->Print(op->source);
-      p->stream << ")\n";
-    });
-
 // Block
 Block::Block(Array<IterVar> iter_vars, Array<BufferRegion> reads, Array<BufferRegion> writes,
              String name_hint, Stmt body, Optional<Stmt> init, Array<Buffer> alloc_buffers,
@@ -957,78 +661,6 @@ TVM_REGISTER_GLOBAL("tir.Block")
 
 TVM_REGISTER_NODE_TYPE(BlockNode);
 
-void PrintBlockTitle(const BlockNode* op, ReprPrinter* p) {
-  p->stream << "block " << op->name_hint << "(";
-  for (size_t i = 0; i < op->iter_vars.size(); i++) {
-    p->Print(op->iter_vars[i]);
-    if (i < op->iter_vars.size() - 1) p->stream << ", ";
-  }
-  p->stream << ")";
-}
-
-void PrintBlockSignature(const BlockNode* op, ReprPrinter* p) {
-  // print read/write regions
-  p->PrintIndent();
-  p->stream << "reads(";
-  p->Print(op->reads);
-  p->stream << ")\n";
-  p->PrintIndent();
-  p->stream << "writes(";
-  p->Print(op->writes);
-  p->stream << ")\n";
-  // Print alloc_buffers
-  for (const auto& alloc_buf : op->alloc_buffers) {
-    p->PrintIndent();
-    p->stream << alloc_buf->name << " = alloc_buffer(" << alloc_buf->dtype << "[";
-    for (size_t i = 0; i < alloc_buf->shape.size(); ++i) {
-      if (i > 0) p->stream << ", ";
-      p->Print(alloc_buf->shape[i]);
-    }
-    p->stream << "])\n";
-  }
-  // Print match_buffer_regions
-  for (const auto& match_buf : op->match_buffers) {
-    p->Print(match_buf);
-  }
-  if (!op->annotations.empty()) {
-    p->PrintIndent();
-    p->stream << "annotations(" << op->annotations << ")\n";
-  }
-}
-
-void PrintBlockBody(const BlockNode* op, ReprPrinter* p) {
-  // Print init
-  if (op->init.defined()) {
-    p->PrintIndent();
-    p->stream << "with init() {\n";
-    p->indent += 2;
-    p->Print(op->init.value());
-    p->indent -= 2;
-    p->PrintIndent();
-    p->stream << "}\n";
-  }
-  // Print body
-  p->Print(op->body);
-}
-
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<BlockNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const BlockNode*>(node.get());
-      p->PrintIndent();
-      PrintBlockTitle(op, p);
-      p->stream << " {\n";
-      p->indent += 2;
-
-      // Print block elements (e.g. reads/writes, etc)
-      PrintBlockSignature(op, p);
-      // Print block init and body
-      PrintBlockBody(op, p);
-
-      p->indent -= 2;
-      p->PrintIndent();
-      p->stream << "}\n";
-    });
-
 // BlockRealize
 BlockRealize::BlockRealize(Array<PrimExpr> values, PrimExpr predicate, Block block, Span span) {
   CHECK_EQ(block->iter_vars.size(), values.size())
@@ -1049,41 +681,6 @@ TVM_REGISTER_GLOBAL("tir.BlockRealize")
 
 TVM_REGISTER_NODE_TYPE(BlockRealizeNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<BlockRealizeNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const BlockRealizeNode*>(node.get());
-      auto* block_op = op->block.get();
-      p->PrintIndent();
-      PrintBlockTitle(block_op, p);
-      p->stream << " {\n";
-      p->indent += 2;
-
-      // Print binding iter_values
-      for (size_t i = 0; i < block_op->iter_vars.size(); ++i) {
-        p->PrintIndent();
-        p->stream << "bind(";
-        p->Print(block_op->iter_vars[i]->var);
-        p->stream << ", ";
-        p->Print(op->iter_values[i]);
-        p->stream << ")\n";
-      }
-      // Print predicate
-      if (!is_one(op->predicate)) {
-        p->PrintIndent();
-        p->stream << "where(";
-        p->Print(op->predicate);
-        p->stream << ")\n";
-      }
-      // Print block elements (e.g. reads/writes, etc)
-      PrintBlockSignature(block_op, p);
-      // Print block init and body
-      PrintBlockBody(block_op, p);
-
-      p->indent -= 2;
-      p->PrintIndent();
-      p->stream << "}\n";
-    });
-
 PrimExpr TypeAnnotation(DataType dtype, Span span) {
   static auto op = Op::Get("tir.type_annotation");
   return tir::Call(dtype, op, {}, span);
diff --git a/src/tir/transforms/common_subexpr_elim.cc b/src/tir/transforms/common_subexpr_elim.cc
index 447d85370ca8..5cf6f231dd80 100644
--- a/src/tir/transforms/common_subexpr_elim.cc
+++ b/src/tir/transforms/common_subexpr_elim.cc
@@ -151,8 +151,8 @@ bool CommonSubexpressionEliminator::OrderOnExprAndFrequency(std::pair<PrimExpr,
   // as we need a deterministic order
   std::stringstream a_stream;
   std::stringstream b_stream;
-  a_stream << a.first;
-  b_stream << b.first;
+  a_stream << LegacyTIRPrint(a.first);
+  b_stream << LegacyTIRPrint(b.first);
   return (a_stream.str().compare(b_stream.str()) < 0);
 }
 
diff --git a/src/tir/transforms/common_subexpr_elim_tools.cc b/src/tir/transforms/common_subexpr_elim_tools.cc
index c118d1db7d8e..c6b0b457c075 100644
--- a/src/tir/transforms/common_subexpr_elim_tools.cc
+++ b/src/tir/transforms/common_subexpr_elim_tools.cc
@@ -817,8 +817,8 @@ std::vector<std::pair<PrimExpr, size_t>> SyntacticToSemanticComputations(
        [](std::pair<PrimExpr, size_t> a, std::pair<PrimExpr, size_t> b) {
          std::stringstream a_stream;
          std::stringstream b_stream;
-         a_stream << a.first;
-         b_stream << b.first;
+         a_stream << LegacyTIRPrint(a.first);
+         b_stream << LegacyTIRPrint(b.first);
          return a_stream.str().compare(b_stream.str()) < 0;
        });
 
diff --git a/tests/cpp/expr_test.cc b/tests/cpp/expr_test.cc
index f10d99eb1ff4..82de46616cb4 100644
--- a/tests/cpp/expr_test.cc
+++ b/tests/cpp/expr_test.cc
@@ -32,7 +32,7 @@ TEST(Expr, Basic) {
   std::ostringstream os;
   os << z;
   ICHECK(zz.same_as(z));
-  ICHECK(os.str() == "max(((x + 1) + 2), 100)");
+  ICHECK(os.str() == "T.max(x + 1 + 2, 100)");
 }
 
 TEST(Expr, VarTypeAnnotation) {
diff --git a/tests/python/driver/tvmc/test_shape_parser.py b/tests/python/driver/tvmc/test_shape_parser.py
index 1e3cde12928a..b7b96ae4efa9 100644
--- a/tests/python/driver/tvmc/test_shape_parser.py
+++ b/tests/python/driver/tvmc/test_shape_parser.py
@@ -18,7 +18,6 @@
 import argparse
 
 import pytest
-
 from tvm.driver.tvmc.shape_parser import parse_shape_string
 
 
@@ -53,14 +52,14 @@ def test_negative_dimensions():
     shape_string = "input:[-1,3,224,224]"
     shape_dict = parse_shape_string(shape_string)
     # Convert to strings to allow comparison with Any.
-    assert str(shape_dict) == "{'input': [?, 3, 224, 224]}"
+    assert str(shape_dict) == "{'input': [T.Any(), 3, 224, 224]}"
 
 
 def test_multiple_valid_gpu_inputs():
     # Check that multiple valid gpu inputs are parsed correctly.
     shape_string = "gpu_0/data_0:[1, -1,224,224] gpu_1/data_1:[7, 7]"
     shape_dict = parse_shape_string(shape_string)
-    expected = "{'gpu_0/data_0': [1, ?, 224, 224], 'gpu_1/data_1': [7, 7]}"
+    expected = "{'gpu_0/data_0': [1, T.Any(), 224, 224], 'gpu_1/data_1': [7, 7]}"
     assert str(shape_dict) == expected
 
 
diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py
index ea5ea4920c87..247b22eac494 100644
--- a/tests/python/relay/aot/test_c_device_api.py
+++ b/tests/python/relay/aot/test_c_device_api.py
@@ -21,12 +21,11 @@
 
 import numpy as np
 import pytest
-
 import tvm.testing
 from tvm import relay
 from tvm.ir.module import IRModule
-from tvm.testing.aot import AOTTestModel, generate_ref_data, compile_models
 from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER
+from tvm.testing.aot import AOTTestModel, compile_models, generate_ref_data
 
 
 @pytest.fixture(name="device_api_main_func")
@@ -40,10 +39,13 @@ def fixture_device_api_main_func():
     # pylint: disable=import-outside-toplevel
     import tensorflow as tf
     import tflite.Model
-
-    from tests.python.contrib.test_ethosu.infra import create_test_runner, generate_ref_data_tflite
     from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 
+    from tests.python.contrib.test_ethosu.infra import (
+        create_test_runner,
+        generate_ref_data_tflite,
+    )
+
     # pylint: enable=import-outside-toplevel
 
     tf.config.run_functions_eagerly(True)
@@ -236,11 +238,12 @@ def test_without_device_api_unpacked_api(non_device_api_main_func):
     """Test a graph without the Device API with the unpacked internal calls"""
 
     main_func = non_device_api_main_func(interface_api="c", use_unpacked_api=True)
+    body = main_func.body.seq[1].seq[0].seq[0].value
     assert (
-        str(main_func.body)
-        == "tir.tvm_check_return(0, -1, tir.call_extern("
+        repr(body)
+        == 'T.tvm_check_return(0, -1, T.call_extern("int32", '
         + '"tvmgen_default_fused_multiply",'
-        + " x_buffer_var, y_buffer_var, output_buffer_var))\n"
+        + " x_buffer_var, y_buffer_var, output_buffer_var))"
     )
 
 
@@ -249,12 +252,16 @@ def test_without_device_api_packed_api(non_device_api_main_func):
 
     main_func = non_device_api_main_func(interface_api="packed", use_unpacked_api=False)
 
-    assert str(main_func.body) == (
-        'tir.tvm_call_cpacked("tvmgen_default_fused_multiply", '
-        "tir.tvm_stack_make_array(x_buffer_var, tir.tvm_stack_make_shape(10, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "  # pylint: disable=line-too-long
-        "tir.tvm_stack_make_array(y_buffer_var, tir.tvm_stack_make_shape(1, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "  # pylint: disable=line-too-long
-        "tir.tvm_stack_make_array(output_buffer_var, tir.tvm_stack_make_shape(10, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "  # pylint: disable=line-too-long
-        "tir.reinterpret((uint64)0))\n"
+    body = main_func.body.seq[1].seq[0].seq[0].value
+    assert repr(body) == (
+        'T.call_cpacked("tvmgen_default_fused_multiply", '
+        "T.tvm_stack_make_array(x_buffer_var, T.tvm_stack_make_shape(10, 10), "
+        'T.reinterpret("handle", T.uint64(0)), T.uint32(2), T.Cast("float32", 0), 0), '
+        "T.tvm_stack_make_array(y_buffer_var, T.tvm_stack_make_shape(1, 10), "
+        'T.reinterpret("handle", T.uint64(0)), T.uint32(2), T.Cast("float32", 0), 0), '
+        "T.tvm_stack_make_array(output_buffer_var, T.tvm_stack_make_shape(10, 10), "
+        'T.reinterpret("handle", T.uint64(0)), T.uint32(2), T.Cast("float32", 0), 0), '
+        'T.reinterpret("handle", T.uint64(0)))'
     )
 
 
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index b3db410156b3..2e7e23ead65f 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -16,35 +16,34 @@
 # under the License.
 """AOT with C Runtime Tests"""
 
-from collections import OrderedDict
-import re
 import os
-import tarfile
 import pathlib
+import re
+import tarfile
+from collections import OrderedDict
 
 import numpy as np
 import pytest
-
 import tvm
-from tvm import relay, TVMError
+from tvm import TVMError, relay
 from tvm.contrib import utils
+from tvm.ir.instrument import pass_instrument
 from tvm.ir.module import IRModule
+from tvm.micro import export_model_library_format
+from tvm.micro import model_library_format as mlf
+from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER, parametrize_aot_options
+from tvm.micro.testing.utils import get_conv2d_relay_module
 from tvm.relay import testing, transform
-from tvm.relay.testing import byoc
-from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend import Executor, Runtime
-from tvm.micro import model_library_format as mlf
-from tvm.micro import export_model_library_format
-from tvm.ir.instrument import pass_instrument
+from tvm.relay.op.annotation import compiler_begin, compiler_end
+from tvm.relay.testing import byoc
 from tvm.testing.aot import (
     AOTTestModel,
-    generate_ref_data,
     compile_and_run,
     compile_models,
     create_relay_module_and_inputs_from_tflite_file,
+    generate_ref_data,
 )
-from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER, parametrize_aot_options
-from tvm.micro.testing.utils import get_conv2d_relay_module
 
 
 def test_error_c_interface_with_packed_api():
@@ -985,8 +984,8 @@ def test_workspace_calculation_cmsis_nn():
     pytest.importorskip("tflite")
 
     # pylint: disable=import-outside-toplevel
-    from tvm.relay.op.contrib import cmsisnn
     from tvm.contrib.download import download_testdata
+    from tvm.relay.op.contrib import cmsisnn
 
     # pylint: enable=import-outside-toplevel
 
@@ -1040,11 +1039,11 @@ def test_aot_codegen_checks_returns():
     main_func = main_ir_module["__tvm_main__"]
 
     # Check operator call is wrapped properly
+    body = main_func.body[1].seq[0].seq[0].value
     assert (
-        str(main_func.body[1])
-        == "tir.tvm_check_return(0, -1, tir.call_extern("
-        + '"tvmgen_default_fused_add",'
-        + " x_buffer_var, y_buffer_var, output_buffer_var))\n"
+        repr(body)
+        == 'T.tvm_check_return(0, -1, T.call_extern("int32", "tvmgen_default_fused_add",'
+        + " x_buffer_var, y_buffer_var, output_buffer_var))"
     )
     # TODO(Mousius) - Create a better place for C codegen tests
     assert (
diff --git a/tests/python/unittest/test_tvmscript_printer_python_doc_printer.py b/tests/python/unittest/test_tvmscript_printer_python_doc_printer.py
index a426befd4bf2..d87f9ec69e05 100644
--- a/tests/python/unittest/test_tvmscript_printer_python_doc_printer.py
+++ b/tests/python/unittest/test_tvmscript_printer_python_doc_printer.py
@@ -17,7 +17,6 @@
 import itertools
 
 import pytest
-
 import tvm
 from tvm.script.printer.doc import (
     AssertDoc,
@@ -62,7 +61,7 @@ def format_script(s: str) -> str:
     cleaned_lines = "\n".join(line[spaces_to_remove:] for line in s.splitlines())
     if not cleaned_lines.endswith("\n"):
         cleaned_lines += "\n"
-    return cleaned_lines
+    return cleaned_lines.strip()
 
 
 @pytest.mark.parametrize(
diff --git a/tests/python/unittest/test_tvmscript_printer_tir.py b/tests/python/unittest/test_tvmscript_printer_tir.py
new file mode 100644
index 000000000000..fd3bb3788cfb
--- /dev/null
+++ b/tests/python/unittest/test_tvmscript_printer_tir.py
@@ -0,0 +1,638 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+from contextlib import contextmanager
+
+from tvm import ir, tir
+from tvm.ir import Range
+from tvm.script.ir_builder import IRBuilder
+from tvm.script.ir_builder import tir as T
+from tvm.script.printer import default
+
+
+@contextmanager
+def verbose_expr():
+    try:
+        default.verbose_expr(True)
+        yield
+    finally:
+        default.verbose_expr(False)
+
+
+def _assert_print(obj, expected):
+    with verbose_expr():
+        assert repr(obj).strip() == expected.strip()
+
+
+def test_prim_func():
+    a = tir.Var("a", "handle")
+    b = tir.Var("b", "handle")
+    func = tir.PrimFunc(
+        params=[a, b],
+        ret_type=None,
+        buffer_map={
+            a: tir.decl_buffer(shape=[128, 128], dtype="float32", name="A"),
+            b: tir.decl_buffer(shape=[256, 256], dtype="float32", name="B"),
+        },
+        body=tir.Evaluate(0),
+    )
+    _assert_print(
+        func,
+        expected="""
+@T.prim_func
+def main(a: T.handle, b: T.handle) -> None:
+    A = T.match_buffer(a, (128, 128))
+    B = T.match_buffer(b, (256, 256))
+    T.evaluate(0)""",
+    )
+
+
+def test_block_realize():
+    i = tir.Var("i", "int32")
+    j = tir.Var("j", "int32")
+    k = tir.Var("k", "int32")
+    with IRBuilder() as ib:
+        with T.block(name="block", no_realize=False):
+            vi = ib.name("vi", T.axis.spatial(128, i))
+            vj = ib.name("vj", T.axis.spatial(64, j))
+            vk = ib.name("vk", T.axis.reduce(32, k))
+            T.reads()
+            T.writes()
+            T.evaluate(0)
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+i = T.var("int32")
+j = T.var("int32")
+k = T.var("int32")
+with T.block("block"):
+    vi = T.axis.spatial(128, i)
+    vj = T.axis.spatial(64, j)
+    vk = T.axis.reduce(32, k)
+    T.reads()
+    T.writes()
+    T.evaluate(0)""",
+    )
+
+
+def test_block():
+    i = tir.Var("i", "int32")
+    j = tir.Var("j", "int32")
+    k = tir.Var("k", "int32")
+    with IRBuilder() as ib:
+        with T.block(name="block", no_realize=False):
+            vi = ib.name("vi", T.axis.spatial(128, i))
+            vj = ib.name("vj", T.axis.spatial(64, j))
+            vk = ib.name("vk", T.axis.reduce(32, k))
+            T.reads()
+            T.writes()
+            T.evaluate(0)
+    obj = ib.get().block
+    _assert_print(
+        obj,
+        """
+with T.block("block", no_realize=True):
+    vi = T.axis.spatial(128)
+    vj = T.axis.spatial(64)
+    vk = T.axis.reduce(32)
+    T.reads()
+    T.writes()
+    T.evaluate(0)""",
+    )
+
+
+def test_match_buffer_region():
+    src = tir.decl_buffer((128, 128), "float32", name="src")
+    tgt = tir.decl_buffer((64, 64), "float32", name="tgt")
+    obj = tir.MatchBufferRegion(
+        tgt,
+        tir.BufferRegion(
+            src,
+            [
+                Range(64, 128),
+                Range(64, 128),
+            ],
+        ),
+    )
+    _assert_print(
+        obj,
+        """
+src = T.buffer_decl((128, 128))
+tgt = T.match_buffer(src[64:128, 64:128], (64, 64))
+""",
+    )
+
+
+def test_buffer():
+    a = tir.decl_buffer((128, 128), "float16", name="A")
+    _assert_print(
+        a,
+        """A = T.buffer_decl((128, 128), "float16")
+A""",
+    )
+
+
+def test_buffer_region():
+    src = tir.decl_buffer((128, 128), "float32", name="src")
+    obj = tir.BufferRegion(
+        src,
+        [
+            Range(64, 128),
+            Range(64, 128),
+        ],
+    )
+    _assert_print(
+        obj,
+        """
+src = T.buffer_decl((128, 128))
+src[64:128, 64:128]
+""",
+    )
+
+
+def test_buffer_load():
+    a = tir.decl_buffer((128, 128), "float16", name="A")
+    obj = tir.BufferLoad(a, [128, 128])
+    _assert_print(
+        obj,
+        """
+A = T.buffer_decl((128, 128), "float16")
+A[128, 128]
+""",
+    )
+
+
+def test_buffer_store():
+    a = tir.decl_buffer((128, 128), "float16", name="A")
+    with IRBuilder() as ib:
+        T.buffer_store(a, a[128, 128] + 1, [128, 128])
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+A = T.buffer_decl((128, 128), "float16")
+A[128, 128] = A[128, 128] + T.float16(1)
+""",
+    )
+
+
+def test_for():
+    with IRBuilder() as ib:
+        with T.grid(128, 128, 128) as (i, j, k):
+            ib.name_many(["i", "j", "k"], [i, j, k])
+            T.evaluate(0)
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+for i, j, k in T.grid(128, 128, 128):
+    T.evaluate(0)
+""",
+    )
+
+
+def test_let_stmt():
+    with IRBuilder() as ib:
+        with T.let(T.var("float32"), T.float32(10)):
+            T.evaluate(0)
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+with T.let(v, T.float32(10)):
+    T.evaluate(0)
+""",
+    )
+
+
+def test_attr_stmt():
+    with IRBuilder() as ib:
+        with T.attr("pragma", "unroll", 1):
+            T.evaluate(0)
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+with T.attr("pragma", "unroll", 1):
+    T.evaluate(0)
+""",
+    )
+
+
+def test_assert_stmt():
+    with IRBuilder() as ib:
+        with T.Assert(1, "assertion"):
+            T.evaluate(0)
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+with T.Assert(1, "assertion"):
+    T.evaluate(0)
+""",
+    )
+
+
+def test_while():
+    with IRBuilder() as ib:
+        x = T.var("int32")
+        with T.While(x < 10):
+            T.evaluate(0)
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+v = T.var("int32")
+while v < 10:
+    T.evaluate(0)
+""",
+    )
+
+
+def test_allocate():
+    with IRBuilder() as ib:
+        with T.allocate([128, 128], "float32"):
+            T.evaluate(0)
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+with T.allocate([128, 128], "float32", "global") as v:
+    T.evaluate(0)
+""",
+    )
+
+
+def test_decl_buffer():
+    with IRBuilder() as ib:
+        with T.decl_buffer((10, 10), data=T.ptr("float32")):
+            T.evaluate(0)
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+with T.decl_buffer((10, 10)) as buffer:
+    T.evaluate(0)
+""",
+    )
+
+
+def test_prefetch():
+    a = tir.decl_buffer((128, 128), "float16", name="A")
+    with IRBuilder() as ib:
+        T.prefetch(a, [Range(0, 64), Range(0, 64)])
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+A = T.buffer_decl((128, 128), "float16")
+T.prefetch(A, [T.Range(0, 64), T.Range(0, 64)])
+""",
+    )
+
+
+def test_seq_stmt():
+    with IRBuilder() as ib:
+        with T.serial(10):
+            T.evaluate(0)
+            T.evaluate(1)
+    obj = ib.get().body
+    _assert_print(
+        obj,
+        """
+T.evaluate(0)
+T.evaluate(1)
+""",
+    )
+
+
+def test_if_then_else():
+    with IRBuilder() as ib:
+        with T.If(T.var("int32") == 1):
+            with T.Then():
+                T.evaluate(0)
+
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+v = T.var("int32")
+if v == 1:
+    T.evaluate(0)
+""",
+    )
+
+
+def test_evaluate():
+    with IRBuilder() as ib:
+        T.evaluate(0)
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+T.evaluate(0)
+""",
+    )
+
+
+def test_buffer_realize():
+    with IRBuilder() as ib:
+        a = tir.decl_buffer((128, 128), "float32", name="A")
+        with T.realize(a[0:128, 0:128], "test_storage_scope", True):
+            T.evaluate(0)
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+A = T.buffer_decl((128, 128))
+with T.realize(A[0:128, 0:128], "test_storage_scope"):
+    T.evaluate(0)
+""",
+    )
+
+
+def test_var():
+    a = tir.Var("a", "float32")
+    _assert_print(
+        a,
+        """
+a = T.var("float32")
+a""",
+    )
+
+
+def test_size_var():
+    a = tir.SizeVar("a", "float32")
+    _assert_print(
+        a,
+        """
+a = T.var("float32")
+a""",
+    )
+
+
+def test_iter_var():
+    a = tir.IterVar((0, 8), "a", iter_type=tir.IterVar.DataPar)
+    _assert_print(
+        a,
+        """
+a = T.var("int32")
+T.iter_var(a, T.Range(0, 8), "DataPar", "")
+""",
+    )
+
+
+def test_string_imm():
+    s = tir.StringImm("str")
+    _assert_print(s, '"str"')
+
+
+def test_cast():
+    obj = tir.Cast("float64", tir.Var("a", "float32"))
+    _assert_print(
+        obj,
+        """
+a = T.var("float32")
+T.Cast("float64", a)
+""",
+    )
+
+
+def test_binary_arith():
+    a = tir.Var("a", "float32")
+    b = tir.Var("b", "float32")
+    for op, sign in [
+        (tir.Add, "+"),
+        (tir.Sub, "-"),
+        (tir.Mul, "*"),
+        (tir.Div, "/"),
+        (tir.Mod, "truncmod"),
+        (tir.FloorDiv, "//"),
+        (tir.FloorMod, "%"),
+        (tir.LT, "<"),
+        (tir.LE, "<="),
+        (tir.EQ, "=="),
+        (tir.NE, "!="),
+        (tir.GT, ">"),
+        (tir.GE, ">="),
+    ]:
+        obj = op(a, b)
+        if sign.isalpha():
+            expected = """
+a = T.var("float32")
+b = T.var("float32")
+T.{}(a, b)""".format(
+                sign
+            )
+        else:
+            expected = """
+a = T.var("float32")
+b = T.var("float32")
+a {} b""".format(
+                sign
+            )
+        _assert_print(obj, expected)
+
+
+def test_logical():
+    a = T.var("bool", "a")
+    b = T.var("bool", "b")
+    _assert_print(
+        tir.And(a, b),
+        """
+a = T.var("bool")
+b = T.var("bool")
+a and b
+""",
+    )
+    _assert_print(
+        tir.Or(a, b),
+        """
+a = T.var("bool")
+b = T.var("bool")
+a or b
+""",
+    )
+    _assert_print(
+        tir.Not(a),
+        """
+a = T.var("bool")
+not a
+""",
+    )
+
+
+def test_select():
+    obj = tir.Select(True, 0, 2)
+    _assert_print(
+        obj,
+        """T.Select(True, 0, 2)
+""",
+    )
+
+
+def test_ramp():
+    a = tir.Var("a", "int32")
+    obj = tir.Ramp(a, 1, 32)
+    _assert_print(
+        obj,
+        """
+a = T.var("int32")
+T.Ramp(a, 1, 32)
+""",
+    )
+
+
+def test_broadcast():
+    obj = tir.Broadcast(0, 4)
+    _assert_print(
+        obj,
+        """
+T.Broadcast(0, 4)
+""",
+    )
+
+
+def test_let_expr():
+    x = tir.Var("x", "int32")
+    obj = tir.Let(x, 1, x + 1)
+    _assert_print(
+        obj,
+        """
+x = T.var("int32")
+T.let(x, 1, x + 1)
+""",
+    )
+
+
+def test_call():
+    obj = tir.atan(T.float32(1.0))
+    _assert_print(
+        obj,
+        """
+T.atan(T.float32(1))
+""",
+    )
+
+
+def test_comm_reducer():
+    obj = T.comm_reducer(lambda x, y: x + y, identity=[T.float32(0)])
+    _assert_print(
+        obj,
+        """
+T.comm_reducer(lambda x, y: x + y, [T.float32(0)])
+""",
+    )
+
+
+def test_any():
+    obj = tir.Any()
+    _assert_print(
+        obj,
+        """
+T.Any()
+""",
+    )
+
+
+def test_int_imm():
+    obj = T.int16(1)
+    _assert_print(
+        obj,
+        """
+T.int16(1)
+""",
+    )
+
+
+def test_float_imm():
+    obj = T.float16(1)
+    _assert_print(
+        obj,
+        """
+T.float16(1)
+""",
+    )
+
+
+def test_range():
+    obj = Range(0, 10)
+    _assert_print(
+        obj,
+        """
+T.Range(0, 10)
+""",
+    )
+
+
+def test_prim_type():
+    obj = ir.PrimType("float32")
+    _assert_print(obj, "T.float32")
+
+
+def test_pointer_type():
+    obj = ir.PointerType(ir.PrimType("int32"), "global")
+    _assert_print(obj, 'T.Ptr("int32", "global")')
+
+
+def test_tuple_type():
+    obj = ir.TupleType([ir.PrimType("float32"), ir.PrimType("int32")])
+    _assert_print(obj, "T.Tuple(T.float32, T.int32)")
+
+
+if __name__ == "__main__":
+    test_prim_func()
+    test_block_realize()
+    test_block()
+    test_buffer()
+    test_buffer_region()
+    test_buffer_load()
+    test_buffer_store()
+    test_match_buffer_region()
+    test_for()
+    test_let_stmt()
+    test_attr_stmt()
+    test_assert_stmt()
+    test_while()
+    test_allocate()
+    test_decl_buffer()
+    test_prefetch()
+    test_seq_stmt()
+    test_if_then_else()
+    test_evaluate()
+    test_buffer_realize()
+    test_var()
+    test_size_var()
+    test_iter_var()
+    test_string_imm()
+    test_cast()
+    test_binary_arith()
+    test_logical()
+    test_select()
+    test_ramp()
+    test_broadcast()
+    test_let_expr()
+    test_call()
+    test_comm_reducer()
+    test_any()
+    test_int_imm()
+    test_float_imm()
+    test_range()
+    test_prim_type()
+    test_pointer_type()
+    test_tuple_type()
diff --git a/tests/python/unittest/test_tvmscript_printer_underlining.py b/tests/python/unittest/test_tvmscript_printer_underlining.py
index a7e7dffb8b82..467aad2df517 100644
--- a/tests/python/unittest/test_tvmscript_printer_underlining.py
+++ b/tests/python/unittest/test_tvmscript_printer_underlining.py
@@ -18,14 +18,13 @@
 from typing import Optional
 
 import pytest
-
 from tvm.runtime import ObjectPath
 from tvm.script.printer.doc import (
-    StmtBlockDoc,
     ExprStmtDoc,
     IdDoc,
     OperationDoc,
     OperationKind,
+    StmtBlockDoc,
 )
 from tvm.script.printer.doc_printer import to_python_script
 
@@ -59,7 +58,7 @@ def format_script(s: str) -> str:
     cleaned_lines = "\n".join(line[spaces_to_remove:] for line in s.splitlines())
     if not cleaned_lines.endswith("\n"):
         cleaned_lines += "\n"
-    return cleaned_lines
+    return cleaned_lines.strip()
 
 
 def test_underline_basic():
@@ -290,8 +289,10 @@ def test_print_two_context_lines(to_underline, expected_text):
 def test_underline_and_print_line_numbers():
     doc = StmtBlockDoc([ExprStmtDoc(make_id_doc(f"line{i + 1}")) for i in range(12)])
     result = to_python_script(doc, print_line_numbers=True, path_to_underline=make_path("line6"))
-    assert result == format_script(
-        """
+    assert (
+        result.strip()
+        == format_script(
+            """
             1 line1
             2 line2
             3 line3
@@ -306,6 +307,7 @@ def test_underline_and_print_line_numbers():
            11 line11
            12 line12
     """
+        ).strip()
     )
 
 
diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py
index 38d58179c4b4..b1135c0eb007 100644
--- a/vta/python/vta/transform.py
+++ b/vta/python/vta/transform.py
@@ -729,7 +729,7 @@ def _find_basics(op):
 
         def _do_fold(op):
             if _match_pragma(op, "conv2d_transpose_gemm"):
-                is_init = ".init" in str(op)
+                is_init = "_init" in str(op)
                 tvm.tir.stmt_functor.post_order_visit(op, _find_basics)
 
                 if is_init:

From bd5e54b987b262c852ab64b93b497a169ab1cf64 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 13 Jan 2023 21:15:08 -0800
Subject: [PATCH 174/286] [COMMUNITY] Hongyi Jin -> Committer (#13784)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 79df9186b995..93ff513a87d5 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -89,6 +89,7 @@ We do encourage everyone to work anything they are interested in.
 - [Lianmin Zheng](https://github.com/merrymercy) (PMC): @merrymercy - autotvm, auto_scheduler, topi, relay
 - [Xiyou Zhou](https://github.com/zxybazh): @zxybazh - relay
 - [wrongtest](https://github.com/wrongtest-intellif): @wrongtest-intellif - tir, tvm-script, arith
+- [Hongyi Jin](https://github.com/jinhongyii): @jinhongyii - tir, tvm-script, arith, relay, topi
 
 ## Reviewers
 

From 54069dcb034eb23de7268b3228d7b64726ab8aa8 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Sun, 15 Jan 2023 08:24:03 +0800
Subject: [PATCH 175/286] [COMMUNITY] Yaxing Cai -> Committer (#13787)

Please join me in welcoming Yaxing Cai (@cyx-6) as a new committer in TVM. He has made significant enhancements across the TVM stack by implementing the following RFCs:

- RFC 0051: Bring PackedFunc into TVM Object System
- RFC 0079: TVMScript Metaprogramming

His activities:
- [Commits History](https://github.com/apache/tvm/commits?author=cyx-6)
- [Code Review](https://github.com/apache/tvm/pulls?q=reviewed-by%3Acyx-6)
- [Community Forum Summary](https://discuss.tvm.apache.org/u/cyx/summary)
---
 CONTRIBUTORS.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 93ff513a87d5..18a4e13f511d 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -31,6 +31,7 @@ We do encourage everyone to work anything they are interested in.
 - [Aditya Atluri](https://github.com/adityaatluri): @adityaatluri - rocm
 - [Matthew Barrett](https://github.com/mbaret): @mbaret - byoc, arm
 - [Matthew Brookhart](https://github.com/mbrookhart): @mbrookhart - relay, frontends
+- [Yaxing Cai](https://github.com/cyx-6): @cyx-6 - tvm-script, runtime
 - [Liangfu Chen](https://github.com/liangfu): @liangfu - vta, chisel, intel FPGA, c runtime
 - [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs
 - [Wei Chen](https://github.com/wweic): @wweic - runtime, relay, vm
@@ -46,6 +47,7 @@ We do encourage everyone to work anything they are interested in.
 - [Animesh Jain](https://github.com/anijain2305): @anijain2305 - quantization, relay
 - [Chenfan Jia](https://github.com/jcf94): @jcf94 - auto_scheduler
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
+- [Hongyi Jin](https://github.com/jinhongyii): @jinhongyii - tir, tvm-script, arith, relay, topi
 - [Manupa Karunaratne](https://github.com/manupak): @manupak - ethos-u, memory planner
 - [Elen Kalda](https://github.com/ekalda): @ekalda - ethos-u, arm
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - relay
@@ -89,7 +91,6 @@ We do encourage everyone to work anything they are interested in.
 - [Lianmin Zheng](https://github.com/merrymercy) (PMC): @merrymercy - autotvm, auto_scheduler, topi, relay
 - [Xiyou Zhou](https://github.com/zxybazh): @zxybazh - relay
 - [wrongtest](https://github.com/wrongtest-intellif): @wrongtest-intellif - tir, tvm-script, arith
-- [Hongyi Jin](https://github.com/jinhongyii): @jinhongyii - tir, tvm-script, arith, relay, topi
 
 ## Reviewers
 

From 7c816bf45041ae05d91e7957a743844b53cb7f63 Mon Sep 17 00:00:00 2001
From: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Date: Sun, 15 Jan 2023 22:45:58 -0500
Subject: [PATCH 176/286] [MeteSchedule] Bugfix: Add checks for nullable
 `run_secs` (#13790)

Runner sec is an Optional ObjectRef, so we need to check it's defined before we access its value.
---
 src/meta_schedule/measure_callback/update_cost_model.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/meta_schedule/measure_callback/update_cost_model.cc b/src/meta_schedule/measure_callback/update_cost_model.cc
index 6c217a6c4d65..63c32b189eee 100644
--- a/src/meta_schedule/measure_callback/update_cost_model.cc
+++ b/src/meta_schedule/measure_callback/update_cost_model.cc
@@ -44,7 +44,8 @@ class UpdateCostModelNode : public MeasureCallbackNode {
     for (int i = 0; i < n; i++) {
       if (!builder_results[i]->error_msg.defined() &&  //
           (runner_results[i]->error_msg.defined() ||   //
-           Sum(runner_results[i]->run_secs.value()) > 0)) {
+           (runner_results[i]->run_secs.defined() &&
+            Sum(runner_results[i]->run_secs.value()) > 0))) {
         pruned_candidate.push_back(measure_candidates[i]);
         pruned_runner_result.push_back(runner_results[i]);
       }

From be6a3342c6a1de9c0945272c9e4362ca62ed98ca Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Mon, 16 Jan 2023 09:31:44 -0500
Subject: [PATCH 177/286] [TIR][Fix] Buffer slicing using index dtype as extent
 (#13788)

[Fix] Buffer slicing using index dtype as extent
---
 python/tvm/tir/buffer.py                      |  8 ++++++--
 .../unittest/test_tvmscript_regression.py     | 19 ++++++++++++++-----
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/python/tvm/tir/buffer.py b/python/tvm/tir/buffer.py
index 726d5d1c988c..c2c158c77f78 100644
--- a/python/tvm/tir/buffer.py
+++ b/python/tvm/tir/buffer.py
@@ -179,7 +179,7 @@ def offset_of(self, indices):
 
     def __getitem__(self, indices):
         from ..arith import Analyzer  # pylint: disable=import-outside-toplevel
-        from .expr import BufferLoad, Ramp  # pylint: disable=import-outside-toplevel
+        from .expr import BufferLoad, Ramp, const  # pylint: disable=import-outside-toplevel
         from .stmt import BufferRegion  # pylint: disable=import-outside-toplevel
 
         if not isinstance(indices, (tuple, list)):
@@ -195,7 +195,11 @@ def __getitem__(self, indices):
                     stop = self.shape[i] if index.stop is None else index.stop
                     region.append(Range.from_min_extent(start, analyzer.simplify(stop - start)))
                 else:
-                    region.append(Range.from_min_extent(index, 1))
+                    region.append(
+                        Range.from_min_extent(
+                            index, const(1, index.dtype) if isinstance(index, PrimExpr) else 1
+                        )
+                    )
             return BufferRegion(self, region)
         else:
             expr_indices = []
diff --git a/tests/python/unittest/test_tvmscript_regression.py b/tests/python/unittest/test_tvmscript_regression.py
index d063c0fcab7f..44d3036596ba 100644
--- a/tests/python/unittest/test_tvmscript_regression.py
+++ b/tests/python/unittest/test_tvmscript_regression.py
@@ -17,6 +17,7 @@
 import numpy
 
 import tvm
+import tvm.testing
 from tvm.script import tir as T
 
 
@@ -73,9 +74,17 @@ def func_ref():
     tvm.ir.assert_structural_equal(test_case, func_ref)
 
 
+def test_tir_buffer_region_extent_correct_dtype():
+    @T.prim_func
+    def func(A: T.Buffer[(T.int64(16), T.int64(1)), "float32"]):
+        for i in T.grid(T.int64(16)):
+            with T.block("block"):
+                vi = T.axis.remap("S", [i])
+                T.reads(A[vi, T.int64(0) : T.int64(1)])
+                T.evaluate(0)
+
+    assert func.body.block.body.body.block.reads[0].region[0].extent.dtype == "int64"
+
+
 if __name__ == "__main__":
-    a = numpy.zeros((10, 10), dtype="int8")
-    test_multi_element_array_in_outmost_namespace()
-    test_different_dtype_assignment_to_var()
-    b = 1
-    test_var_capturing_order()
+    tvm.testing.main()

From 94403f4deb35c5695ad2c5a34195f5cdcc0ca1fe Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Mon, 16 Jan 2023 09:33:03 -0500
Subject: [PATCH 178/286] [TIR][Fix] IndexDataTypeNormalizer not unwrapping
 float casting (#13789)

---
 src/tir/ir/data_type_rewriter.cc              |  5 +-
 .../unittest/test_te_create_primfunc.py       | 66 +++++++++++++++++++
 2 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/src/tir/ir/data_type_rewriter.cc b/src/tir/ir/data_type_rewriter.cc
index f0f0d84644fe..8da7cfdd5b97 100644
--- a/src/tir/ir/data_type_rewriter.cc
+++ b/src/tir/ir/data_type_rewriter.cc
@@ -574,7 +574,10 @@ PrimExpr IndexDataTypeNormalizer::VisitExpr_(const VarNode* op) {
 }
 
 PrimExpr IndexDataTypeNormalizer::VisitExpr_(const CastNode* op) {
-  if (is_enabled_) {
+  // Unwrap the cast only when the dtype of this cast is integer dtype.
+  // When the dtype of this cast is not integer dtype, it means that this cast
+  // has some other purpose, and we should not unwrap the cast.
+  if (is_enabled_ && op->dtype.is_int()) {
     PrimExpr value = IndexDataTypeNormalizer::VisitExpr(op->value);
     return value->dtype == target_data_type_ ? value : Cast(target_data_type_, value);
   }
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index f78dc458d9d3..4b8d857e8619 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -689,6 +689,72 @@ def test_argmax():
     tvm.ir.assert_structural_equal(prim_func, argmax_expected)
 
 
+def te_resize2d_symbolic():
+    oh = tir.Var("oh", "int64")
+    ow = tir.Var("ow", "int64")
+    roi = (0.0, 0.0, 0.0, 0.0)
+    A = te.placeholder((2, 3, 128, 128), "float32", name="A")
+    B = topi.image.resize2d(
+        A,
+        roi,
+        size=(oh, ow),
+        method="nearest_neighbor",
+        coordinate_transformation_mode="asymmetric",
+        rounding_method="round",
+    )
+    return [A, B]
+
+
+@T.prim_func
+def tir_resize2d_symbolic(
+    A: T.Buffer[(T.int64(2), T.int64(3), T.int64(128), T.int64(128)), "float32"],
+    var_resize: T.handle,
+):
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    oh = T.var("int64")
+    ow = T.var("int64")
+    resize = T.match_buffer(var_resize, [T.int64(2), T.int64(3), oh, ow], dtype="float32")
+    for i0, i1, i2, i3 in T.grid(T.int64(2), T.int64(3), oh, ow):
+        with T.block("resize"):
+            v_i0, v_i1, v_i2, v_i3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            T.reads(A[v_i0, v_i1, T.int64(0) : T.int64(128), T.int64(0) : T.int64(128)])
+            T.writes(resize[v_i0, v_i1, v_i2, v_i3])
+            resize[v_i0, v_i1, v_i2, v_i3] = A[
+                v_i0,
+                v_i1,
+                T.max(
+                    T.min(
+                        T.Cast(
+                            "int64",
+                            T.round(
+                                T.float32(128) / T.Cast("float32", oh) * T.Cast("float32", v_i2),
+                                dtype="float32",
+                            ),
+                        ),
+                        T.int64(127),
+                    ),
+                    T.int64(0),
+                ),
+                T.max(
+                    T.min(
+                        T.Cast(
+                            "int64",
+                            T.round(
+                                T.float32(128) / T.Cast("float32", ow) * T.Cast("float32", v_i3),
+                                dtype="float32",
+                            ),
+                        ),
+                        T.int64(127),
+                    ),
+                    T.int64(0),
+                ),
+            ]
+
+
+def test_resize2d_symbolic():
+    _check_workload(te_resize2d_symbolic, tir_resize2d_symbolic, index_dtype_override="int64")
+
+
 def test_extern_with_explicit_buffer_access():
     def te_extern():
         A = te.placeholder((128, 128), name="A")

From b2997b77bd3dd47dd66a4955b8fe5e85fc7480cb Mon Sep 17 00:00:00 2001
From: Anirudh Sundar Subramaniam <quic_sanirudh@quicinc.com>
Date: Tue, 17 Jan 2023 02:19:37 +0530
Subject: [PATCH 179/286] [TIR] Fix cache_write bug with allocate const node
 (#13792)

Applying `cache_write`, when there's a allocate_const node in the TIR
causes the cache write block to be inserted at the incorrect location.
This patch tries to fix that error.

Co-authored-by: Abhikrant Sharma <abhikran-quic@quicinc.com>

Co-authored-by: Abhikrant Sharma <abhikran-quic@quicinc.com>
---
 .../schedule/primitive/cache_read_write.cc    |  8 ++-
 .../test_tir_schedule_cache_read_write.py     | 71 +++++++++++++++++++
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index 4174a6699e06..a2b45d407ddf 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -417,7 +417,13 @@ class CacheLocDetector : public StmtVisitor {
       info->loc_pos = detector.loc_pos_;
     } else {
       info->loc_sref = scope_sref;
-      const auto* body = scope_sref->StmtAs<BlockNode>()->body.as<SeqStmtNode>();
+
+      auto block_body = scope_sref->StmtAs<BlockNode>()->body;
+      // Find the SeqStmtNode within (potentially nested) AllocateConstNodes
+      while (block_body->IsInstance<AllocateConstNode>()) {
+        block_body = block_body.as<AllocateConstNode>()->body;
+      }
+      const auto* body = block_body.as<SeqStmtNode>();
       info->loc_pos = body == nullptr ? 1 : body->size();
     }
   }
diff --git a/tests/python/unittest/test_tir_schedule_cache_read_write.py b/tests/python/unittest/test_tir_schedule_cache_read_write.py
index 28c9a13700bf..6a75057e72ff 100644
--- a/tests/python/unittest/test_tir_schedule_cache_read_write.py
+++ b/tests/python/unittest/test_tir_schedule_cache_read_write.py
@@ -1005,6 +1005,67 @@ def block_predicate_cache_write_output_buf() -> None:
 use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 
 
+@T.prim_func
+def cache_write_allocate_const(
+    A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(128, 128), "float16"]
+):
+    B = T.alloc_buffer([128, 128], dtype="float32")
+    const = T.allocate_const([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], "float32", [8])
+    const_1 = T.buffer_decl([8], dtype="float32", data=const)
+    const2 = T.allocate_const([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], "float32", [8])
+    const_2 = T.buffer_decl([8], dtype="float32", data=const)
+    for i, j in T.grid(128, 128):
+        for x in range(8):
+            with T.block("B"):
+                vi, vj, vx = T.axis.remap("SSS", [i, j, x])
+                T.reads(A[vi, vj], const_1[vx], const_2[vx])
+                T.writes(B[vi, vj])
+                B[vi, vj] = A[vi, vj] * const_1[vx] + const_2[vx]
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(B[vi, vj])
+            T.writes(C[vi, vj])
+            C[vi, vj] = B[vi, vj] + 1.0
+
+
+@T.prim_func
+def cache_write_allocate_const_output(
+    A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(128, 128), "float16"]
+):
+    B = T.alloc_buffer([128, 128], dtype="float32")
+    A_global = T.alloc_buffer([128, 128], dtype="float32")
+    C_global = T.alloc_buffer([128, 128], dtype="float16")
+    const_2 = T.allocate_const([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], "float32", [8])
+    const_1 = T.buffer_decl([8], dtype="float32", data=const_2)
+    const_2_1 = T.buffer_decl([8], dtype="float32", data=const_2)
+    const2 = T.allocate_const([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], "float32", [8])
+    for ax0, ax1 in T.grid(128, 128):
+        with T.block("A_global"):
+            v0, v1 = T.axis.remap("SS", [ax0, ax1])
+            T.reads(A[v0, v1])
+            T.writes(A_global[v0, v1])
+            A_global[v0, v1] = A[v0, v1]
+    for i, j, x in T.grid(128, 128, 8):
+        with T.block("B"):
+            vi, vj, vx = T.axis.remap("SSS", [i, j, x])
+            T.reads(A_global[vi, vj], const_1[vx], const_2_1[vx])
+            T.writes(B[vi, vj])
+            B[vi, vj] = A_global[vi, vj] * const_1[vx] + const_2_1[vx]
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(B[vi, vj])
+            T.writes(C_global[vi, vj])
+            C_global[vi, vj] = B[vi, vj] + T.float32(1)
+    for ax0, ax1 in T.grid(128, 128):
+        with T.block("C_global"):
+            v0, v1 = T.axis.remap("SS", [ax0, ax1])
+            T.reads(C_global[v0, v1])
+            T.writes(C[v0, v1])
+            C[v0, v1] = C_global[v0, v1]
+
+
 def test_cache_read_elementwise(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
     block_b = sch.get_block("B")
@@ -1265,5 +1326,15 @@ def test_cache_write_fail_invalid_storage_scope(use_block_name):
         sch.cache_write(block_b, 0, "test_scope")
 
 
+def test_cache_write_allocate_const():
+    sch = tir.Schedule(cache_write_allocate_const)
+    block_b = sch.get_block("B")
+    block_c = sch.get_block("C")
+    sch.cache_read(block_b, 0, "global")
+    sch.cache_write(block_c, 0, "global")
+    tvm.ir.assert_structural_equal(cache_write_allocate_const_output, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=cache_write_allocate_const)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 94fd43f8f281ea55e0f9ad1c35112df3e86f7ad7 Mon Sep 17 00:00:00 2001
From: QuqqU <answeqr@gmail.com>
Date: Tue, 17 Jan 2023 08:21:42 +0900
Subject: [PATCH 180/286] [FIX] Fix Typos in Docs and Comments (#13793)

* [Fix] Fix Typo in relay/expr.h

* [Fix] Remove Duplicated Right Angle Bracket

* [Fix] Add WhiteSpace

* [Fix] Fix Parameter Values
---
 docs/dev/how_to/debugging_tvm.rst            | 2 +-
 gallery/tutorial/autotvm_relay_x86.py        | 2 +-
 gallery/tutorial/tvmc_command_line_driver.py | 2 +-
 include/tvm/relay/expr.h                     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/dev/how_to/debugging_tvm.rst b/docs/dev/how_to/debugging_tvm.rst
index 8e3161077053..9df54af4f691 100644
--- a/docs/dev/how_to/debugging_tvm.rst
+++ b/docs/dev/how_to/debugging_tvm.rst
@@ -53,7 +53,7 @@ optimization). To enable VLOGging, do the following:
    level assignments of the form ``<file_name>=<level>``. Here are some specializations:
 
     - The special filename ``DEFAULT`` sets the VLOG level setting for all files.
-    - ``<level>>`` can be set to ``-1`` to disable VLOG in that file.
+    - ``<level>`` can be set to ``-1`` to disable VLOG in that file.
     - ``<file_name>`` is the name of the c++ source file (e.g. ``.cc``, not ``.h``) relative to the
       ``src/`` directory in the TVM repo. You do not need to supply ``src/`` when specifying the
       file path, but if you do, VLOG will still interpret the path correctly.
diff --git a/gallery/tutorial/autotvm_relay_x86.py b/gallery/tutorial/autotvm_relay_x86.py
index b7e9cebb5d6a..ef8fa4a113c3 100644
--- a/gallery/tutorial/autotvm_relay_x86.py
+++ b/gallery/tutorial/autotvm_relay_x86.py
@@ -344,7 +344,7 @@
 # .. admonition:: Setting Tuning Parameters
 #
 #   In this example, in the interest of time, we set the number of trials and
-#   early stopping to 10. You will likely see more performance improvements if
+#   early stopping to 20 and 100. You will likely see more performance improvements if
 #   you set these values to be higher but this comes at the expense of time
 #   spent tuning. The number of trials required for convergence will vary
 #   depending on the specifics of the model and the target platform.
diff --git a/gallery/tutorial/tvmc_command_line_driver.py b/gallery/tutorial/tvmc_command_line_driver.py
index 27302b721bc1..a462e24dc7b8 100644
--- a/gallery/tutorial/tvmc_command_line_driver.py
+++ b/gallery/tutorial/tvmc_command_line_driver.py
@@ -52,7 +52,7 @@
 # will vary depending on your platform and installation method.
 #
 # Alternatively, if you have TVM as a Python module on your
-# ``$PYTHONPATH``,you can access the command line driver functionality
+# ``$PYTHONPATH``, you can access the command line driver functionality
 # via the executable python module, ``python -m tvm.driver.tvmc``.
 #
 # For simplicity, this tutorial will mention TVMC command line using
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index bd094a7f6905..6847a53caad4 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -265,7 +265,7 @@ class Var : public Expr {
 };
 
 /*!
- * \brief Returns \p vor with the given properties. A null property denotes 'no change'.
+ * \brief Returns \p var with the given properties. A null property denotes 'no change'.
  * Returns \p var if all properties are unchanged. Otherwise, returns a copy with the new
  * fields.
  */

From cd1fa03ef327b4c90ae4802f8c6c94dbca6a830c Mon Sep 17 00:00:00 2001
From: Michal Piszczek <imichaljp@gmail.com>
Date: Tue, 17 Jan 2023 02:41:26 +0100
Subject: [PATCH 181/286] [TOPI] Fix tuple unpack in conv2d NCHWc int8 (#13761)

Fixes a tuple being incorrectly unpacked (too many elements expected) in a Conv2D TOPI int8 op, similarly to https://github.com/apache/tvm/pull/13566 which addressed the issue specifically in the ARM implementation (which calls the function modified in this PR).

This explicit unpacking helps improve flexibility with other incoming layouts.
---
 python/tvm/topi/nn/conv2d.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 0485a17e98f5..a3afc2590256 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -515,9 +515,9 @@ def conv2d_NCHWc_int8(
 
     n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
     in_channel = ic_chunk * ic_bn
-    oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple(
-        kernel.shape
-    )
+    oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn = get_const_tuple(kernel.shape)[
+        :6
+    ]
     groups = ic_chunk // ic_chunk_group
 
     dilated_kernel_h = (kernel_height - 1) * dilation_h + 1

From 174277f8c3a3620f520db8fb0fd53d2c8165b93b Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Tue, 17 Jan 2023 20:24:30 +0800
Subject: [PATCH 182/286] [FIX] Minor Compilation Warning Fixes (#13794)

This PR fixes some warnings from the latest clang compiler.
---
 include/tvm/tir/function.h            | 8 ++++----
 src/tir/analysis/control_flow_graph.h | 7 ++++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index cf92f97360b1..9f7c0fa16b06 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -224,14 +224,14 @@ class TensorIntrin : public ObjectRef {
   TVM_DEFINE_OBJECT_REF_METHODS(TensorIntrin, ObjectRef, TensorIntrinNode)
 };
 
-/*
+/*!
  * \brief Specialize parameters of PrimFunc.
  * \param func The PrimFunc to be specialized.
  * \param param_map The mapping from function params to the instance.
  * \return The new function with parameter specialized.
  * \note We can define a Meta TIR function with symbolic shape:
  *
- * \code
+ * \code{.py}
  *  @T.prim_func
  *  def mem_copy(a: T.handle, b: T.handle, m: T.int32, n: T.int32) -> None:
  *      A = T.match_buffer(a, (m, n), "float32")
@@ -244,14 +244,14 @@ class TensorIntrin : public ObjectRef {
  *
  * Then we can make it specialized with given shapes or buffers.
  *
- * \code
+ * \code{.py}
  *  a, _, m, n = mem_copy.params
  *  func = mem_copy.specialize({a: tir.decl_buffer((16, 16))})
  *  # or
  *  func = mem_copy.specialize({n: 16, m: 16})
  * \endcode
  *
- * \code {.language-id}
+ * \code{.py}
  *  @T.prim_func
  *  def mem_copy_16_16(a: T.handle, b: T.handle) -> None:
  *      A = T.match_buffer(a, (16, 16), "float32")
diff --git a/src/tir/analysis/control_flow_graph.h b/src/tir/analysis/control_flow_graph.h
index 590392cf658a..00a6b68ff945 100644
--- a/src/tir/analysis/control_flow_graph.h
+++ b/src/tir/analysis/control_flow_graph.h
@@ -292,7 +292,8 @@ class BufferState {
   std::vector<BufferTouch> constraints_;
 };
 
-/*! \brief Represents the flow of control through a `tir::Stmt`
+/*!
+ * \brief Represents the flow of control through a `tir::Stmt`
  *
  * This class contains an internal representation of the possible
  * control flow that may occur during execution of a `tir::Stmt`.  It
@@ -312,7 +313,7 @@ class BufferState {
  *
  * For example, consider the following PrimFunc
  *
- * ```python
+ * \code{.py}
  * @T.prim_func
  * def func(T.Buffer[16, "float32"]):
  *     for i in T.serial(16):
@@ -320,7 +321,7 @@ class BufferState {
  *              B[i] = i
  *         else:
  *              B[i] = i-8
- * ```
+ * \endcode
  *
  * The control flow graph would have eight control blocks.
  *

From bd0d605afa95394952a4d5d57c09295ace5c2f3a Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Tue, 17 Jan 2023 11:15:04 -0800
Subject: [PATCH 183/286] [Hexagon] Add hexagon user DMA intrins for
 tensorization (#13719)

Added some intrins for user dma on hexagon. Currently these seem to perform worse than all other options used in the test.
---
 python/tvm/tir/tensor_intrin/hexagon.py       | 54 +++++++++++++++++++
 .../test_hexagon/test_vtcm_bandwidth.py       | 46 +++++++++-------
 2 files changed, 81 insertions(+), 19 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py
index 5e5749055bb0..7a348f3f1a45 100644
--- a/python/tvm/tir/tensor_intrin/hexagon.py
+++ b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -20,6 +20,54 @@
 from .. import TensorIntrin
 
 
+def generate_dma_load_intrin(
+    size: int,
+    dtype: str,
+):
+    """Generator of dma_load intrins"""
+
+    @T.prim_func
+    def sync_dma_load_desc(a: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (size), dtype, offset_factor=1, scope="global")
+        C = T.match_buffer(c, (size), dtype, offset_factor=1, scope="global.vtcm")
+        with T.block("root"):
+            T.reads(A[0:size])
+            T.writes(C[0:size])
+            for i in T.serial(size):
+                with T.block("load"):
+                    vii = T.axis.remap("S", [i])
+                    C[vii] = A[vii]
+
+    @T.prim_func
+    def sync_dma_load_impl(a: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (size), dtype, offset_factor=1, scope="global")
+        C = T.match_buffer(c, (size), dtype, offset_factor=1, scope="global.vtcm")
+        with T.block("root"):
+            T.reads(A[0:size])
+            T.writes(C[0:size])
+            T.evaluate(
+                T.tvm_call_packed(
+                    "device_api.hexagon.dma_copy",
+                    -1,  # Use QueueId of -1 to not interfere with async copies.
+                    T.address_of(C[0], dtype="handle"),
+                    T.address_of(A[0], dtype="handle"),
+                    size,
+                    0,  # Do not use experimental bypass mode.
+                    dtype="int32",
+                )
+            )
+            T.evaluate(
+                T.tvm_call_packed(
+                    "device_api.hexagon.dma_wait",
+                    -1,
+                    0,  # Wait for the sync queue (-1) to have 0 messages.
+                    dtype="int32",
+                )
+            )
+
+    return sync_dma_load_desc, sync_dma_load_impl
+
+
 def generate_dot_product_32x4_u8u8i32(mem_scope="global"):
     @T.prim_func
     def dot_product_32x4_u8u8i32_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
@@ -163,3 +211,9 @@ def dot_product_32x2_i16i16i32_vdmpy(a: T.handle, b: T.handle, c: T.handle) -> N
 
 VRMPY_u8i8i32_VTCM_INTRIN = "dot_32x4_u8i8i32_vtcm_vrmpy"
 TensorIntrin.register(VRMPY_u8i8i32_VTCM_INTRIN, *generate_dot_product_32x4_u8i8i32("global.vtcm"))
+
+DMA_READ_128_u8 = "dma_read_128_u8"
+TensorIntrin.register(DMA_READ_128_u8, *generate_dma_load_intrin(128, "uint8"))
+
+DMA_READ_128_i8 = "dma_read_128_i8"
+TensorIntrin.register(DMA_READ_128_i8, *generate_dma_load_intrin(128, "int8"))
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
index 254eb00cb2ea..53d0428a5ad1 100644
--- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -20,7 +20,9 @@
 import numpy as np
 
 import tvm
+import pytest
 from tvm.script import tir as T
+from tvm.tir.tensor_intrin.hexagon import DMA_READ_128_i8
 
 from .infrastructure import get_hexagon_target
 
@@ -30,6 +32,7 @@
     "Test bandwidth with buffer size {}MB... \n"
     "    -Base: {} GBps \n    -Vectorized: {} GBps\n"
     "    -Vectorized and Parallelized: {} GBps\n"
+    "    -Sync DMA: {} GBps\n"
     "    -Single DMA Copy: {} GBps\n"
 )
 
@@ -103,13 +106,12 @@ def evaluate(hexagon_session, sch, size):
         a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
     )
 
-    # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise.
-    number = 1
-    repeat = 1
+    if tvm.testing.utils.IS_IN_CI:
+        # Run with reduced number and repeat for CI
+        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=1, repeat=1)
+    else:
+        timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=10, repeat=10)
 
-    timer = module.time_evaluator(
-        "__tvm_main__", hexagon_session.device, number=number, repeat=repeat
-    )
     runtime = timer(a_hexagon, a_vtcm_hexagon)
 
     gbps = round((size / 2**30) / runtime.mean, 4)
@@ -123,18 +125,11 @@ class TestMatMulVec:
 
     # Removed most of these to speedup CI.
     size = tvm.testing.parameter(
-        # 10 * KB,
-        # 20 * KB,
-        # 40 * KB,
-        # 80 * KB,
-        # 160 * KB,
-        # 320 * KB,
-        640 * KB,
-        # MB,
-        # 2 * MB,
-        # 3 * MB,
-        # 4 * MB,
-        # 8 * MB,  # Only works on 8gen1 HDKs
+        128,
+        KB,
+        10 * KB,
+        100 * KB,
+        MB,
     )
 
     outer_split = tvm.testing.parameter(4)
@@ -144,6 +139,10 @@ class TestMatMulVec:
     @tvm.testing.requires_hexagon
     def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vector_split):
         """Test bandwidth."""
+
+        if tvm.testing.utils.IS_IN_CI and (size > 128):
+            pytest.skip("Skipping test since it takes too long in CI.")
+
         # Run the base memcopy operator.
         sch = tvm.tir.Schedule(memcopy_operator(size))
         base_gpbs = evaluate(hexagon_session, sch, size)
@@ -169,6 +168,15 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto
         sch.parallel(vbo_a)
         parallel_gbps = evaluate(hexagon_session, sch, size)
 
+        # Run with some basic unroll and vectorize scheduling and parallelization.
+        sch = tvm.tir.Schedule(memcopy_operator(size))
+        block = sch.get_block("A_global.vtcm")
+        loops = sch.get_loops(block)
+        _, inner = sch.split(loops[0], [None, 128])
+        sch.tensorize(inner, DMA_READ_128_i8)
+        # print(sch.mod.script())
+        sync_dma_gbps = evaluate(hexagon_session, sch, size)
+
         # Run using a single dma copy to transfer the data.
         sch = tvm.tir.Schedule(single_dma_operator(size))
         single_dma_gbps = evaluate(hexagon_session, sch, size)
@@ -176,7 +184,7 @@ def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vecto
         mbs = round(size / MB, 2)
         print(
             TEST_OUTPUT_TEMPLATE.format(
-                mbs, base_gpbs, vectorize_gbps, parallel_gbps, single_dma_gbps
+                mbs, base_gpbs, vectorize_gbps, parallel_gbps, sync_dma_gbps, single_dma_gbps
             )
         )
 

From a466614b21ff13661f924805eb1a746c101c97c4 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Tue, 17 Jan 2023 19:16:37 +0000
Subject: [PATCH 184/286] [CMSIS-NN] Remove support for the old CMSIS NN
 project (#13760)

Pr: #13656 adds support for the new CMSIS NN project
After the docker image is updated we can remove support for the old CMSIS NN project.
---
 apps/microtvm/cmsisnn/Makefile                |  9 +---
 apps/microtvm/ethosu/Makefile                 |  9 +---
 .../template_project/CMakeLists.txt.template  | 26 ++++-------
 apps/microtvm/zephyr_cmsisnn/CMakeLists.txt   | 46 ++++++++-----------
 .../ubuntu_install_ethosu_driver_stack.sh     |  3 --
 tests/micro/zephyr/test_zephyr.py             |  2 +-
 tests/python/relay/aot/corstone300.mk         |  9 +---
 7 files changed, 35 insertions(+), 69 deletions(-)

diff --git a/apps/microtvm/cmsisnn/Makefile b/apps/microtvm/cmsisnn/Makefile
index e7d1b7081d54..2fc4d4fa06c9 100644
--- a/apps/microtvm/cmsisnn/Makefile
+++ b/apps/microtvm/cmsisnn/Makefile
@@ -31,11 +31,6 @@ CMAKE ?= cmake
 CC = arm-none-eabi-gcc
 AR = arm-none-eabi-ar
 RANLIB = arm-none-eabi-ranlib
-ifeq ($(shell [ -d ${CMSIS_PATH}/CMSIS-NN ]; echo $$?), 0)
-	CMSIS_NN_PATH = ${CMSIS_PATH}/CMSIS-NN
-else
-	CMSIS_NN_PATH = ${CMSIS_PATH}/CMSIS/NN
-endif
 PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I${STANDALONE_CRT_PATH}/include \
 	-I${STANDALONE_CRT_PATH}/src/runtime/crt/include \
@@ -43,7 +38,7 @@ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I${CORSTONE_300_PATH} \
 	-I${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Include/ \
 	-I${CMSIS_PATH}/CMSIS/Core/Include \
-	-I${CMSIS_NN_PATH}/Include \
+	-I${CMSIS_PATH}/CMSIS-NN/Include \
 	-I${CMSIS_PATH}/CMSIS/DSP/Include \
 	-I$(abspath $(BUILD_DIR))/codegen/host/include
 CMSIS_NN_CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=$(abspath $(BUILD_DIR))/../arm-none-eabi-gcc.cmake \
@@ -61,7 +56,7 @@ DEMO_MAIN = src/demo_bare_metal.c
 CODEGEN_SRCS = $(wildcard $(abspath $(BUILD_DIR))/codegen/host/src/*.c)
 CODEGEN_OBJS = $(subst .c,.o,$(CODEGEN_SRCS))
 CMSIS_STARTUP_SRCS = $(wildcard ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
-CMSIS_NN_SRCS = $(shell find ${CMSIS_NN_PATH}/Source/*/*.c)
+CMSIS_NN_SRCS = $(shell find ${CMSIS_PATH}/CMSIS-NN/Source/*/*.c)
 UART_SRCS = $(wildcard ${CORSTONE_300_PATH}/*.c)
 
 demo: $(BUILD_DIR)/demo
diff --git a/apps/microtvm/ethosu/Makefile b/apps/microtvm/ethosu/Makefile
index 63f8adbc2790..630a2082473d 100644
--- a/apps/microtvm/ethosu/Makefile
+++ b/apps/microtvm/ethosu/Makefile
@@ -32,11 +32,6 @@ CMAKE ?= cmake
 CC = arm-none-eabi-gcc
 AR = arm-none-eabi-ar
 RANLIB = arm-none-eabi-ranlib
-ifeq ($(shell [ -d ${CMSIS_PATH}/CMSIS-NN ]; echo $$?), 0)
-	CMSIS_NN_PATH = ${CMSIS_PATH}/CMSIS-NN
-else
-	CMSIS_NN_PATH = ${CMSIS_PATH}/CMSIS/NN
-endif
 PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I${STANDALONE_CRT_PATH}/include \
 	-I${STANDALONE_CRT_PATH}/src/runtime/crt/include \
@@ -45,7 +40,7 @@ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I${ETHOSU_PATH}/core_driver/include \
 	-I${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Include/ \
 	-I${CMSIS_PATH}/CMSIS/Core/Include \
-	-I${CMSIS_NN_PATH}/Include \
+	-I${CMSIS_PATH}/CMSIS-NN/Include \
 	-I${CMSIS_PATH}/CMSIS/DSP/Include \
 	-I$(abspath $(BUILD_DIR))/codegen/host/include \
 	-DETHOSU_TEST_RUNNER_TOL=${ETHOSU_TEST_RUNNER_TOL}
@@ -83,7 +78,7 @@ endif
 CODEGEN_SRCS = $(wildcard $(abspath $(BUILD_DIR))/codegen/host/src/*.c)
 CODEGEN_OBJS = $(subst .c,.o,$(CODEGEN_SRCS))
 CMSIS_STARTUP_SRCS = $(wildcard ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
-CMSIS_NN_SOFTMAX_SRCS = $(shell find ${CMSIS_NN_PATH}/Source/SoftmaxFunctions/*.c)
+CMSIS_NN_SOFTMAX_SRCS = $(shell find ${CMSIS_PATH}/CMSIS-NN/Source/SoftmaxFunctions/*.c)
 UART_SRCS = $(wildcard ${CORSTONE_300_PATH}/*.c)
 
 demo: $(BUILD_DIR)/demo
diff --git a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
index 17200f7e9704..1aff9ece6bfa 100644
--- a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
+++ b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
@@ -31,27 +31,21 @@ find_package(Zephyr HINTS $ENV{ZEPHYR_BASE})
 project(microtvm_autogenerated_project)
 
 if(DEFINED CMSIS_PATH)
-  if (EXISTS ${CMSIS_PATH}/CMSIS-NN)
-      set(CMSIS_NN_PATH ${CMSIS_PATH}/CMSIS-NN)
-  else()
-      set(CMSIS_NN_PATH ${CMSIS_PATH}/CMSIS/NN)
-  endif()
-
   file(GLOB_RECURSE cmsis_lib_srcs
-    ${CMSIS_NN_PATH}/Source/ActivationFunctions/*.c
-    ${CMSIS_NN_PATH}/Source/BasicMathFunctions/*.c
-    ${CMSIS_NN_PATH}/Source/ConcatenationFunctions/*.c
-    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/*.c
-    ${CMSIS_NN_PATH}/Source/FullyConnectedFunctions/*.c
-    ${CMSIS_NN_PATH}/Source/NNSupportFunctions/*.c
-    ${CMSIS_NN_PATH}/Source/PoolingFunctions/*.c
-    ${CMSIS_NN_PATH}/Source/ReshapeFunctions/*.c
-    ${CMSIS_NN_PATH}/Source/SoftmaxFunctions/*.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/ActivationFunctions/*.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/BasicMathFunctions/*.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/ConcatenationFunctions/*.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/ConvolutionFunctions/*.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/FullyConnectedFunctions/*.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/NNSupportFunctions/*.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/PoolingFunctions/*.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/ReshapeFunctions/*.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/SoftmaxFunctions/*.c
   )
 
   set(cmsis_includes
     ${CMSIS_PATH}/CMSIS/Core/Include
-    ${CMSIS_NN_PATH}/Include
+    ${CMSIS_PATH}/CMSIS-NN/Include
     ${CMSIS_PATH}/CMSIS/DSP/Include
     ${CMSIS_PATH}/CMSIS/DSP/Include/dsp
   )
diff --git a/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt b/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt
index 9dec75dc5030..a68370099515 100644
--- a/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt
+++ b/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt
@@ -51,34 +51,24 @@ set(DATA_FILES
     ${CMAKE_CURRENT_BINARY_DIR}/outputs.c
     ${CMAKE_CURRENT_BINARY_DIR}/labels.c
 )
-
-if (EXISTS ${CMSIS_PATH}/CMSIS-NN)
-    set(CMSIS_NN_PATH ${CMSIS_PATH}/CMSIS-NN)
-    set(CMSIS_NN_ADDITIONAL_SOURCES
-        ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_convolve_1x1_s8.c
-        ${CMSIS_NN_PATH}/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c
-    )
-else()
-    set(CMSIS_NN_PATH ${CMSIS_PATH}/CMSIS/NN)
-endif()
-
 set(CMSIS_SOURCES
-    ${CMSIS_NN_PATH}/Source/SoftmaxFunctions/arm_softmax_s8.c
-    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c
-    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
-    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
-    ${CMSIS_NN_PATH}/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c
-    ${CMSIS_NN_PATH}/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c
-    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c
-    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c
-    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
-    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_convolve_s8.c
-    ${CMSIS_NN_PATH}/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
-    ${CMSIS_NN_PATH}/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
-    ${CMSIS_NN_PATH}/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
-    ${CMSIS_NN_PATH}/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
-    ${CMSIS_NN_PATH}/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
-    ${CMSIS_NN_ADDITIONAL_SOURCES}
+    ${CMSIS_PATH}/CMSIS-NN/Source/SoftmaxFunctions/arm_softmax_s8.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/ConvolutionFunctions/arm_convolve_s8.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c
+    ${CMSIS_PATH}/CMSIS-NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_1x_s8.c
 )
 
 add_custom_command(
@@ -105,5 +95,5 @@ target_sources(app PRIVATE
 target_include_directories(app
     PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
     PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/runtime/include ${CMAKE_CURRENT_BINARY_DIR}/codegen/host/include
-    PUBLIC ${CMSIS_NN_PATH}/Include/ ${CMSIS_PATH}/CMSIS/DSP/Include
+    PUBLIC ${CMSIS_PATH}/CMSIS-NN/Include/ ${CMSIS_PATH}/CMSIS/DSP/Include
 )
diff --git a/docker/install/ubuntu_install_ethosu_driver_stack.sh b/docker/install/ubuntu_install_ethosu_driver_stack.sh
index 0fb35b13e797..8bc6b733edc8 100755
--- a/docker/install/ubuntu_install_ethosu_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosu_driver_stack.sh
@@ -87,7 +87,4 @@ make
 # Build NN Library
 mkdir ${CMSIS_PATH}/CMSIS-NN/build/ && cd ${CMSIS_PATH}/CMSIS-NN/build/
 cmake .. -DCMAKE_TOOLCHAIN_FILE=${ethosu_dir}/core_platform/cmake/toolchain/arm-none-eabi-gcc.cmake -DTARGET_CPU=cortex-m55 -DBUILD_CMSIS_NN_FUNCTIONS=YES -DCMSIS_PATH=${CMSIS_PATH}
-
-mkdir ${CMSIS_PATH}/CMSIS/NN/build/ && cd ${CMSIS_PATH}/CMSIS/NN/build/
-cmake .. -DCMAKE_TOOLCHAIN_FILE=${ethosu_dir}/core_platform/cmake/toolchain/arm-none-eabi-gcc.cmake -DTARGET_CPU=cortex-m55 -DBUILD_CMSIS_NN_FUNCTIONS=YES
 make
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index a053c905aa34..f86f4a7a7f3f 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -624,7 +624,7 @@ def test_schedule_build_with_cmsis_dependency(workspace_dir, board, microtvm_deb
     assert "CMSIS/DSP/Include" in cmake_content
     assert "CMSIS/DSP/Include/dsp" in cmake_content
     assert "CMSIS/DSP/Include" in cmake_content
-    # assert "CMSIS-NN/Include" in cmake_content
+    assert "CMSIS-NN/Include" in cmake_content
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/aot/corstone300.mk b/tests/python/relay/aot/corstone300.mk
index ebda50d9cfef..45d93ab493ed 100644
--- a/tests/python/relay/aot/corstone300.mk
+++ b/tests/python/relay/aot/corstone300.mk
@@ -48,11 +48,6 @@ CC = arm-none-eabi-gcc
 AR = arm-none-eabi-ar
 RANLIB = arm-none-eabi-ranlib
 CC_OPTS = CC=$(CC) AR=$(AR) RANLIB=$(RANLIB)
-ifeq ($(shell [ -d ${CMSIS_PATH}/CMSIS-NN ]; echo $$?), 0)
-	CMSIS_NN_PATH = ${CMSIS_PATH}/CMSIS-NN
-else
-	CMSIS_NN_PATH = ${CMSIS_PATH}/CMSIS/NN
-endif
 PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	${CFLAGS} \
 	-I$(build_dir)/../include \
@@ -62,7 +57,7 @@ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I${DRIVER_PATH}/include \
 	-I${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Include/ \
 	-I${CMSIS_PATH}/CMSIS/Core/Include \
-	-I${CMSIS_NN_PATH}/Include \
+	-I${CMSIS_PATH}/CMSIS-NN/Include \
 	-I${CMSIS_PATH}/CMSIS/DSP/Include \
 	-isystem$(STANDALONE_CRT_DIR)/include
 DRIVER_CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=$(ETHOSU_TEST_ROOT)/arm-none-eabi-gcc.cmake \
@@ -83,7 +78,7 @@ CC_CODEGEN_SRCS = $(shell find $(abspath $(CODEGEN_ROOT)/host/src/*.cc))
 C_CODEGEN_OBJS = $(subst .c,.o,$(C_CODEGEN_SRCS))
 CC_CODEGEN_OBJS = $(subst .cc,.o,$(CC_CODEGEN_SRCS))
 CMSIS_STARTUP_SRCS = $(shell find ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
-CMSIS_NN_SRCS = $(shell find ${CMSIS_NN_PATH}/Source/*/*.c)
+CMSIS_NN_SRCS = $(shell find ${CMSIS_PATH}/CMSIS-NN/Source/*/*.c)
 UART_SRCS = $(shell find ${PLATFORM_PATH}/*.c)
 
 ifdef ETHOSU_TEST_ROOT

From 3ef5a2f4d6832fae6174d716136aae75a61ec943 Mon Sep 17 00:00:00 2001
From: Valery Chernov <black.chervi@gmail.com>
Date: Wed, 18 Jan 2023 01:26:28 +0400
Subject: [PATCH 185/286] [TIR][TOPI][x86][CI] Support skylake avx512 (#13621)

* add skylake-avx512 tests

* extend tests by skylake-avx512

* lint fixes

* fix misprinting

* misprinting fix

* TODOs for further development

* add temporally commented tests for skylake-avx512 due to not implemented shedules and postprocs for it. add TODOs for further check and development

* update int8-acc32 test for vnni and avx512 w/o it

* pylint fix

* once more pylint fix

* fix Feature init for skylake

* fix test

* fix intrin names for assert for skylake

* small fix

* return back fast int8 intrinsic tests

* test connect of dense and batch_matmul to avx512 tensorization

* extend dense_alter_layout on avx512 (currently) instead of VNNI. some renaming vnni to int8 for the sake of clarity

* more renaming vnni to int8 for dense schedule, compute, strategy for the sake of clarity

* update for batch_matmul with avx512

* extend space generator init for avx512. Add Default AVX512 schedule rules

* avx512 dot 16x4 intrin was implemented for MS default schedule rule

* small fix

* update

* pylint fixes

* test workaround for const alloc in tir

* test fix (broadcasting)

* remove excess instructions from dot_product_16x4_u8i8i32_avx512

* pylint fix

* skip asm check for askew weight shapes

* fix pylint

* revert test fix

* set number of args

* test fix

* fix const allocation in tir for avx512 dot 16x4

* fix signature of dot_product_16x4_u8i8i32_avx512

* use script instead of tvm.tir for const allocation

* extend auto tensorize test by skylake-avx512 target

* clean code

* update test_op_level1, resolve TODO

* small update test_op_level2

* update test_op_level10, resolve TODO

* update qnn legalize pass test, resolve TODOs

* pylint fixes

* update ms test for avx512

* update more ms test for avx512

* try to fix i386 CI tests

* fix intrin name for check

* skip test due to model downloading issue

* fix test failure

* use ORT for conv2d check

* lint fix after rebasing

* comment ORT part of test

* extend tests tir schedule analysis and transform for avx512. unify test classes

* extend test tir schedule tensorize for avx512

* extend test meta schedule vnni integration for avx512

* rename test file

* pylint fix

* tag fix

* update test meta schedule trace apply with avx512

* rollback test class unifying in utils

* pylint fixes

* separate TIRs for scheduled conv2d for vnni and avx512

* fix registering issue in test

* update conv+bias onnx model for intermediate test

* fix int16 overflow

* fix int16 overflow for dense test

* update input data for test of dense

* small rollback

* fix misprinting

* fix

* restart CI

* DefaultVNNI was renamed to DefaultLLVM for mutator

* rename test file for the sake of clarity

* DefaultVNNI was renamed to DefaultCPUTensorization for postproc

* remove resolved TODO

* DefaultVNNI and AVX512 for ScheduleRule were unified

* replace code to upstream with initial version

* fix arg type

* lint fix

* small fix

* lint fix

* fix misprinting

* rollback trace apply test for avx512 (reviewer remark)

* fix pylint

Co-authored-by: Valery Chernov <valery.chernov@deelvin.com>
---
 include/tvm/meta_schedule/mutator.h           |   2 -
 include/tvm/meta_schedule/postproc.h          |   4 +-
 include/tvm/meta_schedule/schedule_rule.h     |   4 +-
 python/tvm/relay/qnn/op/legalizations.py      |   4 +-
 python/tvm/testing/utils.py                   |  29 ++++
 python/tvm/tir/tensor_intrin/x86.py           |  40 +++++
 python/tvm/topi/x86/batch_matmul.py           |  25 +--
 python/tvm/topi/x86/dense.py                  |  19 ++-
 python/tvm/topi/x86/dense_alter_op.py         |  18 +-
 src/meta_schedule/mutator/mutator.cc          |   2 -
 src/meta_schedule/postproc/postproc.cc        |   2 +-
 .../schedule_rule/schedule_rule.cc            |   6 +-
 .../space_generator/space_generator.cc        |  19 ++-
 tests/python/contrib/test_gemm_acc32_vnni.py  | 160 +++++++++---------
 .../python/integration/test_auto_tensorize.py | 136 +++++++++------
 tests/python/relay/test_op_level1.py          |  24 ++-
 tests/python/relay/test_op_level10.py         |  45 +++--
 tests/python/relay/test_op_level2.py          |  24 ++-
 tests/python/relay/test_pass_qnn_legalize.py  |  26 +--
 ... => test_meta_schedule_cpu_dot_product.py} |  62 ++++---
 .../test_meta_schedule_relay_integration.py   |  19 ++-
 ..._meta_schedule_schedule_rule_mlt_intrin.py |  23 +--
 .../test_meta_schedule_trace_apply.py         |   8 +-
 .../unittest/test_tir_schedule_analysis.py    |  15 +-
 .../unittest/test_tir_schedule_tensorize.py   |  14 +-
 .../unittest/test_tir_schedule_transform.py   |  38 +++--
 26 files changed, 485 insertions(+), 283 deletions(-)
 rename tests/python/unittest/{test_meta_schedule_vnni_integration.py => test_meta_schedule_cpu_dot_product.py} (83%)

diff --git a/include/tvm/meta_schedule/mutator.h b/include/tvm/meta_schedule/mutator.h
index 498b2797ada5..1560c00f3907 100644
--- a/include/tvm/meta_schedule/mutator.h
+++ b/include/tvm/meta_schedule/mutator.h
@@ -131,8 +131,6 @@ class Mutator : public runtime::ObjectRef {
                                    FApply f_apply, FClone f_clone, FAsString f_as_string);
   /*! \brief Create default mutators for LLVM */
   TVM_DLL static Map<Mutator, FloatImm, void> DefaultLLVM();
-  /*! \brief Create default mutators for x86 VNNI */
-  TVM_DLL static Map<Mutator, FloatImm, void> DefaultVNNI();
   /*! \brief Create default mutators for CUDA */
   TVM_DLL static Map<Mutator, FloatImm, void> DefaultCUDA();
   /*! \brief Create default mutators for CUDA with TensorCore */
diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index 06fa086c4bca..85fb9003e87f 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -163,8 +163,8 @@ class Postproc : public runtime::ObjectRef {
   TVM_DLL static Postproc RewriteLayout();
   /*! \brief Create default postprocessors for LLVM */
   TVM_DLL static Array<Postproc, void> DefaultLLVM();
-  /*! \brief Create default postprocessors for x86 VNNI */
-  TVM_DLL static Array<Postproc, void> DefaultVNNI();
+  /*! \brief Create default postprocessors for x86 (AVX512 and VNNI) */
+  TVM_DLL static Array<Postproc, void> DefaultCPUTensorization();
   /*! \brief Create default postprocessors for CUDA */
   TVM_DLL static Array<Postproc, void> DefaultCUDA();
   /*! \brief Create default postprocessors for CUDA with TensorCore */
diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 16202e18bf95..7995d1fceeb6 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -290,8 +290,8 @@ class ScheduleRule : public runtime::ObjectRef {
 
   /*! \brief Create default schedule rules for LLVM */
   TVM_DLL static Array<ScheduleRule, void> DefaultLLVM();
-  /*! \brief Create default schedule rules for x86 VNNI */
-  TVM_DLL static Array<ScheduleRule, void> DefaultVNNI();
+  /*! \brief Create default schedule rules for x86 (AVX512 and VNNI) */
+  TVM_DLL static Array<ScheduleRule, void> DefaultX86(const String& type);
   /*! \brief Create default schedule rules for CUDA */
   TVM_DLL static Array<ScheduleRule, void> DefaultCUDA();
   /*! \brief Create default postprocessors for CUDA with TensorCore */
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index 9baabf36a9d8..ef368a016e0c 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -248,7 +248,7 @@ def helper_change_dtypes_to_uint8_int8(attrs, inputs, types, relay_op):
     Replacing QA + 128 with QA' and (zp_a + 128) with zp_a'
     We get our new quantized uint8 tensor - scale * (QA' - zp_a')
 
-    Similarly we can convert from int8 to uint8.
+    Similarly we can convert from uint8 to int8.
 
     Parameters
     ----------
@@ -449,6 +449,7 @@ def _qnn_dense_legalize_arm_cpu(attrs, inputs, types):
 
 @qnn_conv2d_legalize.register("cpu")
 def _qnn_conv2d_legalize_intel_cpu(attrs, inputs, types):
+    # TODO(vvchernov): not only VNNI
     # The VNNI transformations prefer uint8 x int8 datatypes.
     if is_fast_int8_on_intel():
         return helper_change_dtypes_to_uint8_int8(attrs, inputs, types, relay.qnn.op.conv2d)
@@ -457,6 +458,7 @@ def _qnn_conv2d_legalize_intel_cpu(attrs, inputs, types):
 
 @qnn_dense_legalize.register("cpu")
 def _qnn_dense_legalize_intel_cpu(attrs, inputs, types):
+    # TODO(vvchernov): not only VNNI
     # The VNNI transformations prefer uint8 x int8 datatypes.
     if is_fast_int8_on_intel():
         return helper_change_dtypes_to_uint8_int8(attrs, inputs, types, relay.qnn.op.dense)
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 899b05440388..19669cd60cf4 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1027,6 +1027,28 @@ def _has_vnni():
     return False
 
 
+# check avx512 intrinsic groups for SkyLake X
+def _has_slavx512():
+    # Check LLVM support
+    llvm_version = tvm.target.codegen.llvm_version_major()
+    is_llvm_support = llvm_version >= 8
+    arch = platform.machine()
+    # Only linux is supported for now.
+    if arch == "x86_64" and sys.platform.startswith("linux"):
+        with open("/proc/cpuinfo", "r") as content:
+            ctx = content.read()
+            check = (
+                "avx512f" in ctx
+                and "avx512cd" in ctx
+                and "avx512bw" in ctx
+                and "avx512dq" in ctx
+                and "avx512vl" in ctx
+            )
+            return check and is_llvm_support
+
+    return False
+
+
 requires_arm_dot = Feature("arm_dot", "ARM dot product", run_time_check=_arm_dot_supported)
 
 
@@ -1035,6 +1057,13 @@ def _has_vnni():
 )
 
 
+requires_skylake_avx512 = Feature(
+    "skylake_avx512",
+    "x86 SkyLake AVX512",
+    run_time_check=lambda: _has_slavx512() and _is_intel(),
+)
+
+
 def _cmake_flag_enabled(flag):
     flag = tvm.support.libinfo()[flag]
 
diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index d93167f9e614..c527d0d21008 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -67,8 +67,48 @@ def dot_product_16x4_u8i8i32_vnni(
         )
 
 
+@T.prim_func
+def dot_product_16x4_u8i8i32_avx512(
+    A: T.Buffer((4,), "uint8", offset_factor=1),
+    B: T.Buffer((16, 4), "int8", offset_factor=1),
+    C: T.Buffer((16,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:16], A[0:4], B[0:16, 0:4])
+        T.writes(C[0:16])
+
+        A_u8x4 = A.vload([0], "uint8x4")
+        A_i32 = T.reinterpret(A_u8x4, dtype="int32")
+        A_brdcst = T.broadcast(A_i32, 16)
+        A_u8x64 = T.reinterpret(A_brdcst, dtype="uint8x64")
+
+        B_i8x64 = B.vload([0, 0], dtype="int8x64")
+
+        Red = T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddubs.w.512"),
+            T.uint32(2),
+            A_u8x64,
+            B_i8x64,
+            dtype="int16x32",
+        )
+
+        C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddw.d.512"),
+            T.uint32(2),
+            Red,
+            T.int16x32(1),
+            dtype="int32x16",
+        )
+
+
 VNNI_DOT_16x4_INTRIN = "dot_16x4_vnni"
 
 TensorIntrin.register(
     VNNI_DOT_16x4_INTRIN, dot_product_16x4_u8i8i32_desc, dot_product_16x4_u8i8i32_vnni
 )
+
+AVX512_DOT_16x4_INTRIN = "dot_16x4_avx512"
+
+TensorIntrin.register(
+    AVX512_DOT_16x4_INTRIN, dot_product_16x4_u8i8i32_desc, dot_product_16x4_u8i8i32_avx512
+)
diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
index 9f3bc2951524..95408a924f28 100644
--- a/python/tvm/topi/x86/batch_matmul.py
+++ b/python/tvm/topi/x86/batch_matmul.py
@@ -25,12 +25,12 @@
 from .. import generic, nn
 from ..transform import layout_transform
 from ..utils import get_const_tuple, get_max_power2_factor, traverse_inline
-from .dense import dense_vnni_schedule, dense_amx_int8_schedule
+from .dense import dense_int8_schedule, dense_amx_int8_schedule
 from .injective import schedule_injective_from_existing
-from .utils import target_has_vnni, target_has_amx
+from .utils import target_has_avx512, target_has_amx
 
 
-@autotvm.register_topi_compute("batch_matmul_vnni.x86")
+@autotvm.register_topi_compute("batch_matmul_int8.x86")
 def batch_matmul_int8_compute(cfg, x, y, *_):
     """Compute for uint8 x int8 -> int32 batch_matmul"""
     batch, m, k = x.shape
@@ -39,8 +39,8 @@ def batch_matmul_int8_compute(cfg, x, y, *_):
     _, n_o, _, n_i, _ = packed_y.shape
     ak = te.reduce_axis((0, k), name="k")
     mcpu = tvm.target.Target.current().mcpu
-    if target_has_vnni(mcpu):
-        attrs_info = {"schedule_rule": "batch_matmul_vnni"}
+    if target_has_avx512(mcpu):
+        attrs_info = {"schedule_rule": "batch_matmul_int8"}
     else:
         attrs_info = None
 
@@ -60,13 +60,14 @@ def batch_matmul_int8_compute(cfg, x, y, *_):
     return z
 
 
-def batch_matmul_vnni_schedule(cfg, s, C, O, layout_trans):
-    """Schedule batch_matmul compute using VNNI vpdpbusd instruction"""
+def batch_matmul_int8_schedule(cfg, s, C, O, layout_trans):
+    """Schedule batch_matmul compute using avx512 or lower instructions
+    including VNNI vpdpbusd instruction if possible"""
     # C: The output of batched GEMM
     # O: The output of the fused op
 
     # Schedule the GEMM part
-    s, fused_inner = dense_vnni_schedule(cfg, s, C, O, do_parallel=False)
+    s, fused_inner = dense_int8_schedule(cfg, s, C, O, do_parallel=False)
     # Parallelize over batch
     fused = s[O].fuse(O.op.axis[0], fused_inner)
     s[O].parallel(fused)
@@ -228,9 +229,9 @@ def _callback(op):
     return s
 
 
-@autotvm.register_topi_schedule("batch_matmul_vnni.x86")
+@autotvm.register_topi_schedule("batch_matmul_int8.x86")
 def schedule_batch_matmul_int8(cfg, outs):
-    """Schedule for batch_matmul_vnni"""
+    """Schedule for batch_matmul_int8"""
     s = te.create_schedule([x.op for x in outs])
     mcpu = tvm.target.Target.current().mcpu
 
@@ -239,8 +240,8 @@ def _callback(op):
             layout_trans = op.input_tensors[1]
             if target_has_amx(mcpu):
                 batch_matmul_amx_schedule(cfg, s, op.output(0), outs[0], layout_trans)
-            elif target_has_vnni(mcpu):
-                batch_matmul_vnni_schedule(cfg, s, op.output(0), outs[0], layout_trans)
+            elif target_has_avx512(mcpu):
+                batch_matmul_int8_schedule(cfg, s, op.output(0), outs[0], layout_trans)
 
     traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index bb99a632811b..b697cf98a625 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -26,10 +26,10 @@
 
 from .. import generic, tag
 from ..utils import get_const_tuple, traverse_inline
-from .tensor_intrin import dot_16x1x16_uint8_int8_int32_cascadelake
+from .tensor_intrin import dot_16x1x16_uint8_int8_int32
 from .tensor_intrin import dot_32x128x32_u8s8s32_sapphirerapids
 from .tensor_intrin import acc_32x32_int32_sapphirerapids
-from .utils import get_simd_32bit_lanes, target_has_vnni, target_has_amx
+from .utils import get_simd_32bit_lanes, target_has_avx512, target_has_amx
 
 
 def _schedule_dense_pack_template(cfg, s, C, O):
@@ -302,8 +302,8 @@ def _callback(op):
         if "dense_int8" in op.tag:
             if target_has_amx(mcpu):
                 dense_amx_int8_schedule(cfg, s, op.output(0), outs[0])
-            elif target_has_vnni(mcpu):
-                dense_vnni_schedule(cfg, s, op.output(0), outs[0])
+            elif target_has_avx512(mcpu):
+                dense_int8_schedule(cfg, s, op.output(0), outs[0])
 
     traverse_inline(s, outs[0].op, _callback)
     return s
@@ -315,8 +315,8 @@ def dense_int8_compute(cfg, X, packed_w, bias=None):
     n_o, _, n_i, _ = packed_w.shape
     ak = te.reduce_axis((0, k), name="k")
     mcpu = tvm.target.Target.current().mcpu
-    if target_has_vnni(mcpu):
-        target_attr = {"schedule_rule": "meta_schedule.x86.dense_vnni"}
+    if target_has_avx512(mcpu):
+        target_attr = {"schedule_rule": "meta_schedule.x86.dense_int8"}
     else:
         target_attr = None
 
@@ -339,8 +339,9 @@ def dense_int8_compute(cfg, X, packed_w, bias=None):
     return C
 
 
-def dense_vnni_schedule(cfg, s, C, O, do_parallel=True):
-    """Schedule dense compute using VNNI vpdpbusd instruction"""
+def dense_int8_schedule(cfg, s, C, O, do_parallel=True):
+    """Schedule dense compute using avx512 or lower instructions
+    including VNNI vpdpbusd instruction if possible"""
     # C: The output of GEMM
     # O: The output of the fused op
     def split_y(out):
@@ -361,7 +362,7 @@ def split_y(out):
 
     s[C].reorder(a_yo, a_xo, a_yi, a_ko, a_xi, a_ki)
 
-    pc = dot_16x1x16_uint8_int8_int32_cascadelake()
+    pc = dot_16x1x16_uint8_int8_int32()
     s[C].tensorize(a_xi, pc)
 
     if C == O:
diff --git a/python/tvm/topi/x86/dense_alter_op.py b/python/tvm/topi/x86/dense_alter_op.py
index 2cb46b8291fb..a380b7fc9ff7 100644
--- a/python/tvm/topi/x86/dense_alter_op.py
+++ b/python/tvm/topi/x86/dense_alter_op.py
@@ -24,14 +24,14 @@
 from .dense import _default_dense_pack_config
 from ..utils import get_const_tuple
 from ..nn import dense_alter_layout
-from .utils import target_has_vnni
-from .utils import target_has_amx
+from .utils import target_has_avx512, target_has_amx
 from .. import nn
 
 
-def check_inst_applicable(x, y, allow_padding=False):
+def check_int8_applicable(x, y, allow_padding=False):
     mcpu = tvm.target.Target.current().mcpu
-    simd_avai = target_has_vnni(mcpu) or target_has_amx(mcpu)
+    # TODO(vvchernov): may be also target_has_avx2 or lower?
+    simd_avai = target_has_avx512(mcpu) or target_has_amx(mcpu)
     return (
         simd_avai
         and "int8" in x.dtype
@@ -49,7 +49,7 @@ def _alter_dense_layout(attrs, inputs, tinfos, out_type):
     M, K = get_const_tuple(data_tensor.shape)
     N, _ = get_const_tuple(weight_tensor.shape)
 
-    if check_inst_applicable(data_tensor, weight_tensor) and data_tensor.dtype == "uint8":
+    if check_int8_applicable(data_tensor, weight_tensor) and data_tensor.dtype == "uint8":
         weight_layout = "NC16n4c"
         return relay.nn.contrib_dense_pack(inputs[0], inputs[1], weight_layout, None, out_dtype)
 
@@ -86,10 +86,10 @@ def _alter_dense_layout(attrs, inputs, tinfos, out_type):
     return None
 
 
-def vnni_legalize(inputs, arg_types, op, attrs, need_expand=False):
+def int8_int8_legalize(inputs, arg_types, op, attrs, need_expand=False):
     """Legalizes s8, s8 -> s32 GEMM op for VNNI."""
     if (
-        check_inst_applicable(arg_types[0], arg_types[1], allow_padding=True)
+        check_int8_applicable(arg_types[0], arg_types[1], allow_padding=True)
         and arg_types[0].dtype == "int8"
     ):
         x, y = inputs
@@ -135,7 +135,7 @@ def vnni_legalize(inputs, arg_types, op, attrs, need_expand=False):
 @nn.dense_legalize.register("cpu")
 def _dense_legalize(attrs, inputs, arg_types):
     """Legalizes s8, s8 -> s32 dense for VNNI."""
-    return vnni_legalize(inputs, arg_types, relay.nn.dense, attrs)
+    return int8_int8_legalize(inputs, arg_types, relay.nn.dense, attrs)
 
 
 @nn.batch_matmul_legalize.register("cpu")
@@ -143,4 +143,4 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
     """Legalizes s8, s8 -> s32 batch_matmul for VNNI."""
     if attrs["transpose_a"] or not attrs["transpose_b"]:
         return None
-    return vnni_legalize(inputs, arg_types, relay.nn.batch_matmul, attrs, need_expand=True)
+    return int8_int8_legalize(inputs, arg_types, relay.nn.batch_matmul, attrs, need_expand=True)
diff --git a/src/meta_schedule/mutator/mutator.cc b/src/meta_schedule/mutator/mutator.cc
index 3cf43e11260e..ddc2d73590f9 100644
--- a/src/meta_schedule/mutator/mutator.cc
+++ b/src/meta_schedule/mutator/mutator.cc
@@ -59,8 +59,6 @@ Map<Mutator, FloatImm> Mutator::DefaultLLVM() {
       {Mutator::MutateParallel(/*max_jobs_per_core=*/16), FloatImm(DataType::Float(64), 0.02)}};
 }
 
-Map<Mutator, FloatImm> Mutator::DefaultVNNI() { return Mutator::DefaultLLVM(); }
-
 Map<Mutator, FloatImm> Mutator::DefaultCUDA() {
   return Map<Mutator, FloatImm>{
       {Mutator::MutateTileSize(), FloatImm(DataType::Float(64), 0.9)},
diff --git a/src/meta_schedule/postproc/postproc.cc b/src/meta_schedule/postproc/postproc.cc
index 7730e4372fa9..bcd0cef4dd69 100644
--- a/src/meta_schedule/postproc/postproc.cc
+++ b/src/meta_schedule/postproc/postproc.cc
@@ -59,7 +59,7 @@ Array<Postproc> Postproc::DefaultLLVM() {
   };
 }
 
-Array<Postproc> Postproc::DefaultVNNI() {
+Array<Postproc> Postproc::DefaultCPUTensorization() {
   return Array<Postproc>{
       Postproc::DisallowDynamicLoop(),   Postproc::RewriteParallelVectorizeUnroll(),
       Postproc::RewriteReductionBlock(), Postproc::RewriteTensorize(/*vectorize_init_loop=*/true),
diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index 113703272031..e25f0b12210d 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -85,7 +85,9 @@ Array<ScheduleRule> ScheduleRule::DefaultLLVM() {
   };
 }
 
-Array<ScheduleRule> ScheduleRule::DefaultVNNI() {
+Array<ScheduleRule> ScheduleRule::DefaultX86(const String& type) {
+  static const Map<String, String> intrins = {{"vnni", "dot_16x4_vnni"},
+                                              {"avx512", "dot_16x4_avx512"}};
   return {
       ScheduleRule::ApplyCustomRule(),
       ScheduleRule::InlineConstantScalars(),
@@ -101,7 +103,7 @@ Array<ScheduleRule> ScheduleRule::DefaultVNNI() {
           /*max_jobs_per_core=*/16,
           /*max_innermost_factor=*/Integer(64)),
       ScheduleRule::MultiLevelTilingWithIntrin(
-          /*intrin_name=*/"dot_16x4_vnni",
+          /*intrin_name=*/intrins[type],
           /*structure=*/"SSRSRS",
           /*tile_binds=*/NullOpt,
           /*max_innermost_factor=*/Integer(64),
diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index 926f86cc4ff9..2ce8d8fa1103 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -29,6 +29,14 @@ String GetRuleKindFromTarget(const Target& target) {
     if (target->GetAttr<String>("mcpu") &&
         (*f_check_vnni)(target->GetAttr<String>("mcpu").value())) {
       return "vnni";
+    } else {
+      static const PackedFunc* f_check_avx512 =
+          runtime::Registry::Get("tvm.topi.x86.utils.target_has_avx512");
+      ICHECK(f_check_avx512 != nullptr) << "The `target_has_avx512` func is not in tvm registry.";
+      if (target->GetAttr<String>("mcpu") &&
+          (*f_check_avx512)(target->GetAttr<String>("mcpu").value())) {
+        return "avx512";
+      }
     }
     return "llvm";
   }
@@ -73,6 +81,7 @@ void SpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
     Array<ScheduleRule> default_sch_rules;
     Array<Postproc> default_postprocs;
     Map<Mutator, FloatImm> default_mutator_probs;
+    // for target with skylake-avx512
     if (kind == "llvm") {
       default_sch_rules = ScheduleRule::DefaultLLVM();
       default_postprocs = Postproc::DefaultLLVM();
@@ -90,9 +99,13 @@ void SpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
       default_postprocs = Postproc::DefaultHexagon();
       default_mutator_probs = Mutator::DefaultHexagon();
     } else if (kind == "vnni") {
-      default_sch_rules = ScheduleRule::DefaultVNNI();
-      default_postprocs = Postproc::DefaultVNNI();
-      default_mutator_probs = Mutator::DefaultVNNI();
+      default_sch_rules = ScheduleRule::DefaultX86("vnni");
+      default_postprocs = Postproc::DefaultCPUTensorization();
+      default_mutator_probs = Mutator::DefaultLLVM();
+    } else if (kind == "avx512") {
+      default_sch_rules = ScheduleRule::DefaultX86("avx512");
+      default_postprocs = Postproc::DefaultCPUTensorization();
+      default_mutator_probs = Mutator::DefaultLLVM();
     } else if (kind == "c") {
       default_sch_rules = ScheduleRule::DefaultMicro();
       default_postprocs = Postproc::DefaultMicro();
diff --git a/tests/python/contrib/test_gemm_acc32_vnni.py b/tests/python/contrib/test_gemm_acc32_vnni.py
index 9cec823cc58a..c01f7758cb45 100644
--- a/tests/python/contrib/test_gemm_acc32_vnni.py
+++ b/tests/python/contrib/test_gemm_acc32_vnni.py
@@ -14,106 +14,102 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
 
 import tvm
 import tvm.testing
 from tvm import te
 import numpy as np
-from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32_cascadelake
 from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32
-import pytest
 
 
-@tvm.testing.requires_llvm
-@pytest.mark.skip("skip because feature not enabled")
-def test_fc_int8_acc32():
-    m = 1024
-    n = 1024
-    k = 1024
-
+def verify_fc_int8_acc32(m=1024, n=1024, k=1024, target="llvm -mcpu=cascadelake"):
     X = te.placeholder((m, k), name="X", dtype="uint8")
-    W = te.placeholder((n, k), name="W", dtype="int8")
+    # W = te.placeholder((n, k), name="W", dtype="int8")
+
+    if not tvm.testing.device_enabled(target):
+        print("skip because %s is not enabled..." % target)
+        return
+
+    dev = tvm.device(target, 0)
+    # workaround for Target.current()
+    with tvm.target.Target(target) as target:
+        pc = dot_16x1x16_uint8_int8_int32()
+
+    ak = te.reduce_axis((0, k), name="k")
+    packedW = te.placeholder((n // 16, 16 * (k // 4), 4), name="packedW", dtype="int8")
+
+    t_fc = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * packedW[
+                tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(ak, 4) * 16 + j % 16, ak % 4
+            ].astype("int32"),
+            axis=ak,
+        ),
+        name="F",
+    )
+    t_sch = te.create_schedule(t_fc.op)
+    a_x, a_y = t_fc.op.axis
+    (a_k,) = t_fc.op.reduce_axis
+
+    a_yo, a_yi = t_sch[t_fc].split(a_y, factor=16)
+    a_xo, a_xi = t_sch[t_fc].split(a_x, factor=32)
+    a_ko, a_ki = t_sch[t_fc].split(a_k, factor=4)
+    a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=4)
+    t_sch[t_fc].reorder(a_yo, a_xo, a_xi, a_koo, a_koi, a_yi, a_ki)
+
+    t_sch[t_fc].unroll(a_koi)
+    t_sch[t_fc].tensorize(a_yi, pc)
+
+    t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
+    t_evaluator = t_func.time_evaluator(t_func.entry_name, dev, number=10)
+
+    # generate the plain data
+    a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
+    b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8")
+
+    packW = np.random.uniform(1, 10, size=(n // 16, 16 * (k // 4), 4)).astype("int8")
+    # This occurs in pre_compute stage
+    for r_idx in range(n // 16):
+        for s_idx in range(16 * (k // 4)):
+            for t_idx in range(4):
+                packW[r_idx][s_idx][t_idx] = b_[r_idx * 16 + s_idx % 16][(s_idx // 16) * 4 + t_idx]
+
+    x = tvm.nd.array(a_, dev)
+    w = tvm.nd.array(packW, dev)
+    y = tvm.nd.array(np.zeros((m, n), dtype="int32"), dev)
+    result = t_evaluator(x, w, y)
 
     peak = 280
     print("Peak {} Gops/s".format(peak))
-    memory_ops = m * k + n * k + 2 * m * n
+    # memory_ops = m * k + n * k + 2 * m * n
     gops_per_mm = 2 * m * n * k
 
+    gops_per_sec = gops_per_mm / result.mean / 1e9
+    # verify the correctness
+    tvm.testing.assert_allclose(y.numpy(), np.dot(a_, b_.T), rtol=0)
+    print(
+        "Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}".format(
+            result.mean * 1000, gops_per_sec, gops_per_sec / peak
+        )
+    )
+    # t_func.export_library("tensorize_acc32.o")
+
+
+@tvm.testing.requires_cascadelake
+def test_fc_int8_acc32_vnni():
     # For LLVM < 8.0, it shows "'cascadelake' is not a recognized processor for this target
     # (ignoring processor)" error with the following setting. After LLVM 8.0 is enabled in the
     # test, we should use cascadelake setting.
-    def verify(target="llvm -mcpu=cascadelake"):
-        if not tvm.testing.device_enabled(target):
-            print("skip because %s is not enabled..." % target)
-            return
-
-        dev = tvm.device(target, 0)
-        pc = dot_16x1x16_uint8_int8_int32_cascadelake()
-        ak = te.reduce_axis((0, k), name="k")
-        packedW = te.placeholder((n // 16, 16 * (k // 4), 4), name="packedW", dtype="int8")
-
-        t_fc = te.compute(
-            (m, n),
-            lambda i, j: te.sum(
-                X[i, ak].astype("int32")
-                * packedW[
-                    tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(ak, 4) * 16 + j % 16, ak % 4
-                ].astype("int32"),
-                axis=ak,
-            ),
-            name="F",
-        )
-        t_sch = te.create_schedule(t_fc.op)
-        a_x, a_y = t_fc.op.axis
-        (a_k,) = t_fc.op.reduce_axis
-
-        a_yo, a_yi = t_sch[t_fc].split(a_y, factor=16)
-        a_xo, a_xi = t_sch[t_fc].split(a_x, factor=32)
-        a_ko, a_ki = t_sch[t_fc].split(a_k, factor=4)
-        a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=4)
-        t_sch[t_fc].reorder(a_yo, a_xo, a_xi, a_koo, a_koi, a_yi, a_ki)
-
-        t_sch[t_fc].unroll(a_koi)
-        t_sch[t_fc].tensorize(a_yi, pc)
-
-        t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
-        t_evaluator = t_func.time_evaluator(t_func.entry_name, dev, number=10)
-
-        # generate the plain data
-        a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
-        b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8")
-
-        packW = np.random.uniform(1, 10, size=(n // 16, 16 * (k // 4), 4)).astype("int8")
-        # This occurs in pre_compute stage
-        for r_idx in range(n // 16):
-            for s_idx in range(16 * (k // 4)):
-                for t_idx in range(4):
-                    packW[r_idx][s_idx][t_idx] = b_[r_idx * 16 + s_idx % 16][
-                        (s_idx // 16) * 4 + t_idx
-                    ]
-
-        x = tvm.nd.array(a_, dev)
-        w = tvm.nd.array(packW, dev)
-        y = tvm.nd.array(np.zeros((m, n), dtype="int32"), dev)
-        result = t_evaluator(x, w, y)
-
-        gops_per_sec = gops_per_mm / result.mean / 1e9
-        # verify the correctness
-        tvm.testing.assert_allclose(y.numpy(), np.dot(a_, b_.T), rtol=0)
-        print(
-            "Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}".format(
-                result.mean * 1000, gops_per_sec, gops_per_sec / peak
-            )
-        )
-        t_func.export_library("tensorize_acc32.o")
+    verify_fc_int8_acc32()
 
-    verify()
 
+@tvm.testing.requires_skylake_avx512
+def test_fc_int8_acc32_avx512():
+    verify_fc_int8_acc32(target="llvm -mcpu=skylake-avx512")
 
-if __name__ == "__main__":
-    # The test requires Cascade Lake and newer Intel machines to generate the
-    # correct AVX512 VNNI instruction. So, disabling the test.
 
-    # test_fc_int8_acc32()
-    pass
+if __name__ == "__main__":
+    test_fc_int8_acc32_vnni()
+    test_fc_int8_acc32_avx512()
diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
index 572da53b34fd..70b2b875c124 100644
--- a/tests/python/integration/test_auto_tensorize.py
+++ b/tests/python/integration/test_auto_tensorize.py
@@ -29,52 +29,63 @@
 from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN
 from tvm.tir.tensor_intrin.rocm import AMDGPU_SDOT4_INTRIN
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
+from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
 
-SCH_RULES_FOR_VNNI = [
-    ms.schedule_rule.ApplyCustomRule(),
-    ms.schedule_rule.AutoInline(
-        into_producer=False,
-        into_consumer=True,
-        inline_const_tensor=True,
-        disallow_if_then_else=True,
-        require_injective=True,
-        require_ordered=True,
-        disallow_op=["tir.exp"],
-    ),
-    ms.schedule_rule.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64),
-    ms.schedule_rule.MultiLevelTilingWithIntrin(
-        VNNI_INTRIN,
-        structure="SSRSRS",
-        tile_binds=None,
-        max_innermost_factor=64,
-        vector_load_lens=None,
-        reuse_read=None,
-        reuse_write=ms.schedule_rule.ReuseType(
-            req="may",
-            levels=[1, 2],
-            scope="global",
+
+CASCADELAKE_VNNI_TARGET = "llvm -mcpu=cascadelake -num-cores 4"
+SKYLAKE_AVX512_TARGET = "llvm -mcpu=skylake-avx512 -num-cores 4"
+
+
+def _get_schedule_rules_for_x86(intrin):
+    return [
+        ms.schedule_rule.ApplyCustomRule(),
+        ms.schedule_rule.AutoInline(
+            into_producer=False,
+            into_consumer=True,
+            inline_const_tensor=True,
+            disallow_if_then_else=True,
+            require_injective=True,
+            require_ordered=True,
+            disallow_op=["tir.exp"],
+        ),
+        ms.schedule_rule.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64),
+        ms.schedule_rule.MultiLevelTilingWithIntrin(
+            intrin,
+            structure="SSRSRS",
+            tile_binds=None,
+            max_innermost_factor=64,
+            vector_load_lens=None,
+            reuse_read=None,
+            reuse_write=ms.schedule_rule.ReuseType(
+                req="may",
+                levels=[1, 2],
+                scope="global",
+            ),
+        ),
+        ms.schedule_rule.MultiLevelTiling(
+            structure="SSRSRS",
+            tile_binds=None,
+            max_innermost_factor=64,
+            vector_load_lens=None,
+            reuse_read=None,
+            reuse_write=ms.schedule_rule.ReuseType(
+                req="may",
+                levels=[1, 2],
+                scope="global",
+            ),
         ),
-    ),
-    ms.schedule_rule.MultiLevelTiling(
-        structure="SSRSRS",
-        tile_binds=None,
-        max_innermost_factor=64,
-        vector_load_lens=None,
-        reuse_read=None,
-        reuse_write=ms.schedule_rule.ReuseType(
-            req="may",
-            levels=[1, 2],
-            scope="global",
+        ms.schedule_rule.ParallelizeVectorizeUnroll(
+            max_jobs_per_core=16,
+            max_vectorize_extent=64,
+            unroll_max_steps=[0, 16, 64, 512],
+            unroll_explicit=True,
         ),
-    ),
-    ms.schedule_rule.ParallelizeVectorizeUnroll(
-        max_jobs_per_core=16,
-        max_vectorize_extent=64,
-        unroll_max_steps=[0, 16, 64, 512],
-        unroll_explicit=True,
-    ),
-    ms.schedule_rule.RandomComputeLocation(),
-]
+        ms.schedule_rule.RandomComputeLocation(),
+    ]
+
+
+SCH_RULES_FOR_VNNI = _get_schedule_rules_for_x86(VNNI_INTRIN)
+SCH_RULES_FOR_AVX512 = _get_schedule_rules_for_x86(AVX512_INTRIN)
 
 
 def _get_sch_rules_for_dp4a(intrin):
@@ -177,6 +188,11 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos
         asm = lib.lib.get_source("asm")
         assert "vpdpbusd" in asm
 
+    if "skylake-avx512" in target:
+        asm = lib.lib.get_source("asm")
+        assert "pmaddubs" in asm
+        assert "pmaddw" in asm
+
     runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
     runtime.set_input("data", data_np)
     runtime.run()
@@ -273,9 +289,12 @@ def _test_bert_int8(relay_mod, params, input_info, target, sch_rules, postprocs)
 
 @tvm.testing.requires_cascadelake
 def test_vnni_dense():
-    _test_dense(
-        "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, "llvm -mcpu=cascadelake -num-cores 4"
-    )
+    _test_dense("uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, CASCADELAKE_VNNI_TARGET)
+
+
+@tvm.testing.requires_skylake_avx512
+def test_avx512_dense():
+    _test_dense("uint8", SCH_RULES_FOR_AVX512, POSTPROCS_FOR_VNNI, SKYLAKE_AVX512_TARGET)
 
 
 @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
@@ -293,9 +312,12 @@ def test_dp4a_dense():
 
 @tvm.testing.requires_cascadelake
 def test_vnni_conv2d():
-    _test_conv2d(
-        "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, "llvm -mcpu=cascadelake -num-cores 4"
-    )
+    _test_conv2d("uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, CASCADELAKE_VNNI_TARGET)
+
+
+@tvm.testing.requires_skylake_avx512
+def test_avx512_conv2d():
+    _test_conv2d("uint8", SCH_RULES_FOR_AVX512, POSTPROCS_FOR_VNNI, SKYLAKE_AVX512_TARGET)
 
 
 @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
@@ -319,12 +341,26 @@ def test_vnni_bert_int8():
         relay_mod,
         params,
         input_info,
-        "llvm -mcpu=cascadelake -num-cores 4",
+        CASCADELAKE_VNNI_TARGET,
         SCH_RULES_FOR_VNNI,
         POSTPROCS_FOR_VNNI,
     )
 
 
+@tvm.testing.requires_skylake_avx512
+@pytest.mark.skip("Due to quantized BERT download issue")
+def test_avx512_bert_int8():
+    relay_mod, params, input_info = load_quantized_bert_base()
+    _test_bert_int8(
+        relay_mod,
+        params,
+        input_info,
+        SKYLAKE_AVX512_TARGET,
+        SCH_RULES_FOR_AVX512,
+        POSTPROCS_FOR_VNNI,
+    )
+
+
 @tvm.testing.requires_gpu
 @pytest.mark.skip("Slow on CI")
 def test_dp4a_bert_int8():
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 3bb9918c7c77..0549f4f2fbcc 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -760,9 +760,7 @@ def test_bitserial_dense():
     assert yy.checked_type == relay.TensorType((m, 32), "int16")
 
 
-@tvm.testing.requires_cascadelake
-@pytest.mark.parametrize("m,n,k", [(32, 128, 96), (32, 128, 97)])
-def test_dense_vnni(m, n, k):
+def dense_x86_test(m, n, k, target="llvm -mcpu=cascadelake", intrins=["vpdpbusd"]):
     data_shape = (m, k)
     weight_shape = (n, k)
 
@@ -774,12 +772,14 @@ def test_dense_vnni(m, n, k):
         out = relay.nn.bias_add(dense, bias)
         mod = tvm.IRModule.from_expr(out)
 
-        target = "llvm -mcpu=cascadelake"
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build(mod, target=target)
 
-        asm = lib.lib.get_source("asm")
-        assert "vpdpbusd" in asm
+        # TODO(vvchernov): needs for avx512 arch, can be extended
+        if n % 16 == 0 and k % 4 == 0:
+            asm = lib.lib.get_source("asm")
+            for intrin in intrins:
+                assert intrin in asm
 
         dev = tvm.device(target, 0)
         runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
@@ -846,6 +846,18 @@ def test_dense_amx_int8():
         np.testing.assert_equal(out, ref)
 
 
+@tvm.testing.requires_cascadelake
+@pytest.mark.parametrize("m,n,k", [(32, 128, 96), (32, 128, 97)])
+def test_dense_vnni(m, n, k):
+    dense_x86_test(m, n, k)
+
+
+@tvm.testing.requires_skylake_avx512
+@pytest.mark.parametrize("m,n,k", [(32, 128, 96), (32, 128, 97)])
+def test_dense_skylake_avx512(m, n, k):
+    dense_x86_test(m, n, k, "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "vpaddd"])
+
+
 @pytest.mark.skip("Requires GFX10 AMDGPU")
 def test_dense_rocm_sdot4():
     data_shape = (32, 96)
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index cdf4e734842b..ed044989ac18 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -473,16 +473,7 @@ def test_batch_matmul(executor_kind):
     verify_batch_matmul_with_inputs(executor_kind, x, x, x_np, x_np, (10, 27, 27))
 
 
-@tvm.testing.requires_cascadelake
-@pytest.mark.parametrize(
-    "b,m,n,k",
-    [
-        (16, 32, 128, 96),
-        (16, 32, 128, 97),
-        (16, 32, 129, 96),
-    ],
-)
-def test_batch_matmul_vnni(b, m, n, k):
+def batch_matmul_x86_test(b, m, n, k, target="llvm -mcpu=cascadelake", intrins=["vpdpbusd"]):
     x_shape = (b, m, k)
     y_shape = (b, n, k)
     z_shape = (b, m, n)
@@ -495,12 +486,14 @@ def test_batch_matmul_vnni(b, m, n, k):
         out = bmm + z
         mod = tvm.IRModule.from_expr(out)
 
-        target = "llvm -mcpu=cascadelake"
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build(mod, target=target)
 
-        asm = lib.lib.get_source("asm")
-        assert "vpdpbusd" in asm
+        # TODO(vvchernov): needs for avx512 arch, can be extended
+        if n % 16 == 0 and k % 4 == 0:
+            asm = lib.lib.get_source("asm")
+            for intrin in intrins:
+                assert intrin in asm
 
         dev = tvm.device(target, 0)
         runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
@@ -575,6 +568,32 @@ def test_batch_matmul_amx(b, m, n, k):
         np.testing.assert_equal(out, ref)
 
 
+@tvm.testing.requires_cascadelake
+@pytest.mark.parametrize(
+    "b,m,n,k",
+    [
+        (16, 32, 128, 96),
+        (16, 32, 128, 97),
+        (16, 32, 129, 96),
+    ],
+)
+def test_batch_matmul_vnni(b, m, n, k):
+    batch_matmul_x86_test(b, m, n, k)
+
+
+@tvm.testing.requires_skylake_avx512
+@pytest.mark.parametrize(
+    "b,m,n,k",
+    [
+        (16, 32, 128, 96),
+        (16, 32, 128, 97),
+        (16, 32, 129, 96),
+    ],
+)
+def test_batch_matmul_skylake_avx512(b, m, n, k):
+    batch_matmul_x86_test(b, m, n, k, "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "vpaddd"])
+
+
 @pytest.mark.skip("Requires GFX10 AMDGPU")
 def test_batch_matmul_rocm_sdot4():
     x_shape = (16, 32, 96)
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index ca1adf940029..f7cfc81fb2d3 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1696,7 +1696,7 @@ def fast_int8_intrinsic(self, target):
         elif "cascadelake" in target:
             return "vpdpbusd"
         else:
-            assert False, "Target should be Skylake or Cascadelake"
+            assert False, "Target should be Nehalem or core-avx2 or Skylake or Cascadelake"
 
     @tvm.testing.fixture
     def assembly(
@@ -2137,7 +2137,7 @@ def get_subgraph(dtype):
             np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
 
 
-def _test_conv2d_int8_alter_dtype(data_dtype, target, dot_product_instr):
+def _test_conv2d_int8_alter_dtype(data_dtype, target, dot_product_instrs):
     def get_conv2d_nchw(
         d_shape,
         w_shape,
@@ -2168,16 +2168,16 @@ def get_conv2d_nchw(
 
     bias = relay.var("bias", shape=bias_shape, dtype="int32")
     bias_np = np.random.randint(low=-127, high=128, size=bias_shape).astype("int32")
-    weight_np = np.random.uniform(-128, 127, size=weight_shape).astype("int8")
+    weight_np = np.random.uniform(-32, 32, size=weight_shape).astype("int8")
 
     conv2d = get_conv2d_nchw(data_shape, weight_shape, data_dtype)
     bias_add = relay.add(conv2d, bias)
     mod = tvm.IRModule.from_expr(bias_add)
 
     if data_dtype == "uint8":
-        data_np = np.random.uniform(0, 255, size=data_shape).astype("uint8")
+        data_np = np.random.uniform(0, 64, size=data_shape).astype("uint8")
     else:
-        data_np = np.random.uniform(-128, 127, size=data_shape).astype("int8")
+        data_np = np.random.uniform(-32, 32, size=data_shape).astype("int8")
 
     params = {"weight": weight_np, "bias": bias_np}
 
@@ -2194,7 +2194,8 @@ def get_conv2d_nchw(
     ):
         lib = relay.build(mod, target=target, params=params)
 
-    assert dot_product_instr in lib.lib.get_source("asm")
+    for dot_product_instr in dot_product_instrs:
+        assert dot_product_instr in lib.lib.get_source("asm")
 
     rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
 
@@ -2210,13 +2211,20 @@ def get_conv2d_nchw(
 @tvm.testing.requires_arm_dot
 def test_conv2d_int8_alter_dtype_arm():
     _test_conv2d_int8_alter_dtype(
-        "uint8", "llvm -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod", "sdot"
+        "uint8", "llvm -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod", ["sdot"]
     )
 
 
 @tvm.testing.requires_cascadelake
 def test_conv2d_int8_alter_dtype_vnni():
-    _test_conv2d_int8_alter_dtype("int8", "llvm -mcpu=cascadelake", "vpdpbusd")
+    _test_conv2d_int8_alter_dtype("int8", "llvm -mcpu=cascadelake", ["vpdpbusd"])
+
+
+@tvm.testing.requires_skylake_avx512
+def test_conv2d_int8_alter_dtype_avx512():
+    _test_conv2d_int8_alter_dtype(
+        "int8", "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "vpaddd"]
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_pass_qnn_legalize.py b/tests/python/relay/test_pass_qnn_legalize.py
index a30cd1e73e3f..c64b30a2128b 100644
--- a/tests/python/relay/test_pass_qnn_legalize.py
+++ b/tests/python/relay/test_pass_qnn_legalize.py
@@ -136,11 +136,12 @@ def _get_mod(data_dtype, kernel_dtype):
         #############################################################
         # Check transformations for platforms with fast Int8 support.
         #############################################################
-        # Check that Intel VNNI gets picked up.
-        with tvm.target.Target("llvm -mcpu=skylake-avx512"):
-            mod = relay.transform.InferType()(mod)
-            legalized_mod = relay.qnn.transform.Legalize()(mod)
-            assert "cast" in legalized_mod.astext() and "qnn.conv2d" in legalized_mod.astext()
+        # Check that Intel AVX512 (with or w/o VNNI) gets picked up.
+        for target in ["llvm -mcpu=skylake-avx512", "llvm -mcpu=cascadelake"]:
+            with tvm.target.Target(target):
+                mod = relay.transform.InferType()(mod)
+                legalized_mod = relay.qnn.transform.Legalize()(mod)
+                assert "cast" in legalized_mod.astext() and "qnn.conv2d" in legalized_mod.astext()
 
         # Since same dtype, there should not be any transformation
         with tvm.target.Target(
@@ -167,7 +168,7 @@ def _get_mod(data_dtype, kernel_dtype):
     #############################################################
     # Check transformations for platforms with fast Int8 support.
     #############################################################
-    # Check no transformation for Intel VNNI.
+    # Check no transformation for Intel AVX512.
     with tvm.target.Target("llvm -mcpu=skylake-avx512"):
         mod = relay.transform.InferType()(mod)
         legalized_mod = relay.qnn.transform.Legalize()(mod)
@@ -229,11 +230,12 @@ def _get_mod(data_dtype, kernel_dtype):
         #############################################################
         # Check transformations for platforms with fast Int8 support.
         #############################################################
-        # Check that Intel VNNI gets picked up.
-        with tvm.target.Target("llvm -mcpu=skylake-avx512"):
-            mod = relay.transform.InferType()(mod)
-            legalized_mod = relay.qnn.transform.Legalize()(mod)
-            assert "cast" in legalized_mod.astext() and "qnn.dense" in legalized_mod.astext()
+        # Check that Intel AVX512 (with or w/o VNNI) gets picked up.
+        for target in ["llvm -mcpu=skylake-avx512", "llvm -mcpu=cascadelake"]:
+            with tvm.target.Target(target):
+                mod = relay.transform.InferType()(mod)
+                legalized_mod = relay.qnn.transform.Legalize()(mod)
+                assert "cast" in legalized_mod.astext() and "qnn.dense" in legalized_mod.astext()
 
         # Since same dtype, there should not be any transformation
         with tvm.target.Target(
@@ -260,7 +262,7 @@ def _get_mod(data_dtype, kernel_dtype):
     #############################################################
     # Check transformations for platforms with fast Int8 support.
     #############################################################
-    # Check no transformation for Intel VNNI.
+    # Check no transformation for Intel AVX512.
     with tvm.target.Target("llvm -mcpu=skylake-avx512"):
         mod = relay.transform.InferType()(mod)
         legalized_mod = relay.qnn.transform.Legalize()(mod)
diff --git a/tests/python/unittest/test_meta_schedule_vnni_integration.py b/tests/python/unittest/test_meta_schedule_cpu_dot_product.py
similarity index 83%
rename from tests/python/unittest/test_meta_schedule_vnni_integration.py
rename to tests/python/unittest/test_meta_schedule_cpu_dot_product.py
index 3bbe916472f5..6dc72d69336f 100644
--- a/tests/python/unittest/test_meta_schedule_vnni_integration.py
+++ b/tests/python/unittest/test_meta_schedule_cpu_dot_product.py
@@ -28,6 +28,7 @@
 from tvm.tir.schedule import BlockRV, Schedule
 from tvm.tir.schedule.analysis import has_block
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
+from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
 
 logging.basicConfig(
     format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s",
@@ -36,9 +37,9 @@
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
 
 
-def _schedule_dense(m: Optional[int], do_tune: bool):
+def _schedule_dense(m: Optional[int], do_tune: bool, intrin=VNNI_INTRIN):
     """Manually schedule a dense block, created from TE compute op via CreatePrimFunc,
-    using VNNI instruction.
+    using VNNI or AVX512 instructions.
     """
 
     def schedule_fn(sch, dense_block: Optional[BlockRV] = None) -> bool:
@@ -47,7 +48,7 @@ def schedule_fn(sch, dense_block: Optional[BlockRV] = None) -> bool:
         if dense_block is None:
             assert has_block(sch, "compute")
             dense_block = sch.get_block("compute")
-            assert "dense_vnni" in sch.get(dense_block).annotations["schedule_rule"]
+            assert "dense_int8" in sch.get(dense_block).annotations["schedule_rule"]
 
         post_blocks = sch.get_consumers(dense_block)
         if len(post_blocks) > 0:
@@ -90,7 +91,7 @@ def schedule_fn(sch, dense_block: Optional[BlockRV] = None) -> bool:
         dec = sch.decompose_reduction(dense_block, a_ko)
         init_loop = sch.get_loops(dec)[-1]
         sch.vectorize(init_loop)
-        sch.tensorize(a_xi, VNNI_INTRIN)
+        sch.tensorize(a_xi, intrin)
         return True
 
     return schedule_fn
@@ -109,10 +110,10 @@ def _relay_dense(m, n, k):
         out_dtype="int32",
     )
     relay_mod = tvm.IRModule.from_expr(out)
-    data = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
+    data = np.random.randint(0, 5, size=(m, k), dtype="uint8")
     params = {
-        "weight": np.random.uniform(1, 10, size=(n, k)).astype("int8"),
-        "bias": np.random.uniform(1, 10, size=(n,)).astype("int32"),
+        "weight": np.random.randint(0, 5, size=(n, k), dtype="int8"),
+        "bias": np.random.randint(0, 5, size=(n,), dtype="int32"),
     }
 
     def f_check(lib, dev):
@@ -135,10 +136,7 @@ def f_check(lib, dev):
     return relay_mod, params, f_check
 
 
-@tvm.testing.requires_cascadelake
-def test_vnni_schedule_fn_database():
-    m, n, k = 1024, 1024, 1024
-    target = tvm.target.Target("llvm -mcpu=cascadelake -num-cores 4")
+def schedule_16x4_dense_fn_database(target, intrin, m=1024, n=1024, k=1024):
     dev = tvm.cpu(0)
     relay_mod, params, f_check = _relay_dense(m, n, k)
 
@@ -146,6 +144,7 @@ def test_vnni_schedule_fn_database():
         _schedule_dense(
             m=m,
             do_tune=False,
+            intrin=intrin,
         )
     ), tvm.transform.PassContext(
         opt_level=3,
@@ -167,21 +166,32 @@ def test_vnni_schedule_fn_database():
 
 
 @tvm.testing.requires_cascadelake
-def test_vnni_schedule_fn_tune():
+def test_vnni_schedule_fn_database():
+    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=cascadelake -num-cores=4")
+    schedule_16x4_dense_fn_database(target, VNNI_INTRIN)
+
+
+@tvm.testing.requires_skylake_avx512
+def test_avx512_schedule_fn_database():
+    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=skylake-avx512 -num-cores=4")
+    schedule_16x4_dense_fn_database(target, AVX512_INTRIN, 16, 16, 16)
+
+
+def schedule_16x4_dense_fn_tune(target, intrin, m=1024, n=1024, k=1024):
     # pylint: disable=W0105
     """
     We can inject and apply a custom TIR scheduling to a TE compute of interest, using
     the "schedule_rule" annotation. For example, in topi/x86/dense.py we have the following
-    declaration for int8 dense targeting the VNNI instruction.
+    declaration for int8 dense targeting the VNNI or AVX512 instructions.
 
     C = te.compute(
         ...
-        attrs={"schedule_rule": "meta_schedule.x86.dense_vnni"},
+        attrs={"schedule_rule": "meta_schedule.x86.dense_int8"},
     )
 
     When the MetaSchedule encounters a TensorIR block with the "schedule_rule" annotation,
     it looks up the packed func registry for a function that is associated with the given schedule
-    rule key ("meta_schedule.x86.dense_vnni" in this example). The signature of such custom
+    rule key ("meta_schedule.x86.dense_int8" in this example). The signature of such custom
     schedule functions must be
 
        (tir.schedule.Schedule, tir.schedule.BlockRV) -> [tir.schedule.Schedule].
@@ -191,14 +201,12 @@ def test_vnni_schedule_fn_tune():
     The relevant code is in `src/meta_schedule/space_generator/apply_custom_rule.cc`.
     """
 
-    def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV):
-        _schedule_dense(m=None, do_tune=True)(sch, dense_block)
+    def schedule_rule_dense_16x4(sch: Schedule, dense_block: BlockRV):
+        _schedule_dense(m=None, do_tune=True, intrin=intrin)(sch, dense_block)
         return [sch]
 
-    register_func("meta_schedule.x86.dense_vnni", schedule_rule_dense_vnni)
+    register_func("meta_schedule.x86.dense_int8", schedule_rule_dense_16x4, override=True)
 
-    m, n, k = 1024, 1024, 1024
-    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=cascadelake -num-cores=4")
     dev = tvm.cpu(0)
     relay_mod, params, f_check = _relay_dense(m, n, k)
 
@@ -247,6 +255,20 @@ def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV):
     f_check(lib, dev)
 
 
+@tvm.testing.requires_cascadelake
+def test_vnni_schedule_fn_tune():
+    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=cascadelake -num-cores=4")
+    schedule_16x4_dense_fn_tune(target, VNNI_INTRIN)
+
+
+@tvm.testing.requires_skylake_avx512
+def test_avx512_schedule_fn_tune():
+    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=skylake-avx512 -num-cores=4")
+    schedule_16x4_dense_fn_tune(target, AVX512_INTRIN, 16, 16, 16)
+
+
 if __name__ == """__main__""":
     test_vnni_schedule_fn_database()
+    test_avx512_schedule_fn_database()
     test_vnni_schedule_fn_tune()
+    test_avx512_schedule_fn_tune()
diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index d3731cfa1be8..795890de083e 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -316,9 +316,8 @@ def traverse(t):
         assert t.task_name in expected_task_names, t.task_name
 
 
-@pytest.mark.skip("Too slow on CI")
-def extract_task_qbert():
-    def _test(mod, params, target):
+def extract_task_qbert(target, sch_rule_tag):
+    def _test(mod, params, target, sch_rule_tag):
         extracted_tasks = ms.relay_integration.extract_tasks(mod, target, params)
         tune_tasks = list(
             filter(
@@ -341,10 +340,20 @@ def _test(mod, params, target):
             annotations = sch.get(block).annotations
 
             assert "schedule_rule" in annotations
-            assert "vnni" in annotations["schedule_rule"]
+            assert sch_rule_tag in annotations["schedule_rule"]
 
     mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128)
-    _test(mod, params, target="llvm -mcpu=cascadelake")
+    _test(mod, params, target=target, sch_rule_tag=sch_rule_tag)
+
+
+@pytest.mark.skip("Too slow on CI")
+def extract_task_qbert_vnni():
+    extract_task_qbert("llvm -mcpu=cascadelake", "vnni")
+
+
+@pytest.mark.skip("Too slow on CI")
+def extract_task_qbert_avx512():
+    extract_task_qbert("llvm -mcpu=skylake-avx512", "avx512")
 
 
 @tvm.testing.skip_if_32bit(reason="Apparently the LLVM version on i386 image is too old")
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
index 54f342c3a5d8..4667626f1706 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
@@ -26,9 +26,10 @@
 from tvm.target import Target
 from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
+from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
 
 
-def test_vnni_conv2d_nchwc():
+def test_x86_conv2d_nchwc(intrin=VNNI_INTRIN, target="llvm -mcpu=cascadelake -num-cores=4"):
     @T.prim_func
     def conv2d_nchwc(
         placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
@@ -68,7 +69,7 @@ def conv2d_nchwc(
 
     # fmt: off
     @T.prim_func
-    def vnni_conv2d_nchwc_0(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
+    def x86_conv2d_nchwc_0(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         conv2d_NCHWc_int8_global = T.alloc_buffer([1, 16, 56, 56, 16], dtype="int32")
         for i0_0, i1_0, i2_0, i3_0, i4_0_0, i0_1, i1_1, i2_1, i3_1, i4_0_1 in T.grid(1, 8, 28, 56, 1, 1, 2, 1, 1, 1):
@@ -86,7 +87,7 @@ def vnni_conv2d_nchwc_0(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
                     ic_s_inner_o = T.axis.reduce(1, i9_0_1 + i9_0_0)
                     T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
                     T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, 0 : 16])
-                    T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"})
+                    T.block_attr({"meta_schedule.auto_tensorize":intrin})
                     with T.init():
                         for i4_1 in T.serial(16):
                             with T.block("conv2d_NCHWc_int8_init"):
@@ -113,7 +114,7 @@ def vnni_conv2d_nchwc_0(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
                     conv2d_NCHWc_int8[v0, v1, v2, v3, v4] = conv2d_NCHWc_int8_global[v0, v1, v2, v3, v4]
 
     @T.prim_func
-    def vnni_conv2d_nchwc_1(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
+    def x86_conv2d_nchwc_1(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         conv2d_NCHWc_int8_global = T.alloc_buffer([1, 16, 56, 56, 16], dtype="int32")
         for i0_0, i1_0, i2_0, i3_0, i4_0_0 in T.grid(1, 8, 28, 56, 1):
@@ -131,7 +132,7 @@ def vnni_conv2d_nchwc_1(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
                     ic_s_inner_o = T.axis.reduce(1, i9_0_1 + i9_0_0)
                     T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
                     T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, 0 : 16])
-                    T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"})
+                    T.block_attr({"meta_schedule.auto_tensorize":intrin})
                     with T.init():
                         for i4_1 in T.serial(16):
                             with T.block("conv2d_NCHWc_int8_init"):
@@ -158,7 +159,7 @@ def vnni_conv2d_nchwc_1(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
                     conv2d_NCHWc_int8[v0, v1, v2, v3, v4] = conv2d_NCHWc_int8_global[v0, v1, v2, v3, v4]
 
     @T.prim_func
-    def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
+    def x86_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         for i0_0, i1_0, i2_0, i3_0, i4_0_0, i0_1, i1_1, i2_1, i3_1, i4_0_1, i5_0, i6_0, i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(1, 8, 28, 56, 1, 1, 2, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1):
             with T.block("conv2d_NCHWc_int8_o"):
@@ -174,7 +175,7 @@ def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
                 ic_s_inner_o = T.axis.reduce(1, i9_0_1 + i9_0_0)
                 T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
                 T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16])
-                T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"})
+                T.block_attr({"meta_schedule.auto_tensorize":intrin})
                 with T.init():
                     for i4_1 in T.serial(16):
                         with T.block("conv2d_NCHWc_int8_init"):
@@ -228,7 +229,6 @@ def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
     ]
 
     mod = conv2d_nchwc
-    target = Target("llvm -mcpu=cascadelake -num-cores=4")
     actual = generate_design_space(
         kind="llvm",
         mod=mod,
@@ -236,7 +236,7 @@ def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
         types=None,
         sch_rules=[
             ms.schedule_rule.MultiLevelTilingWithIntrin(
-                VNNI_INTRIN,
+                intrin,
                 structure="SSRSRS",
                 tile_binds=None,
                 max_innermost_factor=64,
@@ -249,7 +249,7 @@ def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
     check_sketches(
         mod,
         sketches=actual,
-        expected_mods=[vnni_conv2d_nchwc_0, vnni_conv2d_nchwc_1, vnni_conv2d_nchwc_2],
+        expected_mods=[x86_conv2d_nchwc_0, x86_conv2d_nchwc_1, x86_conv2d_nchwc_2],
         expected_decisions=[decision_0, decision_1, decision_2],
     )
 
@@ -417,7 +417,8 @@ def test_dp4a_dense_no_tensorize_2():
 
 
 if __name__ == "__main__":
-    test_vnni_conv2d_nchwc()
+    test_x86_conv2d_nchwc()
+    test_x86_conv2d_nchwc(AVX512_INTRIN, "llvm -mcpu=skylake-avx512 -num-cores=4")
     test_dp4a_dense()
     test_dp4a_dense_no_tensorize_1()
     test_dp4a_dense_no_tensorize_2()
diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
index 9a62207fa261..43b9eb8bbb19 100644
--- a/tests/python/unittest/test_meta_schedule_trace_apply.py
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -25,6 +25,8 @@
 from tvm.target import Target
 from tvm.target.codegen import llvm_lookup_intrinsic_id
 
+from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
+
 
 # fmt: off
 @tvm.script.ir_module
@@ -2553,9 +2555,7 @@ def apply_trace(sch):
         l36, l37, l38, l39, l40, l41, l42, l43, l44, l45, l46, l47 = sch.get_loops(block=b1)
         sch.reorder(l42, l43, l44, l45, l46, l35, l33)
         b48 = sch.blockize(loop=l35)
-        sch.annotate(
-            block_or_loop=b48, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni"
-        )
+        sch.annotate(block_or_loop=b48, ann_key="meta_schedule.auto_tensorize", ann_val=VNNI_INTRIN)
         l49, l50, l51, l52, l53, l54, l55, l56, l57, l58 = sch.get_loops(block=b48)
         v59, v60, v61, v62 = sch.sample_perfect_tile(
             loop=l49, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
@@ -2729,7 +2729,7 @@ def apply_trace(sch):
         sch.vectorize(loop=l193)
         b194 = sch.get_block(name="conv2d_NCHWc_int8_o_update", func_name="main")
         sch.unannotate(block_or_loop=b194, ann_key="meta_schedule.auto_tensorize")
-        sch.tensorize(block_or_loop=b194, tensor_intrin="dot_16x4_vnni")
+        sch.tensorize(block_or_loop=b194, tensor_intrin=VNNI_INTRIN)
 
     vnni_id = llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512")
     verify(
diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py
index e0667da6fe92..38bd4bba1418 100644
--- a/tests/python/unittest/test_tir_schedule_analysis.py
+++ b/tests/python/unittest/test_tir_schedule_analysis.py
@@ -146,7 +146,7 @@ def test_suggest_index_map_winograd():
 
 
 @tvm.script.ir_module
-class DenseVNNIModule:
+class DenseTIRModule:
     @T.prim_func
     def main(
         placeholder: T.Buffer[(1024, 1024), "uint8"],
@@ -170,7 +170,7 @@ def main(
 
 
 @tvm.script.ir_module
-class Conv2dNCHWcVNNIModule:
+class Conv2dNCHWcTIRModule:
     @T.prim_func
     def main(
         placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
@@ -202,7 +202,8 @@ def main(
                 conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
                     n, oc_chunk, oh, ow, oc_block
                 ] + T.cast(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    "int32",
                 ) * T.cast(
                     placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
                     "int32",
@@ -222,8 +223,8 @@ def callback(node):
     return loops
 
 
-def test_get_tensorize_loop_mapping_dense_vnni():
-    s = Schedule(DenseVNNIModule)
+def test_get_tensorize_loop_mapping_dense_16x4():
+    s = Schedule(DenseTIRModule)
     block = s.get_block("compute")
 
     info = get_tensorize_loop_mapping(s, block, dot_product_16x4_u8i8i32_desc)
@@ -240,8 +241,8 @@ def test_get_tensorize_loop_mapping_dense_vnni():
     assert s.get(desc_loop_to_sref[desc_loops[1]]) == s.get(loop_k)
 
 
-def test_get_tensorize_loop_mapping_conv2d_nchwc_vnni():
-    s = Schedule(Conv2dNCHWcVNNIModule)
+def test_get_tensorize_loop_mapping_conv2d_nchwc_16x4():
+    s = Schedule(Conv2dNCHWcTIRModule)
     block = s.get_block("conv2d_NCHWc_int8")
 
     info = get_tensorize_loop_mapping(s, block, dot_product_16x4_u8i8i32_desc)
diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py
index fc0bdc146c88..4847f261a32c 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize.py
@@ -29,7 +29,7 @@
     ARM_DOT_4x4_i8_SDOT_INTRIN,
 )
 from tvm.tir.tensor_intrin.rocm import AMDGPU_SDOT4_INTRIN
-from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN
+from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN, AVX512_DOT_16x4_INTRIN
 from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN, VDMPY_i16i16i32_INTRIN
 
 # fmt: off
@@ -557,7 +557,7 @@ def get_matmul_packed(m, n, k, lhs_type, rhs_dtype="int8"):
     return te.create_prim_func([X, W, matmul])
 
 
-def test_tensorize_vnni():
+def tensorize_16x4_test(intrin=VNNI_DOT_16x4_INTRIN):
     m, n, k = 128, 128, 128
 
     func = get_matmul_packed(m, n, k, "uint8")
@@ -572,11 +572,19 @@ def test_tensorize_vnni():
     sch.reorder(ko, ji, ki)
 
     sch.decompose_reduction(block, ko)
-    sch.tensorize(ji, VNNI_DOT_16x4_INTRIN)
+    sch.tensorize(ji, intrin)
 
     verify_trace_roundtrip(sch=sch, mod=func)
 
 
+def test_tensorize_vnni():
+    tensorize_16x4_test()
+
+
+def test_tensorize_avx512():
+    tensorize_16x4_test(AVX512_DOT_16x4_INTRIN)
+
+
 def test_tensorize_arm_dot():
     m, n, k = 128, 128, 128
 
diff --git a/tests/python/unittest/test_tir_schedule_transform.py b/tests/python/unittest/test_tir_schedule_transform.py
index e812587e6676..c068385f0a46 100644
--- a/tests/python/unittest/test_tir_schedule_transform.py
+++ b/tests/python/unittest/test_tir_schedule_transform.py
@@ -18,11 +18,11 @@
 from tvm.script import tir as T
 from tvm.tir import Schedule
 from tvm.tir.schedule.transform import tile_with_tensor_intrin
-from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN
+from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN, AVX512_DOT_16x4_INTRIN
 
 
 @tvm.script.ir_module
-class DenseVNNIModule:
+class DenseTIRModule:
     @T.prim_func
     def main(
         placeholder: T.Buffer[(1024, 1024), "uint8"],
@@ -46,7 +46,7 @@ def main(
 
 
 @tvm.script.ir_module
-class DenseVNNIModuleTiled:
+class DenseTIRModuleTiled:
     @T.prim_func
     def main(
         placeholder: T.Buffer[(1024, 1024), "uint8"],
@@ -72,7 +72,7 @@ def main(
 
 
 @tvm.script.ir_module
-class Conv2dNCHWcVNNIModule:
+class Conv2dNCHWcTIRModule:
     @T.prim_func
     def main(
         placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
@@ -104,7 +104,8 @@ def main(
                 conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
                     n, oc_chunk, oh, ow, oc_block
                 ] + T.cast(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    "int32",
                 ) * T.cast(
                     placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
                     "int32",
@@ -112,7 +113,7 @@ def main(
 
 
 @tvm.script.ir_module
-class Conv2dNCHWcVNNIModuleTiled:
+class Conv2dNCHWcTIRModuleTiled:
     @T.prim_func
     def main(
         placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
@@ -141,35 +142,38 @@ def main(
                 conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
                     n, oc_chunk, oh, ow, oc_block
                 ] + T.cast(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    "int32",
                 ) * T.cast(
                     placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
                     "int32",
                 )
 
 
-def test_tile_with_tensor_intrin_dense_vnni():
-    s = Schedule(DenseVNNIModule)
+def test_tile_with_tensor_intrin_dense(intrin=VNNI_DOT_16x4_INTRIN):
+    s = Schedule(DenseTIRModule)
     block = s.get_block("compute")
 
-    tiled_loop = tile_with_tensor_intrin(s, block, VNNI_DOT_16x4_INTRIN)
+    tiled_loop = tile_with_tensor_intrin(s, block, intrin)
 
     _, _, _, i1_1, _ = s.get_loops(block)
 
     assert s.get(tiled_loop) == s.get(i1_1)
-    tvm.ir.assert_structural_equal(s.mod, DenseVNNIModuleTiled)
+    tvm.ir.assert_structural_equal(s.mod, DenseTIRModuleTiled)
 
 
-def test_tile_with_tensor_intrin_conv2d_nchwc_vnni():
-    s = Schedule(Conv2dNCHWcVNNIModule)
+def test_tile_with_tensor_intrin_conv2d_nchwc(intrin=VNNI_DOT_16x4_INTRIN):
+    s = Schedule(Conv2dNCHWcTIRModule)
     block = s.get_block("conv2d_NCHWc_int8")
-    tiled_loop = tile_with_tensor_intrin(s, block, VNNI_DOT_16x4_INTRIN)
+    tiled_loop = tile_with_tensor_intrin(s, block, intrin)
     tiled_loops = s.get_loops(block)
     assert len(tiled_loops) == 12
     assert s.get(tiled_loop) == s.get(tiled_loops[-2])
-    tvm.ir.assert_structural_equal(s.mod, Conv2dNCHWcVNNIModuleTiled)
+    tvm.ir.assert_structural_equal(s.mod, Conv2dNCHWcTIRModuleTiled)
 
 
 if __name__ == "__main__":
-    test_tile_with_tensor_intrin_dense_vnni()
-    test_tile_with_tensor_intrin_conv2d_nchwc_vnni()
+    test_tile_with_tensor_intrin_dense()
+    test_tile_with_tensor_intrin_dense(AVX512_DOT_16x4_INTRIN)
+    test_tile_with_tensor_intrin_conv2d_nchwc()
+    test_tile_with_tensor_intrin_conv2d_nchwc(AVX512_DOT_16x4_INTRIN)

From 03e15016775205b0bf86f213e7be11ac6febdb81 Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Tue, 17 Jan 2023 22:08:24 +0000
Subject: [PATCH 186/286] [CI] Cross-compile libtvm_runtime to Aarch64 and run
 tests (#13714)

~~This is a PR to test the CI~~

This PR is part of #13526. It adds cross-compilation of libtvm, libtvm_runtime and libtvm_allvisible to aarch64. It then executes the cpp and python tests on aarch64.

A successful run of the CI using the updated minimal image can be found [here](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm-minimal-cross-isa/detail/PR-13714/13/pipeline/60).
---
 .../minimal_cross_isa_jenkinsfile.groovy      | 632 ++++++++++++++++++
 .../minimal_cross_isa_jenkinsfile.groovy.j2   |  56 ++
 tests/python/unittest/test_micro_ms_tuning.py |   9 +-
 .../task_config_build_minimal_cross_isa.sh    |  49 ++
 4 files changed, 743 insertions(+), 3 deletions(-)
 create mode 100644 ci/jenkins/generated/minimal_cross_isa_jenkinsfile.groovy
 create mode 100644 ci/jenkins/templates/minimal_cross_isa_jenkinsfile.groovy.j2
 create mode 100755 tests/scripts/task_config_build_minimal_cross_isa.sh

diff --git a/ci/jenkins/generated/minimal_cross_isa_jenkinsfile.groovy b/ci/jenkins/generated/minimal_cross_isa_jenkinsfile.groovy
new file mode 100644
index 000000000000..992a1e307523
--- /dev/null
+++ b/ci/jenkins/generated/minimal_cross_isa_jenkinsfile.groovy
@@ -0,0 +1,632 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// Docker env used for testing
+// Different image may have different version tag
+// because some of them are more stable than anoter.
+//
+// Docker images are maintained by PMC, cached in dockerhub
+// and remains relatively stable over the time.
+// Flow for upgrading docker env(need commiter)
+//
+// - Send PR to upgrade build script in the repo
+// - Build the new docker image
+// - Tag the docker image with a new version and push to a binary cache.
+// - Update the version in the Jenkinsfile, send a PR
+// - Fix any issues wrt to the new image version in the PR
+// - Merge the PR and now we are in new version
+// - Tag the new version as the lates
+// - Periodically cleanup the old versions on local workers
+//
+
+// ============================= IMPORTANT NOTE =============================
+// This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
+// Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
+// 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at 2023-01-12T09:59:14.593960
+
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+// These are set at runtime from data in ci/jenkins/docker-images.yml, update
+// image tags in that file
+ci_lint = ''
+ci_gpu = ''
+ci_cpu = ''
+ci_minimal = ''
+ci_wasm = ''
+ci_i386 = ''
+ci_cortexm = ''
+ci_arm = ''
+ci_hexagon = ''
+ci_riscv = ''
+
+// Parameters to allow overriding (in Jenkins UI), the images
+// to be used by a given build. When provided, they take precedence
+// over default values above.
+properties([
+  parameters([
+    string(name: 'ci_arm_param', defaultValue: ''),
+    string(name: 'ci_cortexm_param', defaultValue: ''),
+    string(name: 'ci_cpu_param', defaultValue: ''),
+    string(name: 'ci_gpu_param', defaultValue: ''),
+    string(name: 'ci_hexagon_param', defaultValue: ''),
+    string(name: 'ci_i386_param', defaultValue: ''),
+    string(name: 'ci_lint_param', defaultValue: ''),
+    string(name: 'ci_minimal_param', defaultValue: ''),
+    string(name: 'ci_riscv_param', defaultValue: ''),
+    string(name: 'ci_wasm_param', defaultValue: ''),
+  ])
+])
+
+// Placeholders for newly built Docker image names (if rebuild_docker_images
+// is used)
+  built_ci_arm = null;
+  built_ci_cortexm = null;
+  built_ci_cpu = null;
+  built_ci_gpu = null;
+  built_ci_hexagon = null;
+  built_ci_i386 = null;
+  built_ci_lint = null;
+  built_ci_minimal = null;
+  built_ci_riscv = null;
+  built_ci_wasm = null;
+
+// Global variable assigned during Sanity Check that holds the sha1 which should be
+// merged into the PR in all branches.
+upstream_revision = null
+
+// command to start a docker container
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME'
+docker_build = 'docker/build.sh'
+// timeout in minutes
+max_time = 180
+rebuild_docker_images = false
+
+s3_bucket = 'tvm-jenkins-artifacts-prod'
+s3_prefix = "tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+
+// Jenkins script root directory
+jenkins_scripts_root = "ci/scripts/jenkins"
+
+
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  retry(5) {
+    checkout scm
+  }
+
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+
+  // Determine merge commit to use for all stages
+  if (env.BRANCH_NAME == 'main') {
+    // Only set upstream_revision to HEAD and skip merging to avoid a race with another commit merged to main.
+    update_upstream_revision("HEAD")
+  } else {
+    // This is PR branch so merge with latest main.
+    merge_with_main()
+  }
+
+  sh(
+    script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 3 timeout 5m git submodule update --init -f --jobs 0
+    """,
+    label: 'Update git submodules',
+  )
+  checkout_trusted_files()
+}
+
+def update_upstream_revision(git_ref) {
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: "git log -1 ${git_ref} --format=\'%H\'",
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+}
+
+def merge_with_main() {
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  update_upstream_revision("FETCH_HEAD")
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+}
+
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    ecr_pull(image)
+  } else {
+    sh(
+      script: """
+      set -eux
+      . ${jenkins_scripts_root}/retry.sh
+      retry 5 docker pull ${image}
+      """,
+      label: 'Pull docker image',
+    )
+  }
+}
+
+def ecr_pull(full_name) {
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          . ${jenkins_scripts_root}/retry.sh
+          retry 5 docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def checkout_trusted_files() {
+  // trust everything from branch builds
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    return;
+  }
+
+  // trust peoople listed in CONTRIBUTING.md
+  grep_code = sh(
+    returnStatus: true,
+    script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'",
+    label: 'Check if change is from a contributor',
+  )
+
+  if (grep_code == 1) {
+    // Any scripts that run on the bare host and not inside a Docker container
+    // (especially those that access secrets) should be checked out here so
+    // only trusted versions are used in CI
+    sh(
+      script: "git checkout ${upstream_revision} ${jenkins_scripts_root}/.",
+      label: 'Check out trusted files',
+    )
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./${jenkins_scripts_root}/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./${jenkins_scripts_root}/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+def check_pr(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh (
+      script: "python3 ${jenkins_scripts_root}/check_pr.py --pr ${pr_number}",
+      label: 'Check PR title and body',
+    )
+  }
+
+}
+
+def prepare() {
+  stage('Prepare') {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
+        init_git()
+
+        check_pr(env.CHANGE_ID)
+
+        if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
+          sh(
+            script: "./${jenkins_scripts_root}/determine_docker_images.py ci_arm ci_cortexm ci_cpu ci_gpu ci_hexagon ci_i386 ci_lint ci_minimal ci_riscv ci_wasm ",
+            label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
+          )
+          // Pull image names from the results of should_rebuild_docker.py
+          ci_arm = sh(
+            script: "cat .docker-image-names/ci_arm",
+            label: "Find docker image name for ci_arm",
+            returnStdout: true,
+          ).trim()
+          ci_cortexm = sh(
+            script: "cat .docker-image-names/ci_cortexm",
+            label: "Find docker image name for ci_cortexm",
+            returnStdout: true,
+          ).trim()
+          ci_cpu = sh(
+            script: "cat .docker-image-names/ci_cpu",
+            label: "Find docker image name for ci_cpu",
+            returnStdout: true,
+          ).trim()
+          ci_gpu = sh(
+            script: "cat .docker-image-names/ci_gpu",
+            label: "Find docker image name for ci_gpu",
+            returnStdout: true,
+          ).trim()
+          ci_hexagon = sh(
+            script: "cat .docker-image-names/ci_hexagon",
+            label: "Find docker image name for ci_hexagon",
+            returnStdout: true,
+          ).trim()
+          ci_i386 = sh(
+            script: "cat .docker-image-names/ci_i386",
+            label: "Find docker image name for ci_i386",
+            returnStdout: true,
+          ).trim()
+          ci_lint = sh(
+            script: "cat .docker-image-names/ci_lint",
+            label: "Find docker image name for ci_lint",
+            returnStdout: true,
+          ).trim()
+          ci_minimal = sh(
+            script: "cat .docker-image-names/ci_minimal",
+            label: "Find docker image name for ci_minimal",
+            returnStdout: true,
+          ).trim()
+          ci_riscv = sh(
+            script: "cat .docker-image-names/ci_riscv",
+            label: "Find docker image name for ci_riscv",
+            returnStdout: true,
+          ).trim()
+          ci_wasm = sh(
+            script: "cat .docker-image-names/ci_wasm",
+            label: "Find docker image name for ci_wasm",
+            returnStdout: true,
+          ).trim()
+        }
+
+        ci_arm = params.ci_arm_param ?: ci_arm
+        ci_cortexm = params.ci_cortexm_param ?: ci_cortexm
+        ci_cpu = params.ci_cpu_param ?: ci_cpu
+        ci_gpu = params.ci_gpu_param ?: ci_gpu
+        ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+        ci_i386 = params.ci_i386_param ?: ci_i386
+        ci_lint = params.ci_lint_param ?: ci_lint
+        ci_minimal = params.ci_minimal_param ?: ci_minimal
+        ci_riscv = params.ci_riscv_param ?: ci_riscv
+        ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          echo " ci_arm = ${ci_arm}"
+          echo " ci_cortexm = ${ci_cortexm}"
+          echo " ci_cpu = ${ci_cpu}"
+          echo " ci_gpu = ${ci_gpu}"
+          echo " ci_hexagon = ${ci_hexagon}"
+          echo " ci_i386 = ${ci_i386}"
+          echo " ci_lint = ${ci_lint}"
+          echo " ci_minimal = ${ci_minimal}"
+          echo " ci_riscv = ${ci_riscv}"
+          echo " ci_wasm = ${ci_wasm}"
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docs.sh",
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: "./${jenkins_scripts_root}/git_change_docker.sh",
+          label: 'Check for any docker changes',
+        )
+
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
+      }
+    }
+  }
+}
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_clear_pytest.sh",
+    label: 'Clean up old workspace',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def make_standalone_crt(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target standalone_crt \
+        --build-dir build
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target crttest \
+        --build-dir build
+      """,
+    label: 'Make standalone CRT',
+  )
+}
+
+def make_cpp_tests(image, build_dir) {
+  sh (
+    script: """
+      set -eux
+      ${docker_run} ${image} python3 ./tests/scripts/task_build.py \
+        --sccache-bucket tvm-sccache-prod \
+        --cmake-target cpptest \
+        --build-dir ${build_dir}
+      """,
+    label: 'Make C++ tests',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Run C++ tests',
+  )
+}
+
+def micro_cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_microtvm_cpp_tests.sh build",
+    label: 'Run microTVM C++ tests',
+  )
+}
+
+cancel_previous_build()
+
+prepare()
+def build() {
+  stage('Build') {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu-minimal-cross-isa") {
+          init_git()
+          docker_init(ci_minimal)
+          timeout(time: max_time, unit: 'MINUTES') {
+            sh (
+          script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal_cross_isa.sh build",
+          label: 'Create CPU minimal cmake config',
+        )
+        cmake_build(ci_minimal, 'build', '-j2')
+        sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu-minimal-cross-isa --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so",
+            label: 'Upload artifacts to S3',
+          )
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: CPU MINIMAL CROSS ISA')
+    }
+  }
+}
+build()
+
+
+
+
+def shard_run_unittest_CPU_MINIMAL_CROSS_ISA_1_of_1() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-cpp-arm-cross-isa") {
+        try {
+          init_git()
+          docker_init(ci_arm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TEST_STEP_NAME=unittest: CPU MINIMAL CROSS ISA',
+              'TVM_NUM_SHARDS=1',
+              'TVM_SHARD_INDEX=0',
+              "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
+              sh(
+                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu-minimal-cross-isa",
+                  label: 'Download artifacts from S3',
+                )
+
+              ci_setup(ci_arm)
+              sh "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_minimal_cross_isa.sh build"
+              make_cpp_tests(ci_arm, 'build')
+              cpp_unittest(ci_arm)
+              python_unittest(ci_arm)
+            })
+          }
+        } finally {
+          try {
+            sh(
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_CPU_MINIMAL_CROSS_ISA --items build/pytest-results",
+            label: 'Upload JUnits to S3',
+          )
+
+            junit 'build/pytest-results/*.xml'
+          } catch (Exception e) {
+            echo 'Exception during JUnit upload: ' + e.toString()
+          }
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('unittest: CPU MINIMAL CROSS ISA 1 of 1')
+  }
+}
+
+
+
+def test() {
+  stage('Test') {
+    environment {
+      SKIP_SLOW_TESTS = "${skip_slow_tests}"
+    }
+    parallel(
+    'unittest: CPU MINIMAL CROSS ISA 1 of 1': {
+      shard_run_unittest_CPU_MINIMAL_CROSS_ISA_1_of_1()
+    },
+    )
+  }
+}
+test()
diff --git a/ci/jenkins/templates/minimal_cross_isa_jenkinsfile.groovy.j2 b/ci/jenkins/templates/minimal_cross_isa_jenkinsfile.groovy.j2
new file mode 100644
index 000000000000..f418b2a08ec4
--- /dev/null
+++ b/ci/jenkins/templates/minimal_cross_isa_jenkinsfile.groovy.j2
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+{% include "utils/base.groovy.j2" with context %}
+{% import 'utils/macros.j2' as m with context -%}
+
+{% call m.invoke_build(
+  name='BUILD: CPU MINIMAL CROSS ISA',
+  node='CPU-SMALL',
+  condition='!skip_ci && is_docs_only_build != 1',
+  ws='tvm/build-cpu-minimal-cross-isa',
+  docker_image='ci_minimal',
+) %}
+  sh (
+    script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal_cross_isa.sh build",
+    label: 'Create CPU minimal cmake config',
+  )
+  cmake_build(ci_minimal, 'build', '-j2')
+  {{ m.upload_artifacts(tag='cpu-minimal-cross-isa', filenames=tvm_lib + tvm_allvisible) }}
+{% endcall %}
+
+
+{% set test_method_names = [] %}
+
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="unittest: CPU MINIMAL CROSS ISA",
+  node="ARM-SMALL",
+  num_shards=1,
+  ws="tvm/ut-cpp-arm-cross-isa",
+  platform="arm",
+  docker_image="ci_arm",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='cpu-minimal-cross-isa') }}
+  ci_setup(ci_arm)
+  sh "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_minimal_cross_isa.sh build"
+  make_cpp_tests(ci_arm, 'build')
+  cpp_unittest(ci_arm)
+  python_unittest(ci_arm)
+{% endcall %}
+
+
+{{ m.invoke_tests(test_method_names) -}}
diff --git a/tests/python/unittest/test_micro_ms_tuning.py b/tests/python/unittest/test_micro_ms_tuning.py
index 81b412fd9c88..edb27396e324 100644
--- a/tests/python/unittest/test_micro_ms_tuning.py
+++ b/tests/python/unittest/test_micro_ms_tuning.py
@@ -19,17 +19,20 @@
 from types import MappingProxyType
 import pathlib
 import json
-from tests.micro.zephyr.test_ms_tuning import create_relay_module
 import tvm
+import tvm.testing
 from tvm import relay
 from tvm.relay.backend import Executor
 from tvm.contrib import graph_executor, utils
 from tvm import meta_schedule as ms
-from tvm.contrib.micro.meta_schedule.local_builder_micro import get_local_builder_micro
-from tvm.contrib.micro.meta_schedule.rpc_runner_micro import get_rpc_runner_micro
 
 
+@tvm.testing.requires_micro
 def test_micro_tuning_with_meta_schedule():
+    from tests.micro.zephyr.test_ms_tuning import create_relay_module
+    from tvm.contrib.micro.meta_schedule.local_builder_micro import get_local_builder_micro
+    from tvm.contrib.micro.meta_schedule.rpc_runner_micro import get_rpc_runner_micro
+
     platform = "crt"
     target = tvm.target.target.micro(model="host")
     options = {}
diff --git a/tests/scripts/task_config_build_minimal_cross_isa.sh b/tests/scripts/task_config_build_minimal_cross_isa.sh
new file mode 100755
index 000000000000..ac556d48ed2c
--- /dev/null
+++ b/tests/scripts/task_config_build_minimal_cross_isa.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euxo pipefail
+
+BUILD_DIR=$1
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
+cp ../cmake/config.cmake .
+
+echo set\(USE_SORT ON\) >> config.cmake
+echo set\(USE_RELAY_DEBUG ON\) >> config.cmake
+echo set\(CMAKE_BUILD_TYPE=Debug\) >> config.cmake
+echo set\(CMAKE_CXX_FLAGS \"-Werror -Wp,-D_GLIBCXX_ASSERTIONS\"\) >> config.cmake
+echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
+echo set\(USE_LIBBACKTRACE OFF\) >> config.cmake
+echo set\(USE_CCACHE OFF\) >> config.cmake
+echo set\(SUMMARIZE ON\) >> config.cmake
+
+architecture_type=$(uname -i)
+if [ "$architecture_type" != "aarch64" ]; then
+  echo set\(USE_LLVM \"/usr/llvm-aarch64/bin/llvm-config --link-static\"\) >> config.cmake
+
+  # Cross compile to aarch64
+  echo set\(CMAKE_C_COMPILER aarch64-linux-gnu-gcc\) >> config.cmake
+  echo set\(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++\) >> config.cmake
+
+  echo set\(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu\) >> config.cmake
+  echo set\(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER\) >> config.cmake
+  echo set\(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY\) >> config.cmake
+else
+  # This usually runs in the ci_arm docker image.
+  echo set\(USE_LLVM llvm-config-8\) >> config.cmake
+fi

From 1feb73edc39673437588a8a9f093023a96a93c31 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 17 Jan 2023 14:12:38 -0800
Subject: [PATCH 187/286] [TVMScript] Migrate More to TVMScripr Printer
 (#13785)

This PR gradually migrates more pieces of the default printing to
TVMScript printer for TIR.

This PR gradually migrates more pieces of the default printing to
TVMScript printer for TIR. Details:
- Introduced a method `AsLegacyRepr` which preserves existing
`AsRepr` provided by `ReprPrinter`, so that the legacy behavior
could be 100% preserved.
- Introduced `Script` method to `IRModule`, `PrimFunc`, `tir.Stmt`,
`tir.PrimExpr`. The `script` method exists in python side before,
and this PR introduced them to C++ to be consistent.
- Replace TIR's `PrettyPrint` to `operator <<` that is provided by
the new `ReprPrinter`, which outputs in TVMScript format by default.
`PrettyPrint` on Relay is all preserved for backward compatibility.
---
 include/tvm/ir/expr.h                         |   11 +
 include/tvm/ir/module.h                       |   55 +-
 include/tvm/ir/type.h                         |   19 +
 include/tvm/ir/type_functor.h                 |    4 +-
 include/tvm/node/repr_printer.h               |   32 +
 .../tvm/script/printer/ir_docsifier_functor.h |    3 +
 include/tvm/script/printer/printer.h          |   15 -
 include/tvm/tir/expr.h                        |    3 -
 include/tvm/tir/function.h                    |   11 +
 include/tvm/tir/stmt.h                        |   11 +
 python/tvm/ir/__init__.py                     |   56 +-
 python/tvm/ir/affine_type.py                  |    7 +-
 python/tvm/ir/base.py                         |   12 +-
 python/tvm/ir/module.py                       |  126 ++-
 python/tvm/ir/tensor_type.py                  |    7 +-
 python/tvm/ir/type.py                         |    3 +-
 python/tvm/relay/dataflow_pattern/__init__.py |    5 +
 python/tvm/relay/expr.py                      |   24 +-
 python/tvm/relay/frontend/tensorflow_ops.py   |    2 +-
 python/tvm/relay/function.py                  |    9 +-
 python/tvm/relay/op/contrib/cutlass.py        |    8 +-
 python/tvm/relay/op/contrib/dnnl.py           |   23 +-
 python/tvm/relay/op/contrib/ethosu.py         |   64 +-
 python/tvm/relay/op/contrib/tensorrt.py       |   13 +-
 python/tvm/runtime/_ffi_node_api.py           |    5 +
 python/tvm/runtime/object.py                  |   19 +-
 python/tvm/tir/expr.py                        |   86 +-
 python/tvm/tir/function.py                    |   81 +-
 python/tvm/tir/schedule/schedule.py           |   25 +-
 python/tvm/tir/stmt.py                        |   75 ++
 src/arith/iter_affine_map.cc                  |   12 +-
 src/auto_scheduler/compute_dag.cc             |   22 +-
 src/ir/adt.cc                                 |    5 +-
 src/ir/attrs.cc                               |    6 -
 src/ir/error.cc                               |    7 -
 src/ir/expr.cc                                |   11 -
 src/ir/function.cc                            |   16 +-
 src/ir/module.cc                              |  157 +--
 src/ir/transform.cc                           |    3 -
 src/ir/type.cc                                |   25 -
 src/meta_schedule/arg_info.cc                 |    5 +-
 src/meta_schedule/database/json_database.cc   |    2 +-
 .../task_scheduler/task_scheduler.cc          |    6 +-
 src/meta_schedule/utils.h                     |    1 -
 src/node/repr_printer.cc                      |   25 +
 src/node/structural_equal.cc                  |    4 +-
 src/printer/model_library_format_printer.cc   |    6 +-
 src/printer/text_printer.h                    |    2 -
 src/printer/tvmscript_printer.cc              |   10 -
 src/relay/backend/te_compiler_cache.cc        |    3 +-
 src/relay/ir/function.cc                      |  130 +++
 src/relay/transforms/defunctionalization.cc   |    2 +-
 src/script/printer/ir/ir.cc                   |   70 +-
 src/script/printer/ir/script_method.cc        |   34 +
 src/script/printer/ir/utils.h                 |   16 +-
 src/script/printer/legacy_repr.cc             | 1008 +++++++++++++++++
 src/script/printer/tir/block.cc               |    4 +-
 src/script/printer/tir/buffer.cc              |   16 +-
 src/script/printer/tir/expr.cc                |   75 +-
 src/script/printer/tir/for_loop.cc            |    4 +-
 src/script/printer/tir/function.cc            |   20 +-
 src/script/printer/tir/ir.cc                  |   18 +-
 src/script/printer/tir/script_method.cc       |   59 +
 src/script/printer/tir/stmt.cc                |   26 +-
 src/script/printer/tir/utils.h                |   31 +-
 src/script/printer/utils.h                    |   73 ++
 src/target/source/interface_c.cc              |    3 +-
 src/target/source/source_module.cc            |    3 +-
 src/tir/analysis/control_flow_graph.cc        |    5 +-
 src/tir/analysis/oob_checker.cc               |    1 -
 src/tir/analysis/verify_memory.cc             |    2 +-
 src/tir/ir/legacy_printer.cc                  |  270 -----
 src/tir/schedule/analysis/verify.cc           |    2 +-
 src/tir/schedule/error.cc                     |    1 +
 src/tir/schedule/primitive/compute_inline.cc  |   10 +-
 .../primitive/layout_transformation.cc        |    4 +-
 src/tir/schedule/utils.h                      |    2 +-
 src/tir/transforms/common_subexpr_elim.cc     |    4 +-
 .../transforms/common_subexpr_elim_tools.cc   |    4 +-
 src/tir/transforms/install_debug_spans.cc     |    2 +-
 src/tir/transforms/narrow_datatype.cc         |    1 -
 src/tir/usmp/transform/assign_pool_info.cc    |    4 +-
 .../test_ethosu/test_encode_constants.py      |   24 +-
 .../test_outline_compiler_functions.py        |    4 +-
 .../test_ethosu/test_remove_concatenates.py   |    7 +-
 .../test_ethosu/test_replace_conv2d.py        |   11 +-
 .../contrib/test_ethosu/test_replace_copy.py  |   13 +-
 tests/python/contrib/test_tensorrt.py         |   12 +-
 .../python/contrib/test_uma/test_partition.py |    7 +-
 tests/python/frontend/pytorch/qnn_test.py     |   39 +-
 .../unittest/test_arith_deduce_bound.py       |   20 +-
 .../test_meta_schedule_schedule_rule_mlt.py   |  112 +-
 tests/python/unittest/test_te_schedule.py     |    7 +-
 tests/python/unittest/test_tir_nodes.py       |   54 +-
 ...est_tir_transform_inject_ptx_async_copy.py |    6 +-
 ...est_tir_transform_inject_rolling_buffer.py |   12 +-
 ..._tir_transform_inject_software_pipeline.py |    4 +-
 .../test_tir_transform_make_packed_api.py     |   16 +-
 .../test_tir_transform_thread_sync.py         |    4 +-
 .../unittest/test_tvmscript_complete.py       |    2 +-
 tests/python/unittest/test_tvmscript_ops.py   |    8 +-
 .../unittest/test_tvmscript_printer_ir.py     |   49 +
 .../unittest/test_tvmscript_printer_tir.py    |    5 +-
 .../unittest/test_tvmscript_regression.py     |    4 +-
 .../unittest/test_tvmscript_roundtrip.py      |   23 +-
 105 files changed, 2484 insertions(+), 1005 deletions(-)
 create mode 100644 src/script/printer/ir/script_method.cc
 create mode 100644 src/script/printer/legacy_repr.cc
 create mode 100644 src/script/printer/tir/script_method.cc
 create mode 100644 src/script/printer/utils.h
 delete mode 100644 src/tir/ir/legacy_printer.cc
 create mode 100644 tests/python/unittest/test_tvmscript_printer_ir.py

diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index bb4c468f452f..bfbaa7cddd4f 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -100,6 +100,17 @@ class PrimExprNode : public BaseExprNode {
    */
   DataType dtype;
 
+  /*!
+   * \brief Returns the TVMScript format
+   * \param indent_spaces Number of spaces used for indentation
+   * \param print_line_numbers Whether to print line numbers
+   * \param num_context_lines Number of context lines to print around the underlined text
+   * \param path_to_underline Object path to be underlined
+   */
+  TVM_DLL std::string Script(int indent_spaces = 4, bool print_line_numbers = false,
+                             int num_context_lines = -1,
+                             Optional<ObjectPath> path_to_underline = NullOpt) const;
+
   static constexpr const char* _type_key = "PrimExpr";
   static constexpr const uint32_t _type_child_slots = 38;
   TVM_DECLARE_BASE_OBJECT_INFO(PrimExprNode, BaseExprNode);
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index 7313b4f78349..f26e640f6c22 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -63,6 +63,26 @@ class IRModuleNode : public Object {
   parser::SourceMap source_map;
   /* \brief Additional attributes storing meta-data about the module. */
   DictAttrs attrs;
+  /*!
+   * \brief A map from string names to global variables that
+   * ensures global uniqueness.
+   */
+  Map<String, GlobalVar> global_var_map_;
+
+  /*! \brief A map from string names to global type variables (ADT names)
+   * that ensures global uniqueness.
+   */
+  Map<String, GlobalTypeVar> global_type_var_map_;
+
+  /*! \brief A map from constructor tags to constructor objects
+   * for convenient access
+   */
+  std::unordered_map<int32_t, Constructor> constructor_tag_map_;
+
+  /*! \brief The files previously imported, required to ensure
+      importing is idempotent for each module.
+   */
+  std::unordered_set<String> import_set_;
 
   /*!
    * \brief Get a module attribute.
@@ -304,15 +324,20 @@ class IRModuleNode : public Object {
   TVM_DLL void ImportFromStd(const String& path);
 
   /*!
-   * \brief Should Link Parameters into the module
-   * \return Whether the Executor is configured to execute with linked parameters (Default: false)
+   * \brief The set of imported files.
    */
-  TVM_DLL Bool ShouldLinkParameters() const;
+  TVM_DLL std::unordered_set<String> Imports() const;
 
   /*!
-   * \brief The set of imported files.
+   * \brief Returns the TVMScript format
+   * \param indent_spaces Number of spaces used for indentation
+   * \param print_line_numbers Whether to print line numbers
+   * \param num_context_lines Number of context lines to print around the underlined text
+   * \param path_to_underline Object path to be underlined
    */
-  TVM_DLL std::unordered_set<String> Imports() const;
+  TVM_DLL std::string Script(int indent_spaces = 4, bool print_line_numbers = false,
+                             int num_context_lines = -1,
+                             Optional<ObjectPath> path_to_underline = NullOpt) const;
 
   static constexpr const char* _type_key = "IRModule";
   static constexpr const bool _type_has_method_sequal_reduce = true;
@@ -322,26 +347,6 @@ class IRModuleNode : public Object {
  private:
   /*! \brief Helper function for registering a typedef's constructors */
   void RegisterConstructors(const GlobalTypeVar& var, const TypeData& type);
-
-  /*! \brief A map from string names to global variables that
-   * ensures global uniqueness.
-   */
-  Map<String, GlobalVar> global_var_map_;
-
-  /*! \brief A map from string names to global type variables (ADT names)
-   * that ensures global uniqueness.
-   */
-  Map<String, GlobalTypeVar> global_type_var_map_;
-
-  /*! \brief A map from constructor tags to constructor objects
-   * for convenient access
-   */
-  std::unordered_map<int32_t, Constructor> constructor_tag_map_;
-
-  /*! \brief The files previously imported, required to ensure
-      importing is idempotent for each module.
-   */
-  std::unordered_set<String> import_set_;
   friend class IRModule;
 };
 
diff --git a/include/tvm/ir/type.h b/include/tvm/ir/type.h
index 579061e02eb6..62328f6a074a 100644
--- a/include/tvm/ir/type.h
+++ b/include/tvm/ir/type.h
@@ -207,6 +207,25 @@ enum TypeKind : int {
   kTypeData = 6
 };
 
+/*! \brief Converts a TypeKind to a string. */
+inline String TypeKind2String(TypeKind kind) {
+  switch (kind) {
+    case TypeKind::kType:
+      return "Type";
+    case TypeKind::kShapeVar:
+      return "ShapeVar";
+    case TypeKind::kBaseType:
+      return "BaseType";
+    case TypeKind::kConstraint:
+      return "Constraint";
+    case TypeKind::kAdtHandle:
+      return "AdtHandle";
+    case TypeKind::kTypeData:
+      return "TypeData";
+  }
+  LOG(FATAL) << "ValueError: Unknown TypeKind: " << static_cast<int>(kind);
+}
+
 /*!
  * \brief Type parameter in functions.
  *
diff --git a/include/tvm/ir/type_functor.h b/include/tvm/ir/type_functor.h
index 11bf7d4740d0..334a35d052e1 100644
--- a/include/tvm/ir/type_functor.h
+++ b/include/tvm/ir/type_functor.h
@@ -24,9 +24,9 @@
 #ifndef TVM_IR_TYPE_FUNCTOR_H_
 #define TVM_IR_TYPE_FUNCTOR_H_
 
+#include <tvm/ir/tensor_type.h>
+#include <tvm/ir/type_relation.h>
 #include <tvm/node/functor.h>
-#include <tvm/relay/adt.h>
-#include <tvm/relay/expr.h>
 
 #include <string>
 #include <utility>
diff --git a/include/tvm/node/repr_printer.h b/include/tvm/node/repr_printer.h
index 532425a51b3e..e3f59fcc14a1 100644
--- a/include/tvm/node/repr_printer.h
+++ b/include/tvm/node/repr_printer.h
@@ -26,6 +26,7 @@
 #include <tvm/node/functor.h>
 
 #include <iostream>
+#include <string>
 
 namespace tvm {
 /*! \brief A printer class to print the AST/IR nodes. */
@@ -48,6 +49,30 @@ class ReprPrinter {
   TVM_DLL static FType& vtable();
 };
 
+/*! \brief Legacy behavior of ReprPrinter. */
+class ReprLegacyPrinter {
+ public:
+  /*! \brief The indentation level. */
+  int indent{0};
+
+  explicit ReprLegacyPrinter(std::ostream& stream)  // NOLINT(*)
+      : stream(stream) {}
+
+  /*! \brief The node to be printed. */
+  TVM_DLL void Print(const ObjectRef& node);
+  /*! \brief Print indent to the stream */
+  TVM_DLL void PrintIndent();
+  /*! \brief Return the ostream it maintains */
+  TVM_DLL std::ostream& Stream() const;
+  // Allow registration to be printer.
+  using FType = NodeFunctor<void(const ObjectRef&, ReprLegacyPrinter*)>;
+  TVM_DLL static FType& vtable();
+
+ private:
+  /*! \brief The output stream */
+  std::ostream& stream;
+};
+
 /*!
  * \brief Dump the node to stderr, used for debug purposes.
  * \param node The input node
@@ -70,6 +95,13 @@ inline std::ostream& operator<<(std::ostream& os, const ObjectRef& n) {  // NOLI
   ReprPrinter(os).Print(n);
   return os;
 }
+
+inline std::string AsLegacyRepr(const ObjectRef& n) {
+  std::ostringstream os;
+  ReprLegacyPrinter(os).Print(n);
+  return os.str();
+}
 }  // namespace runtime
+using runtime::AsLegacyRepr;
 }  // namespace tvm
 #endif  // TVM_NODE_REPR_PRINTER_H_
diff --git a/include/tvm/script/printer/ir_docsifier_functor.h b/include/tvm/script/printer/ir_docsifier_functor.h
index d04d8c4d028a..54810fd55a43 100644
--- a/include/tvm/script/printer/ir_docsifier_functor.h
+++ b/include/tvm/script/printer/ir_docsifier_functor.h
@@ -69,6 +69,9 @@ class IRDocsifierFunctor {
     if ((pf = LookupDispatchTable("", type_index)) != nullptr) {
       return (*pf)(obj, args...);
     }
+    LOG(WARNING) << "ObjectFunctor calls un-registered function on type: "
+                 << runtime::Object::TypeIndex2Key(type_index) << " (token: " << token << ")"
+                 << ". ObjectType: " << obj->GetTypeKey() << ". Object: " << obj;
     ICHECK(false) << "ObjectFunctor calls un-registered function on type: "
                   << runtime::Object::TypeIndex2Key(type_index) << " (token: " << token << ")"
                   << ". ObjectType: " << obj->GetTypeKey() << ". Object: " << obj;
diff --git a/include/tvm/script/printer/printer.h b/include/tvm/script/printer/printer.h
index 289e838b52a8..b373a2be73fb 100644
--- a/include/tvm/script/printer/printer.h
+++ b/include/tvm/script/printer/printer.h
@@ -55,21 +55,6 @@ struct Default {
   static bool& VerboseExpr() { return Instance()->verbose_expr; }
 };
 
-/*!
- * \brief The entry method for TVMScript printing
- * \param obj The object to be printed
- * \param indent_spaces Number of spaces used for indentation
- * \param print_line_numbers Whether to print line numbers
- * \param num_context_lines Number of context lines to print around the underlined text
- * \param path_to_underline Object path to be underlined
- * \return The TVMScript text format
- */
-String Script(ObjectRef obj,                    //
-              int indent_spaces = 4,            //
-              bool print_line_numbers = false,  //
-              int num_context_lines = -1,       //
-              Optional<ObjectPath> path_to_underline = NullOpt);
-
 /*!
  * \brief Convert Doc into Python script.
  * \param doc Doc to be converted
diff --git a/include/tvm/tir/expr.h b/include/tvm/tir/expr.h
index 1d5e8f317a2e..689b1c0a17ad 100644
--- a/include/tvm/tir/expr.h
+++ b/include/tvm/tir/expr.h
@@ -1191,9 +1191,6 @@ class Any : public PrimExpr {
   TVM_DEFINE_OBJECT_REF_COW_METHOD(AnyNode);
 };
 
-/*! \brief Legacy ReprPrint format for TIR */
-std::string LegacyTIRPrint(const ObjectRef& obj);
-
 /*
  * \brief Template function to convert Map to unordered_map
  *  Sometimes useful for API gluing when internal uses unordered_map
diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index 9f7c0fa16b06..17e7de930260 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -132,6 +132,17 @@ class PrimFuncNode : public BaseFuncNode {
    */
   TVM_DLL FuncType func_type_annotation() const;
 
+  /*!
+   * \brief Returns the TVMScript format
+   * \param indent_spaces Number of spaces used for indentation
+   * \param print_line_numbers Whether to print line numbers
+   * \param num_context_lines Number of context lines to print around the underlined text
+   * \param path_to_underline Object path to be underlined
+   */
+  std::string Script(int indent_spaces = 4, bool print_line_numbers = false,
+                     int num_context_lines = -1,
+                     Optional<ObjectPath> path_to_underline = NullOpt) const;
+
   static constexpr const char* _type_key = "tir.PrimFunc";
   TVM_DECLARE_FINAL_OBJECT_INFO(PrimFuncNode, BaseFuncNode);
 };
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 96e03477a141..e0b7bcc868b3 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -46,6 +46,17 @@ class StmtNode : public Object {
   StmtNode() = default;
   explicit StmtNode(Span span) : span(span) {}
 
+  /*!
+   * \brief Returns the TVMScript format
+   * \param indent_spaces Number of spaces used for indentation
+   * \param print_line_numbers Whether to print line numbers
+   * \param num_context_lines Number of context lines to print around the underlined text
+   * \param path_to_underline Object path to be underlined
+   */
+  std::string Script(int indent_spaces = 4, bool print_line_numbers = false,
+                     int num_context_lines = -1,
+                     Optional<ObjectPath> path_to_underline = NullOpt) const;
+
   static constexpr const char* _type_key = "tir.Stmt";
   static constexpr const bool _type_has_method_sequal_reduce = true;
   static constexpr const bool _type_has_method_shash_reduce = true;
diff --git a/python/tvm/ir/__init__.py b/python/tvm/ir/__init__.py
index 4e847c0310a4..9e81dd5519e1 100644
--- a/python/tvm/ir/__init__.py
+++ b/python/tvm/ir/__init__.py
@@ -16,29 +16,47 @@
 # under the License.
 # pylint: disable=unused-import
 """Common data structures across all IR variants."""
-from .base import SourceName, Span, Node, EnvFunc, load_json, save_json
-from .base import structural_equal, assert_structural_equal, structural_hash
-from .type import Type, TypeKind, PrimType, PointerType, TypeVar, GlobalTypeVar, TupleType
-from .type import TypeConstraint, FuncType, IncompleteType, RelayRefType
-from .tensor_type import TensorType
-from .affine_type import TensorAffineType, TupleAffineType
-from .type_relation import TypeCall, TypeRelation
-from .expr import BaseExpr, PrimExpr, RelayExpr, GlobalVar, Range
-from .op import Op, register_op_attr, register_intrin_lowering
-from .function import CallingConv, BaseFunc
+from . import diagnostics, instrument, transform
 from .adt import Constructor, TypeData
-from .module import IRModule
+from .affine_type import TensorAffineType, TupleAffineType
 from .attrs import Attrs, DictAttrs, make_node
+from .base import (
+    EnvFunc,
+    Node,
+    SourceName,
+    Span,
+    assert_structural_equal,
+    load_json,
+    pretty_print,
+    save_json,
+    structural_equal,
+    structural_hash,
+)
 from .container import Array, Map
+from .expr import BaseExpr, GlobalVar, PrimExpr, Range, RelayExpr
+from .function import BaseFunc, CallingConv
 from .memory_pools import (
-    PoolInfo,
-    WorkspacePoolInfo,
-    ConstantPoolInfo,
-    WorkspaceMemoryPools,
     ConstantMemoryPools,
+    ConstantPoolInfo,
+    PoolInfo,
     PoolInfoProperties,
+    WorkspaceMemoryPools,
+    WorkspacePoolInfo,
 )
-
-from . import transform
-from . import instrument
-from . import diagnostics
+from .module import IRModule
+from .op import Op, register_intrin_lowering, register_op_attr
+from .tensor_type import TensorType
+from .type import (
+    FuncType,
+    GlobalTypeVar,
+    IncompleteType,
+    PointerType,
+    PrimType,
+    RelayRefType,
+    TupleType,
+    Type,
+    TypeConstraint,
+    TypeKind,
+    TypeVar,
+)
+from .type_relation import TypeCall, TypeRelation
diff --git a/python/tvm/ir/affine_type.py b/python/tvm/ir/affine_type.py
index bd77c187af40..8d185ae59a34 100644
--- a/python/tvm/ir/affine_type.py
+++ b/python/tvm/ir/affine_type.py
@@ -17,8 +17,8 @@
 """Types for quantized Tensors."""
 import tvm._ffi
 
-from .base import Node
 from . import _ffi_api
+from .base import Node
 
 
 class AffineType(Node):
@@ -31,6 +31,11 @@ def __eq__(self, other):
     def __ne__(self, other):
         return not self.__eq__(other)
 
+    def __str__(self):
+        from tvm.ir import pretty_print  # pylint: disable=import-outside-toplevel
+
+        return pretty_print(self)
+
 
 @tvm._ffi.register_object("TensorAffineType")
 class TensorAffineType(AffineType):
diff --git a/python/tvm/ir/base.py b/python/tvm/ir/base.py
index d754ae567c5e..a1e1d20d8823 100644
--- a/python/tvm/ir/base.py
+++ b/python/tvm/ir/base.py
@@ -16,13 +16,16 @@
 # under the License.
 """Common base structures."""
 import tvm._ffi
-
 import tvm.error
 import tvm.runtime._ffi_node_api
 from tvm.runtime import Object
 
-from . import _ffi_api
-from . import json_compact
+from . import _ffi_api, json_compact
+
+
+def pretty_print(obj: Object) -> None:
+    """Pretty print the object."""
+    return _ffi_api.PrettyPrint(obj)  # type: ignore # pylint: disable=no-member
 
 
 class Node(Object):
@@ -54,9 +57,6 @@ def astext(self, show_meta_data=True, annotate=None):
         """
         return _ffi_api.AsText(self, show_meta_data, annotate)
 
-    def __str__(self):
-        return _ffi_api.PrettyPrint(self)
-
 
 @tvm._ffi.register_object("SourceName")
 class SourceName(Object):
diff --git a/python/tvm/ir/module.py b/python/tvm/ir/module.py
index 3ed7e57cb758..b184c3b0c3cf 100644
--- a/python/tvm/ir/module.py
+++ b/python/tvm/ir/module.py
@@ -17,13 +17,13 @@
 """IRModule that holds the functions and type definitions."""
 from typing import Optional
 
-from tvm._ffi.base import string_types
 import tvm._ffi
+from tvm._ffi.base import string_types
 
-from .base import Node
+from . import _ffi_api
 from . import expr as _expr
 from . import type as _ty
-from . import _ffi_api
+from .base import Node
 
 
 @tvm._ffi.register_object("IRModule")
@@ -252,51 +252,6 @@ def import_from_std(self, file_to_import):
         _ffi_api.Module_ImportFromStd(self, file_to_import)
         return tvm.relay.transform.InferType()(self)
 
-    def __str__(self):
-        return _ffi_api.PrettyPrint(self)
-
-    def __repr__(self):
-        return self.astext()
-
-    def script(self, tir_prefix: str = "T", show_meta: bool = False) -> str:
-        """Print IRModule into TVMScript
-
-        Parameters
-        ----------
-        tir_prefix : str
-            The tir namespace prefix
-
-        show_meta : bool
-            Whether to show meta information
-
-        Returns
-        -------
-        script : str
-            The TVM Script of the IRModule
-        """
-        return tvm._ffi.get_global_func("script.AsTVMScript")(
-            self, tir_prefix, show_meta
-        )  # type: ignore
-
-    def show(self, style: Optional[str] = None, black_format: bool = True) -> None:
-        """A sugar for print highlighted TVM script.
-
-        Parameters
-        ----------
-        style : str, optional
-
-            Pygmentize printing style, auto-detected if None.  See
-            `tvm.script.highlight.cprint` for more details.
-
-        black_format: bool
-
-            If true (default), use the formatter Black to format the TVMScript
-        """
-        from tvm.script.highlight import cprint  # pylint: disable=import-outside-toplevel
-
-        # Use deferred import to avoid circular import while keeping cprint under tvm/script
-        cprint(self, style=style, black_format=black_format)
-
     def get_attr(self, attr_key):
         """Get the IRModule attribute.
 
@@ -331,3 +286,78 @@ def with_attr(self, attr_key, attr_value):
         """
 
         return _ffi_api.Module_WithAttr(self, attr_key, attr_value)
+
+    def script(
+        self,
+        *,
+        indent_spaces: int = 4,
+        print_line_numbers: bool = False,
+        num_context_lines: Optional[int] = None,
+        path_to_underline=None,
+    ) -> str:
+        """Print IRModule into TVMScript
+
+        Parameters
+        ----------
+        indent_spaces : int
+            The number of indent spaces to use in the output
+        print_line_numbers: bool
+            Whether to print line numbers
+        num_context_lines : Optional[int]
+            Number of context lines to print around the underlined text
+        path_to_underline : Optional[ObjectPath]
+            Object path to be underlined
+
+        Returns
+        -------
+        script : str
+            The TVM Script of the IRModule
+        """
+        if num_context_lines is None:
+            num_context_lines = -1
+        return _ffi_api.Module_Script(  # type: ignore  # pylint: disable=no-member
+            self, indent_spaces, print_line_numbers, num_context_lines, path_to_underline
+        )
+
+    def show(
+        self,
+        *,
+        style: Optional[str] = None,
+        black_format: bool = True,
+        indent_spaces: int = 4,
+        print_line_numbers: bool = False,
+        num_context_lines: Optional[int] = None,
+        path_to_underline=None,
+    ) -> None:
+        """A sugar for print highlighted TVM script.
+
+        Parameters
+        ----------
+        style : str, optional
+            Pygmentize printing style, auto-detected if None.  See
+            `tvm.script.highlight.cprint` for more details.
+        black_format: bool
+            If true (default), use the formatter Black to format the TVMScript
+        indent_spaces : int
+            The number of indent spaces to use in the output
+        print_line_numbers: bool
+            Whether to print line numbers
+        num_context_lines : Optional[int]
+            Number of context lines to print around the underlined text
+        path_to_underline : Optional[ObjectPath]
+            Object path to be underlined
+        """
+        from tvm.script.highlight import (  # pylint: disable=import-outside-toplevel
+            cprint,
+        )
+
+        cprint(
+            self.script(
+                indent_spaces=indent_spaces,
+                print_line_numbers=print_line_numbers,
+                num_context_lines=num_context_lines,
+                path_to_underline=path_to_underline,
+            ),
+            style=style,
+            black_format=black_format,
+        )
diff --git a/python/tvm/ir/tensor_type.py b/python/tvm/ir/tensor_type.py
index 22b15a397e30..7313f3c2b42c 100644
--- a/python/tvm/ir/tensor_type.py
+++ b/python/tvm/ir/tensor_type.py
@@ -17,8 +17,8 @@
 """Type relation and function for type checking."""
 import tvm._ffi
 
-from .type import Type
 from . import _ffi_api
+from .type import Type
 
 
 @tvm._ffi.register_object("relay.TensorType")
@@ -54,3 +54,8 @@ def concrete_shape(self):
         TypeError : If the shape is symbolic
         """
         return tuple(int(x) for x in self.shape)
+
+    def __str__(self):
+        from tvm.ir import pretty_print  # pylint: disable=import-outside-toplevel
+
+        return pretty_print(self)
diff --git a/python/tvm/ir/type.py b/python/tvm/ir/type.py
index 4fe28f1d72e2..ea06aeda2030 100644
--- a/python/tvm/ir/type.py
+++ b/python/tvm/ir/type.py
@@ -16,11 +16,12 @@
 # under the License.
 """Unified type system in the project."""
 from enum import IntEnum
+
 import tvm
 import tvm._ffi
 
-from .base import Node
 from . import _ffi_api
+from .base import Node
 
 
 class Type(Node):
diff --git a/python/tvm/relay/dataflow_pattern/__init__.py b/python/tvm/relay/dataflow_pattern/__init__.py
index 1f6d8bb9ab0b..6c29825bc04d 100644
--- a/python/tvm/relay/dataflow_pattern/__init__.py
+++ b/python/tvm/relay/dataflow_pattern/__init__.py
@@ -46,6 +46,11 @@ def register_df_node(type_key=None):
 class DFPattern(Node):
     """Base class of all Patterns."""
 
+    def __str__(self):
+        from tvm.ir import pretty_print  # pylint: disable=import-outside-toplevel
+
+        return pretty_print(self)
+
     def __call__(self, *args):
         args = list(args)
         if len(args) == 1 and args[0] is None:
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 88b84bbe7ebc..7d60e89b59b7 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -17,17 +17,20 @@
 # pylint: disable=no-else-return, invalid-name, unused-import
 """The expression nodes of Relay."""
 from __future__ import absolute_import
+
 from numbers import Number as _Number
 
 import numpy as _np
+
 import tvm._ffi
 from tvm._ffi import base as _base
-from tvm.runtime import NDArray, ndarray as _nd
-from tvm.ir import RelayExpr, GlobalVar, Node
+from tvm.ir import GlobalVar, Node, RelayExpr
+from tvm.runtime import NDArray
+from tvm.runtime import ndarray as _nd
 
-from .base import RelayNode
 from . import _ffi_api
 from . import ty as _ty
+from .base import RelayNode
 
 # alias relay expr as Expr.
 Expr = RelayExpr
@@ -58,6 +61,11 @@ def astype(self, dtype):
         """
         return _ffi_api.cast(self, dtype)
 
+    def __str__(self):
+        from tvm.ir import pretty_print  # pylint: disable=import-outside-toplevel
+
+        return pretty_print(self)
+
     def __neg__(self):
         return _op_make.negative(self)
 
@@ -710,6 +718,11 @@ class StorageInfo(Node):
     def __init__(self, sids, dev_types, sizes):
         self.__init_handle_by_constructor__(_ffi_api.StorageInfo, sids, dev_types, sizes)
 
+    def __str__(self):
+        from tvm.ir import pretty_print  # pylint: disable=import-outside-toplevel
+
+        return pretty_print(self)
+
     @property
     def storage_ids(self):
         return _ffi_api.StorageInfoStorageIds(self)
@@ -735,3 +748,8 @@ class StaticMemoryPlan(Node):
 
     def __init__(self, expr_to_storage_info):
         self.__init_handle_by_constructor__(_ffi_api.StaticMemoryPlan, expr_to_storage_info)
+
+    def __str__(self):
+        from tvm.ir import pretty_print  # pylint: disable=import-outside-toplevel
+
+        return pretty_print(self)
diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py
index 66bb858edbf0..e9bb15e1d1c6 100644
--- a/python/tvm/relay/frontend/tensorflow_ops.py
+++ b/python/tvm/relay/frontend/tensorflow_ops.py
@@ -1847,7 +1847,7 @@ def _impl(inputs, attr, params, mod):
                 shape_arg = tuple(params_new.numpy().astype("int32").flatten())
             except Exception:
                 # Deal with symbolic shape case.
-                if isinstance(pop_node, _expr.Call) and "shape_of" in str(pop_node.op):
+                if isinstance(pop_node, _expr.Call) and "shape_of" in str(pop_node.op.name):
                     # shape_of is the direct ancestor.
                     return _op.reshape_like(inputs[0], pop_node.args[0])
                 shape_arg = pop_node
diff --git a/python/tvm/relay/function.py b/python/tvm/relay/function.py
index 68d8953900cf..ef3356450085 100644
--- a/python/tvm/relay/function.py
+++ b/python/tvm/relay/function.py
@@ -19,11 +19,11 @@
 from __future__ import absolute_import
 
 import tvm._ffi
-from tvm.runtime import convert
 from tvm.ir import BaseFunc
+from tvm.runtime import convert
 
-from .expr import Call
 from . import _ffi_api
+from .expr import Call
 
 
 @tvm._ffi.register_object("relay.Function")
@@ -67,6 +67,11 @@ def __call__(self, *args):
         """
         return Call(self, args, None, None)
 
+    def __str__(self):
+        from tvm.ir import pretty_print  # pylint: disable=import-outside-toplevel
+
+        return pretty_print(self)
+
 
 @tvm._ffi.register_func("relay.FunctionWithFields")
 def FunctionWithFields(
diff --git a/python/tvm/relay/op/contrib/cutlass.py b/python/tvm/relay/op/contrib/cutlass.py
index 1a441a6f03c2..6fce020a6694 100644
--- a/python/tvm/relay/op/contrib/cutlass.py
+++ b/python/tvm/relay/op/contrib/cutlass.py
@@ -17,12 +17,14 @@
 # pylint: disable=invalid-name
 """Patterns supported CUTLASS."""
 from functools import partial
+
 from tvm import relay
-from tvm.ir.transform import Sequential, PassContext
+from tvm.ir.transform import PassContext, Sequential
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
 from tvm.relay.op.contrib.register import register_pattern_table  # type: ignore
-from ...dataflow_pattern import wildcard, is_op, is_constant
+
+from ...dataflow_pattern import is_constant, is_op, wildcard
 
 
 def make_gelu_pattern(bias_out, out_dtype="float16"):
@@ -124,7 +126,7 @@ def check_dtype(lhs, rhs):
 def get_root_call(call, root_op_name):
     if not isinstance(call, relay.Call):
         return None
-    if str(call.op) == root_op_name:
+    if str(call.op.name) == root_op_name:
         return call
     return get_root_call(call.args[0], root_op_name)
 
diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index bdf910d704ce..7db8608d6d7c 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -36,22 +36,25 @@
 from functools import reduce
 
 import tvm.ir
-from tvm.ir import Op
 from tvm import relay
+from tvm.ir import Op
+from tvm.relay import expr as _expr
 from tvm.relay import transform
-from tvm.relay.expr import GlobalVar
-from tvm.relay.expr_functor import ExprMutator, ExprVisitor
-from tvm.relay.expr import const
-
 from tvm.relay.analysis import analysis as _analysis
-from tvm.relay import expr as _expr
+from tvm.relay.expr import Call, GlobalVar, TupleGetItem, const
+from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 
-from tvm.relay.expr import Call, TupleGetItem
 from ... import _ffi_api
-from ...dataflow_pattern import wildcard, is_op, is_constant, is_expr, rewrite, DFPatternCallback
+from ...dataflow_pattern import (
+    DFPatternCallback,
+    is_constant,
+    is_expr,
+    is_op,
+    rewrite,
+    wildcard,
+)
 from .register import register_pattern_table
 
-
 logger = logging.getLogger("DNNL")
 supported_post_elts = ["nn.relu", "tanh", "sigmoid", "clip", "gelu", "swish", "mish", None]
 
@@ -762,7 +765,7 @@ def visit_call(self, call):
             ]
         )
         if isinstance(call.op, tvm.tir.op.Op):
-            if str(call.op) in compute_intensive_ops:
+            if str(call.op.name) in compute_intensive_ops:
                 self.is_compute_intensive = True
 
         return super().visit_call(call)
diff --git a/python/tvm/relay/op/contrib/ethosu.py b/python/tvm/relay/op/contrib/ethosu.py
index a86357db39fc..bd9a7d5ba0d1 100644
--- a/python/tvm/relay/op/contrib/ethosu.py
+++ b/python/tvm/relay/op/contrib/ethosu.py
@@ -17,16 +17,22 @@
 # pylint: disable=ungrouped-imports, import-outside-toplevel
 """Arm(R) Ethos(TM)-U NPU supported operators."""
 import functools
-from typing import Dict, List, Tuple, Callable, Optional
+from typing import Callable, Dict, List, Optional, Tuple
 
 import numpy as np  # type: ignore
 
 import tvm  # type: ignore
 from tvm import relay
-from tvm.relay.expr import Constant, Call  # type: ignore
-from tvm.relay.op.contrib.register import register_pattern_table  # type: ignore
-from tvm.relay.dataflow_pattern import wildcard, is_op, is_constant, is_tuple  # type: ignore
+from tvm.ir import Op
 from tvm.relay.build_module import bind_params_by_name  # type: ignore
+from tvm.relay.dataflow_pattern import (  # type: ignore
+    is_constant,
+    is_op,
+    is_tuple,
+    wildcard,
+)
+from tvm.relay.expr import Call, Constant  # type: ignore
+from tvm.relay.op.contrib.register import register_pattern_table  # type: ignore
 
 try:
     # As ethos-u-vela package is an optional TVM dependency, we want to lazy load it
@@ -197,20 +203,23 @@ class QnnConv2DParams:
     @requires_vela
     def __init__(self, func_body: tvm.relay.Function):
         from tvm.relay.backend.contrib.ethosu.util import QConv2DArgs  # type: ignore
-        from tvm.relay.backend.contrib.ethosu.util import BiasAddArgs
-        from tvm.relay.backend.contrib.ethosu.util import RequantArgs
+        from tvm.relay.backend.contrib.ethosu.util import BiasAddArgs, RequantArgs
 
         activation = None
         separate_padding = None
 
-        if str(func_body.op) in self.activation_map.keys():
+        if str(func_body.op.name) in self.activation_map.keys():
             activation = func_body
             requantize_op = activation.args[0]
         else:
             requantize_op = func_body
         bias_add = requantize_op.args[0]
         qnn_conv2d = bias_add.args[0]
-        if isinstance(qnn_conv2d.args[0], relay.Call) and str(qnn_conv2d.args[0].op) == "nn.pad":
+        if (
+            isinstance(qnn_conv2d.args[0], relay.Call)
+            and isinstance(qnn_conv2d.args[0].op, Op)
+            and str(qnn_conv2d.args[0].op.name) == "nn.pad"
+        ):
             separate_padding = qnn_conv2d.args[0]
         data_layout = qnn_conv2d.attrs.data_layout
         self.kernel_layout = qnn_conv2d.attrs.kernel_layout
@@ -330,13 +339,14 @@ class QnnConv2DTransposeParams:
 
     @requires_vela
     def __init__(self, func_body: tvm.relay.Function):
-        from tvm.relay.backend.contrib.ethosu.util import QConv2DTransposeArgs  # type: ignore
-        from tvm.relay.backend.contrib.ethosu.util import BiasAddArgs
-        from tvm.relay.backend.contrib.ethosu.util import RequantArgs
+        from tvm.relay.backend.contrib.ethosu.util import (
+            QConv2DTransposeArgs,  # type: ignore
+        )
+        from tvm.relay.backend.contrib.ethosu.util import BiasAddArgs, RequantArgs
 
         requantize = func_body
         call = func_body.args[0]
-        if str(call.op) == "nn.bias_add":
+        if str(call.op.name) == "nn.bias_add":
             bias_add = call
             call = call.args[0]
         else:
@@ -561,7 +571,7 @@ class MaxPool2DParams:
 
     def __init__(self, func_body: Call):
         clip = None
-        if str(func_body.op) == "clip":
+        if str(func_body.op.name) == "clip":
             clip = func_body
             pool_op = clip.args[0]
         else:
@@ -617,7 +627,7 @@ class AvgPool2DParams:
 
     def __init__(self, func_body: Call):
         clip = None
-        if str(func_body.op) == "clip":
+        if str(func_body.op.name) == "clip":
             clip = func_body
             cast2 = clip.args[0]
         else:
@@ -681,19 +691,21 @@ class BinaryElementwiseParams:
     """
 
     def __init__(self, func_body: Call, operator_type: str, is_quantized_operation: bool):
-        from tvm.relay.backend.contrib.ethosu.util import BinaryElementwiseArgs
-        from tvm.relay.backend.contrib.ethosu.util import RequantArgs
+        from tvm.relay.backend.contrib.ethosu.util import (
+            BinaryElementwiseArgs,
+            RequantArgs,
+        )
 
         current_call = func_body
         clip = None
         requantize = None
 
         if is_quantized_operation:
-            if str(current_call.op) == "clip":
+            if str(current_call.op.name) == "clip":
                 clip = current_call
                 current_call = clip.args[0]
         else:
-            if str(current_call.op) == "qnn.requantize":
+            if str(current_call.op.name) == "qnn.requantize":
                 requantize = current_call
                 clip = current_call.args[0]
                 current_call = clip.args[0]
@@ -1101,8 +1113,7 @@ class AbsParams:
     composite_name = "ethos-u.abs"
 
     def __init__(self, func_body: Call):
-        from tvm.relay.backend.contrib.ethosu.util import QuantizeArgs
-        from tvm.relay.backend.contrib.ethosu.util import DequantizeArgs
+        from tvm.relay.backend.contrib.ethosu.util import DequantizeArgs, QuantizeArgs
 
         quantize = func_body
         abs_op = quantize.args[0]
@@ -1157,8 +1168,7 @@ class LutActivationParams:
     """
 
     def __init__(self, func_body: Call):
-        from tvm.relay.backend.contrib.ethosu.util import QuantizeArgs
-        from tvm.relay.backend.contrib.ethosu.util import DequantizeArgs
+        from tvm.relay.backend.contrib.ethosu.util import DequantizeArgs, QuantizeArgs
 
         layout = "NHWC"
 
@@ -1631,18 +1641,17 @@ class FullyConnectedParams:
     @requires_vela
     def __init__(self, func_body):
         from tvm.relay.backend.contrib.ethosu.util import QDenseArgs  # type: ignore
-        from tvm.relay.backend.contrib.ethosu.util import BiasAddArgs
-        from tvm.relay.backend.contrib.ethosu.util import RequantArgs
+        from tvm.relay.backend.contrib.ethosu.util import BiasAddArgs, RequantArgs
 
         self.activation = None
-        if str(func_body.op) == "clip":
+        if str(func_body.op.name) == "clip":
             self.activation = func_body
             requantize_op = self.activation.args[0]
         else:
             requantize_op = func_body
 
         call = requantize_op.args[0]
-        if str(requantize_op.args[0].op) == "nn.bias_add":
+        if str(requantize_op.args[0].op.name) == "nn.bias_add":
             bias_add = call
             qnn_dense = call.args[0]
         else:
@@ -1733,8 +1742,7 @@ class HardSwishParams:
     composite_name = "ethos-u.hard_swish"
 
     def __init__(self, func_body):
-        from tvm.relay.backend.contrib.ethosu.util import QuantizeArgs
-        from tvm.relay.backend.contrib.ethosu.util import DequantizeArgs
+        from tvm.relay.backend.contrib.ethosu.util import DequantizeArgs, QuantizeArgs
 
         quantize = func_body
         divide = quantize.args[0]
diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index 4008b0eb3f78..0971770e5726 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -17,15 +17,22 @@
 # pylint: disable=invalid-name, unused-argument, logging-format-interpolation
 """TensorRT supported operators."""
 import logging
-from typing import Tuple, List, Dict, Union, Optional, Any, Callable
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np  # type: ignore
+
 import tvm
 from tvm import relay
 from tvm.ir import Op
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.dataflow_pattern import is_op, wildcard, is_constant, is_tuple, is_tuple_get_item
+from tvm.relay.dataflow_pattern import (
+    is_constant,
+    is_op,
+    is_tuple,
+    is_tuple_get_item,
+    wildcard,
+)
 from tvm.relay.expr import Call, Constant, TupleGetItem
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 from tvm.relay.op.contrib.register import register_pattern_table
@@ -1050,7 +1057,7 @@ def visit_call(self, call: relay.expr.Call) -> None:
             "mean",
         }
         if isinstance(call.op, tvm.tir.op.Op):
-            if str(call.op) in compute_intensive_ops:
+            if str(call.op.name) in compute_intensive_ops:
                 self.is_compute_intensive = True
 
         return super().visit_call(call)
diff --git a/python/tvm/runtime/_ffi_node_api.py b/python/tvm/runtime/_ffi_node_api.py
index 11d317b657e6..703a12f45f4b 100644
--- a/python/tvm/runtime/_ffi_node_api.py
+++ b/python/tvm/runtime/_ffi_node_api.py
@@ -19,6 +19,7 @@
 """FFI for tvm.node"""
 import tvm._ffi
 
+
 # The implementations below are default ones when the corresponding
 # functions are not available in the runtime only mode.
 # They will be overriden via _init_api to the ones registered
@@ -27,6 +28,10 @@ def AsRepr(obj):
     return obj.type_key() + "(" + obj.handle.value + ")"
 
 
+def AsLegacyRepr(obj):
+    return obj.type_key() + "(" + obj.handle.value + ")"
+
+
 def NodeListAttrNames(obj):
     return lambda x: 0
 
diff --git a/python/tvm/runtime/object.py b/python/tvm/runtime/object.py
index e522fd539b4e..6a8dd6587643 100644
--- a/python/tvm/runtime/object.py
+++ b/python/tvm/runtime/object.py
@@ -18,22 +18,30 @@
 """Runtime Object API"""
 import ctypes
 
-from tvm._ffi.base import _FFI_MODE, _RUNTIME_ONLY, check_call, _LIB, c_str
+from tvm._ffi.base import _FFI_MODE, _LIB, _RUNTIME_ONLY, c_str, check_call
 from tvm._ffi.runtime_ctypes import ObjectRValueRef
+
 from . import _ffi_api, _ffi_node_api
 
 try:
     # pylint: disable=wrong-import-position,unused-import
     if _FFI_MODE == "ctypes":
         raise ImportError()
-    from tvm._ffi._cy3.core import _set_class_object, _set_class_object_generic
-    from tvm._ffi._cy3.core import ObjectBase, PyNativeObject
+    from tvm._ffi._cy3.core import (
+        ObjectBase,
+        PyNativeObject,
+        _set_class_object,
+        _set_class_object_generic,
+    )
 except (RuntimeError, ImportError) as error:
     # pylint: disable=wrong-import-position,unused-import
     if _FFI_MODE == "cython":
         raise error
-    from tvm._ffi._ctypes.packed_func import _set_class_object, _set_class_object_generic
     from tvm._ffi._ctypes.object import ObjectBase, PyNativeObject
+    from tvm._ffi._ctypes.packed_func import (
+        _set_class_object,
+        _set_class_object_generic,
+    )
 
 
 def _new_object(cls):
@@ -49,6 +57,9 @@ class Object(ObjectBase):
     def __repr__(self):
         return _ffi_node_api.AsRepr(self)
 
+    def legacy_repr(self):
+        return _ffi_node_api.AsLegacyRepr(self)
+
     def __dir__(self):
         class_names = dir(self.__class__)
         fnames = _ffi_node_api.NodeListAttrNames(self)
diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py
index d52fbb83c368..dab7a175185d 100644
--- a/python/tvm/tir/expr.py
+++ b/python/tvm/tir/expr.py
@@ -28,15 +28,16 @@
   assert(y.a == x)
 """
 from typing import Optional, Union
-from tvm import ir
+
 import tvm._ffi
+import tvm.ir._ffi_api
+from tvm import ir
+from tvm.ir import Op, PrimExpr
 from tvm.ir.base import Span
+from tvm.runtime import DataType, DataTypeCode, Object, ObjectGeneric, const
 
-from tvm.runtime import Object, ObjectGeneric, DataType, DataTypeCode, const
-from tvm.ir import PrimExpr, Op
-import tvm.ir._ffi_api
-from . import generic as _generic
 from . import _ffi_api
+from . import generic as _generic
 
 
 def div_ambiguity_error():
@@ -324,6 +325,81 @@ class PrimExprWithOp(ExprOp, PrimExpr):
     # https://docs.python.org/3.1/reference/datamodel.html#object.__hash__
     __hash__ = PrimExpr.__hash__
 
+    def script(
+        self,
+        *,
+        indent_spaces: int = 4,
+        print_line_numbers: bool = False,
+        num_context_lines: Optional[int] = None,
+        path_to_underline=None,
+    ) -> str:
+        """Print IRModule into TVMScript
+
+        Parameters
+        ----------
+        indent_spaces : int
+            The number of indent spaces to use in the output
+        print_line_numbers: bool
+            Whether to print line numbers
+        num_context_lines : Optional[int]
+            Number of context lines to print around the underlined text
+        path_to_underline : Optional[ObjectPath]
+            Object path to be underlined
+
+        Returns
+        -------
+        script : str
+            The TVM Script of the IRModule
+        """
+        if num_context_lines is None:
+            num_context_lines = -1
+        return _ffi_api.PrimExprScript(  # type: ignore  # pylint: disable=no-member
+            self, indent_spaces, print_line_numbers, num_context_lines, path_to_underline
+        )
+
+    def show(
+        self,
+        *,
+        style: Optional[str] = None,
+        black_format: bool = True,
+        indent_spaces: int = 4,
+        print_line_numbers: bool = False,
+        num_context_lines: Optional[int] = None,
+        path_to_underline=None,
+    ) -> None:
+        """A sugar for print highlighted TVM script.
+
+        Parameters
+        ----------
+        style : str, optional
+            Pygmentize printing style, auto-detected if None.  See
+            `tvm.script.highlight.cprint` for more details.
+        black_format: bool
+            If true (default), use the formatter Black to format the TVMScript
+        indent_spaces : int
+            The number of indent spaces to use in the output
+        print_line_numbers: bool
+            Whether to print line numbers
+        num_context_lines : Optional[int]
+            Number of context lines to print around the underlined text
+        path_to_underline : Optional[ObjectPath]
+            Object path to be underlined
+        """
+        from tvm.script.highlight import (  # pylint: disable=import-outside-toplevel
+            cprint,
+        )
+
+        cprint(
+            self.script(
+                indent_spaces=indent_spaces,
+                print_line_numbers=print_line_numbers,
+                num_context_lines=num_context_lines,
+                path_to_underline=path_to_underline,
+            ),
+            style=style,
+            black_format=black_format,
+        )
+
 
 class ConstExpr(PrimExprWithOp):
     pass
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index 082faeb456d3..fb5a37c5dc17 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -18,17 +18,18 @@
 
 import collections
 import inspect
-from typing import Callable, List, Mapping, Optional, Union, Tuple
+from typing import Callable, List, Mapping, Optional, Tuple, Union
 
 import tvm
 import tvm._ffi
 import tvm.runtime
-from tvm.runtime import Object
 from tvm.ir import BaseFunc, Range
-from .buffer import Buffer
-from .expr import Var, PrimExpr
-from . import _ffi_api
+from tvm.runtime import Object
+
 from ..runtime.ndarray import NDArray
+from . import _ffi_api
+from .buffer import Buffer
+from .expr import PrimExpr, Var
 
 
 @tvm._ffi.register_object("tir.PrimFunc")
@@ -169,44 +170,80 @@ def mem_copy_16_16(a: T.handle, b: T.handle) -> None:
         """
         return _ffi_api.Specialize(self, param_map)  # type: ignore
 
-    def script(self, tir_prefix: str = "T", show_meta: bool = False) -> str:
+    def script(
+        self,
+        *,
+        indent_spaces: int = 4,
+        print_line_numbers: bool = False,
+        num_context_lines: Optional[int] = None,
+        path_to_underline=None,
+    ) -> str:
         """Print IRModule into TVMScript
 
         Parameters
         ----------
-        tir_prefix : str
-            The tir namespace prefix
-
-        show_meta : bool
-            Whether to show meta information
+        indent_spaces : int
+            The number of indent spaces to use in the output
+        print_line_numbers: bool
+            Whether to print line numbers
+        num_context_lines : Optional[int]
+            Number of context lines to print around the underlined text
+        path_to_underline : Optional[ObjectPath]
+            Object path to be underlined
 
         Returns
         -------
         script : str
-            The TVM Script of the PrimFunc
+            The TVM Script of the IRModule
         """
-        return tvm._ffi.get_global_func("script.AsTVMScript")(
-            self, tir_prefix, show_meta
-        )  # type: ignore
+        if num_context_lines is None:
+            num_context_lines = -1
+        return _ffi_api.PrimFuncScript(  # type: ignore  # pylint: disable=no-member
+            self, indent_spaces, print_line_numbers, num_context_lines, path_to_underline
+        )
 
-    def show(self, style: Optional[str] = None, black_format: bool = True) -> None:
+    def show(
+        self,
+        *,
+        style: Optional[str] = None,
+        black_format: bool = True,
+        indent_spaces: int = 4,
+        print_line_numbers: bool = False,
+        num_context_lines: Optional[int] = None,
+        path_to_underline=None,
+    ) -> None:
         """A sugar for print highlighted TVM script.
 
         Parameters
         ----------
         style : str, optional
-
             Pygmentize printing style, auto-detected if None.  See
             `tvm.script.highlight.cprint` for more details.
-
         black_format: bool
-
             If true (default), use the formatter Black to format the TVMScript
+        indent_spaces : int
+            The number of indent spaces to use in the output
+        print_line_numbers: bool
+            Whether to print line numbers
+        num_context_lines : Optional[int]
+            Number of context lines to print around the underlined text
+        path_to_underline : Optional[ObjectPath]
+            Object path to be underlined
         """
-        from tvm.script.highlight import cprint  # pylint: disable=import-outside-toplevel
+        from tvm.script.highlight import (  # pylint: disable=import-outside-toplevel
+            cprint,
+        )
 
-        # Use deferred import to avoid circular import while keeping cprint under tvm/script
-        cprint(self, style=style, black_format=black_format)
+        cprint(
+            self.script(
+                indent_spaces=indent_spaces,
+                print_line_numbers=print_line_numbers,
+                num_context_lines=num_context_lines,
+                path_to_underline=path_to_underline,
+            ),
+            style=style,
+            black_format=black_format,
+        )
 
 
 @tvm._ffi.register_object("tir.TensorIntrin")
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 48850012cbb7..64aba0e029fe 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -239,21 +239,26 @@ def fork_seed(self) -> int:
         """
         return _ffi_api.ScheduleForkSeed(self)  # type: ignore # pylint: disable=no-member
 
-    @type_checked
-    def show(self, rand_var: RAND_VAR_TYPE) -> str:
-        """Returns a string representation of the value that the random variable evaluates to
+    def show(self, style: Optional[str] = None, black_format: bool = True) -> None:
+        """A sugar for print highlighted TVM script.
 
         Parameters
         ----------
-        rand_var : Union[ExprRV, BlockRV, LoopRV]
-            The random variable to be evaluated
+        style : str, optional
 
-        Returns
-        -------
-        str_repr : str
-            The string representation
+            Pygmentize printing style, auto-detected if None.  See
+            `tvm.script.highlight.cprint` for more details.
+
+        black_format: bool
+
+            If true (default), use the formatter Black to format the TVMScript
         """
-        return str(self.get(rand_var))
+        mod = self.mod
+        if mod is not None:
+            mod.show(style=style, black_format=black_format)
+        trace = self.trace
+        if trace is not None:
+            trace.show(style=style, black_format=black_format)
 
     ########## Lookup ##########
 
diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py
index 4847e377dec1..096c13653a94 100644
--- a/python/tvm/tir/stmt.py
+++ b/python/tvm/tir/stmt.py
@@ -41,6 +41,81 @@
 class Stmt(Object):
     """Base class of all the statements."""
 
+    def script(
+        self,
+        *,
+        indent_spaces: int = 4,
+        print_line_numbers: bool = False,
+        num_context_lines: Optional[int] = None,
+        path_to_underline=None,
+    ) -> str:
+        """Print IRModule into TVMScript
+
+        Parameters
+        ----------
+        indent_spaces : int
+            The number of indent spaces to use in the output
+        print_line_numbers: bool
+            Whether to print line numbers
+        num_context_lines : Optional[int]
+            Number of context lines to print around the underlined text
+        path_to_underline : Optional[ObjectPath]
+            Object path to be underlined
+
+        Returns
+        -------
+        script : str
+            The TVM Script of the IRModule
+        """
+        if num_context_lines is None:
+            num_context_lines = -1
+        return _ffi_api.StmtScript(  # type: ignore  # pylint: disable=no-member
+            self, indent_spaces, print_line_numbers, num_context_lines, path_to_underline
+        )
+
+    def show(
+        self,
+        *,
+        style: Optional[str] = None,
+        black_format: bool = True,
+        indent_spaces: int = 4,
+        print_line_numbers: bool = False,
+        num_context_lines: Optional[int] = None,
+        path_to_underline=None,
+    ) -> None:
+        """A sugar for print highlighted TVM script.
+
+        Parameters
+        ----------
+        style : str, optional
+            Pygmentize printing style, auto-detected if None.  See
+            `tvm.script.highlight.cprint` for more details.
+        black_format: bool
+            If true (default), use the formatter Black to format the TVMScript
+        indent_spaces : int
+            The number of indent spaces to use in the output
+        print_line_numbers: bool
+            Whether to print line numbers
+        num_context_lines : Optional[int]
+            Number of context lines to print around the underlined text
+        path_to_underline : Optional[ObjectPath]
+            Object path to be underlined
+        """
+        from tvm.script.highlight import (  # pylint: disable=import-outside-toplevel
+            cprint,
+        )
+
+        cprint(
+            self.script(
+                indent_spaces=indent_spaces,
+                print_line_numbers=print_line_numbers,
+                num_context_lines=num_context_lines,
+                path_to_underline=path_to_underline,
+            ),
+            style=style,
+            black_format=black_format,
+        )
+
 
 @tvm._ffi.register_object("tir.LetStmt")
 class LetStmt(Stmt):
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index 03a36e803be8..af6e47b7a066 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -1288,7 +1288,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
   if (a->IsInstance<IterMapExprNode>() && b->IsInstance<IterMapExprNode>()) {
     // cannot multiply two iterators, mark as unresolved.
     ErrorLogger(this) << "Product of two iterators cannot be represented as an IterMap, "
-                      << "occurs in " << tvm::PrettyPrint(GetRef<Mul>(op));
+                      << "occurs in " << GetRef<Mul>(op);
     return GetRef<PrimExpr>(op);
   }
 
@@ -1321,7 +1321,7 @@ IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend, PrimExpr o
     }
     auto opt_fused = TryFuseIters(sum, check_level_);
     if (!opt_fused) {
-      ErrorLogger(this) << "Dividend  " << tvm::PrettyPrint(original_dividend)
+      ErrorLogger(this) << "Dividend  " << original_dividend
                         << ", can't be written as a single fused IterSum";
       return IterSumExpr();
     }
@@ -1446,8 +1446,7 @@ std::pair<IterSplitExpr, PrimExpr> IterMapRewriter::PadDividendToDivisor(IterSpl
       // since the extent covers the full padding range.
       left_pad = floordiv(mark_left_pad, split->lower_factor);
     } else {
-      ErrorLogger(this) << "Detect incompatible left padding on "
-                        << tvm::PrettyPrint(NormalizeIterMapToExpr(split))
+      ErrorLogger(this) << "Detect incompatible left padding on " << NormalizeIterMapToExpr(split)
                         << ", the iter mark is left padded with " << mark_left_pad;
       return {IterSplitExpr(), PrimExpr()};
     }
@@ -1522,8 +1521,7 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr base, P
     } else {
       // mark as unresolved.
       ErrorLogger(this) << "Cannot represent as IterMap: the numerator's scaling factor, "
-                        << tvm::PrettyPrint(lhs->scale) << " and the divisor "
-                        << tvm::PrettyPrint(rhs)
+                        << lhs->scale << " and the divisor " << rhs
                         << " cannot be simplified to remove the scaling factor.";
       return PrimExpr();
     }
@@ -1621,7 +1619,7 @@ PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr base, P
       // mark as unresolved.
       ErrorLogger(this)
           << "Cannot represent as IterMap: the left-hand side of FloorMod has a scaling factor, "
-          << tvm::PrettyPrint(lhs->scale) << " and the right-hand " << tvm::PrettyPrint(rhs)
+          << lhs->scale << " and the right-hand " << rhs
           << " cannot be used to simplify out the scaling factor.";
       return PrimExpr();
     }
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index 3a9224227680..e03d4302c89f 100644
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -1274,28 +1274,28 @@ String ComputeDAG::PrintDAG(bool simple_mode) const {
           ICHECK_LT(k, p_reduce->combiner->result.size());
           PrimExpr combiner = p_reduce->combiner->result[k];
           if (combiner->IsInstance<AddNode>()) {
-            ss << " += " << LegacyTIRPrint(p_reduce->source[0]) << "\n";
+            ss << " += " << AsLegacyRepr(p_reduce->source[0]) << "\n";
           } else if (combiner->IsInstance<MaxNode>()) {
-            ss << " max= " << LegacyTIRPrint(p_reduce->source[0]) << "\n";
+            ss << " max= " << AsLegacyRepr(p_reduce->source[0]) << "\n";
           } else if (combiner->IsInstance<MinNode>()) {
-            ss << " min= " << LegacyTIRPrint(p_reduce->source[0]) << "\n";
+            ss << " min= " << AsLegacyRepr(p_reduce->source[0]) << "\n";
           } else if (combiner->IsInstance<SelectNode>()) {
             const auto& select = combiner.as<SelectNode>();
-            ss << " select(" << LegacyTIRPrint(select->condition)  //
-               << ", " << LegacyTIRPrint(select->true_value)       //
-               << ", " << LegacyTIRPrint(select->false_value)      //
-               << ")= (" << LegacyTIRPrint(p_reduce->source[0])    //
-               << ',' << LegacyTIRPrint(p_reduce->source[1])       //
+            ss << " select(" << AsLegacyRepr(select->condition)  //
+               << ", " << AsLegacyRepr(select->true_value)       //
+               << ", " << AsLegacyRepr(select->false_value)      //
+               << ")= (" << AsLegacyRepr(p_reduce->source[0])    //
+               << ',' << AsLegacyRepr(p_reduce->source[1])       //
                << ")\n";
           } else {
-            ss << "reduce" << LegacyTIRPrint(combiner) << "\n";
+            ss << "reduce" << AsLegacyRepr(combiner) << "\n";
           }
         } else {
           auto call = pop->body[k].as<CallNode>();
           if (simple_mode && call) {
-            ss << " = " << LegacyTIRPrint(call->op) << "\n";
+            ss << " = " << AsLegacyRepr(call->op) << "\n";
           } else {
-            ss << " = " << LegacyTIRPrint(pop->body[k]) << "\n";
+            ss << " = " << AsLegacyRepr(pop->body[k]) << "\n";
           }
         }
       }
diff --git a/src/ir/adt.cc b/src/ir/adt.cc
index f0ce859f3f87..3533c8c514cd 100644
--- a/src/ir/adt.cc
+++ b/src/ir/adt.cc
@@ -21,8 +21,9 @@
  * \file src/ir/adt.cc
  * \brief ADT type definitions.
  */
-#include <tvm/relay/adt.h>
-#include <tvm/relay/type.h>
+#include <tvm/ir/adt.h>
+#include <tvm/ir/type.h>
+#include <tvm/runtime/registry.h>
 
 namespace tvm {
 
diff --git a/src/ir/attrs.cc b/src/ir/attrs.cc
index af46439cff7c..f197ac4416fa 100644
--- a/src/ir/attrs.cc
+++ b/src/ir/attrs.cc
@@ -53,12 +53,6 @@ DictAttrs::DictAttrs(Map<String, ObjectRef> dict) {
   data_ = std::move(n);
 }
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<DictAttrsNode>([](const ObjectRef& node, ReprPrinter* p) {
-      auto* op = static_cast<const DictAttrsNode*>(node.get());
-      p->stream << op->dict;
-    });
-
 TVM_REGISTER_NODE_TYPE(DictAttrsNode);
 
 TVM_REGISTER_NODE_TYPE(AttrFieldInfoNode);
diff --git a/src/ir/error.cc b/src/ir/error.cc
index f0e78b954a41..26448d04005c 100644
--- a/src/ir/error.cc
+++ b/src/ir/error.cc
@@ -21,15 +21,8 @@
  * \file ir/error.cc
  * \brief Utilities for error tracking and reporting.
  */
-
 #include <tvm/ir/error.h>
 #include <tvm/ir/module.h>
-// NOTE: reverse dependency on relay.
-// These dependencies do not happen at the interface-level,
-// and are only used in minimum cases where they are clearly marked.
-//
-// Rationale: use relay's printer for astext.
-#include <tvm/relay/expr.h>
 
 // clang-format off
 #include <string>
diff --git a/src/ir/expr.cc b/src/ir/expr.cc
index 7ba99e34d519..050d9b87a856 100644
--- a/src/ir/expr.cc
+++ b/src/ir/expr.cc
@@ -25,11 +25,6 @@
 #include <tvm/ir/expr.h>
 #include <tvm/ir/function.h>
 #include <tvm/runtime/registry.h>
-// NOTE: reverse dependency on top/tir.
-// These dependencies do not happen at the interface-level,
-// and are only used in minimum cases where they are clearly marked.
-//
-// Rationale: convert from IterVar and top::Tensor
 #include <tvm/te/tensor.h>
 #include <tvm/tir/expr.h>
 
@@ -168,12 +163,6 @@ TVM_REGISTER_GLOBAL("ir.GlobalVar").set_body_typed([](String name, Type type) {
   return GlobalVar(name, type);
 });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<GlobalVarNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const GlobalVarNode*>(ref.get());
-      p->stream << "GlobalVar(" << node->name_hint << ")";
-    });
-
 TVM_REGISTER_GLOBAL("ir.DebugPrint").set_body_typed([](ObjectRef ref) {
   std::stringstream ss;
   ss << ref;
diff --git a/src/ir/function.cc b/src/ir/function.cc
index dcfddd5f69d5..ce294708b2a9 100644
--- a/src/ir/function.cc
+++ b/src/ir/function.cc
@@ -23,12 +23,6 @@
  */
 #include <tvm/ir/function.h>
 #include <tvm/runtime/registry.h>
-// NOTE: reverse dependency on relay, tir/
-// These dependencies do not happen at the interface-level,
-// and are only used in minimum cases where they are clearly marked.
-//
-// Rationale: We calls into the type specific WithAttr function
-#include <tvm/relay/function.h>
 #include <tvm/tir/function.h>
 
 namespace tvm {
@@ -41,11 +35,13 @@ TVM_REGISTER_GLOBAL("ir.BaseFuncWithAttr")
     .set_body_typed([](BaseFunc func, String key, ObjectRef value) -> BaseFunc {
       if (func->IsInstance<tir::PrimFuncNode>()) {
         return WithAttr(Downcast<tir::PrimFunc>(std::move(func)), key, value);
-      } else if (func->IsInstance<relay::FunctionNode>()) {
-        return WithAttr(Downcast<relay::Function>(std::move(func)), key, value);
-      } else {
-        LOG(FATAL) << "Do not support function type " << func->GetTypeKey();
       }
+      if (const auto* f = runtime::Registry::Get("relay.ir.FuncWithAttr")) {
+        if (Optional<BaseFunc> ret = (*f)(func, key, value)) {
+          return ret.value();
+        }
+      }
+      LOG(FATAL) << "Do not support function type " << func->GetTypeKey();
     });
 
 }  // namespace tvm
diff --git a/src/ir/module.cc b/src/ir/module.cc
index def94a046855..b6923cd1e60d 100644
--- a/src/ir/module.cc
+++ b/src/ir/module.cc
@@ -23,19 +23,10 @@
  */
 #include <tvm/ir/global_var_supply.h>
 #include <tvm/ir/module.h>
-#include <tvm/node/structural_equal.h>
-#include <tvm/runtime/registry.h>
-// NOTE: reverse dependency on relay.
-// These dependencies do not happen at the interface-level,
-// and are only used in minimum cases where they are clearly marked.
-//
-// Rationale: We calls into relay's analysis module to verify correctness.
 #include <tvm/ir/type_functor.h>
+#include <tvm/node/structural_equal.h>
 #include <tvm/parser/parser.h>
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/executor.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/transform.h>
+#include <tvm/runtime/registry.h>
 
 #include <fstream>
 #include <sstream>
@@ -182,26 +173,11 @@ tvm::Array<GlobalTypeVar> IRModuleNode::GetGlobalTypeVars() const {
   return tvm::Array<GlobalTypeVar>(global_type_vars);
 }
 
-void WarnIfMalformed(const IRModule& mod, relay::Function func) {
-  func = Downcast<relay::Function>(relay::DeDup(func));
-  // Type check the item before we add it to the module.
-  auto fv = relay::FreeVars(func);
-  auto ftv = relay::FreeTypeVars(func, mod);
-  // TODO(@jroesch): refactor to use diagnostic context
-  ICHECK_EQ(fv.size(), 0) << "Function:" << std::endl
-                          << PrettyPrint(func) << std::endl
-                          << "contains free variables: " << fv;
-  ICHECK_EQ(ftv.size(), 0) << "Function:" << std::endl
-                           << PrettyPrint(func) << std::endl
-                           << "contains free type variables: " << fv;
-}
-
 void IRModuleNode::Add(const GlobalVar& var, const BaseFunc& f, bool update) {
   BaseFunc checked_func = f;
-  if (auto* ptr = f.as<relay::FunctionNode>()) {
-    WarnIfMalformed(GetRef<IRModule>(this), GetRef<relay::Function>(ptr));
+  if (const auto* f = runtime::Registry::Get("relay.ir.WarnIfMalformed")) {
+    (*f)(GetRef<IRModule>(this), checked_func);
   }
-
   AddUnchecked(var, checked_func);
 }
 
@@ -212,8 +188,7 @@ void IRModuleNode::AddUnchecked(const GlobalVar& var, const BaseFunc& func) {
   if (it != global_var_map_.end()) {
     ICHECK_EQ((*it).second, var);
   } else {
-    ICHECK(global_var_map_.count(var->name_hint) == 0)
-        << "Duplicate global function name " << PrettyPrint(var);
+    ICHECK(global_var_map_.count(var->name_hint) == 0) << "Duplicate global function name " << var;
   }
 
   global_var_map_.Set(var->name_hint, var);
@@ -243,7 +218,7 @@ void IRModuleNode::AddTypeDefUnchecked(const GlobalTypeVar& var, const TypeData&
   if (!update) {
     // set global type var map
     ICHECK(global_type_var_map_.count(var->name_hint) == 0)
-        << "Duplicate global type definition name " << PrettyPrint(var);
+        << "Duplicate global type definition name " << var;
   }
   global_type_var_map_.Set(var->name_hint, var);
   RegisterConstructors(var, type);
@@ -266,7 +241,7 @@ void IRModuleNode::Remove(const GlobalVar& var) {
 
 BaseFunc IRModuleNode::Lookup(const GlobalVar& var) const {
   auto it = functions.find(var);
-  ICHECK(it != functions.end()) << "There is no definition of " << PrettyPrint(var);
+  ICHECK(it != functions.end()) << "There is no definition of " << var;
   return (*it).second;
 }
 
@@ -277,7 +252,7 @@ BaseFunc IRModuleNode::Lookup(const String& name) const {
 
 TypeData IRModuleNode::LookupTypeDef(const GlobalTypeVar& var) const {
   auto it = type_definitions.find(var);
-  ICHECK(it != type_definitions.end()) << "There is no definition of " << PrettyPrint(var);
+  ICHECK(it != type_definitions.end()) << "There is no definition of " << var;
   return (*it).second;
 }
 
@@ -292,70 +267,14 @@ Constructor IRModuleNode::LookupTag(const int32_t tag) {
   return (*it).second;
 }
 
-/*!
- * \brief Renames global type/term variables to prefer the GlobalTypeVar/GlobalVar in the lhs
- * ('one') side above the rhs ('two').
- */
-struct Renamer : relay::ExprMutator, TypeMutator {
-  Map<String, GlobalVar> defs;
-  Map<String, GlobalTypeVar> types;
-  std::unordered_map<int32_t, Constructor> ctors;
-
-  Renamer(Map<String, GlobalVar> defs_one, Map<String, GlobalVar> defs_two,
-          Map<String, GlobalTypeVar> types_one, Map<String, GlobalTypeVar> types_two,
-          std::unordered_map<int32_t, Constructor> ctors_one,
-          std::unordered_map<int32_t, Constructor> ctor_two) {
-    for (auto pair : defs_one) {
-      defs.Set(pair.first, pair.second);
-    }
-
-    for (auto pair : defs_two) {
-      auto it = defs.find(pair.first);
-      if (it == defs.end()) {
-        defs.Set(pair.first, pair.second);
-      }
-    }
-
-    for (auto pair : types_one) {
-      types.Set(pair.first, pair.second);
-    }
-
-    for (auto pair : types_two) {
-      auto it = types.find(pair.first);
-      if (it == types.end()) {
-        types.Set(pair.first, pair.second);
-      }
-    }
-  }
-
-  relay::Expr VisitExpr_(const GlobalVarNode* node) override { return defs.at(node->name_hint); }
-
-  Type VisitType_(const GlobalTypeVarNode* node) override { return types.at(node->name_hint); }
-};
-
 void IRModuleNode::Update(const IRModule& mod) {
-  Renamer renamer(this->global_var_map_, mod->global_var_map_, this->global_type_var_map_,
-                  mod->global_type_var_map_, this->constructor_tag_map_, mod->constructor_tag_map_);
-
-  this->global_var_map_ = renamer.defs;
-  this->global_type_var_map_ = renamer.types;
-  this->constructor_tag_map_ = renamer.ctors;
-
-  for (auto pair : mod->type_definitions) {
-    auto tvar = renamer.types.at(pair.first->name_hint);
-    auto ty = renamer.ExprMutator::VisitType(pair.second);
-    this->AddTypeDefUnchecked(tvar, Downcast<TypeData>(ty), true);
+  if (const auto* f = runtime::Registry::Get("relay.ir.IRModuleUpdateWithRenamer")) {
+    (*f)(GetRef<IRModule>(this), mod);
+    return;
   }
-
   for (auto pair : mod->functions) {
-    if (auto rfn = pair.second.as<relay::FunctionNode>()) {
-      auto gvar = renamer.defs.at(pair.first->name_hint);
-      auto fn = renamer.VisitExpr(GetRef<relay::Function>(rfn));
-      this->AddUnchecked(gvar, Downcast<BaseFunc>(fn));
-    } else {
-      // TODO(@jroesch): rename into IRModule.
-      this->AddUnchecked(pair.first, pair.second);
-    }
+    // TODO(@jroesch): rename into IRModule.
+    this->AddUnchecked(pair.first, pair.second);
   }
 }
 
@@ -379,8 +298,10 @@ std::pair<IRModule, GlobalVar> IRModule::FromExprInContext(
       // Function literal has been annotated with it's required global symbol.
       gv_name = opt.value();
     }
+  } else if (const auto* f = runtime::Registry::Get("relay.ir.FunctionFromExprInContext")) {
+    func = (*f)(expr, mod);
   } else {
-    func = relay::Function(relay::FreeVars(expr), expr, Type(), relay::FreeTypeVars(expr, mod), {});
+    LOG(FATAL) << "`relay.ir.FunctionFromExprInContext` is not registered";
   }
 
   GlobalVar main_gv;
@@ -418,14 +339,6 @@ void IRModuleNode::ImportFromStd(const String& path) {
   this->Import(std_path + "/" + path);
 }
 
-Bool IRModuleNode::ShouldLinkParameters() const {
-  Optional<relay::Executor> executor = GetAttr<tvm::relay::Executor>(tvm::attr::kExecutor);
-  if (!executor.defined()) {
-    return Bool(false);
-  }
-  return executor.value()->ShouldLinkParameters();
-}
-
 std::unordered_set<String> IRModuleNode::Imports() const { return this->import_set_; }
 
 IRModule IRModule::FromText(const String& text, const String& source_path) {
@@ -440,29 +353,15 @@ TVM_REGISTER_GLOBAL("ir.IRModule")
       return IRModule(funcs, types, {});
     });
 
-TVM_REGISTER_GLOBAL("ir.Module_Add").set_body([](TVMArgs args, TVMRetValue* ret) {
-  IRModule mod = args[0];
-  GlobalVar var = args[1];
-  ObjectRef val = args[2];
-  bool update = args[3];
-  ICHECK(val->IsInstance<RelayExprNode>());
-
-  if (val->IsInstance<BaseFuncNode>()) {
-    mod->Add(var, Downcast<BaseFunc>(val), update);
-  } else if (val->IsInstance<GlobalVarNode>()) {
-    GlobalVar gv = Downcast<GlobalVar>(val);
-    auto mod_copy = IRModule(make_object<IRModuleNode>(*mod.operator->()));
-    mod_copy = relay::transform::EtaExpand(
-        /* expand_constructor */ false,
-        /* expand_global_var */ true)(mod_copy);
-    auto func = mod_copy->Lookup(gv->name_hint);
-    mod->Add(var, Downcast<relay::Function>(func), update);
-  } else {
-    auto func = relay::Function({}, Downcast<RelayExpr>(val), Type(nullptr), {});
-    mod->Add(var, func, update);
-  }
-  *ret = mod;
-});
+TVM_REGISTER_GLOBAL("ir.Module_Add")
+    .set_body_typed([](IRModule mod, GlobalVar var, ObjectRef val, bool update) -> IRModule {
+      ICHECK(val->IsInstance<RelayExprNode>());
+      if (const auto* f = runtime::Registry::Get("relay.ir.IRModuleAdd")) {
+        return (*f)(mod, var, val, update);
+      }
+      mod->Add(var, Downcast<BaseFunc>(val), update);
+      return mod;
+    });
 
 TVM_REGISTER_GLOBAL("ir.Module_AddDef").set_body_method<IRModule>(&IRModuleNode::AddTypeDef);
 
@@ -530,10 +429,4 @@ TVM_REGISTER_GLOBAL("ir.Module_GetAttr").set_body_typed([](IRModule mod, String
   return mod->GetAttr<ObjectRef>(key);
 });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<IRModuleNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const IRModuleNode*>(ref.get());
-      p->stream << "IRModule(" << node->functions << ")";
-    });
-
 }  // namespace tvm
diff --git a/src/ir/transform.cc b/src/ir/transform.cc
index e0f08d28fb18..bfd0a5917556 100644
--- a/src/ir/transform.cc
+++ b/src/ir/transform.cc
@@ -377,7 +377,6 @@ IRModule ModulePassNode::operator()(IRModule mod, const PassContext& pass_ctx) c
 
   VLOG_CONTEXT << pass_info->name;
   VLOG(0) << "Executing module pass with opt level: " << pass_info->opt_level;
-  VLOG(1) << "Input module:" << std::endl << PrettyPrint(mod);
 
   mod = pass_func(std::move(mod), pass_ctx);
 
@@ -389,8 +388,6 @@ IRModule ModulePassNode::operator()(IRModule mod, const PassContext& pass_ctx) c
   pass_ctx->diag_ctx.value().Render();
   pass_ctx->diag_ctx = previous;
 
-  VLOG(1) << "Result module:" << std::endl << PrettyPrint(mod);
-
   return mod;
 }
 
diff --git a/src/ir/type.cc b/src/ir/type.cc
index ee05fd03596a..d965406e8bb0 100644
--- a/src/ir/type.cc
+++ b/src/ir/type.cc
@@ -65,12 +65,6 @@ TVM_REGISTER_GLOBAL("ir.TypeVar").set_body_typed([](String name, int kind) {
   return TypeVar(name, static_cast<TypeKind>(kind));
 });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<TypeVarNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const TypeVarNode*>(ref.get());
-      p->stream << "TypeVar(" << node->name_hint << ", " << node->kind << ")";
-    });
-
 GlobalTypeVar::GlobalTypeVar(String name, TypeKind kind, Span span) {
   ObjectPtr<GlobalTypeVarNode> n = make_object<GlobalTypeVarNode>();
   n->name_hint = std::move(name);
@@ -85,12 +79,6 @@ TVM_REGISTER_GLOBAL("ir.GlobalTypeVar").set_body_typed([](String name, int kind)
   return GlobalTypeVar(name, static_cast<TypeKind>(kind));
 });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<GlobalTypeVarNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const GlobalTypeVarNode*>(ref.get());
-      p->stream << "GlobalTypeVar(" << node->name_hint << ", " << node->kind << ")";
-    });
-
 FuncType::FuncType(tvm::Array<Type> arg_types, Type ret_type, tvm::Array<TypeVar> type_params,
                    tvm::Array<TypeConstraint> type_constraints, Span span) {
   ObjectPtr<FuncTypeNode> n = make_object<FuncTypeNode>();
@@ -110,13 +98,6 @@ TVM_REGISTER_GLOBAL("ir.FuncType")
       return FuncType(arg_types, ret_type, type_params, type_constraints);
     });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<FuncTypeNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const FuncTypeNode*>(ref.get());
-      p->stream << "FuncType(" << node->type_params << ", " << node->arg_types << ", "
-                << node->ret_type << ", " << node->type_constraints << ")";
-    });
-
 TupleType::TupleType(Array<Type> fields, Span span) {
   ObjectPtr<TupleTypeNode> n = make_object<TupleTypeNode>();
   n->fields = std::move(fields);
@@ -158,10 +139,4 @@ TVM_REGISTER_GLOBAL("ir.RelayRefType").set_body_typed([](Type value) {
 
 TVM_REGISTER_NODE_TYPE(RelayRefTypeNode);
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<RelayRefTypeNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const RelayRefTypeNode*>(ref.get());
-      p->stream << "RelayRefTypeNode(" << node->value << ")";
-    });
-
 }  // namespace tvm
diff --git a/src/meta_schedule/arg_info.cc b/src/meta_schedule/arg_info.cc
index 4663fd90762a..c90d92f83b39 100644
--- a/src/meta_schedule/arg_info.cc
+++ b/src/meta_schedule/arg_info.cc
@@ -52,13 +52,12 @@ inline tir::PrimFunc FindEntryFunc(const IRModule& mod) {
   }
   // Priority 3: The only PrimFunc in the IRModule
   if (num_prim_func == 0) {
-    LOG(FATAL) << "ValueError: Cannot find any PrimFunc in the given IRModule: "
-               << tir::AsTVMScript(mod);
+    LOG(FATAL) << "ValueError: Cannot find any PrimFunc in the given IRModule: " << mod;
   }
   if (num_prim_func > 1) {
     LOG(FATAL) << "ValueError: Multiple PrimFuncs exist in the IRModule, but none of them are "
                   "annotated with `kIsEntryFunc`, i.e. `tir.is_entry_func`"
-               << tir::AsTVMScript(mod);
+               << mod;
   }
   return GetRef<tir::PrimFunc>(last_func);
 }
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index 22d6ec849c5f..b0fba5adb5c2 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -196,7 +196,7 @@ Database Database::JSONDatabase(String path_workload, String path_tuning_record,
           } catch (std::runtime_error& e) {
             LOG(FATAL) << "ValueError: Unable to parse TuningRecord, on line " << (task_id + 1)
                        << " of file " << path_tuning_record << ". The workload is:\n"
-                       << (workload.defined() ? tir::AsTVMScript(workload->mod) : "(null)")
+                       << (workload.defined() ? workload->mod->Script() : "(null)")
                        << "\nThe JSONObject of TuningRecord is:\n"
                        << json_obj << "\nThe error message is:\n"
                        << e.what();
diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index 9d859947e4fe..404ee01983c5 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -32,7 +32,7 @@ TaskRecord::TaskRecord(TuneContext ctx, double task_weight) {
       << "ValueError: Require `context.space_generator`, but it is not defined";
   CHECK(ctx->search_strategy.defined())
       << "ValueError: Require `context.search_strategy`, but it is not defined";
-  TVM_PY_LOG(INFO, ctx->logger) << "\n" << tir::AsTVMScript(ctx->mod);
+  TVM_PY_LOG(INFO, ctx->logger) << "\n" << ctx->mod;
   ctx->Initialize();
   n->flop = std::max(1.0, tir::EstimateTIRFlops(ctx->mod.value()));
   this->data_ = std::move(n);
@@ -124,7 +124,7 @@ void TaskCleanUp(TaskRecordNode* self, int task_id, const Array<RunnerResult>& r
                                << (builder_result->error_msg.defined() ? "building" : "running")
                                << ":\n"
                                << err << "\n"
-                               << tir::AsTVMScript(sch->mod()) << "\n"
+                               << sch->mod() << "\n"
                                << Concat(sch->trace().value()->AsPython(false), "\n");
     } else {
       double best_ms = *std::min_element(self->latency_ms.begin(), self->latency_ms.end());
@@ -168,7 +168,7 @@ void TaskSchedulerNode::Tune(Array<TuneContext> ctxs, Array<FloatImm> task_weigh
       tir::Trace trace = sch->trace().value();
       trace = trace->Simplified(true);
       TVM_PY_LOG(INFO, ctx->logger) << "Design space #" << i << ":\n"
-                                    << tir::AsTVMScript(sch->mod()) << "\n"
+                                    << sch->mod() << "\n"
                                     << Concat(trace->AsPython(false), "\n");
     }
     ctx->search_strategy.value()->PreTuning(max_trials_per_task, num_trials_per_iter, design_spaces,
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 6039423844e8..9a372dde8f6d 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -48,7 +48,6 @@
 #include <utility>
 #include <vector>
 
-#include "../printer/text_printer.h"
 #include "../support/array.h"
 #include "../support/base64.h"
 #include "../support/nd_int_set.h"
diff --git a/src/node/repr_printer.cc b/src/node/repr_printer.cc
index ea263439023f..63bba67dd5f2 100644
--- a/src/node/repr_printer.cc
+++ b/src/node/repr_printer.cc
@@ -51,6 +51,28 @@ ReprPrinter::FType& ReprPrinter::vtable() {
   return inst;
 }
 
+void ReprLegacyPrinter::Print(const ObjectRef& node) {
+  static const FType& f = vtable();
+  if (!node.defined()) {
+    stream << "(nullptr)";
+  } else if (f.can_dispatch(node)) {
+    f(node, this);
+  } else {
+    stream << node;  // Use ReprPrinter
+  }
+}
+
+void ReprLegacyPrinter::PrintIndent() {
+  for (int i = 0; i < indent; ++i) {
+    stream << ' ';
+  }
+}
+
+ReprLegacyPrinter::FType& ReprLegacyPrinter::vtable() {
+  static FType inst;
+  return inst;
+}
+
 void Dump(const runtime::ObjectRef& n) { std::cerr << n << "\n"; }
 
 void Dump(const runtime::Object* n) { Dump(runtime::GetRef<runtime::ObjectRef>(n)); }
@@ -60,4 +82,7 @@ TVM_REGISTER_GLOBAL("node.AsRepr").set_body_typed([](runtime::ObjectRef obj) {
   os << obj;
   return os.str();
 });
+
+TVM_REGISTER_GLOBAL("node.AsLegacyRepr").set_body_typed(runtime::AsLegacyRepr);
+
 }  // namespace tvm
diff --git a/src/node/structural_equal.cc b/src/node/structural_equal.cc
index 0290b7afe3fd..80e390d9b0ad 100644
--- a/src/node/structural_equal.cc
+++ b/src/node/structural_equal.cc
@@ -314,9 +314,9 @@ class SEqualHandlerDefault::Impl {
     }
     if (assert_mode_ && !result) {
       LOG(FATAL) << "ValueError: StructuralEqual check failed, caused by lhs:" << std::endl
-                 << PrettyPrint(lhs) << std::endl
+                 << lhs << std::endl
                  << "and rhs:" << std::endl
-                 << PrettyPrint(rhs);
+                 << rhs;
     }
     return result;
   }
diff --git a/src/printer/model_library_format_printer.cc b/src/printer/model_library_format_printer.cc
index f6ac39ce79ff..4220aa00f5a4 100644
--- a/src/printer/model_library_format_printer.cc
+++ b/src/printer/model_library_format_printer.cc
@@ -38,9 +38,9 @@ class ModelLibraryFormatPrinter : public ::tvm::runtime::ModuleNode {
   const char* type_key() const final { return "model_library_format_printer"; }
 
   std::string Print(const ObjectRef& node) {
-    Doc doc;
-    doc << text_printer_.PrintFinal(node);
-    return doc.str();
+    std::ostringstream oss;
+    oss << node;
+    return oss.str();
   }
 
   TVMRetValue GetVarName(tir::Var var) {
diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h
index afc76112879e..925c2ebf494e 100644
--- a/src/printer/text_printer.h
+++ b/src/printer/text_printer.h
@@ -409,8 +409,6 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc PrintBody(const Stmt& body, bool indent = true);
 };
 
-String AsTVMScript(const ObjectRef& mod, const String& tir_prefix = "T", bool show_meta = false);
-
 String AsTVMScriptWithDiagnostic(const ObjectRef& mod, const String& tir_prefix, bool show_meta,
                                  runtime::TypedPackedFunc<std::string(Stmt)> annotate);
 
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 274b9542cc92..c578bc53d3d3 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -2002,16 +2002,6 @@ Doc TVMScriptPrinterWithDiagnostic::PrintLoop(const For& loop) {
   return res;
 }
 
-String AsTVMScript(const ObjectRef& mod, const String& tir_prefix, bool show_meta) {
-  ICHECK(mod->IsInstance<PrimFuncNode>() || mod->IsInstance<IRModuleNode>());
-  Doc doc;
-  doc << TVMScriptPrinter::PrintHeader(tir_prefix)
-      << TVMScriptPrinter(tir_prefix, show_meta).Print(mod);
-  return doc.str() + "\n";
-}
-
-TVM_REGISTER_GLOBAL("script.AsTVMScript").set_body_typed(AsTVMScript);
-
 String AsTVMScriptWithDiagnostic(const ObjectRef& mod, const String& tir_prefix, bool show_meta,
                                  runtime::TypedPackedFunc<std::string(Stmt)> annotate) {
   ICHECK(mod->IsInstance<PrimFuncNode>() || mod->IsInstance<IRModuleNode>());
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index d71cbcfc667d..154101fc94fe 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -51,7 +51,6 @@
 #include <utility>
 #include <vector>
 
-#include "../../printer/text_printer.h"
 #include "../../te/operation/create_primfunc.h"
 #include "../op/memory/memory.h"
 #include "../src/meta_schedule/module_equality.h"
@@ -646,7 +645,7 @@ class ScheduleBuilder : public ExprVisitor {
             // (dispatch & 4): controls whether to raise fatal errors for missing TIR
             if (dispatch & 2) {
               LOG(WARNING) << "Cannot find workload: " << prim_fn_var->name_hint << "\n"
-                           << tir::AsTVMScript(f.value());
+                           << f.value();
             } else {
               LOG(WARNING) << "Cannot find workload: " << prim_fn_var->name_hint;
             }
diff --git a/src/relay/ir/function.cc b/src/relay/ir/function.cc
index 07cfb27b1d35..3ff5eaa059c1 100644
--- a/src/relay/ir/function.cc
+++ b/src/relay/ir/function.cc
@@ -21,7 +21,11 @@
  * \file src/relay/ir/function.cc
  * \brief Function in relay.
  */
+#include <tvm/ir/type_functor.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/expr_functor.h>
 #include <tvm/relay/function.h>
+#include <tvm/relay/transform.h>
 
 namespace tvm {
 namespace relay {
@@ -119,6 +123,132 @@ const FunctionNode* AsOptimizableFunctionNode(const BaseFunc& base_func) {
   }
   return nullptr;
 }
+TVM_REGISTER_GLOBAL("relay.ir.PrintRelayModule")
+    .set_body_typed([](IRModule mod) -> Optional<String> {
+      for (const auto& it : mod->functions) {
+        if (it.second->IsInstance<FunctionNode>()) {
+          return PrettyPrint(mod);
+        }
+      }
+      return NullOpt;
+    });
+
+TVM_REGISTER_GLOBAL("relay.ir.WarnIfMalformed")
+    .set_body_typed([](const IRModule& mod, const BaseFunc& base_func) -> void {
+      if (const auto* relay_func = base_func.as<FunctionNode>()) {
+        Function func = Downcast<relay::Function>(relay::DeDup(GetRef<Function>(relay_func)));
+        // Type check the item before we add it to the module.
+        auto fv = relay::FreeVars(func);
+        auto ftv = relay::FreeTypeVars(func, mod);
+        // TODO(@jroesch): refactor to use diagnostic context
+        ICHECK_EQ(fv.size(), 0) << "Function:" << std::endl
+                                << PrettyPrint(func) << std::endl
+                                << "contains free variables: " << fv;
+        ICHECK_EQ(ftv.size(), 0) << "Function:" << std::endl
+                                 << PrettyPrint(func) << std::endl
+                                 << "contains free type variables: " << fv;
+      }
+    });
+TVM_REGISTER_GLOBAL("relay.ir.IRModuleAdd")
+    .set_body_typed([](IRModule mod, GlobalVar var, ObjectRef val, bool update) -> IRModule {
+      if (val->IsInstance<BaseFuncNode>()) {
+        mod->Add(var, Downcast<BaseFunc>(val), update);
+      } else if (val->IsInstance<GlobalVarNode>()) {
+        GlobalVar gv = Downcast<GlobalVar>(val);
+        IRModule mod_copy(make_object<IRModuleNode>(*mod.operator->()));
+        mod_copy = relay::transform::EtaExpand(
+            /* expand_constructor */ false,
+            /* expand_global_var */ true)(mod_copy);
+        auto func = mod_copy->Lookup(gv->name_hint);
+        mod->Add(var, Downcast<relay::Function>(func), update);
+      } else {
+        auto func = relay::Function({}, Downcast<RelayExpr>(val), Type(nullptr), {});
+        mod->Add(var, func, update);
+      }
+      return mod;
+    });
+
+TVM_REGISTER_GLOBAL("relay.ir.IRModuleUpdateWithRenamer")
+    .set_body_typed([](IRModule self, IRModule mod) -> void {
+      struct Renamer : relay::ExprMutator, TypeMutator {
+        Map<String, GlobalVar> defs;
+        Map<String, GlobalTypeVar> types;
+        std::unordered_map<int32_t, Constructor> ctors;
+
+        Renamer(Map<String, GlobalVar> defs_one, Map<String, GlobalVar> defs_two,
+                Map<String, GlobalTypeVar> types_one, Map<String, GlobalTypeVar> types_two,
+                std::unordered_map<int32_t, Constructor> ctors_one,
+                std::unordered_map<int32_t, Constructor> ctor_two) {
+          for (auto pair : defs_one) {
+            defs.Set(pair.first, pair.second);
+          }
+
+          for (auto pair : defs_two) {
+            auto it = defs.find(pair.first);
+            if (it == defs.end()) {
+              defs.Set(pair.first, pair.second);
+            }
+          }
+
+          for (auto pair : types_one) {
+            types.Set(pair.first, pair.second);
+          }
+
+          for (auto pair : types_two) {
+            auto it = types.find(pair.first);
+            if (it == types.end()) {
+              types.Set(pair.first, pair.second);
+            }
+          }
+        }
+
+        relay::Expr VisitExpr_(const GlobalVarNode* node) override {
+          return defs.at(node->name_hint);
+        }
+
+        Type VisitType_(const GlobalTypeVarNode* node) override {
+          return types.at(node->name_hint);
+        }
+      };
+
+      Renamer renamer(self->global_var_map_, mod->global_var_map_, self->global_type_var_map_,
+                      mod->global_type_var_map_, self->constructor_tag_map_,
+                      mod->constructor_tag_map_);
+
+      self->global_var_map_ = renamer.defs;
+      self->global_type_var_map_ = renamer.types;
+      self->constructor_tag_map_ = renamer.ctors;
+
+      for (auto pair : mod->type_definitions) {
+        auto tvar = renamer.types.at(pair.first->name_hint);
+        auto ty = renamer.ExprMutator::VisitType(pair.second);
+        self->AddTypeDefUnchecked(tvar, Downcast<TypeData>(ty), true);
+      }
+
+      for (auto pair : mod->functions) {
+        if (auto rfn = pair.second.as<relay::FunctionNode>()) {
+          auto gvar = renamer.defs.at(pair.first->name_hint);
+          auto fn = renamer.VisitExpr(GetRef<relay::Function>(rfn));
+          self->AddUnchecked(gvar, Downcast<BaseFunc>(fn));
+        } else {
+          // TODO(@jroesch): rename into IRModule.
+          self->AddUnchecked(pair.first, pair.second);
+        }
+      }
+    });
+
+TVM_REGISTER_GLOBAL("relay.ir.FunctionFromExprInContext")
+    .set_body_typed([](RelayExpr expr, IRModule mod) -> Function {
+      return Function(relay::FreeVars(expr), expr, Type(), relay::FreeTypeVars(expr, mod), {});
+    });
+
+TVM_REGISTER_GLOBAL("relay.ir.FuncWithAttr")
+    .set_body_typed([](BaseFunc func, String key, ObjectRef value) -> Optional<Function> {
+      if (func->IsInstance<relay::FunctionNode>()) {
+        return WithAttr(Downcast<relay::Function>(std::move(func)), key, value);
+      }
+      return NullOpt;
+    });
 
 TVM_REGISTER_NODE_TYPE(FunctionNode);
 
diff --git a/src/relay/transforms/defunctionalization.cc b/src/relay/transforms/defunctionalization.cc
index 5ee3bbcef48f..59f94e0cdd86 100644
--- a/src/relay/transforms/defunctionalization.cc
+++ b/src/relay/transforms/defunctionalization.cc
@@ -312,7 +312,7 @@ class DefuncMutator : public ExprMutator {
    */
   std::string TypeToString(const Type& t) {
     std::ostringstream s;
-    s << t;
+    s << t->GetTypeKey();
     return s.str();
   }
 
diff --git a/src/script/printer/ir/ir.cc b/src/script/printer/ir/ir.cc
index c4ecf92e9116..5cd459be6696 100644
--- a/src/script/printer/ir/ir.cc
+++ b/src/script/printer/ir/ir.cc
@@ -16,6 +16,8 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <tvm/ir/tensor_type.h>
+
 #include "./utils.h"
 
 namespace tvm {
@@ -50,7 +52,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           BaseFunc func = kv.second;
           (*f)->stmts.push_back(d->AsDoc<FunctionDoc>(func, p->Attr("functions")->MapValue(gv)));
         }
-        return ClassDoc(IdDoc("Module"), {IR(d)}, (*f)->stmts);
+        return ClassDoc(IdDoc("Module"), {IR("ir_module")}, (*f)->stmts);
       }
     });
 
@@ -61,14 +63,76 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<GlobalVar>("", [](GlobalVar gv, ObjectPath p, IRDocsifier d) -> Doc {
-      return IdDoc("GlobalVar")->Call({LiteralDoc::Str(gv->name_hint)});
+      return IR("GlobalVar")->Call({LiteralDoc::Str(gv->name_hint)});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<Op>("", [](Op op, ObjectPath p, IRDocsifier d) -> Doc {
-      return IdDoc("Op")->Call({LiteralDoc::Str(op->name)});
+      return IR("Op")->Call({LiteralDoc::Str(op->name)});
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<TypeVar>("", [](TypeVar type_var, ObjectPath p, IRDocsifier d) -> Doc {
+      return IR("TypeVar")->Call({LiteralDoc::Str(type_var->name_hint),  //
+                                  LiteralDoc::Str(TypeKind2String(type_var->kind))});
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<GlobalTypeVar>(  //
+        "", [](GlobalTypeVar type_var, ObjectPath p, IRDocsifier d) -> Doc {
+          return IR("GlobalTypeVar")
+              ->Call({LiteralDoc::Str(type_var->name_hint),  //
+                      LiteralDoc::Str(TypeKind2String(type_var->kind))});
+        });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<RelayRefType>("", [](RelayRefType ref, ObjectPath p, IRDocsifier d) -> Doc {
+      return IR("RelayRef")->Call({d->AsDoc<ExprDoc>(ref->value, p->Attr("value"))});
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<TensorType>("", [](TensorType type, ObjectPath p, IRDocsifier d) -> Doc {
+      return IR("TensorType")
+          ->Call({d->AsDoc<ExprDoc>(type->shape, p->Attr("shape")),
+                  LiteralDoc::DataType(type->dtype)});
+    });
+
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<FuncType>("", [](FuncType func_type, ObjectPath p, IRDocsifier d) -> Doc {
+      return IR("FuncType")
+          ->Call({
+              d->AsDoc<ExprDoc>(func_type->type_params, p->Attr("type_params")),
+              d->AsDoc<ExprDoc>(func_type->arg_types, p->Attr("arg_types")),
+              d->AsDoc<ExprDoc>(func_type->ret_type, p->Attr("ret_type")),
+          });
     });
 
+TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
+    .set_dispatch<IncompleteType>("", [](IncompleteType ty, ObjectPath p, IRDocsifier d) -> Doc {
+      return IR("IncompleteType")->Call({});
+    });
+
+void ReprPrintIRModule(const ObjectRef& mod, ReprPrinter* p) {
+  if (const auto* f = runtime::Registry::Get("relay.ir.PrintRelayModule")) {
+    if (Optional<String> s = (*f)(mod)) {
+      p->stream << s.value();
+      return;
+    }
+  }
+  std::string res =
+      DocToPythonScript(IRDocsifier()->AsDoc(Downcast<IRModule>(mod), ObjectPath::Root()));
+  p->stream << res;
+}
+
+TVM_SCRIPT_REPR(TypeVarNode, ReprPrintIR);
+TVM_SCRIPT_REPR(GlobalTypeVarNode, ReprPrintIR);
+TVM_SCRIPT_REPR(GlobalVarNode, ReprPrintIR);
+TVM_SCRIPT_REPR(DictAttrsNode, ReprPrintIR);
+TVM_SCRIPT_REPR(RelayRefTypeNode, ReprPrintIR);
+TVM_SCRIPT_REPR(FuncTypeNode, ReprPrintIR);
+TVM_SCRIPT_REPR(IncompleteTypeNode, ReprPrintIR);
+TVM_SCRIPT_REPR(IRModuleNode, ReprPrintIRModule);
+
 }  // namespace printer
 }  // namespace script
 }  // namespace tvm
diff --git a/src/script/printer/ir/script_method.cc b/src/script/printer/ir/script_method.cc
new file mode 100644
index 000000000000..01d3ede7ea6c
--- /dev/null
+++ b/src/script/printer/ir/script_method.cc
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/tir/builtin.h>
+
+#include "./utils.h"
+
+namespace tvm {
+
+std::string IRModuleNode::Script(int indent_spaces, bool print_line_numbers, int num_context_lines,
+                                 Optional<ObjectPath> path_to_underline) const {
+  using namespace tvm::script::printer;
+  return DocToPythonScript(IRDocsifier()->AsDoc(GetRef<ObjectRef>(this), ObjectPath::Root()),
+                           indent_spaces, print_line_numbers, num_context_lines, path_to_underline);
+}
+
+TVM_REGISTER_GLOBAL("ir.Module_Script").set_body_method<IRModule>(&IRModuleNode::Script);
+
+}  // namespace tvm
diff --git a/src/script/printer/ir/utils.h b/src/script/printer/ir/utils.h
index 4065b895c1bb..820fe13df3c6 100644
--- a/src/script/printer/ir/utils.h
+++ b/src/script/printer/ir/utils.h
@@ -28,11 +28,14 @@
 
 #include <utility>
 
+#include "../utils.h"
+
 namespace tvm {
 namespace script {
 namespace printer {
 
-inline ExprDoc IR(const IRDocsifier& d) { return IdDoc("tvm")->Attr("script"); }
+/*! \brief Creates the IR common prefix, which is by default `I` */
+inline ExprDoc IR(const String& attr) { return IdDoc(Default::Prefix("ir"))->Attr(attr); }
 
 class IRFrameNode : public FrameNode {
  public:
@@ -54,6 +57,17 @@ class IRFrame : public Frame {
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(IRFrame, Frame, IRFrameNode);
 };
 
+inline void ReprPrintIR(const ObjectRef& obj, ReprPrinter* p) {
+  IRDocsifier d;
+  With<IRFrame> f(d);
+  (*f)->AddDispatchToken(d, "ir");
+  try {
+    p->stream << DocToPythonScript(Docsify(obj, d, *f));
+  } catch (const Error& e) {
+    HandleUnsupportedFallback(e, obj, p);
+  }
+}
+
 }  // namespace printer
 }  // namespace script
 }  // namespace tvm
diff --git a/src/script/printer/legacy_repr.cc b/src/script/printer/legacy_repr.cc
new file mode 100644
index 000000000000..f264dfee8d50
--- /dev/null
+++ b/src/script/printer/legacy_repr.cc
@@ -0,0 +1,1008 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include <sstream>
+
+#include "../../support/str_escape.h"
+
+namespace tvm {
+
+#define TVM_LEGACY_REPR_PRINTER_DEF_OP(Type)                        \
+  ReprLegacyPrinter& operator<<(ReprLegacyPrinter& p, Type value) { \
+    p.Stream() << value;                                            \
+    return p;                                                       \
+  }
+
+TVM_LEGACY_REPR_PRINTER_DEF_OP(int);
+TVM_LEGACY_REPR_PRINTER_DEF_OP(int64_t);
+TVM_LEGACY_REPR_PRINTER_DEF_OP(float);
+TVM_LEGACY_REPR_PRINTER_DEF_OP(double);
+TVM_LEGACY_REPR_PRINTER_DEF_OP(char);
+TVM_LEGACY_REPR_PRINTER_DEF_OP(const char*);
+TVM_LEGACY_REPR_PRINTER_DEF_OP(const std::string&);
+TVM_LEGACY_REPR_PRINTER_DEF_OP(runtime::DataType);
+TVM_LEGACY_REPR_PRINTER_DEF_OP(const void*);
+TVM_LEGACY_REPR_PRINTER_DEF_OP(const String&);
+
+std::ostream& ReprLegacyPrinter::Stream() const { return stream; }
+
+ReprLegacyPrinter& operator<<(ReprLegacyPrinter& p, const ObjectRef& value) {
+  p.Stream() << AsLegacyRepr(value);
+  return p;
+}
+
+ReprLegacyPrinter& operator<<(ReprLegacyPrinter& out, tir::ForKind type) {  // NOLINT(*)
+  using tvm::tir::ForKind;
+  switch (type) {
+    case ForKind::kSerial:
+      out << "for";
+      break;
+    case ForKind::kParallel:
+      out << "parallel";
+      break;
+    case ForKind::kUnrolled:
+      out << "unrolled";
+      break;
+    case ForKind::kVectorized:
+      out << "vectorized";
+      break;
+    case ForKind::kThreadBinding:
+      out << "launch_thread";
+      break;
+  }
+  return out;
+}
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<ArrayNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const ArrayNode*>(node.get());
+      (*p) << '[';
+      for (size_t i = 0; i < op->size(); ++i) {
+        if (i != 0) {
+          (*p) << ", ";
+        }
+        p->Print(op->at(i));
+      }
+      (*p) << ']';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<MapNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const MapNode*>(node.get());
+      (*p) << '{';
+      for (auto it = op->begin(); it != op->end(); ++it) {
+        if (it != op->begin()) {
+          (*p) << ", ";
+        }
+        if (it->first->IsInstance<StringObj>()) {
+          (*p) << '\"' << Downcast<String>(it->first) << "\": ";
+        } else {
+          p->Print(it->first);
+          (*p) << ": ";
+        }
+        p->Print(it->second);
+      }
+      (*p) << '}';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<ShapeTupleObj>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const ShapeTupleObj*>(node.get());
+      (*p) << '[';
+      for (size_t i = 0; i < op->size; ++i) {
+        if (i != 0) {
+          (*p) << ", ";
+        }
+        (*p) << op->data[i];
+      }
+      (*p) << ']';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<IntImmNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const IntImmNode*>(node.get());
+      if (op->dtype == DataType::Int(32)) {
+        (*p) << op->value;
+      } else {
+        (*p) << "(" << op->dtype << ")" << op->value;
+      }
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<FloatImmNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const FloatImmNode*>(node.get());
+      switch (op->dtype.bits()) {
+        case 64:
+          (*p) << op->value;
+          break;
+        case 32:
+          (*p) << op->value << 'f';
+          break;
+        case 16:
+          (*p) << op->value << 'h';
+          break;
+        default:
+          LOG(FATAL) << "Unknown float type bits=" << op->dtype.bits();
+      }
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<RangeNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const RangeNode*>(node.get());
+      (*p) << "range(min=" << op->min << ", ext=" << op->extent << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<PrimTypeNode>([](const ObjectRef& ref, ReprLegacyPrinter* p) {
+      auto* node = static_cast<const PrimTypeNode*>(ref.get());
+      (*p) << node->dtype;
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<PointerTypeNode>([](const ObjectRef& ref, ReprLegacyPrinter* p) {
+      auto* node = static_cast<const PointerTypeNode*>(ref.get());
+      if (!node->storage_scope.empty()) {
+        (*p) << node->storage_scope << " ";
+      }
+      p->Print(node->element_type);
+      (*p) << '*';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<TupleTypeNode>([](const ObjectRef& ref, ReprLegacyPrinter* p) {
+      auto* node = static_cast<const TupleTypeNode*>(ref.get());
+      (*p) << "TupleTypeNode(" << node->fields << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<IncompleteTypeNode>([](const ObjectRef& ref, ReprLegacyPrinter* p) {
+      auto* node = static_cast<const IncompleteTypeNode*>(ref.get());
+      (*p) << "IncompleteTypeNode(" << node->kind << ", " << node << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<DictAttrsNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const DictAttrsNode*>(node.get());
+      (*p) << op->dict;
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<GlobalVarNode>([](const ObjectRef& ref, ReprLegacyPrinter* p) {
+      auto* node = static_cast<const GlobalVarNode*>(ref.get());
+      (*p) << "GlobalVar(" << node->name_hint << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<IRModuleNode>([](const ObjectRef& ref, ReprLegacyPrinter* p) {
+      auto* node = static_cast<const IRModuleNode*>(ref.get());
+      (*p) << "IRModule(" << node->functions << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<TypeVarNode>([](const ObjectRef& ref, ReprLegacyPrinter* p) {
+      auto* node = static_cast<const TypeVarNode*>(ref.get());
+      (*p) << "TypeVar(" << node->name_hint << ", " << node->kind << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<GlobalTypeVarNode>([](const ObjectRef& ref, ReprLegacyPrinter* p) {
+      auto* node = static_cast<const GlobalTypeVarNode*>(ref.get());
+      (*p) << "GlobalTypeVar(" << node->name_hint << ", " << node->kind << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<FuncTypeNode>([](const ObjectRef& ref, ReprLegacyPrinter* p) {
+      auto* node = static_cast<const FuncTypeNode*>(ref.get());
+      (*p) << "FuncType(" << node->type_params << ", " << node->arg_types << ", " << node->ret_type
+           << ", " << node->type_constraints << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<RelayRefTypeNode>([](const ObjectRef& ref, ReprLegacyPrinter* p) {
+      auto* node = static_cast<const RelayRefTypeNode*>(ref.get());
+      (*p) << "RelayRefTypeNode(" << node->value << ")";
+    });
+
+}  // namespace tvm
+
+namespace tvm {
+namespace tir {
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<BufferNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const BufferNode*>(node.get());
+      (*p) << "buffer(" << op->name << ", " << op << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<VarNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const VarNode*>(node.get());
+      // omit the type
+      // stream << op->name << "." << op->type;
+      (*p) << op->name_hint;
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<SizeVarNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const SizeVarNode*>(node.get());
+      (*p) << "{" << op->name_hint << "|" << op->name_hint << ">=0}";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<IterVarNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const IterVarNode*>(node.get());
+      (*p) << "iter_var(";
+      if (op->var->name_hint.length() != 0) {
+        (*p) << op->var->name_hint << ", ";
+      }
+      if (op->dom.defined()) {
+        (*p) << op->dom;
+      }
+      if (op->thread_tag.length() != 0) {
+        (*p) << ", " << op->thread_tag;
+      }
+      (*p) << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<StringImmNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const StringImmNode*>(node.get());
+      (*p) << '\"' << support::StrEscape(op->value) << '\"';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<CastNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const CastNode*>(node.get());
+      (*p) << op->dtype << '(';
+      p->Print(op->value);
+      (*p) << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<AddNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const AddNode*>(node.get());
+      (*p) << '(';
+      p->Print(op->a);
+      (*p) << " + ";
+      p->Print(op->b);
+      (*p) << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<SubNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const SubNode*>(node.get());
+      (*p) << '(';
+      p->Print(op->a);
+      (*p) << " - ";
+      p->Print(op->b);
+      (*p) << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<MulNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const MulNode*>(node.get());
+      (*p) << '(';
+      p->Print(op->a);
+      (*p) << "*";
+      p->Print(op->b);
+      (*p) << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<DivNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const DivNode*>(node.get());
+      (*p) << '(';
+      p->Print(op->a);
+      (*p) << "/";
+      p->Print(op->b);
+      (*p) << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<ModNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const ModNode*>(node.get());
+      (*p) << '(';
+      p->Print(op->a);
+      (*p) << " % ";
+      p->Print(op->b);
+      (*p) << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<FloorDivNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const FloorDivNode*>(node.get());
+      (*p) << "floordiv(" << op->a << ", " << op->b << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<FloorModNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const FloorModNode*>(node.get());
+      (*p) << "floormod(" << op->a << ", " << op->b << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<MinNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const MinNode*>(node.get());
+      (*p) << "min(";
+      p->Print(op->a);
+      (*p) << ", ";
+      p->Print(op->b);
+      (*p) << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<MaxNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const MaxNode*>(node.get());
+      (*p) << "max(";
+      p->Print(op->a);
+      (*p) << ", ";
+      p->Print(op->b);
+      (*p) << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<EQNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const EQNode*>(node.get());
+      (*p) << '(';
+      p->Print(op->a);
+      (*p) << " == ";
+      p->Print(op->b);
+      (*p) << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<NENode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const NENode*>(node.get());
+      (*p) << '(';
+      p->Print(op->a);
+      (*p) << " != ";
+      p->Print(op->b);
+      (*p) << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<LTNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const LTNode*>(node.get());
+      (*p) << '(';
+      p->Print(op->a);
+      (*p) << " < ";
+      p->Print(op->b);
+      (*p) << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<LENode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const LENode*>(node.get());
+      (*p) << '(';
+      p->Print(op->a);
+      (*p) << " <= ";
+      p->Print(op->b);
+      (*p) << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<GTNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const GTNode*>(node.get());
+      (*p) << '(';
+      p->Print(op->a);
+      (*p) << " > ";
+      p->Print(op->b);
+      (*p) << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<GENode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const GENode*>(node.get());
+      (*p) << '(';
+      p->Print(op->a);
+      (*p) << " >= ";
+      p->Print(op->b);
+      (*p) << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<AndNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const AndNode*>(node.get());
+      (*p) << '(';
+      p->Print(op->a);
+      (*p) << " && ";
+      p->Print(op->b);
+      (*p) << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<OrNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const OrNode*>(node.get());
+      (*p) << '(';
+      p->Print(op->a);
+      (*p) << " || ";
+      p->Print(op->b);
+      (*p) << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<NotNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const NotNode*>(node.get());
+      (*p) << '!';
+      p->Print(op->a);
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<SelectNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const SelectNode*>(node.get());
+      (*p) << "select(";
+      p->Print(op->condition);
+      (*p) << ", ";
+      p->Print(op->true_value);
+      (*p) << ", ";
+      p->Print(op->false_value);
+      (*p) << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<LoadNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const LoadNode*>(node.get());
+      (*p) << op->buffer_var << "[";
+      p->Print(op->index);
+      (*p) << "]";
+      if (!is_one(op->predicate)) {
+        (*p) << " if ";
+        p->Print(op->predicate);
+      }
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<RampNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const RampNode*>(node.get());
+      (*p) << "ramp(";
+      p->Print(op->base);
+      (*p) << ", ";
+      p->Print(op->stride);
+      (*p) << ", " << op->lanes << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<BroadcastNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const BroadcastNode*>(node.get());
+      (*p) << "x" << op->lanes << "(";
+      p->Print(op->value);
+      (*p) << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<LetNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const LetNode*>(node.get());
+      (*p) << "(let " << op->var << " = ";
+      p->Print(op->value);
+      (*p) << " in ";
+      p->Print(op->body);
+      (*p) << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<CallNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const CallNode*>(node.get());
+      if (auto* ptr_op = op->op.as<OpNode>()) {
+        (*p) << ptr_op->name << "(";
+      } else {
+        auto* ptr_gvar = op->op.as<GlobalVarNode>();
+        ICHECK(ptr_gvar != nullptr);
+        (*p) << "@" << ptr_gvar->name_hint << "(";
+      }
+      for (size_t i = 0; i < op->args.size(); ++i) {
+        p->Print(op->args[i]);
+        if (i < op->args.size() - 1) {
+          (*p) << ", ";
+        }
+      }
+      (*p) << ")";
+    });
+
+template <typename T>
+void PrintList(const Array<T>& exprs, ReprLegacyPrinter* p) {
+  for (size_t i = 0; i < exprs.size(); ++i) {
+    p->Print(exprs[i]);
+    if (i < exprs.size() - 1) {
+      (*p) << ", ";
+    }
+  }
+}
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<ShuffleNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const ShuffleNode*>(node.get());
+      (*p) << "shuffle(";
+      PrintList(op->vectors, p);
+      (*p) << ", ";
+      PrintList(op->indices, p);
+      (*p) << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<CommReducerNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const CommReducerNode*>(node.get());
+      (*p) << "comm_reducer(result=" << op->result << ", lhs=" << op->lhs << ", rhs=" << op->rhs
+           << ", identity_element=" << op->identity_element << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<ReduceNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const ReduceNode*>(node.get());
+      (*p) << "reduce(combiner=" << op->combiner;
+      (*p) << ", source=" << op->source;
+      (*p) << ", init=" << op->init;
+      (*p) << ", axis=" << op->axis;
+      (*p) << ", where=" << op->condition;
+      (*p) << ", value_index=" << op->value_index;
+      (*p) << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<AnyNode>([](const ObjectRef& node, ReprLegacyPrinter* p) { (*p) << "?"; });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<BufferLoadNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const BufferLoadNode*>(node.get());
+      (*p) << op->buffer->name << "[";
+      for (size_t i = 0; i < op->indices.size(); ++i) {
+        p->Print(op->indices[i]);
+        if (i < op->indices.size() - 1) {
+          (*p) << ", ";
+        }
+      }
+      (*p) << "]";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<ProducerLoadNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const ProducerLoadNode*>(node.get());
+      (*p) << op->producer->GetNameHint() << "[";
+      for (size_t i = 0; i < op->indices.size(); ++i) {
+        p->Print(op->indices[i]);
+        if (i < op->indices.size() - 1) {
+          (*p) << ", ";
+        }
+      }
+      (*p) << "]";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<PrimFuncNode>([](const ObjectRef& ref, ReprLegacyPrinter* p) {
+      // TODO(tvm-team) redirect to Text printer once we have a good text format.
+      auto* node = static_cast<const PrimFuncNode*>(ref.get());
+      (*p) << "PrimFunc(" << node->params << ") ";
+      if (node->attrs.defined()) {
+        (*p) << "attrs=" << node->attrs;
+      }
+      (*p) << " {\n";
+      p->indent += 2;
+      p->Print(node->body);
+      p->indent -= 2;
+      (*p) << "}\n";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<LetStmtNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const LetStmtNode*>(node.get());
+      p->PrintIndent();
+      (*p) << "let " << op->var << " = ";
+      p->Print(op->value);
+      (*p) << '\n';
+      p->Print(op->body);
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<AttrStmtNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const AttrStmtNode*>(node.get());
+      p->PrintIndent();
+      (*p) << "// attr [";
+      p->Print(op->node);
+      (*p) << "] " << op->attr_key << " = ";
+      p->Print(op->value);
+      (*p) << '\n';
+      p->Print(op->body);
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<AssertStmtNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const AssertStmtNode*>(node.get());
+      p->PrintIndent();
+      (*p) << "assert(";
+      p->Print(op->condition);
+      (*p) << ", ";
+      p->Print(op->message);
+      (*p) << ")\n";
+      p->Print(op->body);
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<ForNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const ForNode*>(node.get());
+      p->PrintIndent();
+      (*p) << op->kind << " (" << op->loop_var << ", ";
+      p->Print(op->min);
+      (*p) << ", ";
+      p->Print(op->extent);
+      (*p) << ") {\n";
+
+      p->indent += 2;
+      p->Print(op->body);
+      p->indent -= 2;
+
+      p->PrintIndent();
+      (*p) << "}\n";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<WhileNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const WhileNode*>(node.get());
+      p->PrintIndent();
+      (*p) << "while(" << op->condition << ") {\n";
+      p->indent += 2;
+      p->Print(op->body);
+      p->indent -= 2;
+      p->PrintIndent();
+      (*p) << "}\n";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<StoreNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const StoreNode*>(node.get());
+      p->PrintIndent();
+      (*p) << op->buffer_var << "[";
+      p->Print(op->index);
+      (*p) << "] = ";
+      p->Print(op->value);
+      if (!is_one(op->predicate)) {
+        (*p) << " if ";
+        p->Print(op->predicate);
+      }
+      (*p) << '\n';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<ProducerStoreNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const ProducerStoreNode*>(node.get());
+      p->PrintIndent();
+      (*p) << op->producer->GetNameHint() << "[";
+      for (size_t i = 0; i < op->indices.size(); ++i) {
+        p->Print(op->indices[i]);
+        if (i < op->indices.size() - 1) (*p) << ", ";
+      }
+      (*p) << "]";
+      (*p) << " =";
+      p->Print(op->value);
+      (*p) << '\n';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<AllocateNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const AllocateNode*>(node.get());
+      const auto* ptr_type = op->buffer_var->type_annotation.as<PointerTypeNode>();
+      ICHECK(ptr_type) << "The provided variable is not of pointer type";
+      p->PrintIndent();
+      (*p) << "allocate " << op->buffer_var << "[" << op->dtype;
+      for (size_t i = 0; i < op->extents.size(); ++i) {
+        (*p) << " * ";
+        p->Print(op->extents[i]);
+      }
+      (*p) << "], storage_scope = " << ptr_type->storage_scope;
+      if (!is_one(op->condition)) {
+        (*p) << " if ";
+        p->Print(op->condition);
+      }
+      (*p) << "\n";
+      p->Print(op->body);
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<AllocateConstNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const AllocateConstNode*>(node.get());
+      p->PrintIndent();
+      (*p) << "constant " << op->buffer_var << "[" << op->dtype;
+      for (size_t i = 0; i < op->extents.size(); ++i) {
+        (*p) << " * ";
+        p->Print(op->extents[i]);
+      }
+      (*p) << "]";
+      (*p) << "\n";
+      p->Print(op->body);
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<DeclBufferNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const DeclBufferNode*>(node.get());
+      p->PrintIndent();
+      (*p) << "decl_buffer " << op->buffer << "\n";
+      (*p) << op->body;
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<ProducerRealizeNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const ProducerRealizeNode*>(node.get());
+      p->PrintIndent();
+      (*p) << "producer_realize " << op->producer->GetNameHint() << "(";
+      for (size_t i = 0; i < op->bounds.size(); ++i) {
+        (*p) << "[";
+        p->Print(op->bounds[i]->min);
+        (*p) << ", ";
+        p->Print(op->bounds[i]->extent);
+        (*p) << "]";
+        if (i < op->bounds.size() - 1) (*p) << ", ";
+      }
+      (*p) << ")";
+      if (!is_one(op->condition)) {
+        (*p) << " if ";
+        p->Print(op->condition);
+      }
+      (*p) << " {\n";
+
+      p->indent += 2;
+      p->Print(op->body);
+      p->indent -= 2;
+
+      p->PrintIndent();
+      (*p) << "}\n";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<PrefetchNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const PrefetchNode*>(node.get());
+      p->PrintIndent();
+      (*p) << "prefetch " << op->buffer << "(";
+      for (size_t i = 0; i < op->bounds.size(); ++i) {
+        (*p) << "[";
+        p->Print(op->bounds[i]->min);
+        (*p) << ", ";
+        p->Print(op->bounds[i]->extent);
+        (*p) << "]";
+        if (i < op->bounds.size() - 1) (*p) << ", ";
+      }
+      (*p) << ")";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<SeqStmtNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const SeqStmtNode*>(node.get());
+      for (Stmt stmt : op->seq) {
+        p->Print(stmt);
+      }
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<IfThenElseNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const IfThenElseNode*>(node.get());
+      p->PrintIndent();
+      while (true) {
+        (*p) << "if (" << op->condition << ") {\n";
+        p->indent += 2;
+        p->Print(op->then_case);
+        p->indent -= 2;
+
+        if (!op->else_case) {
+          break;
+        }
+
+        if (const IfThenElseNode* nested_if = op->else_case.as<IfThenElseNode>()) {
+          p->PrintIndent();
+          (*p) << "} else ";
+          op = nested_if;
+        } else {
+          p->PrintIndent();
+          (*p) << "} else {\n";
+          p->indent += 2;
+          p->Print(op->else_case);
+          p->indent -= 2;
+          break;
+        }
+      }
+      p->PrintIndent();
+      (*p) << "}\n";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<EvaluateNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const EvaluateNode*>(node.get());
+      p->PrintIndent();
+      p->Print(op->value);
+      (*p) << "\n";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<BufferStoreNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const BufferStoreNode*>(node.get());
+      p->PrintIndent();
+      (*p) << op->buffer->name << "[";
+      for (size_t i = 0; i < op->indices.size(); ++i) {
+        p->Print(op->indices[i]);
+        if (i < op->indices.size() - 1) (*p) << ", ";
+      }
+      (*p) << "]";
+      (*p) << " = ";
+      p->Print(op->value);
+      (*p) << '\n';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<BufferRealizeNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const BufferRealizeNode*>(node.get());
+      p->PrintIndent();
+      (*p) << "buffer_realize " << op->buffer->name << "(";
+      for (size_t i = 0; i < op->bounds.size(); ++i) {
+        (*p) << "[";
+        p->Print(op->bounds[i]->min);
+        (*p) << ", ";
+        p->Print(op->bounds[i]->extent);
+        (*p) << "]";
+        if (i < op->bounds.size() - 1) (*p) << ", ";
+      }
+      (*p) << ")";
+      if (!is_one(op->condition)) {
+        (*p) << " if ";
+        p->Print(op->condition);
+      }
+      (*p) << " {\n";
+
+      p->indent += 2;
+      p->Print(op->body);
+      p->indent -= 2;
+
+      p->PrintIndent();
+      (*p) << "}\n";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<BufferRegionNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const BufferRegionNode*>(node.get());
+      (*p) << op->buffer->name;
+      (*p) << "[";
+      for (size_t i = 0; i < op->region.size(); ++i) {
+        const auto& range = op->region[i];
+        p->Print(range->min);
+        if (!is_one(range->extent)) {
+          (*p) << ":";
+          p->Print(range->min + range->extent);
+        }
+        if (i != op->region.size() - 1) (*p) << ", ";
+      }
+      (*p) << "]";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<MatchBufferRegionNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const MatchBufferRegionNode*>(node.get());
+      p->PrintIndent();
+      (*p) << op->buffer->name << " = match_buffer(";
+      p->Print(op->source);
+      (*p) << ")\n";
+    });
+
+void PrintBlockTitle(const BlockNode* op, ReprLegacyPrinter* p) {
+  (*p) << "block " << op->name_hint << "(";
+  for (size_t i = 0; i < op->iter_vars.size(); i++) {
+    p->Print(op->iter_vars[i]);
+    if (i < op->iter_vars.size() - 1) (*p) << ", ";
+  }
+  (*p) << ")";
+}
+
+void PrintBlockSignature(const BlockNode* op, ReprLegacyPrinter* p) {
+  // print read/write regions
+  p->PrintIndent();
+  (*p) << "reads(";
+  p->Print(op->reads);
+  (*p) << ")\n";
+  p->PrintIndent();
+  (*p) << "writes(";
+  p->Print(op->writes);
+  (*p) << ")\n";
+  // Print alloc_buffers
+  for (const auto& alloc_buf : op->alloc_buffers) {
+    p->PrintIndent();
+    (*p) << alloc_buf->name << " = alloc_buffer(" << alloc_buf->dtype << "[";
+    for (size_t i = 0; i < alloc_buf->shape.size(); ++i) {
+      if (i > 0) (*p) << ", ";
+      p->Print(alloc_buf->shape[i]);
+    }
+    (*p) << "])\n";
+  }
+  // Print match_buffer_regions
+  for (const auto& match_buf : op->match_buffers) {
+    p->Print(match_buf);
+  }
+  if (!op->annotations.empty()) {
+    p->PrintIndent();
+    (*p) << "annotations(" << op->annotations << ")\n";
+  }
+}
+
+void PrintBlockBody(const BlockNode* op, ReprLegacyPrinter* p) {
+  // Print init
+  if (op->init.defined()) {
+    p->PrintIndent();
+    (*p) << "with init() {\n";
+    p->indent += 2;
+    p->Print(op->init.value());
+    p->indent -= 2;
+    p->PrintIndent();
+    (*p) << "}\n";
+  }
+  // Print body
+  p->Print(op->body);
+}
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<BlockNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const BlockNode*>(node.get());
+      p->PrintIndent();
+      PrintBlockTitle(op, p);
+      (*p) << " {\n";
+      p->indent += 2;
+
+      // Print block elements (e.g. reads/writes, etc)
+      PrintBlockSignature(op, p);
+      // Print block init and body
+      PrintBlockBody(op, p);
+
+      p->indent -= 2;
+      p->PrintIndent();
+      (*p) << "}\n";
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
+    .set_dispatch<BlockRealizeNode>([](const ObjectRef& node, ReprLegacyPrinter* p) {
+      auto* op = static_cast<const BlockRealizeNode*>(node.get());
+      auto* block_op = op->block.get();
+      p->PrintIndent();
+      PrintBlockTitle(block_op, p);
+      (*p) << " {\n";
+      p->indent += 2;
+
+      // Print binding iter_values
+      for (size_t i = 0; i < block_op->iter_vars.size(); ++i) {
+        p->PrintIndent();
+        (*p) << "bind(";
+        p->Print(block_op->iter_vars[i]->var);
+        (*p) << ", ";
+        p->Print(op->iter_values[i]);
+        (*p) << ")\n";
+      }
+      // Print predicate
+      if (!is_one(op->predicate)) {
+        p->PrintIndent();
+        (*p) << "where(";
+        p->Print(op->predicate);
+        (*p) << ")\n";
+      }
+      // Print block elements (e.g. reads/writes, etc)
+      PrintBlockSignature(block_op, p);
+      // Print block init and body
+      PrintBlockBody(block_op, p);
+
+      p->indent -= 2;
+      p->PrintIndent();
+      (*p) << "}\n";
+    });
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/script/printer/tir/block.cc b/src/script/printer/tir/block.cc
index 8f008375ff87..e7f733864cc5 100644
--- a/src/script/printer/tir/block.cc
+++ b/src/script/printer/tir/block.cc
@@ -140,8 +140,8 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       return PrintBlock(d, block, p, NullOpt, NullOpt);
     });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BlockNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BlockRealizeNode>(ReprPrint);
+TVM_SCRIPT_REPR(tir::BlockNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::BlockRealizeNode, ReprPrintTIR);
 
 }  // namespace printer
 }  // namespace script
diff --git a/src/script/printer/tir/buffer.cc b/src/script/printer/tir/buffer.cc
index b9eef12abc77..5400328fe219 100644
--- a/src/script/printer/tir/buffer.cc
+++ b/src/script/printer/tir/buffer.cc
@@ -247,14 +247,14 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           return ScopeDoc(NullOpt, prefix, (*f)->stmts);
         });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BufferRegionNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BufferLoadNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BufferStoreNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BufferNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::MatchBufferRegionNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::ProducerLoadNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::ProducerStoreNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::ProducerRealizeNode>(ReprPrint);
+TVM_SCRIPT_REPR(tir::BufferRegionNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::BufferLoadNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::BufferStoreNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::BufferNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::MatchBufferRegionNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::ProducerLoadNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::ProducerStoreNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::ProducerRealizeNode, ReprPrintTIR);
 
 }  // namespace printer
 }  // namespace script
diff --git a/src/script/printer/tir/expr.cc b/src/script/printer/tir/expr.cc
index 317201fa3d74..1f2ba97700cb 100644
--- a/src/script/printer/tir/expr.cc
+++ b/src/script/printer/tir/expr.cc
@@ -134,10 +134,10 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
             Array<IdDoc> vars;
             vars.reserve(n_vars + n_vars);
             for (int i = 0; i < n_vars; ++i) {
-              vars.push_back(DefineVar(r->lhs[i], *f, d));
+              vars.push_back(Downcast<IdDoc>(DefineVar(r->lhs[i], *f, d)));
             }
             for (int i = 0; i < n_vars; ++i) {
-              vars.push_back(DefineVar(r->rhs[i], *f, d));
+              vars.push_back(Downcast<IdDoc>(DefineVar(r->rhs[i], *f, d)));
             }
             int n_results = r->result.size();
             Array<ExprDoc> results;
@@ -190,7 +190,10 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       };
       ExprDoc prefix{nullptr};
       if (const auto* op = call->op.as<OpNode>()) {
-        String name = op_names[GetRef<Op>(op)];
+        String name = op_names.get(GetRef<Op>(op), op->name);
+        if (op_names.count(GetRef<Op>(op)) == 0) {
+          LOG(WARNING) << "No TScriptPrinterName attribute for " << op->name;
+        }
         prefix = TIR(name);
       } else if (const auto* gv = call->op.as<GlobalVarNode>()) {
         prefix = LiteralDoc::Str(gv->name_hint);
@@ -278,39 +281,39 @@ TVM_SCRIPT_PRINTER_DEF_BINARY(Max, "max");
 #undef TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR
 #undef TVM_SCRIPT_PRINTER_DEF_BINARY
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::VarNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::SizeVarNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::IterVarNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::StringImmNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::CastNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::AddNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::SubNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::MulNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::DivNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::ModNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::FloorDivNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::FloorModNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::MinNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::MaxNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::LTNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::LENode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::EQNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::NENode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::GTNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::GENode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::AndNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::OrNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::NotNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::SelectNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::RampNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BroadcastNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::LetNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::CallNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::ShuffleNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::CommReducerNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::AnyNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::ReduceNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::LoadNode>(ReprPrint);
+TVM_SCRIPT_REPR(tir::VarNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::SizeVarNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::IterVarNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::StringImmNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::CastNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::AddNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::SubNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::MulNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::DivNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::ModNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::FloorDivNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::FloorModNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::MinNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::MaxNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::LTNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::LENode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::EQNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::NENode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::GTNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::GENode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::AndNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::OrNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::NotNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::SelectNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::RampNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::BroadcastNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::LetNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::CallNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::ShuffleNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::CommReducerNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::AnyNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::ReduceNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::LoadNode, ReprPrintTIR);
 
 }  // namespace printer
 }  // namespace script
diff --git a/src/script/printer/tir/for_loop.cc b/src/script/printer/tir/for_loop.cc
index 239b8e565f35..c8e2580f9c6f 100644
--- a/src/script/printer/tir/for_loop.cc
+++ b/src/script/printer/tir/for_loop.cc
@@ -62,7 +62,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         return ForDoc(TupleDoc(lhs), TIR("grid")->Call(rhs), (*f)->stmts);
       }
       // Step 3. If not `T.grid`, print loop kind accordingly
-      IdDoc lhs = DefineVar(loop->loop_var, *f, d);
+      ExprDoc lhs = DefineVar(loop->loop_var, *f, d);
       Optional<ExprDoc> min = NullOpt;
       Optional<ExprDoc> max = NullOpt;
       Optional<ExprDoc> annotations = NullOpt;
@@ -117,7 +117,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       return ForDoc(lhs, rhs, (*f)->stmts);
     });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::ForNode>(ReprPrint);
+TVM_SCRIPT_REPR(tir::ForNode, ReprPrintTIR);
 
 }  // namespace printer
 }  // namespace script
diff --git a/src/script/printer/tir/function.cc b/src/script/printer/tir/function.cc
index 55e8c075deb7..f0f84e81d57c 100644
--- a/src/script/printer/tir/function.cc
+++ b/src/script/printer/tir/function.cc
@@ -68,19 +68,27 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       }
       // Step 4. Handle `func->body`
       AsDocBody(func->body, p->Attr("body"), frame->get(), d);
+      Optional<ExprDoc> ret_type = NullOpt;
+      if (func->ret_type.defined()) {
+        const auto* as_tuple = func->ret_type.as<TupleTypeNode>();
+        if (!as_tuple || as_tuple->fields.size()) {
+          ret_type = d->AsDoc<ExprDoc>(func->ret_type, p->Attr("ret_type"));
+        }
+      }
       return FunctionDoc(
           /*name=*/IdDoc(FindFunctionName(d, func)),
           /*args=*/args,
           /*decorators=*/{TIR("prim_func")},
-          /*return_type=*/d->AsDoc<ExprDoc>(func->ret_type, p->Attr("ret_type")),
+          /*return_type=*/ret_type,
           /*body=*/(*frame)->stmts);
     });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
-    .set_dispatch<tir::PrimFuncNode>([](const ObjectRef& obj, ReprPrinter* p) {
-      std::string res = DocToPythonScript(IRDocsifier()->AsDoc(obj, ObjectPath::Root()));
-      p->stream << res;
-    });
+void ReprPrintPrimFunc(const ObjectRef& obj, ReprPrinter* p) {
+  std::string res = DocToPythonScript(IRDocsifier()->AsDoc(obj, ObjectPath::Root()));
+  p->stream << res;
+}
+
+TVM_SCRIPT_REPR(tir::PrimFuncNode, ReprPrintPrimFunc);
 
 }  // namespace printer
 }  // namespace script
diff --git a/src/script/printer/tir/ir.cc b/src/script/printer/tir/ir.cc
index 5fea278a4444..ad00c42119f6 100644
--- a/src/script/printer/tir/ir.cc
+++ b/src/script/printer/tir/ir.cc
@@ -89,24 +89,18 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       return TIR("Tuple")->Call(d->AsDoc<ListDoc>(ty->fields, p->Attr("fields"))->elements);
     });
 
-TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<IncompleteType>("", [](IncompleteType ty, ObjectPath p, IRDocsifier d) -> Doc {
-      return TIR("IncompleteType")->Call({});
-    });
-
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<Target>("", [](Target target, ObjectPath p, IRDocsifier d) -> Doc {
       Map<String, ObjectRef> config = target->Export();
       return TIR("target")->Call({d->AsDoc<ExprDoc>(config, p)});
     });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<IntImmNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<FloatImmNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<RangeNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<PrimTypeNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<PointerTypeNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<TupleTypeNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<IncompleteTypeNode>(ReprPrint);
+TVM_SCRIPT_REPR(IntImmNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(FloatImmNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(RangeNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(PrimTypeNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(PointerTypeNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(TupleTypeNode, ReprPrintTIR);
 
 }  // namespace printer
 }  // namespace script
diff --git a/src/script/printer/tir/script_method.cc b/src/script/printer/tir/script_method.cc
new file mode 100644
index 000000000000..5cda9a9626db
--- /dev/null
+++ b/src/script/printer/tir/script_method.cc
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/tir/builtin.h>
+
+#include "./utils.h"
+
+namespace tvm {
+
+std::string PrimExprNode::Script(int indent_spaces, bool print_line_numbers, int num_context_lines,
+                                 Optional<ObjectPath> path_to_underline) const {
+  using namespace tvm::script::printer;
+  IRDocsifier d;
+  ObjectRef obj = GetRef<ObjectRef>(this);
+  With<TIRFrame> f(MakeDispatchFrame(d, obj, ObjectRef(nullptr)));
+  return DocToPythonScript(Docsify(obj, d, *f), indent_spaces, print_line_numbers,
+                           num_context_lines, path_to_underline);
+}
+
+namespace tir {
+
+std::string StmtNode::Script(int indent_spaces, bool print_line_numbers, int num_context_lines,
+                             Optional<ObjectPath> path_to_underline) const {
+  using namespace tvm::script::printer;
+  IRDocsifier d;
+  ObjectRef obj = GetRef<ObjectRef>(this);
+  With<TIRFrame> f(MakeDispatchFrame(d, obj, ObjectRef(nullptr)));
+  return DocToPythonScript(Docsify(obj, d, *f), indent_spaces, print_line_numbers,
+                           num_context_lines, path_to_underline);
+}
+
+std::string PrimFuncNode::Script(int indent_spaces, bool print_line_numbers, int num_context_lines,
+                                 Optional<ObjectPath> path_to_underline) const {
+  using namespace tvm::script::printer;
+  return DocToPythonScript(IRDocsifier()->AsDoc(GetRef<ObjectRef>(this), ObjectPath::Root()),
+                           indent_spaces, print_line_numbers, num_context_lines, path_to_underline);
+}
+
+TVM_REGISTER_GLOBAL("tir.PrimFuncScript").set_body_method<PrimFunc>(&PrimFuncNode::Script);
+TVM_REGISTER_GLOBAL("tir.StmtScript").set_body_method<Stmt>(&StmtNode::Script);
+TVM_REGISTER_GLOBAL("tir.PrimExprScript").set_body_method<PrimExpr>(&PrimExprNode::Script);
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/script/printer/tir/stmt.cc b/src/script/printer/tir/stmt.cc
index 436f2b202d85..7344cb4d98d5 100644
--- a/src/script/printer/tir/stmt.cc
+++ b/src/script/printer/tir/stmt.cc
@@ -352,19 +352,19 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           LOG(FATAL) << "ValueError: Store has been deprecated for BufferStore: " << stmt;
         });
 
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::LetStmtNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::AttrStmtNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::AssertStmtNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::WhileNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::AllocateNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::AllocateConstNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::DeclBufferNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::PrefetchNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::SeqStmtNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::IfThenElseNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::EvaluateNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::BufferRealizeNode>(ReprPrint);
-TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<tir::StoreNode>(ReprPrint);
+TVM_SCRIPT_REPR(tir::LetStmtNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::AttrStmtNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::AssertStmtNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::WhileNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::AllocateNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::AllocateConstNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::DeclBufferNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::PrefetchNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::SeqStmtNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::IfThenElseNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::EvaluateNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::BufferRealizeNode, ReprPrintTIR);
+TVM_SCRIPT_REPR(tir::StoreNode, ReprPrintTIR);
 
 }  // namespace printer
 }  // namespace script
diff --git a/src/script/printer/tir/utils.h b/src/script/printer/tir/utils.h
index 7f67c3a11c73..047513dcb316 100644
--- a/src/script/printer/tir/utils.h
+++ b/src/script/printer/tir/utils.h
@@ -33,6 +33,8 @@
 #include <utility>
 #include <vector>
 
+#include "../utils.h"
+
 namespace tvm {
 namespace script {
 namespace printer {
@@ -81,7 +83,10 @@ inline ExprDoc TIR(const String& attr) { return IdDoc(Default::Prefix("tir"))->A
  * \param frame The frame to define the variable in
  * \return The IdDoc corresponding to the variable
  */
-inline IdDoc DefineVar(const tir::Var& var, const Frame& frame, const IRDocsifier& d) {
+inline ExprDoc DefineVar(const tir::Var& var, const Frame& frame, const IRDocsifier& d) {
+  if (Optional<ExprDoc> doc = d->GetVarDoc(var)) {
+    return doc.value();
+  }
   return d->Define(var, frame, var->name_hint.empty() ? "v" : var->name_hint);
 }
 
@@ -181,26 +186,14 @@ inline TIRFrame MakeDispatchFrame(const IRDocsifier& d, const ObjectRef& root,
 }
 
 /*! \brief Redirected method for the ReprPrinter */
-inline void ReprPrint(const ObjectRef& stmt, ReprPrinter* p) {
+inline void ReprPrintTIR(const ObjectRef& obj, ReprPrinter* p) {
   IRDocsifier d;
-  With<TIRFrame> f(MakeDispatchFrame(d, stmt, ObjectRef(nullptr)));
-  Doc doc = d->AsDoc(stmt, ObjectPath::Root());
-  if (const auto* expr_doc = doc.as<ExprDocNode>()) {
-    if (!Default::VerboseExpr()) {
-      (*f)->stmts.clear();
-    }
-    (*f)->stmts.push_back(ExprStmtDoc(GetRef<ExprDoc>(expr_doc)));
-  } else if (const auto* stmt_doc = doc.as<StmtDocNode>()) {
-    (*f)->stmts.push_back(GetRef<StmtDoc>(stmt_doc));
-  } else if (const auto* stmt_block = doc.as<StmtBlockDocNode>()) {
-    for (const StmtDoc& d : stmt_block->stmts) {
-      (*f)->stmts.push_back(d);
-    }
-  } else {
-    LOG(FATAL) << "TypeError: Unexpected doc type: " << doc->GetTypeKey();
+  With<TIRFrame> f(MakeDispatchFrame(d, obj, ObjectRef(nullptr)));
+  try {
+    p->stream << DocToPythonScript(Docsify(obj, d, *f));
+  } catch (const tvm::Error& e) {
+    HandleUnsupportedFallback(e, obj, p);
   }
-  std::string res = DocToPythonScript(StmtBlockDoc((*f)->stmts));
-  p->stream << res;
 }
 
 /*!
diff --git a/src/script/printer/utils.h b/src/script/printer/utils.h
new file mode 100644
index 000000000000..9f9a7d8299c4
--- /dev/null
+++ b/src/script/printer/utils.h
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SCRIPT_PRINTER_UTILS_H_
+#define TVM_SCRIPT_PRINTER_UTILS_H_
+
+#include <tvm/script/printer/ir_docsifier.h>
+#include <tvm/script/printer/printer.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt.h>
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+#define TVM_SCRIPT_REPR(ObjectType, Method) \
+  TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<ObjectType>(Method);
+
+inline StmtBlockDoc Docsify(const ObjectRef& obj, const IRDocsifier& d, const Frame& f) {
+  Doc doc = d->AsDoc(obj, ObjectPath::Root());
+  if (const auto* expr_doc = doc.as<ExprDocNode>()) {
+    if (!Default::VerboseExpr()) {
+      f->stmts.clear();
+    }
+    f->stmts.push_back(ExprStmtDoc(GetRef<ExprDoc>(expr_doc)));
+  } else if (const auto* stmt_doc = doc.as<StmtDocNode>()) {
+    f->stmts.push_back(GetRef<StmtDoc>(stmt_doc));
+  } else if (const auto* stmt_block = doc.as<StmtBlockDocNode>()) {
+    for (const StmtDoc& d : stmt_block->stmts) {
+      f->stmts.push_back(d);
+    }
+  } else {
+    LOG(FATAL) << "TypeError: Unexpected doc type: " << doc->GetTypeKey();
+  }
+  return StmtBlockDoc(f->stmts);
+}
+
+inline void HandleUnsupportedFallback(const tvm::Error& error, const ObjectRef& obj,
+                                      ReprPrinter* p) {
+  LOG(WARNING) << "TVMScript printer falls back to the legacy ReprPrinter with the error:\n"
+               << error.what();
+  p->stream << AsLegacyRepr(obj);
+}
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
+
+#endif  // TVM_SCRIPT_PRINTER_UTILS_H_
diff --git a/src/target/source/interface_c.cc b/src/target/source/interface_c.cc
index fe495b212ad8..d2d1d3f78d74 100644
--- a/src/target/source/interface_c.cc
+++ b/src/target/source/interface_c.cc
@@ -218,8 +218,7 @@ class InterfaceCNode : public runtime::ModuleNode {
       code_ << '\n';
 
     } else {
-      LOG(FATAL) << "No constant data in constant pool found "
-                 << PrettyPrint(GetRef<ObjectRef>(pool_info));
+      LOG(FATAL) << "No constant data in constant pool found " << GetRef<ObjectRef>(pool_info);
     }
   }
 
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index ce5f5d5b5357..ccc15fc1ee49 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -329,8 +329,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       code_ << "};";
       code_ << "// of total size " << allocated_size << " bytes\n";
     } else {
-      LOG(FATAL) << "No constant data in constant pool found "
-                 << PrettyPrint(GetRef<ObjectRef>(pool_info));
+      LOG(FATAL) << "No constant data in constant pool found " << GetRef<ObjectRef>(pool_info);
     }
   }
 
diff --git a/src/tir/analysis/control_flow_graph.cc b/src/tir/analysis/control_flow_graph.cc
index 2e537450d232..de9da80140e4 100644
--- a/src/tir/analysis/control_flow_graph.cc
+++ b/src/tir/analysis/control_flow_graph.cc
@@ -25,6 +25,7 @@
 #include "control_flow_graph.h"
 
 #include <tvm/runtime/registry.h>
+#include <tvm/script/printer/printer.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
@@ -1623,8 +1624,8 @@ bool ControlFlowGraph::IsOverwrittenWithoutEffect(const tir::BufferStore& store,
   }
 
   auto it = control_flow_lookup_.find(context.get());
-  ICHECK(it != control_flow_lookup_.end())
-      << "Context " << PrettyPrint(context) << " did not occur within analyzed statement";
+  ICHECK(it != control_flow_lookup_.end()) << "Context did not occur within analyzed statement:\n"
+                                           << context;
   const auto& context_block = control_flow_[it->second];
 
   auto [store_touch, free_params] = context_block.MakeBufferTouch(
diff --git a/src/tir/analysis/oob_checker.cc b/src/tir/analysis/oob_checker.cc
index a3d3501a9aae..dbe114df4973 100644
--- a/src/tir/analysis/oob_checker.cc
+++ b/src/tir/analysis/oob_checker.cc
@@ -24,7 +24,6 @@
 #include <tvm/tir/transform.h>
 
 #include "../../arith/ir_visitor_with_analyzer.h"
-#include "../../printer/text_printer.h"
 #include "../schedule/error.h"
 
 namespace tvm {
diff --git a/src/tir/analysis/verify_memory.cc b/src/tir/analysis/verify_memory.cc
index 80d6897011d5..9d932d236355 100644
--- a/src/tir/analysis/verify_memory.cc
+++ b/src/tir/analysis/verify_memory.cc
@@ -182,7 +182,7 @@ std::vector<String> VerifyMemory_(const PrimFunc& func) {
 
   VLOG(1) << "verifying memory for target '" << target.value()->str()
           << "' for primitive:" << std::endl
-          << PrettyPrint(func);
+          << func;
 
   if (func->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
       CallingConv::kDefault) {
diff --git a/src/tir/ir/legacy_printer.cc b/src/tir/ir/legacy_printer.cc
deleted file mode 100644
index 4c2fd5037b65..000000000000
--- a/src/tir/ir/legacy_printer.cc
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include <tvm/tir/expr.h>
-#include <tvm/tir/op.h>
-#include <tvm/tir/stmt.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <sstream>
-
-#include "../../support/str_escape.h"
-
-namespace tvm {
-namespace tir {
-
-std::string LegacyTIRPrint(const ObjectRef& obj) {
-  using namespace tvm::tir;
-  class LegacyTIRPrinter : private tir::ExprVisitor {
-   public:
-    explicit LegacyTIRPrinter(std::ostream& os) : stream(os) {}
-
-    void Print(const ObjectRef& obj) {
-      if (const auto* op = obj.as<CommReducerNode>()) {
-        Print_(op);
-      } else if (const auto* op = obj.as<IterVarNode>()) {
-        Print_(op);
-      } else if (const auto* op = obj.as<RangeNode>()) {
-        Print_(op);
-      } else if (const auto* op = obj.as<OpNode>()) {
-        Print_(op);
-      } else {
-        VisitExpr(Downcast<PrimExpr>(obj));
-      }
-    }
-
-   private:
-    void VisitExpr_(const VarNode* op) final { stream << op->name_hint; }
-
-    void VisitExpr_(const SizeVarNode* op) final {
-      stream << "{" << op->name_hint << "|" << op->name_hint << ">=0}";
-    }
-
-    void VisitExpr_(const IntImmNode* op) final {
-      if (op->dtype == DataType::Int(32)) {
-        stream << op->value;
-      } else {
-        stream << "(" << op->dtype << ")" << op->value;
-      }
-    }
-
-    void VisitExpr_(const FloatImmNode* op) final {
-      switch (op->dtype.bits()) {
-        case 64:
-          stream << op->value;
-          break;
-        case 32:
-          stream << op->value << 'f';
-          break;
-        case 16:
-          stream << op->value << 'h';
-          break;
-        default:
-          LOG(FATAL) << "Unknown float type bits=" << op->dtype.bits();
-      }
-    }
-    void VisitExpr_(const StringImmNode* op) final {
-      stream << '\"' << support::StrEscape(op->value) << '\"';
-    }
-    void VisitExpr_(const CastNode* op) final {
-      stream << op->dtype << '(';
-      VisitExpr(op->value);
-      stream << ')';
-    }
-    void VisitExpr_(const AddNode* op) final { PrintBinary(op->a, op->b, " + "); }
-    void VisitExpr_(const SubNode* op) final { PrintBinary(op->a, op->b, " - "); }
-    void VisitExpr_(const MulNode* op) final { PrintBinary(op->a, op->b, "*"); }
-    void VisitExpr_(const DivNode* op) final { PrintBinary(op->a, op->b, "/"); }
-    void VisitExpr_(const ModNode* op) final { PrintBinary(op->a, op->b, " % "); }
-    void VisitExpr_(const FloorDivNode* op) final { PrintCall("floordiv", op->a, op->b); }
-    void VisitExpr_(const FloorModNode* op) final { PrintCall("floormod", op->a, op->b); }
-    void VisitExpr_(const MinNode* op) final { PrintCall("min", op->a, op->b); }
-    void VisitExpr_(const MaxNode* op) final { PrintCall("max", op->a, op->b); }
-    void VisitExpr_(const EQNode* op) final { PrintBinary(op->a, op->b, " == "); }
-    void VisitExpr_(const NENode* op) final { PrintBinary(op->a, op->b, " != "); }
-    void VisitExpr_(const LTNode* op) final { PrintBinary(op->a, op->b, " < "); }
-    void VisitExpr_(const LENode* op) final { PrintBinary(op->a, op->b, " <= "); }
-    void VisitExpr_(const GTNode* op) final { PrintBinary(op->a, op->b, " > "); }
-    void VisitExpr_(const GENode* op) final { PrintBinary(op->a, op->b, " >= "); }
-    void VisitExpr_(const AndNode* op) final { PrintBinary(op->a, op->b, " && "); }
-    void VisitExpr_(const OrNode* op) final { PrintBinary(op->a, op->b, " || "); }
-
-    void VisitExpr_(const NotNode* op) final {
-      stream << "!";
-      VisitExpr(op->a);
-    }
-
-    void VisitExpr_(const SelectNode* op) final {
-      stream << "select(";
-      VisitExpr(op->condition);
-      stream << ", ";
-      VisitExpr(op->true_value);
-      stream << ", ";
-      VisitExpr(op->false_value);
-      stream << ')';
-    }
-
-    void VisitExpr_(const RampNode* op) final {
-      stream << "ramp(";
-      VisitExpr(op->base);
-      stream << ", ";
-      VisitExpr(op->stride);
-      stream << ", " << op->lanes << ')';
-    }
-
-    void VisitExpr_(const BroadcastNode* op) final {
-      stream << "x" << op->lanes << "(";
-      VisitExpr(op->value);
-      stream << ")";
-    }
-
-    void VisitExpr_(const LetNode* op) final {
-      stream << "(let " << op->var << " = ";
-      VisitExpr(op->value);
-      stream << " in ";
-      VisitExpr(op->body);
-      stream << ")";
-    }
-
-    void VisitExpr_(const CallNode* op) final {
-      if (auto* ptr_op = op->op.as<OpNode>()) {
-        stream << ptr_op->name << "(";
-      } else {
-        auto* p = op->op.as<GlobalVarNode>();
-        ICHECK(p != nullptr);
-        stream << "@" << p->name_hint << "(";
-      }
-      for (size_t i = 0; i < op->args.size(); ++i) {
-        VisitExpr(op->args[i]);
-        if (i < op->args.size() - 1) {
-          stream << ", ";
-        }
-      }
-      stream << ")";
-    }
-
-    void VisitExpr_(const ShuffleNode* op) final {
-      stream << "shuffle(";
-      PrintList(op->vectors.GetArrayNode());
-      stream << ", ";
-      PrintList(op->indices.GetArrayNode());
-      stream << ")";
-    }
-
-    void VisitExpr_(const ReduceNode* op) final {
-      stream << "reduce(combiner=";
-      Print_(op->combiner.get());
-      stream << ", source=";
-      PrintList(op->source.GetArrayNode());
-      stream << ", init=";
-      PrintList(op->init.GetArrayNode());
-      stream << ", axis=";
-      PrintList(op->axis.GetArrayNode());
-      stream << ", where=";
-      VisitExpr(op->condition);
-      stream << ", value_index=" << op->value_index;
-      stream << ")";
-    }
-
-    void VisitExpr_(const AnyNode* op) final { stream << "?"; }
-
-    void VisitExpr_(const BufferLoadNode* op) final {
-      stream << op->buffer->name << "[";
-      for (size_t i = 0; i < op->indices.size(); ++i) {
-        VisitExpr(op->indices[i]);
-        if (i < op->indices.size() - 1) {
-          stream << ", ";
-        }
-      }
-      stream << "]";
-    }
-
-    void VisitExpr_(const ProducerLoadNode* op) final {
-      stream << op->producer->GetNameHint() << "[";
-      for (size_t i = 0; i < op->indices.size(); ++i) {
-        VisitExpr(op->indices[i]);
-        if (i < op->indices.size() - 1) {
-          stream << ", ";
-        }
-      }
-      stream << "]";
-    }
-
-   private:
-    void Print_(const CommReducerNode* op) {
-      stream << "comm_reducer(result=";
-      PrintList(op->result.GetArrayNode());
-      stream << ", lhs=";
-      PrintList(op->lhs.GetArrayNode());
-      stream << ", rhs=";
-      PrintList(op->rhs.GetArrayNode());
-      stream << ", identity_element=";
-      PrintList(op->identity_element.GetArrayNode());
-      stream << ")";
-    }
-
-    void Print_(const IterVarNode* op) {
-      stream << "{" << op->var->name_hint << "|" << op->var->name_hint << " in [";
-      VisitExpr(op->dom->min);
-      stream << ", ";
-      VisitExpr(op->dom->extent);
-      stream << ")}";
-    }
-
-    void Print_(const RangeNode* op) {
-      stream << "range(min=" << op->min << ", ext=" << op->extent << ')';
-    }
-
-    void Print_(const OpNode* op) { stream << "Op(" << op->name << ")"; }
-
-   private:
-    void PrintBinary(const PrimExpr& a, const PrimExpr& b, const std::string& sign) {
-      stream << '(';
-      VisitExpr(a);
-      stream << sign;
-      VisitExpr(b);
-      stream << ')';
-    }
-
-    void PrintCall(const std::string& call, const PrimExpr& a, const PrimExpr& b) {
-      stream << call << '(';
-      VisitExpr(a);
-      stream << ", ";
-      VisitExpr(b);
-      stream << ')';
-    }
-
-    void PrintList(const ArrayNode* exprs) {
-      int n = static_cast<int>(exprs->size());
-      for (int i = 0; i < n; ++i) {
-        VisitExpr(Downcast<PrimExpr>(exprs->at(i)));
-        if (i < n - 1) {
-          stream << ", ";
-        }
-      }
-    }
-
-    std::ostream& stream;
-  };
-  std::ostringstream os;
-  LegacyTIRPrinter(os).Print(obj);
-  return os.str();
-}
-
-}  // namespace tir
-}  // namespace tvm
diff --git a/src/tir/schedule/analysis/verify.cc b/src/tir/schedule/analysis/verify.cc
index e9ee7227f6fb..ef45f7f8c701 100644
--- a/src/tir/schedule/analysis/verify.cc
+++ b/src/tir/schedule/analysis/verify.cc
@@ -234,7 +234,7 @@ void VerifyCachedFlags(const ScheduleState& self) {
     os << std::endl;
   }
   LOG(FATAL) << "Schedule verification failed. The IR is:\n"
-             << AsTVMScript(self->mod) << "\nThe errors are:\n"
+             << self->mod << "\nThe errors are:\n"
              << os.str();
   throw;
 }
diff --git a/src/tir/schedule/error.cc b/src/tir/schedule/error.cc
index 32e5c2455a85..55d751c3311e 100644
--- a/src/tir/schedule/error.cc
+++ b/src/tir/schedule/error.cc
@@ -16,6 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include "../../printer/text_printer.h"
 #include "./utils.h"
 
 namespace tvm {
diff --git a/src/tir/schedule/primitive/compute_inline.cc b/src/tir/schedule/primitive/compute_inline.cc
index e4771c8b19f6..d21149437f08 100644
--- a/src/tir/schedule/primitive/compute_inline.cc
+++ b/src/tir/schedule/primitive/compute_inline.cc
@@ -225,11 +225,11 @@ class ProducerHasNonTrivialPredicateError : public ScheduleError {
   }
 
   String DetailRenderTemplate() const final {
-    return "ScheduleError: The producer block {0} has a non-trivial predicate " +
-           PrettyPrint(producer_->predicate) +
-           " that cannot be implied "
-           "by the synthesized predicate " +
-           PrettyPrint(new_predicate_) + " of the new inlined block.";
+    std::ostringstream os;
+    os << "ScheduleError: The producer block {0} has a non-trivial predicate "
+       << producer_->predicate << " that cannot be implied by the synthesized predicate "
+       << new_predicate_ << " of the new inlined block.";
+    return os.str();
   }
 
   IRModule mod() const final { return mod_; }
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index a9b367c4b7d9..6aff85da720d 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -17,6 +17,8 @@
  * under the License.
  */
 
+#include <tvm/node/node.h>
+
 #include <optional>
 #include <variant>
 
@@ -1266,7 +1268,7 @@ class OpaqueNewIterTypeError : public ScheduleError {
 
   String DetailRenderTemplate() const final {
     std::ostringstream os;
-    os << "Cannot detect the block iter type for new iter value " << PrettyPrint(iter_value_)
+    os << "Cannot detect the block iter type for new iter value " << iter_value_
        << " in {0} because it contains more than one type of original iter vars.";
     return os.str();
   }
diff --git a/src/tir/schedule/utils.h b/src/tir/schedule/utils.h
index bcc8b7facbc9..d40906209fb9 100644
--- a/src/tir/schedule/utils.h
+++ b/src/tir/schedule/utils.h
@@ -22,6 +22,7 @@
 #include <tvm/arith/analyzer.h>
 #include <tvm/arith/int_set.h>
 #include <tvm/arith/iter_affine_map.h>
+#include <tvm/node/serialization.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/function.h>
 #include <tvm/tir/op.h>
@@ -38,7 +39,6 @@
 
 #include "../../arith/pattern_match.h"
 #include "../../node/attr_registry.h"
-#include "../../printer/text_printer.h"
 #include "../../runtime/thread_storage_scope.h"
 #include "../../support/array.h"
 #include "../../support/nd_int_set.h"
diff --git a/src/tir/transforms/common_subexpr_elim.cc b/src/tir/transforms/common_subexpr_elim.cc
index 5cf6f231dd80..acda9220b731 100644
--- a/src/tir/transforms/common_subexpr_elim.cc
+++ b/src/tir/transforms/common_subexpr_elim.cc
@@ -151,8 +151,8 @@ bool CommonSubexpressionEliminator::OrderOnExprAndFrequency(std::pair<PrimExpr,
   // as we need a deterministic order
   std::stringstream a_stream;
   std::stringstream b_stream;
-  a_stream << LegacyTIRPrint(a.first);
-  b_stream << LegacyTIRPrint(b.first);
+  a_stream << AsLegacyRepr(a.first);
+  b_stream << AsLegacyRepr(b.first);
   return (a_stream.str().compare(b_stream.str()) < 0);
 }
 
diff --git a/src/tir/transforms/common_subexpr_elim_tools.cc b/src/tir/transforms/common_subexpr_elim_tools.cc
index c6b0b457c075..ba101ce4e70f 100644
--- a/src/tir/transforms/common_subexpr_elim_tools.cc
+++ b/src/tir/transforms/common_subexpr_elim_tools.cc
@@ -817,8 +817,8 @@ std::vector<std::pair<PrimExpr, size_t>> SyntacticToSemanticComputations(
        [](std::pair<PrimExpr, size_t> a, std::pair<PrimExpr, size_t> b) {
          std::stringstream a_stream;
          std::stringstream b_stream;
-         a_stream << LegacyTIRPrint(a.first);
-         b_stream << LegacyTIRPrint(b.first);
+         a_stream << AsLegacyRepr(a.first);
+         b_stream << AsLegacyRepr(b.first);
          return a_stream.str().compare(b_stream.str()) < 0;
        });
 
diff --git a/src/tir/transforms/install_debug_spans.cc b/src/tir/transforms/install_debug_spans.cc
index 4daa1aafe8cc..bc9002ee841f 100644
--- a/src/tir/transforms/install_debug_spans.cc
+++ b/src/tir/transforms/install_debug_spans.cc
@@ -23,7 +23,7 @@
     the location to which the ops would be printed
  */
 
-#include "install_debug_spans.h"
+#include "./install_debug_spans.h"
 
 #include <tvm/tir/transform.h>
 
diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc
index e1dc2f5bf113..e9c57eb78e26 100644
--- a/src/tir/transforms/narrow_datatype.cc
+++ b/src/tir/transforms/narrow_datatype.cc
@@ -30,7 +30,6 @@
 
 #include "../../arith/ir_mutator_with_analyzer.h"
 #include "../../arith/ir_visitor_with_analyzer.h"
-#include "../../printer/text_printer.h"
 
 namespace tvm {
 namespace tir {
diff --git a/src/tir/usmp/transform/assign_pool_info.cc b/src/tir/usmp/transform/assign_pool_info.cc
index 2bded7b4877b..3acceab6e31b 100644
--- a/src/tir/usmp/transform/assign_pool_info.cc
+++ b/src/tir/usmp/transform/assign_pool_info.cc
@@ -99,7 +99,7 @@ class PoolInfoAssigner : public StmtExprMutator {
 };
 
 WorkspacePoolInfo PoolInfoAssigner::CreateDefaultWorkspaceMemoryPool(const tvm::IRModule& module) {
-  VLOG(1) << "Creating default memory pool for:" << std::endl << PrettyPrint(module);
+  VLOG(1) << "Creating default memory pool for:" << std::endl << module;
   Map<Target, String> target_access;
   tir::PrimFunc tir_main_func =
       Downcast<tir::PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_module_main));
@@ -134,7 +134,7 @@ Stmt PoolInfoAssigner::VisitStmt_(const AllocateNode* op) {
   Map<String, ObjectRef> annotations = Map<String, ObjectRef>(op->annotations);
   if (op->annotations.find(kPoolCandidatesAllocateAttr) == op->annotations.end()) {
     ICHECK(target_pool_infos_.count(tgt.value()->str()) > 0)
-        << "Target " << PrettyPrint(tgt) << " not found among " << PrettyPrint(target_pool_infos_);
+        << "Target " << tgt << " not found among " << target_pool_infos_;
     annotations.Set(kPoolCandidatesAllocateAttr, target_pool_infos_[tgt.value()->str()]);
   }
   Stmt body = VisitStmt(op->body);
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index a70e091b2cee..0728840ee96b 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -14,20 +14,22 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import pytest
 import numpy as np
+import pytest
 
 pytest.importorskip("ethosu.vela")
 import tvm
 from tvm import relay
-from tvm.script import tir as T
-from tvm.relay.testing import run_opt_pass
-from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
-from tvm.relay.backend.contrib.ethosu.tir.scheduler import OperatorCompute
-from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants
 from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.scheduler import (
+    OperatorCompute,
+    copy_constants,
+)
+from tvm.relay.testing import run_opt_pass
+from tvm.script import tir as T
 
-from .infra import make_ethosu_conv2d, make_ethosu_binary_elementwise
+from .infra import make_ethosu_binary_elementwise, make_ethosu_conv2d
 
 
 # fmt: off
@@ -140,7 +142,7 @@ def _get_func():
     with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
         func = _get_func()
         mod, consts = _lower_to_tir(func, cascader=_planner)
-        script = mod.script(show_meta=True)
+        script = mod.script()
         test_mod = tvm.script.from_source(script)
         tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
@@ -242,7 +244,7 @@ def _get_func():
     with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
         func = _get_func()
         mod, consts = _lower_to_tir(func, cascader=_cascader)
-        script = mod.script(show_meta=True)
+        script = mod.script()
         test_mod = tvm.script.from_source(script)
         tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
@@ -340,7 +342,7 @@ def _get_func():
         func = _get_func()
         mod, consts = _lower_to_tir(func)
 
-        script = mod.script(show_meta=True)
+        script = mod.script()
         test_mod = tvm.script.from_source(script)
         tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
@@ -474,7 +476,7 @@ def _get_func():
         func = _get_func()
         mod, consts = _lower_to_tir(func, cascader=_planner)
 
-        script = mod.script(show_meta=True)
+        script = mod.script()
         test_mod = tvm.script.from_source(script)
         tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
diff --git a/tests/python/contrib/test_ethosu/test_outline_compiler_functions.py b/tests/python/contrib/test_ethosu/test_outline_compiler_functions.py
index 91458f60e172..062637b3bb94 100644
--- a/tests/python/contrib/test_ethosu/test_outline_compiler_functions.py
+++ b/tests/python/contrib/test_ethosu/test_outline_compiler_functions.py
@@ -81,6 +81,6 @@ def expected():
     exp = expected()
 
     global_vars = [str(gv) for gv in after.get_global_vars()]
-    assert "@ext_func" in global_vars
-    assert "@ext_func_2" not in global_vars
+    assert 'I.GlobalVar("ext_func")' in global_vars
+    assert 'I.GlobalVar("ext_func_2")' not in global_vars
     assert tvm.ir.structural_equal(after["ext_func"], exp["ext_func"])
diff --git a/tests/python/contrib/test_ethosu/test_remove_concatenates.py b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
index 4b4ba52b86f6..b8ce7f0d60c9 100644
--- a/tests/python/contrib/test_ethosu/test_remove_concatenates.py
+++ b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
@@ -19,10 +19,11 @@
 pytest.importorskip("ethosu.vela")
 import tvm
 import tvm.script
-from tvm.script import tir as T
 from tvm import relay
-from tvm.relay.testing import run_opt_pass
 from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
+from tvm.relay.testing import run_opt_pass
+from tvm.script import tir as T
+
 from .infra import make_ethosu_conv2d
 
 
@@ -73,7 +74,7 @@ def _get_func():
 
     func = _get_func()
     mod, _ = _lower_to_tir(func)
-    script = mod.script(show_meta=True)
+    script = mod.script()
     test_mod = tvm.script.from_source(script)
 
     reference_mod = ReferenceModule
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index 649f2a611d50..bdc0447bc718 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -18,11 +18,12 @@
 
 pytest.importorskip("ethosu.vela")
 import tvm
-from tvm.script import tir as T
 from tvm import relay
-from tvm.relay.testing import run_opt_pass
 from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
 from tvm.relay.backend.contrib.ethosu.tir.scheduler import total_cascader
+from tvm.relay.testing import run_opt_pass
+from tvm.script import tir as T
+
 from .infra import make_ethosu_conv2d
 
 
@@ -634,7 +635,7 @@ def _get_func(
     params = trial[1:]
     func = _get_func(*params[:-1])
     mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
-    script = mod.script(show_meta=True)
+    script = mod.script()
     mod = tvm.script.from_source(script)
     tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
 
@@ -693,7 +694,7 @@ def _get_func(ifm_shape, lower, upper, ofm_channels=16):
     params = trial[1:]
     func = _get_func(*params)
     mod, _ = _lower_to_tir(func)
-    script = mod.script(show_meta=True)
+    script = mod.script()
     mod = tvm.script.from_source(script)
     tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
 
@@ -795,7 +796,7 @@ def _get_func(ifm_shape, reshaped, ifm_layout):
     params = trial[1:]
     func = _get_func(*params)
     mod, _ = _lower_to_tir(func, cascader=total_cascader((1, 4, 6, 16)))
-    script = mod.script(show_meta=True)
+    script = mod.script()
     mod = tvm.script.from_source(script)
     tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
 
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
index 07124c62ae8b..e23954f4cb67 100644
--- a/tests/python/contrib/test_ethosu/test_replace_copy.py
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -18,11 +18,14 @@
 
 pytest.importorskip("ethosu.vela")
 import tvm
-from tvm.script import tir as T
 from tvm import relay
-from tvm.relay.testing import run_opt_pass
 from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
-from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants, OperatorCompute
+from tvm.relay.backend.contrib.ethosu.tir.scheduler import (
+    OperatorCompute,
+    copy_constants,
+)
+from tvm.relay.testing import run_opt_pass
+from tvm.script import tir as T
 
 from .infra import make_ethosu_conv2d
 
@@ -65,7 +68,7 @@ def _get_func():
     func = _get_func()
     mod, _ = _lower_to_tir(func, cascader=copy_constants())
 
-    script = mod.script(show_meta=True)
+    script = mod.script()
     test_mod = tvm.script.from_source(script)
     reference_mod = ReferenceModule
     tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
@@ -125,7 +128,7 @@ def _get_func():
     func = _get_func()
     mod, _ = _lower_to_tir(func, cascader=_cascader)
 
-    script = mod.script(show_meta=True)
+    script = mod.script()
     test_mod = tvm.script.from_source(script)
     reference_mod = WeightStream
     tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 9e39821fd317..6b3da0fd06d5 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -15,12 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import numpy as np
-import pytest
 import itertools
 import logging
 from typing import Tuple
 
+import numpy as np
+import pytest
+
 try:
     # See issue #9362.
     import torch
@@ -28,13 +29,12 @@
     pass
 
 import tvm
-import tvm.testing
 import tvm.relay.testing
-
+import tvm.testing
 from tvm import relay
+from tvm.contrib.download import download
 from tvm.relay import Any, GlobalVar
 from tvm.relay.expr_functor import ExprVisitor
-from tvm.contrib.download import download
 from tvm.relay.op.contrib import tensorrt
 
 SUPPORTED_DTYPES = ["float16", "float32"]
@@ -615,7 +615,7 @@ def __init__(self, op_list):
 
     def visit_call(self, call):
         if isinstance(call.op, tvm.tir.op.Op):
-            if str(call.op) in self.op_list:
+            if str(call.op.name) in self.op_list:
                 self.on_graph = True
 
         return super().visit_call(call)
diff --git a/tests/python/contrib/test_uma/test_partition.py b/tests/python/contrib/test_uma/test_partition.py
index ec2107f881bc..d02903610933 100644
--- a/tests/python/contrib/test_uma/test_partition.py
+++ b/tests/python/contrib/test_uma/test_partition.py
@@ -16,15 +16,12 @@
 # under the License.
 
 import pytest
-
 import tvm
 import tvm.relay as relay
-
+from tvm.relay.backend.contrib.uma import uma_available
 from tvm.relay.backend.contrib.uma.api import UMAPartitioner
 from tvm.relay.op.contrib.register import get_pattern_table
-from tvm.relay.testing import resnet, mlp
-from tvm.relay.backend.contrib.uma import uma_available
-
+from tvm.relay.testing import mlp, resnet
 
 pytestmark = pytest.mark.skipif(not uma_available(), reason="UMA not available")
 
diff --git a/tests/python/frontend/pytorch/qnn_test.py b/tests/python/frontend/pytorch/qnn_test.py
index 1fae75f23eae..e9fbe12e9754 100644
--- a/tests/python/frontend/pytorch/qnn_test.py
+++ b/tests/python/frontend/pytorch/qnn_test.py
@@ -17,27 +17,24 @@
 """ Tests on quantized torch model conversion """
 import os
 
-from PIL import Image
-
 import numpy as np
-
 import torch
+import tvm
+import tvm.testing
+from PIL import Image
 from torch import nn
 from torch.quantization import (
-    QuantStub,
     DeQuantStub,
-    fuse_modules,
+    QuantStub,
     QuantWrapper,
-    prepare_qat,
+    fuse_modules,
     get_default_qat_qconfig,
+    prepare_qat,
 )
-
-import tvm
-import tvm.testing
 from tvm import relay
-from tvm.relay.frontend.pytorch_utils import is_version_greater_than
 from tvm.contrib.download import download_testdata
-from tvm.relay.op.contrib.register import register_pattern_table, get_pattern_table
+from tvm.relay.frontend.pytorch_utils import is_version_greater_than
+from tvm.relay.op.contrib.register import get_pattern_table, register_pattern_table
 
 
 def torch_version_check():
@@ -66,8 +63,10 @@ def get_tvm_runtime(script_module, input_name, ishape, keep_quantized_weight=Fal
 
 
 def get_qconfig(per_channel):
-    from torch.quantization.observer import MovingAverageMinMaxObserver
-    from torch.quantization.observer import default_weight_observer
+    from torch.quantization.observer import (
+        MovingAverageMinMaxObserver,
+        default_weight_observer,
+    )
 
     if per_channel:
         return torch.quantization.get_default_qconfig("fbgemm")
@@ -396,11 +395,13 @@ def get_imagenet_input():
         pt_tensor = preprocess(im)
         return np.expand_dims(pt_tensor.numpy(), 0)
 
-    from torchvision.models.quantization import resnet as qresnet
-    from torchvision.models.quantization import mobilenet as qmobilenet
-    from torchvision.models.quantization import inception as qinception
     from torchvision.models.quantization import googlenet as qgooglenet
-    from torchvision.models.quantization import mobilenet_v3_large as qmobilenet_v3_large
+    from torchvision.models.quantization import inception as qinception
+    from torchvision.models.quantization import mobilenet as qmobilenet
+    from torchvision.models.quantization import (
+        mobilenet_v3_large as qmobilenet_v3_large,
+    )
+    from torchvision.models.quantization import resnet as qresnet
 
     per_channel = True
     qmodels = [
@@ -596,7 +597,7 @@ def forward(self, inp):
 
 
 def make_qnn_add_pattern():
-    from tvm.relay.dataflow_pattern import wildcard, is_op
+    from tvm.relay.dataflow_pattern import is_op, wildcard
 
     lhs = wildcard()
     rhs = wildcard()
@@ -782,7 +783,7 @@ def forward(self, input):
 
     assert isinstance(output, relay.Tuple) and len(output) == 2
     dq1, dq2 = output
-    assert str(dq1.op) == "qnn.dequantize" and str(dq2.op) == "qnn.dequantize"
+    assert dq1.op.name == "qnn.dequantize" and dq2.op.name == "qnn.dequantize"
     scale1 = dq1.args[1].data.numpy().item()
     scale2 = dq2.args[1].data.numpy().item()
     assert scale1 != scale2
diff --git a/tests/python/unittest/test_arith_deduce_bound.py b/tests/python/unittest/test_arith_deduce_bound.py
index 0915df3051db..d5e0303b05b2 100644
--- a/tests/python/unittest/test_arith_deduce_bound.py
+++ b/tests/python/unittest/test_arith_deduce_bound.py
@@ -64,14 +64,14 @@ def test_deduce():
 
     e2 = tvm.te.max(5, a * 4) < 0
     res2 = tvm.arith.deduce_bound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
-    assert str(res2.max_value) == "neg_inf: handle"
-    assert str(res2.min_value) == "pos_inf: handle"
+    assert str(res2.max_value) == "neg_inf"
+    assert str(res2.min_value) == "pos_inf"
 
     # expression containing variable a is on rhs
     e2 = zero < tvm.te.max(5, a * 4)
     res2 = tvm.arith.deduce_bound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
-    assert str(res2.max_value) == "neg_inf: handle"
-    assert str(res2.min_value) == "pos_inf: handle"
+    assert str(res2.max_value) == "neg_inf"
+    assert str(res2.min_value) == "pos_inf"
 
     e3 = (-b) + a * c - d
     res3 = tvm.arith.deduce_bound(a, e3 >= 0, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
@@ -88,8 +88,8 @@ def test_deduce():
 
     # Unsatisfiable `EQ`, variable as one of the Operand
     res5 = tvm.arith.deduce_bound(a, (a == b), {b: b_s}, {b: b_s})
-    assert str(res5.max_value) == "neg_inf: handle"
-    assert str(res5.min_value) == "pos_inf: handle"
+    assert str(res5.max_value) == "neg_inf"
+    assert str(res5.min_value) == "pos_inf"
 
     # variable `a` on the RHS side
     res6 = tvm.arith.deduce_bound(a, 10 == a, {}, {})
@@ -111,15 +111,15 @@ def test_deduce():
     # Unsatisfiable Mul in `EQ`
     e5 = 4 * a == b
     res9 = tvm.arith.deduce_bound(a, e5, {b: b_s}, {})
-    assert str(res9.max_value) == "neg_inf: handle"
-    assert str(res9.min_value) == "pos_inf: handle"
+    assert str(res9.max_value) == "neg_inf"
+    assert str(res9.min_value) == "pos_inf"
 
     # Unsatisfiable Mul in `EQ`
     res10 = tvm.arith.deduce_bound(
         a, (b * a == b), {b: b_s}, {}
     )  # simplifier is not able to prove that (b % b == 0)
-    assert str(res10.max_value) == "neg_inf: handle"
-    assert str(res10.min_value) == "pos_inf: handle"
+    assert str(res10.max_value) == "neg_inf"
+    assert str(res10.min_value) == "pos_inf"
 
 
 def test_check():
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
index 6d4dcd996475..bb9602279404 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
@@ -22,6 +22,7 @@
 from tvm.meta_schedule.testing.space_generation import (
     check_sketches,
     generate_design_space,
+    print_sketches,
 )
 from tvm.script import tir as T
 from tvm.target import Target
@@ -625,6 +626,97 @@ def cpu_conv2d_nhwc(
 
 
 def test_cache_read_specify_consumer():
+    @T.prim_func
+    def cache_read_specify_consumer_0(
+        A: T.Buffer((512, 512), "float32"),
+        B: T.Buffer((512, 512), "float32"),
+        T_add: T.Buffer((512, 512), "float32"),
+    ):
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        C = T.alloc_buffer((512, 512))
+        C_local = T.alloc_buffer((512, 512), scope="local")
+        A_shared = T.alloc_buffer((512, 512), scope="shared")
+        B_shared = T.alloc_buffer((512, 512), scope="shared")
+        for i_0_j_0_fused in T.thread_binding(2, thread="blockIdx.x"):
+            for i_1_j_1_fused in T.thread_binding(512, thread="vthread.x"):
+                for i_2_j_2_fused in T.thread_binding(16, thread="threadIdx.x"):
+                    for k_0 in range(2):
+                        for ax0_ax1_fused in range(131072):
+                            with T.block("A_shared"):
+                                v0 = T.axis.spatial(512, ax0_ax1_fused // 256)
+                                v1 = T.axis.spatial(512, k_0 * 256 + ax0_ax1_fused % 256)
+                                T.reads(A[v0, v1])
+                                T.writes(A_shared[v0, v1])
+                                T.block_attr({"meta_schedule.cooperative_fetch": 2})
+                                A_shared[v0, v1] = A[v0, v1]
+                        for ax0_ax1_fused in range(65536):
+                            with T.block("B_shared"):
+                                v0 = T.axis.spatial(512, k_0 * 256 + ax0_ax1_fused // 256)
+                                v1 = T.axis.spatial(512, i_0_j_0_fused * 256 + ax0_ax1_fused % 256)
+                                T.reads(B[v0, v1])
+                                T.writes(B_shared[v0, v1])
+                                T.block_attr({"meta_schedule.cooperative_fetch": 3})
+                                B_shared[v0, v1] = B[v0, v1]
+                        for k_1, i_3, j_3, k_2, i_4, j_4 in T.grid(64, 1, 1, 4, 1, 16):
+                            with T.block("C"):
+                                v_i = T.axis.spatial(
+                                    512,
+                                    i_1_j_1_fused // 8 * 8 + i_2_j_2_fused // 2 + i_3 + i_4,
+                                )
+                                v_j = T.axis.spatial(
+                                    512,
+                                    i_0_j_0_fused * 256
+                                    + i_1_j_1_fused % 8 * 32
+                                    + i_2_j_2_fused % 2 * 16
+                                    + j_3 * 16
+                                    + j_4,
+                                )
+                                v_k = T.axis.reduce(512, k_0 * 256 + k_1 * 4 + k_2)
+                                T.reads(A_shared[v_i, v_k], B_shared[v_k, v_j])
+                                T.writes(C_local[v_i, v_j])
+                                T.block_attr(
+                                    {
+                                        "meta_schedule.thread_extent_high_inclusive": 1024,
+                                        "meta_schedule.thread_extent_low_inclusive": 32,
+                                        "meta_schedule.tiling_structure": "SSSRRSRS",
+                                    }
+                                )
+                                with T.init():
+                                    C_local[v_i, v_j] = T.float32(0)
+                                C_local[v_i, v_j] = (
+                                    C_local[v_i, v_j] + A_shared[v_i, v_k] * B_shared[v_k, v_j]
+                                )
+                    for ax0, ax1 in T.grid(1, 16):
+                        with T.block("C_local"):
+                            v0 = T.axis.spatial(
+                                512,
+                                i_1_j_1_fused // 8 * 8 + i_2_j_2_fused // 2 + ax0,
+                            )
+                            v1 = T.axis.spatial(
+                                512,
+                                i_0_j_0_fused * 256
+                                + i_1_j_1_fused % 8 * 32
+                                + i_2_j_2_fused % 2 * 16
+                                + ax1,
+                            )
+                            T.reads(C_local[v0, v1])
+                            T.writes(C[v0, v1])
+                            C[v0, v1] = C_local[v0, v1]
+        for ax0, ax1 in T.grid(512, 512):
+            with T.block("T_add"):
+                v_ax0 = T.axis.spatial(512, ax0)
+                v_ax1 = T.axis.spatial(512, ax1)
+                T.reads(C[v_ax0, v_ax1], A[v_ax0, v_ax1])
+                T.writes(T_add[v_ax0, v_ax1])
+                T_add[v_ax0, v_ax1] = C[v_ax0, v_ax1] + A[v_ax0, v_ax1]
+
+    decision_0 = [
+        ("SamplePerfectTile", [1, 64, 8, 1, 1]),
+        ("SamplePerfectTile", [2, 8, 2, 1, 16]),
+        ("SamplePerfectTile", [2, 64, 4]),
+        ("SampleCategorical", 1),
+        ("SampleCategorical", 2),
+    ]
     A, B, C = te_workload.matmul(512, 512, 512)
     mod = te.create_prim_func([A, B, C + A])
 
@@ -634,17 +726,12 @@ def test_cache_read_specify_consumer():
         target=Target("nvidia/geforce-rtx-3080"),
         types=ms.schedule_rule.MultiLevelTiling,
     )
-
-    residual_block = """
-        for ax0, ax1 in T.grid(512, 512):
-            with T.block("T_add"):
-                v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1])
-                T.reads(C[v_ax0, v_ax1], A[v_ax0, v_ax1])
-                T.writes(T_add[v_ax0, v_ax1])
-                T_add[v_ax0, v_ax1] = C[v_ax0, v_ax1] + A[v_ax0, v_ax1]
-    """
-
-    assert residual_block in space[0].mod.script()
+    check_sketches(
+        mod,
+        sketches=space,
+        expected_mods=[cache_read_specify_consumer_0],
+        expected_decisions=[decision_0],
+    )
 
 
 def test_max_pool_blocked():
@@ -798,4 +885,5 @@ def max_pool_blocked_compute(height, width, channel):
 
 
 if __name__ == "__main__":
-    tvm.testing.main()
+    # tvm.testing.main()
+    test_cache_read_specify_consumer()
diff --git a/tests/python/unittest/test_te_schedule.py b/tests/python/unittest/test_te_schedule.py
index 8b504df120e0..69478b451893 100644
--- a/tests/python/unittest/test_te_schedule.py
+++ b/tests/python/unittest/test_te_schedule.py
@@ -14,10 +14,11 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pickle as pkl
+
 import pytest
 import tvm
 from tvm import te
-import pickle as pkl
 
 
 def test_schedule_create():
@@ -297,8 +298,8 @@ def intrin_func(ins, outs, sp):
     stmt = tvm.lower(s, [A, C])["main"].body
     assert isinstance(stmt.body.body, tvm.tir.Evaluate)
     assert len(stmt.body.body.value.args) == 5
-    assert str(stmt.body.body.value.args[3]) == "(i: int32*i)"
-    assert str(stmt.body.body.value.args[4]) == "(i: int32 + j: int32)"
+    assert str(stmt.body.body.value.args[3]) == "i * i"
+    assert str(stmt.body.body.value.args[4]) == "i + j"
 
 
 def test_legalize_invalid_attach():
diff --git a/tests/python/unittest/test_tir_nodes.py b/tests/python/unittest/test_tir_nodes.py
index 83cd64fa229b..d4ae84a556d7 100644
--- a/tests/python/unittest/test_tir_nodes.py
+++ b/tests/python/unittest/test_tir_nodes.py
@@ -14,10 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import numpy as np
 import pytest
 import tvm
-from tvm import te, ir
-import numpy as np
+from tvm import ir, te
 
 
 def test_const():
@@ -142,7 +142,7 @@ def test_basic():
     a = te.var("a")
     b = te.var("b")
     c = a + b
-    assert str(c) == "(%s: int32 + %s: int32)" % (a.name, b.name)
+    assert str(c) == "%s + %s" % (a.name, b.name)
 
 
 def test_stmt():
@@ -176,8 +176,8 @@ def test_any():
         assert False
     except ValueError:
         pass
-    assert str(tvm.tir.any(x < y)) == "(%s: int32 < %s: int32)" % (x.name, y.name)
-    assert str(tvm.tir.any(x < y, x > z)) == "((%s: int32 < %s: int32) || (%s > %s: int32))" % (
+    assert str(tvm.tir.any(x < y)) == "%s < %s" % (x.name, y.name)
+    assert str(tvm.tir.any(x < y, x > z)) == "%s < %s or %s > %s" % (
         x.name,
         y.name,
         x.name,
@@ -185,7 +185,7 @@ def test_any():
     )
     assert str(
         tvm.tir.any(x < y, y > z + 1, x < z * 2)
-    ) == "(((%s: int32 < %s: int32) || (%s > (%s: int32 + 1))) || (%s < (%s*2)))" % (
+    ) == "%s < %s or %s > %s + 1 or %s < %s * 2" % (
         x.name,
         y.name,
         y.name,
@@ -209,8 +209,8 @@ def test_all():
         assert False
     except ValueError:
         pass
-    assert str(tvm.tir.all(x < y)) == "(%s: int32 < %s: int32)" % (x.name, y.name)
-    assert str(tvm.tir.all(x < y, x > z)) == "((%s: int32 < %s: int32) && (%s > %s: int32))" % (
+    assert str(tvm.tir.all(x < y)) == "%s < %s" % (x.name, y.name)
+    assert str(tvm.tir.all(x < y, x > z)) == "%s < %s and %s > %s" % (
         x.name,
         y.name,
         x.name,
@@ -218,7 +218,7 @@ def test_all():
     )
     assert str(
         tvm.tir.all(x < y, y > z + 1, x < z * 2)
-    ) == "(((%s: int32 < %s: int32) && (%s > (%s: int32 + 1))) && (%s < (%s*2)))" % (
+    ) == "%s < %s and %s > %s + 1 and %s < %s * 2" % (
         x.name,
         y.name,
         y.name,
@@ -231,19 +231,19 @@ def test_all():
 def test_bitwise():
     x = te.var("x")
     y = te.var("y")
-    assert str(x << y) == "@tir.shift_left(x: int32, y: int32, dtype=int32)"
-    assert str(x >> y) == "@tir.shift_right(x: int32, y: int32, dtype=int32)"
-    assert str(x & y) == "@tir.bitwise_and(x: int32, y: int32, dtype=int32)"
-    assert str(x | y) == "@tir.bitwise_or(x: int32, y: int32, dtype=int32)"
-    assert str(x ^ y) == "@tir.bitwise_xor(x: int32, y: int32, dtype=int32)"
-    assert str(10 & x) == "@tir.bitwise_and(10, x: int32, dtype=int32)"
-    assert str(10 | x) == "@tir.bitwise_or(10, x: int32, dtype=int32)"
-    assert str(10 ^ x) == "@tir.bitwise_xor(10, x: int32, dtype=int32)"
-    assert str(10 >> x) == "@tir.shift_right(10, x: int32, dtype=int32)"
-    assert str(10 << x) == "@tir.shift_left(10, x: int32, dtype=int32)"
-    assert str(10 % x) == "floormod(10, x: int32)"
-
-    assert str(~x) == "@tir.bitwise_not(x: int32, dtype=int32)"
+    assert str(x << y) == "T.shift_left(x, y)"
+    assert str(x >> y) == "T.shift_right(x, y)"
+    assert str(x & y) == "T.bitwise_and(x, y)"
+    assert str(x | y) == "T.bitwise_or(x, y)"
+    assert str(x ^ y) == "T.bitwise_xor(x, y)"
+    assert str(10 & x) == "T.bitwise_and(10, x)"
+    assert str(10 | x) == "T.bitwise_or(10, x)"
+    assert str(10 ^ x) == "T.bitwise_xor(10, x)"
+    assert str(10 >> x) == "T.shift_right(10, x)"
+    assert str(10 << x) == "T.shift_left(10, x)"
+    assert str(10 % x) == "10 % x"
+
+    assert str(~x) == "T.bitwise_not(x)"
     assert (tvm.tir.const(1, "int8x2") >> 1).dtype == "int8x2"
     assert (x >> tvm.tir.const(1, "int32x2")).dtype == "int32x2"
     assert (te.var("z", "int8x2") << tvm.tir.const(1, "int8x2")).dtype == "int8x2"
@@ -302,17 +302,17 @@ def test_divide_by_zero():
 
 
 def test_infinity():
-    assert str(tvm.tir.infinity("float16")) == "inff16"
-    assert str(tvm.tir.infinity("float32")) == "inff32"
-    assert str(tvm.tir.infinity("float64")) == "inff64"
+    assert str(tvm.tir.infinity("float16")) == 'T.float16("inf")'
+    assert str(tvm.tir.infinity("float32")) == 'T.float32("inf")'
+    assert str(tvm.tir.infinity("float64")) == 'T.float64("inf")'
 
 
 def test_isnan():
     x = te.var("x", "float32")
-    assert str(tvm.tir.isnan(x)) == "@tir.isnan(x: float32, dtype=bool)"
+    assert str(tvm.tir.isnan(x)) == "T.isnan(x)"
     assert str(tvm.tir.isnan(x).dtype) == "bool"
     y = te.var("y", "float16")
-    assert str(tvm.tir.isnan(y)) == "@tir.isnan(cast(float32, y: float16), dtype=bool)"
+    assert str(tvm.tir.isnan(y)) == 'T.isnan(T.Cast("float32", y))'
     z = te.var("z", "int32")
     assert str(tvm.tir.isnan(z)) == "False"
     k = te.var("k", "int8x2")
diff --git a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
index 7062d5129713..adf3d9da05ce 100644
--- a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
+++ b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
@@ -14,17 +14,17 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import tvm
-from tvm.script import tir as T
 import numpy as np
+import tvm
 import tvm.testing
+from tvm.script import tir as T
 
 
 def count_cp_async(stmt):
     num_alloc = [0]
 
     def verify(n):
-        if isinstance(n, tvm.tir.Call) and str(n.op) == "tir.ptx_cp_async":
+        if isinstance(n, tvm.tir.Call) and n.op.name == "tir.ptx_cp_async":
             num_alloc[0] += 1
 
     tvm.tir.stmt_functor.post_order_visit(stmt, verify)
diff --git a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
index 70c14b02f0eb..d75fb2b03e39 100644
--- a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
+++ b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
@@ -14,15 +14,13 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import numpy as np
+import pytest
 import tvm
 import tvm.script
-from tvm.script import tir as T
-from tvm import te
-from tvm import topi
+from tvm import te, topi
 from tvm.driver.build_module import get_binds
-import numpy as np
-
-import pytest
+from tvm.script import tir as T
 
 
 def _tile_nd(s, tensor, tile):
@@ -271,7 +269,7 @@ def main(A: T.handle, tensor: T.handle) -> None:
 def test_rolling_buffer_ir_transform():
     mod = PreRollingBuffer
     mod = tvm.tir.transform.InjectRollingBuffer()(mod)
-    script = mod.script(show_meta=True)
+    script = mod.script()
     mod = tvm.script.from_source(script)
     tvm.ir.assert_structural_equal(mod["main"], PostRollingBuffer["main"], True)
 
diff --git a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
index 006b67d62697..cf01d7700725 100644
--- a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
+++ b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
@@ -1507,7 +1507,7 @@ def test_async_pipelined_mma_gemm_simple():
     assert body.block.body.body[1].block.body.body.value == 3
 
     assert epilogue.block.body.body.block.body.body.attr_key == "async_wait_inflight_count"
-    assert str(epilogue.block.body.body.block.body.body.value) == "(2 - k_0_0: int32)"
+    assert str(epilogue.block.body.body.block.body.body.value) == "2 - k_0_0"
 
     build_and_run(sch)
 
@@ -1554,7 +1554,7 @@ def test_async_nested_pipeline_mma_gemm_ideal_annotation():
     assert body.block.body.body[1].block.body.body.attr_key == "async_wait_inflight_count"
     assert body.block.body.body[1].block.body.body.value == 2
 
-    assert str(epilogue.block.body.body[0].block.body.body.value) == "(1 - k_0_0: int32)"
+    assert str(epilogue.block.body.body[0].block.body.body.value) == "1 - k_0_0"
 
     build_and_run(sch)
 
diff --git a/tests/python/unittest/test_tir_transform_make_packed_api.py b/tests/python/unittest/test_tir_transform_make_packed_api.py
index e78ed98d8569..47bb7bf228d4 100644
--- a/tests/python/unittest/test_tir_transform_make_packed_api.py
+++ b/tests/python/unittest/test_tir_transform_make_packed_api.py
@@ -83,19 +83,17 @@ def test_variable_passed_from_args():
 
     # Arguments unpacking
     assignment = _find_assignment(func.body, "arg.input_buffer")
-    assert str(assignment.value) == "@tir.tvm_struct_get(args: handle, 0, 12, dtype=handle)"
+    assert str(assignment.value) == 'T.tvm_struct_get(args, 0, 12, "handle")'
 
     assignment = _find_assignment(func.body, "arg.not_device_context")
-    assert str(assignment.value) == "@tir.tvm_struct_get(args: handle, 1, 12, dtype=handle)"
+    assert str(assignment.value) == 'T.tvm_struct_get(args, 1, 12, "handle")'
 
     assignment = _find_assignment(func.body, "input_buffer")
-    assert (
-        str(assignment.value) == "@tir.tvm_struct_get(arg.input_buffer: handle, 0, 1, dtype=handle)"
-    )
+    assert str(assignment.value) == 'T.tvm_struct_get(arg_input_buffer, 0, 1, "handle")'
     unpacked_input_buffer = assignment.var
 
     assignment = _find_assignment(func.body, "not_device_context")
-    assert str(assignment.value) == "arg.not_device_context: handle"
+    assert str(assignment.value) == "arg_not_device_context"
     unpacked_not_device_context = assignment.var
 
     seq_stmt = _find_next(assignment, tvm.tir.SeqStmt)
@@ -131,12 +129,10 @@ def test_device_api_context_implicit_resource_handle():
 
     # Arguments unpacking
     assignment = _find_assignment(func.body, "arg.input_buffer")
-    assert str(assignment.value) == "@tir.tvm_struct_get(args: handle, 0, 12, dtype=handle)"
+    assert str(assignment.value) == 'T.tvm_struct_get(args, 0, 12, "handle")'
 
     assignment = _find_assignment(func.body, "input_buffer")
-    assert (
-        str(assignment.value) == "@tir.tvm_struct_get(arg.input_buffer: handle, 0, 1, dtype=handle)"
-    )
+    assert str(assignment.value) == 'T.tvm_struct_get(arg_input_buffer, 0, 1, "handle")'
     unpacked_input_buffer = assignment.var
 
     seq_stmt = _find_next(assignment, tvm.tir.SeqStmt)
diff --git a/tests/python/unittest/test_tir_transform_thread_sync.py b/tests/python/unittest/test_tir_transform_thread_sync.py
index 0c5d77d02b91..b2a0581d6980 100644
--- a/tests/python/unittest/test_tir_transform_thread_sync.py
+++ b/tests/python/unittest/test_tir_transform_thread_sync.py
@@ -92,7 +92,7 @@ def ir(A, B):
     stmt = ir(A, B)
     func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None)
     mod = run_passes(func)
-    assert "@tir.tvm_storage_sync" in str(mod)
+    assert "T.tvm_storage_sync" in str(mod)
 
 
 @tvm.testing.requires_cuda
@@ -115,7 +115,7 @@ def func(p0_arg: T.Buffer[(1, 2, 1, 1), "float32"], p1: T.Buffer[2, "float32"])
         result_local[0] = result_local[0] + temp_shared[0] * p1[1]
 
     mod = run_passes(func)
-    assert "@tir.tvm_storage_sync" in str(mod)
+    assert "T.tvm_storage_sync" in str(mod)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_tvmscript_complete.py b/tests/python/unittest/test_tvmscript_complete.py
index 29ac5dc5da0d..2f81b0302626 100644
--- a/tests/python/unittest/test_tvmscript_complete.py
+++ b/tests/python/unittest/test_tvmscript_complete.py
@@ -315,7 +315,7 @@ def expect_alloc_buffer_func(a: T.handle, b: T.handle) -> None:
 
 
 def test_complete_alloc_buffer():
-    rt_func = tvm.script.from_source(alloc_buffer_func.script(show_meta=True))
+    rt_func = tvm.script.from_source(alloc_buffer_func.script())
     tvm.ir.assert_structural_equal(alloc_buffer_func, expect_alloc_buffer_func)
 
 
diff --git a/tests/python/unittest/test_tvmscript_ops.py b/tests/python/unittest/test_tvmscript_ops.py
index 3f30c6ddb0bc..e10681338727 100644
--- a/tests/python/unittest/test_tvmscript_ops.py
+++ b/tests/python/unittest/test_tvmscript_ops.py
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import tvm
-from tvm.script import tir as T
 import numpy as np
+import tvm
 import tvm.testing
+from tvm.script import tir as T
 
 
 @T.prim_func
@@ -152,8 +152,8 @@ def _check_alloc_zero_dim_buffer(f):
 def test_alloc_zero_dim_buffer_round_trip():
     func = alloc_zero_dim_buffer
     func_with_block = alloc_zero_dim_buffer_block
-    rt_func = tvm.script.from_source(func.script(show_meta=True))
-    rt_func_with_block = tvm.script.from_source(func_with_block.script(show_meta=True))
+    rt_func = tvm.script.from_source(func.script())
+    rt_func_with_block = tvm.script.from_source(func_with_block.script())
     rt_mod = tvm.build(rt_func, "llvm")
     rt_mod_with_block = tvm.build(rt_func_with_block, "llvm")
     tvm.ir.assert_structural_equal(func, func_with_block)
diff --git a/tests/python/unittest/test_tvmscript_printer_ir.py b/tests/python/unittest/test_tvmscript_printer_ir.py
new file mode 100644
index 000000000000..c3da3d8c702b
--- /dev/null
+++ b/tests/python/unittest/test_tvmscript_printer_ir.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+from tvm import IRModule
+from tvm.script.ir_builder import IRBuilder
+from tvm.script.ir_builder import ir as I
+from tvm.script.ir_builder import tir as T
+
+
+def _assert_print(obj, expected):
+    assert str(obj).strip() == expected.strip()
+    assert repr(obj).strip() == expected.strip()
+    if isinstance(obj, IRModule):
+        assert obj.script().strip() == expected.strip()
+
+
+def test_ir_module():
+    with IRBuilder() as ib:  # pylint: disable=invalid-name
+        with I.ir_module():
+            with T.prim_func():
+                T.func_name("foo")
+    mod = ib.get()
+    _assert_print(
+        mod,
+        """
+@I.ir_module
+class Module:
+    @T.prim_func
+    def foo():
+        T.evaluate(0)""",
+    )
+
+
+if __name__ == "__main__":
+    test_ir_module()
diff --git a/tests/python/unittest/test_tvmscript_printer_tir.py b/tests/python/unittest/test_tvmscript_printer_tir.py
index fd3bb3788cfb..9c15fbc88949 100644
--- a/tests/python/unittest/test_tvmscript_printer_tir.py
+++ b/tests/python/unittest/test_tvmscript_printer_tir.py
@@ -35,6 +35,9 @@ def verbose_expr():
 
 def _assert_print(obj, expected):
     with verbose_expr():
+        if isinstance(obj, (tir.PrimFunc, tir.PrimExpr, tir.Stmt)):
+            assert obj.script().strip() == expected.strip()
+        assert str(obj).strip() == expected.strip()
         assert repr(obj).strip() == expected.strip()
 
 
@@ -54,7 +57,7 @@ def test_prim_func():
         func,
         expected="""
 @T.prim_func
-def main(a: T.handle, b: T.handle) -> None:
+def main(a: T.handle, b: T.handle):
     A = T.match_buffer(a, (128, 128))
     B = T.match_buffer(b, (256, 256))
     T.evaluate(0)""",
diff --git a/tests/python/unittest/test_tvmscript_regression.py b/tests/python/unittest/test_tvmscript_regression.py
index 44d3036596ba..6678c10acd7a 100644
--- a/tests/python/unittest/test_tvmscript_regression.py
+++ b/tests/python/unittest/test_tvmscript_regression.py
@@ -15,12 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy
-
 import tvm
 import tvm.testing
 from tvm.script import tir as T
 
-
 # This numpy array is used to test the comparison between the global objects and the
 # `tvm.script.tir` submodule.
 np_array = numpy.array([0, 1, 2, 3])
@@ -42,7 +40,7 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 def test_multi_element_array_in_outmost_namespace():
     func = matmul
-    rt_func = tvm.script.from_source(func.script(show_meta=True))
+    rt_func = tvm.script.from_source(func.script())
     tvm.ir.assert_structural_equal(func, rt_func)
 
 
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 0e9be0463943..0a6a2a26380c 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -2591,7 +2591,7 @@ def test_module_define():
 
 def test_matmul_original():
     func = matmul_original()
-    rt_func = tvm.script.from_source(func.script(show_meta=True))
+    rt_func = tvm.script.from_source(func.script())
     tvm.ir.assert_structural_equal(func, rt_func)
 
     assert isinstance(rt_func.body.block, tir.stmt.Block)
@@ -2605,7 +2605,7 @@ def test_matmul_original():
 
 def test_element_wise():
     func = element_wise()
-    rt_func = tvm.script.from_source(func.script(show_meta=True))
+    rt_func = tvm.script.from_source(func.script())
     tvm.ir.assert_structural_equal(func, rt_func)
 
     assert isinstance(rt_func.body.block, tir.stmt.Block)
@@ -2621,7 +2621,7 @@ def test_element_wise():
 
 def test_predicate():
     func = predicate()
-    rt_func = tvm.script.from_source(func.script(show_meta=True))
+    rt_func = tvm.script.from_source(func.script())
     tvm.ir.assert_structural_equal(func, rt_func)
 
     assert isinstance(rt_func.body.block, tir.stmt.Block)
@@ -2648,7 +2648,7 @@ def for_thread_binding(a: T.handle, b: T.handle) -> None:
 
 def test_for_thread_binding():
     func = for_thread_binding()
-    rt_func = tvm.script.from_source(func.script(show_meta=True))
+    rt_func = tvm.script.from_source(func.script())
     tvm.ir.assert_structural_equal(func, rt_func)
 
     assert isinstance(rt_func.body, tir.stmt.For)
@@ -2682,7 +2682,7 @@ def match_buffer_region(a: T.handle, b: T.handle) -> None:
 
 def test_match_buffer_region():
     func = match_buffer_region()
-    rt_func = tvm.script.from_source(func.script(show_meta=True))
+    rt_func = tvm.script.from_source(func.script())
     tvm.ir.assert_structural_equal(func, rt_func)
 
     assert isinstance(rt_func.body, tir.stmt.BlockRealize)
@@ -2727,7 +2727,7 @@ def block_elements(a: T.handle, b: T.handle) -> None:
 
 def test_block_elements():
     func = block_elements()
-    rt_func = tvm.script.from_source(func.script(show_meta=True))
+    rt_func = tvm.script.from_source(func.script())
     tvm.ir.assert_structural_equal(func, rt_func)
 
     assert isinstance(rt_func.body.block, tir.stmt.Block)
@@ -2763,7 +2763,7 @@ def opaque_block(a: T.handle, b: T.handle) -> None:
 
 def test_opaque_block():
     func = opaque_block()
-    rt_func = tvm.script.from_source(func.script(show_meta=True))
+    rt_func = tvm.script.from_source(func.script())
     tvm.ir.assert_structural_equal(func, rt_func)
 
     root_block = rt_func.body.block
@@ -2945,14 +2945,9 @@ def var_with_same_name(a: T.handle) -> None:
 
 def test_same_name_var():
     func = var_with_same_name()
-    out_str = func.script(tir_prefix="T", show_meta=True)
+    out_str = func.script()
     rt_func = tvm.script.from_source(out_str)
     tvm.ir.assert_structural_equal(func, rt_func)
-
-    assert out_str.count('vi, vj = T.axis.remap("SS", [i, j])') == 2
-    assert out_str.find("vi_") == -1
-    assert out_str.find("vj_") == -1
-
     assert out_str.count("for i, j in T.grid(16, 16)") == 2
     assert out_str.find("i_") == -1
     assert out_str.find("i_") == -1
@@ -3621,7 +3616,7 @@ def func():
 
 def test_roundtrip(ir_generator):
     original = ir_generator()
-    after_roundtrip = tvm.script.from_source(original.script(show_meta=True))
+    after_roundtrip = tvm.script.from_source(original.script())
     tvm.ir.assert_structural_equal(original, after_roundtrip, True)
 
 
From f560a4680fc0f3f63c9bf7669b4cd215d0593acd Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 17 Jan 2023 15:02:40 -0800
Subject: [PATCH 188/286] [skip ci][ci][docker] Add cross compilation libs
 (#13800)

This updates ci_minimal to use the changes following on to #13714

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .asf.yaml                    | 1 +
 ci/jenkins/docker-images.ini | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.asf.yaml b/.asf.yaml
index 047a573e05c3..f9e1edea88af 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -68,6 +68,7 @@ github:
           - minimal/pr-head
           - riscv/pr-head
           - wasm/pr-head
+          - cross-isa-minimal/pr-head
 
       required_pull_request_reviews:
         required_approving_review_count: 1
diff --git a/ci/jenkins/docker-images.ini b/ci/jenkins/docker-images.ini
index b4be9fdd109c..d1e34487bd61 100644
--- a/ci/jenkins/docker-images.ini
+++ b/ci/jenkins/docker-images.ini
@@ -24,6 +24,6 @@ ci_gpu: tlcpack/ci-gpu:20221128-070141-ae4fd7df7
 ci_hexagon: tlcpack/ci-hexagon:20221013-060115-61c9742ea
 ci_i386: tlcpack/ci-i386:20221013-060115-61c9742ea
 ci_lint: tlcpack/ci-lint:20221013-060115-61c9742ea
-ci_minimal: tlcpack/ci-minimal:20221013-060115-61c9742ea
+ci_minimal: tlcpack/ci-minimal:20230117-070124-125886350
 ci_riscv: tlcpack/ci-riscv:20221013-060115-61c9742ea
 ci_wasm: tlcpack/ci-wasm:20221013-060115-61c9742ea

From 1a6dfda9d1404481c469ba1f7e12345a8a846e78 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Tue, 17 Jan 2023 21:14:27 -0800
Subject: [PATCH 189/286] [TVMScript] `T.axis.remap` syntax sugar for TVMScript
 printer (#13743)

This PR implements the syntax sugar of `T.axis.remap` for new TVMScript printer. This syntax sugar will synthesize the `T.axis.remap` when there are more than 2 simple block iterating variable bindings. For example, it will change
```python
for i, j, k in T.grid(128, 128, 128):
  with T.block("update"):
    vi = T.axis.spatial(128, i)
    vj = T.axis.spatial(128, j)
    vk = T.axis.reduce(128, k)
```
into
```python
for i, j, k in T.grid(128, 128, 128):
  with T.block("update"):
    vi, vj, vk = T.axis.remap("SSR", [i, j, k])
```

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
---
 src/script/printer/tir/block.cc               | 80 ++++++++++++++++++-
 .../test_tvmscript_printer_syntax_sugar.py    | 69 ++++++++++++++++
 2 files changed, 147 insertions(+), 2 deletions(-)
 create mode 100644 tests/python/unittest/test_tvmscript_printer_syntax_sugar.py

diff --git a/src/script/printer/tir/block.cc b/src/script/printer/tir/block.cc
index e7f733864cc5..069ec7f3ea41 100644
--- a/src/script/printer/tir/block.cc
+++ b/src/script/printer/tir/block.cc
@@ -30,8 +30,42 @@ Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
       opt_realize.defined() ? opt_realize.value().get() : nullptr;
   const ObjectPathNode* realize_p = opt_realize_p.defined() ? opt_realize_p.get() : nullptr;
   // Step 1. Handle block var and block bindings
-  int n_vars = block->iter_vars.size();
-  for (int i = 0; i < n_vars; ++i) {
+  // Step 1.1. Obtain all loop var defined along path
+  std::unordered_map<const tir::VarNode*, tir::For> loop_vars;
+  for (Frame f : d->frames) {
+    if (const auto* tir_f = f.as<TIRFrameNode>()) {
+      if (const auto* for_loop = tir_f->tir.as<tir::ForNode>()) {
+        for (const tir::ForNode* l = for_loop; l != nullptr; l = l->body.as<tir::ForNode>()) {
+          loop_vars.insert(std::make_pair(l->loop_var.get(), GetRef<tir::For>(l)));
+        }
+      }
+    }
+  }
+
+  std::vector<int> remap_vars_indices;
+  auto add_remapped_iter_var = [&](int i) -> bool {
+    if (realize) {
+      tir::ExprDeepEqual expr_equal;
+      tir::IterVar iter_var = block->iter_vars[i];
+      PrimExpr value = realize->iter_values[i];
+      if (iter_var->iter_type == tir::IterVarType::kDataPar ||
+          iter_var->iter_type == tir::IterVarType::kCommReduce) {
+        if (const auto* var = value.as<tir::VarNode>()) {
+          if (loop_vars.count(var)) {
+            tir::For for_loop = loop_vars.at(var);
+            if (expr_equal(for_loop->min, iter_var->dom->min) &&
+                expr_equal(for_loop->extent, iter_var->dom->extent)) {
+              remap_vars_indices.push_back(i);
+              return true;
+            }
+          }
+        }
+      }
+    }
+    return false;
+  };
+
+  auto print_single_iter_var = [&](int i) {
     tir::IterVar iter_var = block->iter_vars[i];
     ObjectPath iter_var_p = block_p->Attr("iter_var")->ArrayIndex(i);
     ExprDoc rhs = TIR("axis");
@@ -66,7 +100,49 @@ Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
       rhs = rhs->Call({dom});
     }
     (*frame)->stmts.push_back(AssignDoc(DefineVar(iter_var->var, *frame, d), rhs, NullOpt));
+  };
+
+  auto print_remapped_iter_var = [&]() {
+    if (remap_vars_indices.size()) {
+      int m = remap_vars_indices.size();
+      if (!m) {
+        return;
+      }
+      if (m == 1) {
+        print_single_iter_var(remap_vars_indices[0]);
+        remap_vars_indices.clear();
+        return;
+      }
+      Array<ExprDoc> lhs;
+      Array<ExprDoc> loop_var_doc;
+      lhs.reserve(m);
+      loop_var_doc.reserve(m);
+      std::string binding_type = "";
+      for (int i : remap_vars_indices) {
+        tir::IterVar iter_var = block->iter_vars[i];
+        ObjectPath iter_var_p = block_p->Attr("iter_var")->ArrayIndex(i);
+        lhs.push_back(DefineVar(iter_var->var, *frame, d));
+        loop_var_doc.push_back(d->AsDoc<ExprDoc>(realize->iter_values[i],
+                                                 realize_p->Attr("iter_values")->ArrayIndex(i)));
+        binding_type += iter_var->iter_type == tir::IterVarType::kDataPar ? "S" : "R";
+      }
+      ExprDoc rhs = TIR("axis")->Attr("remap");
+      rhs = rhs->Call({LiteralDoc::Str(binding_type), ListDoc(loop_var_doc)});
+      (*frame)->stmts.push_back(AssignDoc(TupleDoc(lhs), rhs, NullOpt));
+      remap_vars_indices.clear();
+    }
+  };
+
+  // Step 1.2. Construct all block var bindings
+  int n_vars = block->iter_vars.size();
+  for (int i = 0; i < n_vars; ++i) {
+    if (!add_remapped_iter_var(i)) {
+      print_remapped_iter_var();
+      print_single_iter_var(i);
+    }
   }
+  print_remapped_iter_var();
+
   // Step 2. Handle block predicate
   if (realize) {
     ICHECK(realize->predicate.defined() && realize->predicate->dtype.is_bool());
diff --git a/tests/python/unittest/test_tvmscript_printer_syntax_sugar.py b/tests/python/unittest/test_tvmscript_printer_syntax_sugar.py
new file mode 100644
index 000000000000..1bccb8188c9d
--- /dev/null
+++ b/tests/python/unittest/test_tvmscript_printer_syntax_sugar.py
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+import tvm.testing
+from tvm.script.parser import tir as T
+from tvm.script import script
+
+
+def _test(obj, expected: str):
+    assert script(obj).strip() == expected.strip()
+
+
+def test_remap():
+    @T.prim_func
+    def block_with_remap_implicitly():
+        for i0, i1, i2, i3, i4, i5 in T.grid(128, 128, 128, 128, 128, 128):
+            with T.block("update"):
+                v0 = T.axis.spatial(128, i0 + 1)
+                v1 = T.axis.spatial(128, i1)
+                v2 = T.axis.reduce(128, i2)
+                v3 = T.axis.spatial(128, i3 - 1)
+                v4 = T.axis.reduce(128, i4)
+                v5 = T.axis.spatial(128, i5)
+                pass
+
+    @T.prim_func
+    def block_with_remap_explicitly():
+        for i0, i1, i2, i3, i4, i5 in T.grid(128, 128, 128, 128, 128, 128):
+            with T.block("update"):
+                v0 = T.axis.spatial(128, i0 + 1)
+                v1, v2 = T.axis.remap("SR", [i1, i2])
+                v3 = T.axis.spatial(128, i3 - 1)
+                v4, v5 = T.axis.remap("RS", [i4, i5])
+                pass
+
+    expected_output = """@T.prim_func
+def main():
+    with T.block("root"):
+        T.reads()
+        T.writes()
+        for i0, i1, i2, i3, i4, i5 in T.grid(128, 128, 128, 128, 128, 128):
+            with T.block("update"):
+                v0 = T.axis.spatial(128, i0 + 1)
+                v1, v2 = T.axis.remap("SR", [i1, i2])
+                v3 = T.axis.spatial(128, i3 - 1)
+                v4, v5 = T.axis.remap("RS", [i4, i5])
+                T.reads()
+                T.writes()
+                T.evaluate(0)"""
+    _test(block_with_remap_implicitly, expected_output)
+    _test(block_with_remap_explicitly, expected_output)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From fcc06f309df6697e64d4c320d05d7ef31e48e044 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Wed, 18 Jan 2023 05:42:24 -0800
Subject: [PATCH 190/286] [TVMScript] Use TVMScript for all TIR Printing
 (#13795)

---
 CMakeLists.txt                                |   2 +-
 include/tvm/ir/module.h                       |  28 ----
 include/tvm/ir/transform.h                    |   1 -
 include/tvm/relay/base.h                      |  28 ++++
 include/tvm/{ir => relay}/error.h             |  13 +-
 include/tvm/relay/expr.h                      |   1 -
 include/tvm/relay/expr_functor.h              |   2 +-
 include/tvm/relay/pattern_functor.h           |   2 +-
 python/tvm/ir/__init__.py                     |   1 -
 python/tvm/ir/affine_type.py                  |   2 +-
 python/tvm/ir/base.py                         |  31 -----
 python/tvm/ir/expr.py                         |  32 ++++-
 python/tvm/ir/module.py                       |  28 ++++
 python/tvm/ir/op.py                           |  31 ++++-
 python/tvm/ir/tensor_type.py                  |   2 +-
 python/tvm/micro/model_library_format.py      |   7 +-
 python/tvm/relay/__init__.py                  |   1 +
 python/tvm/relay/base.py                      |  39 +++++-
 python/tvm/relay/dataflow_pattern/__init__.py |  29 +++-
 python/tvm/relay/expr.py                      |  34 ++++-
 python/tvm/relay/function.py                  |  29 +++-
 python/tvm/script/__init__.py                 |   1 -
 python/tvm/script/printer/__init__.py         |   1 -
 python/tvm/script/printer/printer.py          |  54 --------
 rust/tvm/src/ir/expr.rs                       |   2 +-
 src/ir/transform.cc                           |   6 +-
 src/relay/analysis/annotated_region_set.cc    |   2 +-
 src/relay/analysis/annotated_region_set.h     |   2 +-
 src/relay/analysis/kind_check.cc              |   2 +-
 src/relay/analysis/match_exhaustion.cc        |   2 +-
 src/relay/analysis/type_solver.h              |   2 +-
 src/relay/backend/contrib/ethosu/codegen.cc   |   2 +-
 .../backend/contrib/ethosu/compiler_attrs.cc  |   2 +-
 .../backend/contrib/ethosu/preprocess.cc      |   2 +-
 src/relay/backend/contrib/uma/relay_to_tir.cc |   2 +-
 src/relay/backend/vm/compiler.cc              |   2 +-
 src/relay/backend/vm/compiler.h               |   2 +-
 src/relay/collage/partition_rule.h            |   2 +-
 src/relay/ir/base.cc                          |   5 +
 src/{ => relay}/ir/error.cc                   |  11 +-
 src/relay/op/tensor/transform.cc              |   2 +-
 src/relay/op/tensor/transform.h               |   2 +-
 src/relay/op/type_relations.h                 |   2 +-
 src/{ => relay}/printer/doc.cc                |   4 +-
 src/{ => relay}/printer/doc.h                 |   9 +-
 src/{ => relay}/printer/meta_data.h           |  13 +-
 .../printer/model_library_format_printer.cc   |   6 +-
 src/{ => relay}/printer/relay_text_printer.cc |  13 +-
 src/{ => relay}/printer/text_printer.cc       |   9 +-
 src/{ => relay}/printer/text_printer.h        |  47 +++----
 src/{ => relay}/printer/tir_text_printer.cc   |  28 ++--
 .../printer/tir_text_printer_debug.cc         |   4 +-
 .../printer/tir_text_printer_debug.h          |  10 +-
 src/{ => relay}/printer/tvmscript_printer.cc  |  85 ++++++------
 .../transforms/merge_compiler_regions.cc      |   2 +-
 src/relay/transforms/partition_graph.cc       |   2 +-
 src/script/printer/printer.cc                 |   7 -
 src/tir/schedule/error.cc                     |   6 +-
 src/tir/transforms/install_debug_spans.cc     |   4 +-
 tests/python/relay/test_ir_parser.py          |  10 +-
 .../test_meta_schedule_schedule_rule_mlt.py   |   3 +-
 tests/python/unittest/test_tir_nodes.py       | 126 ------------------
 .../test_tir_transform_lower_warp_memory.py   |   9 +-
 .../test_tvmscript_printer_syntax_sugar.py    |  69 ----------
 .../unittest/test_tvmscript_printer_tir.py    |  42 ++++++
 65 files changed, 447 insertions(+), 514 deletions(-)
 rename include/tvm/{ir => relay}/error.h (97%)
 delete mode 100644 python/tvm/script/printer/printer.py
 rename src/{ => relay}/ir/error.cc (97%)
 rename src/{ => relay}/printer/doc.cc (98%)
 rename src/{ => relay}/printer/doc.h (97%)
 rename src/{ => relay}/printer/meta_data.h (95%)
 rename src/{ => relay}/printer/model_library_format_printer.cc (96%)
 rename src/{ => relay}/printer/relay_text_printer.cc (99%)
 rename src/{ => relay}/printer/text_printer.cc (95%)
 rename src/{ => relay}/printer/text_printer.h (95%)
 rename src/{ => relay}/printer/tir_text_printer.cc (97%)
 rename src/{ => relay}/printer/tir_text_printer_debug.cc (98%)
 rename src/{ => relay}/printer/tir_text_printer_debug.h (90%)
 rename src/{ => relay}/printer/tvmscript_printer.cc (96%)
 delete mode 100644 tests/python/unittest/test_tvmscript_printer_syntax_sugar.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd69c9d7f120..f51233d244e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -288,7 +288,6 @@ tvm_file_glob(GLOB_RECURSE COMPILER_SRCS
     src/topi/*.cc
     src/driver/*.cc
     src/parser/*.cc
-    src/printer/*.cc
     src/support/*.cc
     src/script/*.cc
     )
@@ -317,6 +316,7 @@ tvm_file_glob(GLOB RELAY_BACKEND_SRCS
     )
 tvm_file_glob(GLOB_RECURSE RELAY_IR_SRCS
     src/relay/ir/*.cc
+    src/relay/printer/*.cc
     )
 tvm_file_glob(GLOB_RECURSE RELAY_QNN_SRCS
     src/relay/qnn/*.cc
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index f26e640f6c22..4cd357d4180b 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -446,34 +446,6 @@ class IRModule : public ObjectRef {
   TVM_DEFINE_OBJECT_REF_COW_METHOD(IRModuleNode);
 };
 
-/*!
- * \brief Pretty print a node for debug purposes.
- *
- * \param node The node to be printed.
- * \return The text reperesentation.
- * \note This function does not show version or meta-data.
- *       Use AsText if you want to store the text.
- * \sa AsText.
- */
-TVM_DLL String PrettyPrint(const ObjectRef& node);
-
-/*!
- * \brief Render the node as a string in the text format.
- *
- * \param node The node to be rendered.
- * \param show_meta_data Whether to print meta data section.
- * \param annotate An optional callback function for attaching
- *        additional comment block to an expr.
- *
- * \note We support a limited set of IR nodes that are part of
- *       relay IR and
- *
- * \sa PrettyPrint.
- * \return The text representation.
- */
-TVM_DLL String AsText(const ObjectRef& node, bool show_meta_data = true,
-                      runtime::TypedPackedFunc<String(ObjectRef)> annotate = nullptr);
-
 namespace attr {
 
 // Following are attributes for IRModule only.
diff --git a/include/tvm/ir/transform.h b/include/tvm/ir/transform.h
index febcca5c0107..473e6291685d 100644
--- a/include/tvm/ir/transform.h
+++ b/include/tvm/ir/transform.h
@@ -57,7 +57,6 @@
 #define TVM_IR_TRANSFORM_H_
 
 #include <tvm/ir/diagnostic.h>
-#include <tvm/ir/error.h>
 #include <tvm/ir/instrument.h>
 #include <tvm/ir/module.h>
 #include <tvm/runtime/container/array.h>
diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index e94bd2756e98..2825bcfc659a 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -120,6 +120,34 @@ class Id : public ObjectRef {
   TVM_DEFINE_OBJECT_REF_METHODS(Id, ObjectRef, IdNode);
 };
 
+/*!
+ * \brief Pretty print a node for debug purposes.
+ *
+ * \param node The node to be printed.
+ * \return The text reperesentation.
+ * \note This function does not show version or meta-data.
+ *       Use AsText if you want to store the text.
+ * \sa AsText.
+ */
+TVM_DLL String PrettyPrint(const ObjectRef& node);
+
+/*!
+ * \brief Render the node as a string in the text format.
+ *
+ * \param node The node to be rendered.
+ * \param show_meta_data Whether to print meta data section.
+ * \param annotate An optional callback function for attaching
+ *        additional comment block to an expr.
+ *
+ * \note We support a limited set of IR nodes that are part of
+ *       relay IR and
+ *
+ * \sa PrettyPrint.
+ * \return The text representation.
+ */
+TVM_DLL String AsText(const ObjectRef& node, bool show_meta_data = true,
+                      runtime::TypedPackedFunc<String(ObjectRef)> annotate = nullptr);
+
 }  // namespace relay
 }  // namespace tvm
 
diff --git a/include/tvm/ir/error.h b/include/tvm/relay/error.h
similarity index 97%
rename from include/tvm/ir/error.h
rename to include/tvm/relay/error.h
index 6ff61781ac44..be34e2b8ae1a 100644
--- a/include/tvm/ir/error.h
+++ b/include/tvm/relay/error.h
@@ -16,13 +16,8 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
-/*!
- * \file tvm/ir/error.h
- * \brief Utilities for error tracking and reporting.
- */
-#ifndef TVM_IR_ERROR_H_
-#define TVM_IR_ERROR_H_
+#ifndef TVM_RELAY_ERROR_H_
+#define TVM_RELAY_ERROR_H_
 
 #include <tvm/ir/module.h>
 #include <tvm/ir/span.h>
@@ -33,6 +28,7 @@
 #include <vector>
 
 namespace tvm {
+namespace relay {
 /*!
  * \brief A wrapper around std::stringstream to build error.
  *
@@ -181,5 +177,6 @@ class ErrorReporter {
   std::unordered_map<ObjectRef, GlobalVar, ObjectPtrHash, ObjectPtrEqual> node_to_gv_;
 };
 
+}  // namespace relay
 }  // namespace tvm
-#endif  // TVM_IR_ERROR_H_
+#endif  // TVM_RELAY_ERROR_H_
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 6847a53caad4..854050464d4a 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -57,7 +57,6 @@ using BaseFunc = tvm::BaseFunc;
 using BaseFuncNode = tvm::BaseFuncNode;
 using GlobalVar = tvm::GlobalVar;
 using GlobalVarNode = tvm::GlobalVarNode;
-using tvm::PrettyPrint;
 
 /*!
  * \brief Constant tensor, backed by an NDArray on the cpu(0) device.
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index 280a1f8a6c29..2a295c9da7f9 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -25,9 +25,9 @@
 #ifndef TVM_RELAY_EXPR_FUNCTOR_H_
 #define TVM_RELAY_EXPR_FUNCTOR_H_
 
-#include <tvm/ir/error.h>
 #include <tvm/node/functor.h>
 #include <tvm/relay/adt.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/function.h>
 #include <tvm/relay/op.h>
diff --git a/include/tvm/relay/pattern_functor.h b/include/tvm/relay/pattern_functor.h
index 711d8323f158..9d2b6689b2c2 100644
--- a/include/tvm/relay/pattern_functor.h
+++ b/include/tvm/relay/pattern_functor.h
@@ -25,8 +25,8 @@
 #ifndef TVM_RELAY_PATTERN_FUNCTOR_H_
 #define TVM_RELAY_PATTERN_FUNCTOR_H_
 
-#include <tvm/ir/error.h>
 #include <tvm/node/functor.h>
+#include <tvm/relay/error.h>
 
 #include <string>
 #include <unordered_map>
diff --git a/python/tvm/ir/__init__.py b/python/tvm/ir/__init__.py
index 9e81dd5519e1..4f63cbecd9d1 100644
--- a/python/tvm/ir/__init__.py
+++ b/python/tvm/ir/__init__.py
@@ -27,7 +27,6 @@
     Span,
     assert_structural_equal,
     load_json,
-    pretty_print,
     save_json,
     structural_equal,
     structural_hash,
diff --git a/python/tvm/ir/affine_type.py b/python/tvm/ir/affine_type.py
index 8d185ae59a34..24126f94b9c4 100644
--- a/python/tvm/ir/affine_type.py
+++ b/python/tvm/ir/affine_type.py
@@ -32,7 +32,7 @@ def __ne__(self, other):
         return not self.__eq__(other)
 
     def __str__(self):
-        from tvm.ir import pretty_print  # pylint: disable=import-outside-toplevel
+        from tvm.relay import pretty_print  # pylint: disable=import-outside-toplevel
 
         return pretty_print(self)
 
diff --git a/python/tvm/ir/base.py b/python/tvm/ir/base.py
index a1e1d20d8823..b84a83d55843 100644
--- a/python/tvm/ir/base.py
+++ b/python/tvm/ir/base.py
@@ -23,40 +23,9 @@
 from . import _ffi_api, json_compact
 
 
-def pretty_print(obj: Object) -> None:
-    """Pretty print the object."""
-    return _ffi_api.PrettyPrint(obj)  # type: ignore # pylint: disable=no-member
-
-
 class Node(Object):
     """Base class of all IR Nodes, implements astext function."""
 
-    def astext(self, show_meta_data=True, annotate=None):
-        """Get the text format of the expression.
-
-        Parameters
-        ----------
-        show_meta_data : bool
-            Whether to include meta data section in the text
-            if there is meta data.
-
-        annotate: Optional[Object->str]
-            Optionally annotate function to provide additional
-            information in the comment block.
-
-        Returns
-        -------
-        text : str
-            The text format of the expression.
-
-        Notes
-        -----
-        The meta data section is necessary to fully parse the text format.
-        However, it can contain dumps that are big (e.g constant weights),
-        so it can be helpful to skip printing the meta data section.
-        """
-        return _ffi_api.AsText(self, show_meta_data, annotate)
-
 
 @tvm._ffi.register_object("SourceName")
 class SourceName(Object):
diff --git a/python/tvm/ir/expr.py b/python/tvm/ir/expr.py
index e16cd5ea9e2f..52af8407b7a0 100644
--- a/python/tvm/ir/expr.py
+++ b/python/tvm/ir/expr.py
@@ -17,9 +17,9 @@
 """Common expressions data structures in the IR."""
 import tvm._ffi
 
-from .base import Node
-from . import _ffi_api
 from ..runtime import const, convert
+from . import _ffi_api
+from .base import Node
 
 
 class BaseExpr(Node):
@@ -91,6 +91,34 @@ def __call__(self, *args):
             "Do not know how to handle GlobalVar.__call__ for types {}".format(arg_types)
         )
 
+    def astext(self, show_meta_data=True, annotate=None):
+        """Get the text format of the expression.
+
+        Parameters
+        ----------
+        show_meta_data : bool
+            Whether to include meta data section in the text
+            if there is meta data.
+
+        annotate: Optional[Object->str]
+            Optionally annotate function to provide additional
+            information in the comment block.
+
+        Returns
+        -------
+        text : str
+            The text format of the expression.
+
+        Notes
+        -----
+        The meta data section is necessary to fully parse the text format.
+        However, it can contain dumps that are big (e.g constant weights),
+        so it can be helpful to skip printing the meta data section.
+        """
+        from tvm.relay import astext  # pylint: disable=import-outside-toplevel
+
+        return astext(self, show_meta_data, annotate)
+
 
 @tvm._ffi.register_object
 class Range(Node):
diff --git a/python/tvm/ir/module.py b/python/tvm/ir/module.py
index b184c3b0c3cf..51410049ec74 100644
--- a/python/tvm/ir/module.py
+++ b/python/tvm/ir/module.py
@@ -287,6 +287,34 @@ def with_attr(self, attr_key, attr_value):
 
         return _ffi_api.Module_WithAttr(self, attr_key, attr_value)
 
+    def astext(self, show_meta_data=True, annotate=None):
+        """Get the text format of the expression.
+
+        Parameters
+        ----------
+        show_meta_data : bool
+            Whether to include meta data section in the text
+            if there is meta data.
+
+        annotate: Optional[Object->str]
+            Optionally annotate function to provide additional
+            information in the comment block.
+
+        Returns
+        -------
+        text : str
+            The text format of the expression.
+
+        Notes
+        -----
+        The meta data section is necessary to fully parse the text format.
+        However, it can contain dumps that are big (e.g constant weights),
+        so it can be helpful to skip printing the meta data section.
+        """
+        from tvm.relay import astext  # pylint: disable=import-outside-toplevel
+
+        return astext(self, show_meta_data, annotate)
+
     def script(
         self,
         *,
diff --git a/python/tvm/ir/op.py b/python/tvm/ir/op.py
index 49ac72b887e6..70aba979518e 100644
--- a/python/tvm/ir/op.py
+++ b/python/tvm/ir/op.py
@@ -17,8 +17,9 @@
 # pylint: disable=invalid-name
 """Primitive operators in the TVM IR."""
 import tvm._ffi
-from .expr import RelayExpr
+
 from . import _ffi_api
+from .expr import RelayExpr
 
 
 @tvm._ffi.register_object("Op")
@@ -28,6 +29,34 @@ class Op(RelayExpr):
     def __init__(self):
         raise RuntimeError("Cannot create op, use get instead")
 
+    def astext(self, show_meta_data=True, annotate=None):
+        """Get the text format of the expression.
+
+        Parameters
+        ----------
+        show_meta_data : bool
+            Whether to include meta data section in the text
+            if there is meta data.
+
+        annotate: Optional[Object->str]
+            Optionally annotate function to provide additional
+            information in the comment block.
+
+        Returns
+        -------
+        text : str
+            The text format of the expression.
+
+        Notes
+        -----
+        The meta data section is necessary to fully parse the text format.
+        However, it can contain dumps that are big (e.g constant weights),
+        so it can be helpful to skip printing the meta data section.
+        """
+        from tvm.relay import astext  # pylint: disable=import-outside-toplevel
+
+        return astext(self, show_meta_data, annotate)
+
     @staticmethod
     def get(op_name):
         """Get the Op for a given name
diff --git a/python/tvm/ir/tensor_type.py b/python/tvm/ir/tensor_type.py
index 7313f3c2b42c..495e0fe868e5 100644
--- a/python/tvm/ir/tensor_type.py
+++ b/python/tvm/ir/tensor_type.py
@@ -56,6 +56,6 @@ def concrete_shape(self):
         return tuple(int(x) for x in self.shape)
 
     def __str__(self):
-        from tvm.ir import pretty_print  # pylint: disable=import-outside-toplevel
+        from tvm.relay import pretty_print  # pylint: disable=import-outside-toplevel
 
         return pretty_print(self)
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index df5170f0e025..cf205c414f8f 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -27,12 +27,13 @@
 
 import tvm
 from tvm.micro import get_standalone_crt_dir
+
 from .._ffi import get_global_func
 from ..contrib import utils
 from ..driver import build_module
-from ..relay.backend import executor_factory
-from ..relay.backend.name_transforms import to_c_variable_style, prefix_generated_name
 from ..relay import param_dict
+from ..relay.backend import executor_factory
+from ..relay.backend.name_transforms import prefix_generated_name, to_c_variable_style
 from ..tir import expr
 
 # This should be kept identical to runtime::symbol::tvm_module_main
@@ -528,7 +529,7 @@ def _eval_shape(param_name, buffer_shape):
         # TODO(mbs): The device type is not unique, better would be to use target.kind.name
         target_device_type = target.get_target_device_type()
         ir_mod = ir_module_by_target[target]
-        printer = get_global_func("tir.ModelLibraryFormatPrinter")(False, None, False)
+        printer = get_global_func("relay.ir.ModelLibraryFormatPrinter")(False, None, False)
         with open(src_dir / f"tir-{target_device_type}.txt", "w") as f:
             f.write(printer["print"](ir_mod))
 
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 97842738e5cd..5e5d1d5f18d8 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -29,6 +29,7 @@
 from . import prelude
 from . import loops
 from . import scope_builder
+from .base import pretty_print, astext
 
 from . import transform
 from . import analysis
diff --git a/python/tvm/relay/base.py b/python/tvm/relay/base.py
index 323a8f6e5a01..8667bfb1dfdc 100644
--- a/python/tvm/relay/base.py
+++ b/python/tvm/relay/base.py
@@ -17,15 +17,50 @@
 # pylint: disable=no-else-return, unidiomatic-typecheck, unused-import
 """The base node types for the Relay language."""
 import os
-import tvm._ffi
 
+import tvm._ffi
+from tvm.ir import Node as RelayNode
+from tvm.ir import SourceName, Span
 from tvm.runtime import Object
-from tvm.ir import SourceName, Span, Node as RelayNode
 
+from . import _ffi_api
 
 __STD_PATH__ = os.path.join(os.path.dirname(os.path.realpath(__file__)), "std")
 
 
+def pretty_print(obj: Object) -> None:
+    """Pretty print the object."""
+    return _ffi_api.PrettyPrint(obj)  # type: ignore # pylint: disable=no-member
+
+
+def astext(obj: Object, show_meta_data=True, annotate=None):
+    """Get the text format of the expression.
+
+    Parameters
+    ----------
+    obj : Object
+        The object to be printed.
+    show_meta_data : bool
+        Whether to include meta data section in the text
+        if there is meta data.
+    annotate: Optional[Object->str]
+        Optionally annotate function to provide additional
+        information in the comment block.
+
+    Returns
+    -------
+    text : str
+        The text format of the expression.
+
+    Notes
+    -----
+    The meta data section is necessary to fully parse the text format.
+    However, it can contain dumps that are big (e.g constant weights),
+    so it can be helpful to skip printing the meta data section.
+    """
+    return _ffi_api.AsText(obj, show_meta_data, annotate)  # type: ignore # pylint: disable=no-member
+
+
 @tvm._ffi.register_func("tvm.relay.std_path")
 def _std_path():
     return __STD_PATH__
diff --git a/python/tvm/relay/dataflow_pattern/__init__.py b/python/tvm/relay/dataflow_pattern/__init__.py
index 6c29825bc04d..6e19cafa747d 100644
--- a/python/tvm/relay/dataflow_pattern/__init__.py
+++ b/python/tvm/relay/dataflow_pattern/__init__.py
@@ -26,6 +26,7 @@
 from ...ir import make_node
 from ...ir.base import Node
 from ...runtime import Object
+from ..base import astext, pretty_print
 from ..op import get
 from . import _ffi as ffi
 
@@ -47,10 +48,34 @@ class DFPattern(Node):
     """Base class of all Patterns."""
 
     def __str__(self):
-        from tvm.ir import pretty_print  # pylint: disable=import-outside-toplevel
-
         return pretty_print(self)
 
+    def astext(self, show_meta_data=True, annotate=None):
+        """Get the text format of the expression.
+
+        Parameters
+        ----------
+        show_meta_data : bool
+            Whether to include meta data section in the text
+            if there is meta data.
+
+        annotate: Optional[Object->str]
+            Optionally annotate function to provide additional
+            information in the comment block.
+
+        Returns
+        -------
+        text : str
+            The text format of the expression.
+
+        Notes
+        -----
+        The meta data section is necessary to fully parse the text format.
+        However, it can contain dumps that are big (e.g constant weights),
+        so it can be helpful to skip printing the meta data section.
+        """
+        return astext(self, show_meta_data, annotate)
+
     def __call__(self, *args):
         args = list(args)
         if len(args) == 1 and args[0] is None:
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 7d60e89b59b7..cb14552ac16e 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -30,7 +30,7 @@
 
 from . import _ffi_api
 from . import ty as _ty
-from .base import RelayNode
+from .base import RelayNode, astext, pretty_print
 
 # alias relay expr as Expr.
 Expr = RelayExpr
@@ -62,10 +62,34 @@ def astype(self, dtype):
         return _ffi_api.cast(self, dtype)
 
     def __str__(self):
-        from tvm.ir import pretty_print  # pylint: disable=import-outside-toplevel
-
         return pretty_print(self)
 
+    def astext(self, show_meta_data=True, annotate=None):
+        """Get the text format of the expression.
+
+        Parameters
+        ----------
+        show_meta_data : bool
+            Whether to include meta data section in the text
+            if there is meta data.
+
+        annotate: Optional[Object->str]
+            Optionally annotate function to provide additional
+            information in the comment block.
+
+        Returns
+        -------
+        text : str
+            The text format of the expression.
+
+        Notes
+        -----
+        The meta data section is necessary to fully parse the text format.
+        However, it can contain dumps that are big (e.g constant weights),
+        so it can be helpful to skip printing the meta data section.
+        """
+        return astext(self, show_meta_data, annotate)
+
     def __neg__(self):
         return _op_make.negative(self)
 
@@ -719,8 +743,6 @@ def __init__(self, sids, dev_types, sizes):
         self.__init_handle_by_constructor__(_ffi_api.StorageInfo, sids, dev_types, sizes)
 
     def __str__(self):
-        from tvm.ir import pretty_print  # pylint: disable=import-outside-toplevel
-
         return pretty_print(self)
 
     @property
@@ -750,6 +772,4 @@ def __init__(self, expr_to_storage_info):
         self.__init_handle_by_constructor__(_ffi_api.StaticMemoryPlan, expr_to_storage_info)
 
     def __str__(self):
-        from tvm.ir import pretty_print  # pylint: disable=import-outside-toplevel
-
         return pretty_print(self)
diff --git a/python/tvm/relay/function.py b/python/tvm/relay/function.py
index ef3356450085..dc0636a9b3f4 100644
--- a/python/tvm/relay/function.py
+++ b/python/tvm/relay/function.py
@@ -23,6 +23,7 @@
 from tvm.runtime import convert
 
 from . import _ffi_api
+from .base import astext, pretty_print
 from .expr import Call
 
 
@@ -68,10 +69,34 @@ def __call__(self, *args):
         return Call(self, args, None, None)
 
     def __str__(self):
-        from tvm.ir import pretty_print  # pylint: disable=import-outside-toplevel
-
         return pretty_print(self)
 
+    def astext(self, show_meta_data=True, annotate=None):
+        """Get the text format of the expression.
+
+        Parameters
+        ----------
+        show_meta_data : bool
+            Whether to include meta data section in the text
+            if there is meta data.
+
+        annotate: Optional[Object->str]
+            Optionally annotate function to provide additional
+            information in the comment block.
+
+        Returns
+        -------
+        text : str
+            The text format of the expression.
+
+        Notes
+        -----
+        The meta data section is necessary to fully parse the text format.
+        However, it can contain dumps that are big (e.g constant weights),
+        so it can be helpful to skip printing the meta data section.
+        """
+        return astext(self, show_meta_data, annotate)
+
 
 @tvm._ffi.register_func("relay.FunctionWithFields")
 def FunctionWithFields(
diff --git a/python/tvm/script/__init__.py b/python/tvm/script/__init__.py
index 82bb698f2773..9283727ad41a 100644
--- a/python/tvm/script/__init__.py
+++ b/python/tvm/script/__init__.py
@@ -18,4 +18,3 @@
 from .parser import ir, ir_module
 from .parser import parse as from_source
 from .parser import tir
-from .printer import script
diff --git a/python/tvm/script/printer/__init__.py b/python/tvm/script/printer/__init__.py
index dc37ea1ff6a6..01d89dacbf52 100644
--- a/python/tvm/script/printer/__init__.py
+++ b/python/tvm/script/printer/__init__.py
@@ -20,4 +20,3 @@
 in a roundtrippable way.
 """
 from . import default
-from .printer import script
diff --git a/python/tvm/script/printer/printer.py b/python/tvm/script/printer/printer.py
deleted file mode 100644
index 2ce6329dca08..000000000000
--- a/python/tvm/script/printer/printer.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""The printer interface"""
-from typing import Optional
-
-from tvm.runtime.object_path import ObjectPath
-
-from . import _ffi_api
-
-
-def script(
-    obj,
-    indent_space: int = 4,
-    print_line_number: bool = False,
-    num_context_lines: int = -1,
-    path_to_underline: Optional[ObjectPath] = None,
-):
-    """Print a TVM IR as a TVMScript text format.
-
-    Parameters
-    ----------
-    obj : object
-        An TVM object representing TVM IR
-    indent_space : int = 4
-        The number of spaces to indent
-    print_line_number : bool = False
-        Whether to print line number
-    num_context_lines : int = -1
-        The number of context lines to print. -1 means all lines.
-    path_to_underline : Optional[ObjectPath]
-        The path to underline in the script.
-
-    Returns
-    -------
-    script : str
-        The TVMScript text format
-    """
-    return _ffi_api.Script(  # type: ignore # pylint: disable=no-member
-        obj, indent_space, print_line_number, num_context_lines, path_to_underline
-    )
diff --git a/rust/tvm/src/ir/expr.rs b/rust/tvm/src/ir/expr.rs
index 03d8a4920718..1a0e7aea39c9 100644
--- a/rust/tvm/src/ir/expr.rs
+++ b/rust/tvm/src/ir/expr.rs
@@ -90,7 +90,7 @@ impl GlobalVar {
 
 // TODO: figure out how to type the last argument runtime::TypedPackedFunc<String(ObjectRef)> annotate)
 external! {
-    #[name("ir.AsText")]
+    #[name("relay.ir.AsText")]
     fn _as_text(object: ObjectRef, show_meta_data: i32, annotate: runtime::Function) -> TString;
 }
 
diff --git a/src/ir/transform.cc b/src/ir/transform.cc
index bfd0a5917556..9a669493ccb7 100644
--- a/src/ir/transform.cc
+++ b/src/ir/transform.cc
@@ -587,7 +587,11 @@ TVM_REGISTER_GLOBAL("transform.OverrideInstruments")
 
 Pass PrintIR(String header, bool show_meta_data) {
   auto pass_func = [header, show_meta_data](IRModule mod, const PassContext& ctx) {
-    LOG(INFO) << "PrintIR(" << header << "):\n" << AsText(mod, show_meta_data);
+    if (const auto* f = runtime::Registry::Get("relay.PrintIR")) {
+      (*f)(mod, header, show_meta_data);
+    } else {
+      LOG(INFO) << "PrintIR(" << header << "):\n" << mod;
+    }
     return mod;
   };
   return CreateModulePass(pass_func, 0, "PrintIR", {});
diff --git a/src/relay/analysis/annotated_region_set.cc b/src/relay/analysis/annotated_region_set.cc
index 53c680b722cd..ef21604d8a71 100644
--- a/src/relay/analysis/annotated_region_set.cc
+++ b/src/relay/analysis/annotated_region_set.cc
@@ -19,7 +19,7 @@
 
 #include "annotated_region_set.h"
 
-#include <tvm/ir/error.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/expr.h>
 
 #include <unordered_map>
diff --git a/src/relay/analysis/annotated_region_set.h b/src/relay/analysis/annotated_region_set.h
index aca42397916c..443bd5ec1da3 100644
--- a/src/relay/analysis/annotated_region_set.h
+++ b/src/relay/analysis/annotated_region_set.h
@@ -27,9 +27,9 @@
 #ifndef TVM_RELAY_ANALYSIS_ANNOTATED_REGION_SET_H_
 #define TVM_RELAY_ANALYSIS_ANNOTATED_REGION_SET_H_
 
-#include <tvm/ir/error.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
diff --git a/src/relay/analysis/kind_check.cc b/src/relay/analysis/kind_check.cc
index 65b8516cb16c..f7a5e7bf2d12 100644
--- a/src/relay/analysis/kind_check.cc
+++ b/src/relay/analysis/kind_check.cc
@@ -31,9 +31,9 @@
  * We check this by ensuring the `dtype` field of a Tensor always
  * contains a data type such as `int`, `float`, `uint`.
  */
-#include <tvm/ir/error.h>
 #include <tvm/ir/type_functor.h>
 #include <tvm/relay/analysis.h>
+#include <tvm/relay/error.h>
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/analysis/match_exhaustion.cc b/src/relay/analysis/match_exhaustion.cc
index 2a90b911b676..05d5b36e3614 100644
--- a/src/relay/analysis/match_exhaustion.cc
+++ b/src/relay/analysis/match_exhaustion.cc
@@ -27,8 +27,8 @@
  * code correctness, since hitting an unmatched case results in a
  * dynamic error unless exhaustiveness is checked in advance.
  */
-#include <tvm/ir/error.h>
 #include <tvm/relay/adt.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/pattern_functor.h>
 
diff --git a/src/relay/analysis/type_solver.h b/src/relay/analysis/type_solver.h
index 3bde1a1e3746..7940e347b3ea 100644
--- a/src/relay/analysis/type_solver.h
+++ b/src/relay/analysis/type_solver.h
@@ -24,8 +24,8 @@
 #ifndef TVM_RELAY_ANALYSIS_TYPE_SOLVER_H_
 #define TVM_RELAY_ANALYSIS_TYPE_SOLVER_H_
 
-#include <tvm/ir/error.h>
 #include <tvm/relay/analysis.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/type.h>
 
diff --git a/src/relay/backend/contrib/ethosu/codegen.cc b/src/relay/backend/contrib/ethosu/codegen.cc
index afa17750d8a8..a622f96c81da 100644
--- a/src/relay/backend/contrib/ethosu/codegen.cc
+++ b/src/relay/backend/contrib/ethosu/codegen.cc
@@ -24,9 +24,9 @@
  * Codegen.
  */
 
-#include <tvm/ir/error.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
diff --git a/src/relay/backend/contrib/ethosu/compiler_attrs.cc b/src/relay/backend/contrib/ethosu/compiler_attrs.cc
index 42add45b013c..6c825a18901a 100644
--- a/src/relay/backend/contrib/ethosu/compiler_attrs.cc
+++ b/src/relay/backend/contrib/ethosu/compiler_attrs.cc
@@ -17,9 +17,9 @@
  * under the License.
  */
 
-#include <tvm/ir/error.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
diff --git a/src/relay/backend/contrib/ethosu/preprocess.cc b/src/relay/backend/contrib/ethosu/preprocess.cc
index 571a56ad97c0..a0e0ac772fb0 100644
--- a/src/relay/backend/contrib/ethosu/preprocess.cc
+++ b/src/relay/backend/contrib/ethosu/preprocess.cc
@@ -16,9 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include <tvm/ir/error.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
diff --git a/src/relay/backend/contrib/uma/relay_to_tir.cc b/src/relay/backend/contrib/uma/relay_to_tir.cc
index 8aed69453158..ca3ae0ebec6b 100644
--- a/src/relay/backend/contrib/uma/relay_to_tir.cc
+++ b/src/relay/backend/contrib/uma/relay_to_tir.cc
@@ -23,9 +23,9 @@
  * \brief this file contains the target hooks for the Universal Modular Accelerator Interface (UMA).
  */
 
-#include <tvm/ir/error.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 9ba90b9f676d..fb23c4cc082a 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -25,11 +25,11 @@
 #include "compiler.h"
 
 #include <tvm/driver/driver_api.h>
-#include <tvm/ir/error.h>
 #include <tvm/parser/parser.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/device_copy.h>
 #include <tvm/relay/attrs/memory.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/interpreter.h>
 #include <tvm/relay/qnn/transform.h>
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index 163ec399013b..9160ce0e2e42 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -25,7 +25,7 @@
 #ifndef TVM_RELAY_BACKEND_VM_COMPILER_H_
 #define TVM_RELAY_BACKEND_VM_COMPILER_H_
 
-#include <tvm/ir/error.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/interpreter.h>
 #include <tvm/relay/transform.h>
diff --git a/src/relay/collage/partition_rule.h b/src/relay/collage/partition_rule.h
index 19e7f3ccebfb..ca68c9b086b0 100644
--- a/src/relay/collage/partition_rule.h
+++ b/src/relay/collage/partition_rule.h
@@ -31,7 +31,7 @@
 #include <string>
 #include <vector>
 
-#include "../../printer/doc.h"
+#include "../printer/doc.h"
 #include "./candidate_partition.h"
 #include "./combiner_rule.h"
 #include "./sub_graph.h"
diff --git a/src/relay/ir/base.cc b/src/relay/ir/base.cc
index 5f7b8747a751..5f913026080d 100644
--- a/src/relay/ir/base.cc
+++ b/src/relay/ir/base.cc
@@ -51,5 +51,10 @@ TVM_REGISTER_GLOBAL("ir.NodeSetSpan").set_body_typed([](ObjectRef node_ref, Span
   }
 });
 
+TVM_REGISTER_GLOBAL("relay.PrintIR")
+    .set_body_typed([](ObjectRef mod, String header, bool show_metadata) {
+      LOG(INFO) << "PrintIR(" << header << "):\n" << AsText(mod, show_metadata);
+    });
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/ir/error.cc b/src/relay/ir/error.cc
similarity index 97%
rename from src/ir/error.cc
rename to src/relay/ir/error.cc
index 26448d04005c..940efd91aa52 100644
--- a/src/ir/error.cc
+++ b/src/relay/ir/error.cc
@@ -16,13 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
-/*!
- * \file ir/error.cc
- * \brief Utilities for error tracking and reporting.
- */
-#include <tvm/ir/error.h>
 #include <tvm/ir/module.h>
+#include <tvm/relay/base.h>
+#include <tvm/relay/error.h>
 
 // clang-format off
 #include <string>
@@ -31,6 +27,7 @@
 // clang-format on
 
 namespace tvm {
+namespace relay {
 
 template <typename T, typename U>
 using NodeMap = std::unordered_map<T, U, ObjectPtrHash, ObjectPtrEqual>;
@@ -137,5 +134,5 @@ void ErrorReporter::ReportAt(const GlobalVar& global, const ObjectRef& node,
   }
   this->node_to_gv_.insert({node, global});
 }
-
+}  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index c41eb0f8ad99..5c5cd6f4b721 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -23,8 +23,8 @@
  */
 #include "transform.h"
 
-#include <tvm/ir/error.h>
 #include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/op.h>
 #include <tvm/runtime/packed_func.h>
diff --git a/src/relay/op/tensor/transform.h b/src/relay/op/tensor/transform.h
index 3c638a59f46e..6c88aec8b957 100644
--- a/src/relay/op/tensor/transform.h
+++ b/src/relay/op/tensor/transform.h
@@ -24,8 +24,8 @@
 #ifndef TVM_RELAY_OP_TENSOR_TRANSFORM_H_
 #define TVM_RELAY_OP_TENSOR_TRANSFORM_H_
 
-#include <tvm/ir/error.h>
 #include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/op_attr_types.h>
 
 #include <algorithm>
diff --git a/src/relay/op/type_relations.h b/src/relay/op/type_relations.h
index 6d6d5f70c0c2..740766172ddc 100644
--- a/src/relay/op/type_relations.h
+++ b/src/relay/op/type_relations.h
@@ -25,7 +25,7 @@
 #ifndef TVM_RELAY_OP_TYPE_RELATIONS_H_
 #define TVM_RELAY_OP_TYPE_RELATIONS_H_
 
-#include <tvm/ir/error.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/type.h>
 
 #include <string>
diff --git a/src/printer/doc.cc b/src/relay/printer/doc.cc
similarity index 98%
rename from src/printer/doc.cc
rename to src/relay/printer/doc.cc
index b06995fb1286..79313c9a587f 100644
--- a/src/printer/doc.cc
+++ b/src/relay/printer/doc.cc
@@ -30,9 +30,10 @@
 #include <sstream>
 #include <vector>
 
-#include "../support/str_escape.h"
+#include "../../support/str_escape.h"
 
 namespace tvm {
+namespace relay {
 
 /*!
  * \brief Represent a piece of text in the doc.
@@ -157,4 +158,5 @@ Doc Doc::Concat(const std::vector<Doc>& vec, const Doc& sep) {
   }
   return seq;
 }
+}  // namespace relay
 }  // namespace tvm
diff --git a/src/printer/doc.h b/src/relay/printer/doc.h
similarity index 97%
rename from src/printer/doc.h
rename to src/relay/printer/doc.h
index dc6ba8952f3e..36f26d9bd24b 100644
--- a/src/printer/doc.h
+++ b/src/relay/printer/doc.h
@@ -23,8 +23,8 @@
  *
  *  Reference: Philip Wadler. A Prettier Printer. Journal of Functional Programming'98
  */
-#ifndef TVM_PRINTER_DOC_H_
-#define TVM_PRINTER_DOC_H_
+#ifndef TVM_RELAY_PRINTER_DOC_H_
+#define TVM_RELAY_PRINTER_DOC_H_
 
 #include <tvm/node/node.h>
 #include <tvm/runtime/data_type.h>
@@ -35,6 +35,7 @@
 #include <vector>
 
 namespace tvm {
+namespace relay {
 
 /*!
  * \brief Doc atom node for the ADT.
@@ -162,6 +163,6 @@ class Doc {
   /*! \brief Internal doc stream. */
   std::vector<DocAtom> stream_;
 };
-
+}  // namespace relay
 }  // namespace tvm
-#endif  // TVM_PRINTER_DOC_H_
+#endif  // TVM_RELAY_PRINTER_DOC_H_
diff --git a/src/printer/meta_data.h b/src/relay/printer/meta_data.h
similarity index 95%
rename from src/printer/meta_data.h
rename to src/relay/printer/meta_data.h
index ddf0d78087ee..2dfd594de7eb 100644
--- a/src/printer/meta_data.h
+++ b/src/relay/printer/meta_data.h
@@ -16,13 +16,8 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
-/*!
- * \file tvm/printer/meta_data.h
- * \brief Meta data context for printers.
- */
-#ifndef TVM_PRINTER_META_DATA_H_
-#define TVM_PRINTER_META_DATA_H_
+#ifndef TVM_RELAY_PRINTER_META_DATA_H_
+#define TVM_RELAY_PRINTER_META_DATA_H_
 
 #include <tvm/node/serialization.h>
 
@@ -32,6 +27,7 @@
 #include "doc.h"
 
 namespace tvm {
+namespace relay {
 /*!
  * \brief Meta data context for Printers
  *
@@ -140,5 +136,6 @@ class TextMetaDataContext {
   /*! \brief map from meta data into its string representation */
   std::unordered_map<ObjectRef, Doc, ObjectPtrHash, ObjectPtrEqual> meta_repr_;
 };
+}  // namespace relay
 }  // namespace tvm
-#endif  // TVM_PRINTER_META_DATA_H_
+#endif  // TVM_RELAY_PRINTER_META_DATA_H_
diff --git a/src/printer/model_library_format_printer.cc b/src/relay/printer/model_library_format_printer.cc
similarity index 96%
rename from src/printer/model_library_format_printer.cc
rename to src/relay/printer/model_library_format_printer.cc
index 4220aa00f5a4..76d0f1423d4f 100644
--- a/src/printer/model_library_format_printer.cc
+++ b/src/relay/printer/model_library_format_printer.cc
@@ -26,7 +26,7 @@
 #include "text_printer.h"
 
 namespace tvm {
-namespace printer {
+namespace relay {
 
 class ModelLibraryFormatPrinter : public ::tvm::runtime::ModuleNode {
  public:
@@ -69,7 +69,7 @@ class ModelLibraryFormatPrinter : public ::tvm::runtime::ModuleNode {
   TextPrinter text_printer_;
 };
 
-TVM_REGISTER_GLOBAL("tir.ModelLibraryFormatPrinter")
+TVM_REGISTER_GLOBAL("relay.ir.ModelLibraryFormatPrinter")
     .set_body_typed([](bool show_meta_data,
                        const runtime::TypedPackedFunc<std::string(ObjectRef)>& annotate,
                        bool show_warning) {
@@ -77,5 +77,5 @@ TVM_REGISTER_GLOBAL("tir.ModelLibraryFormatPrinter")
           make_object<ModelLibraryFormatPrinter>(show_meta_data, annotate, show_warning));
     });
 
-}  // namespace printer
+}  // namespace relay
 }  // namespace tvm
diff --git a/src/printer/relay_text_printer.cc b/src/relay/printer/relay_text_printer.cc
similarity index 99%
rename from src/printer/relay_text_printer.cc
rename to src/relay/printer/relay_text_printer.cc
index 76cac28b07f7..cc86f9b56435 100644
--- a/src/printer/relay_text_printer.cc
+++ b/src/relay/printer/relay_text_printer.cc
@@ -40,10 +40,10 @@
 #include <tvm/target/virtual_device.h>
 #include <tvm/tir/function.h>
 
-#include "../ir/attr_functor.h"
-#include "../parser/meta_ref.h"
-#include "../relay/analysis/dependency_graph.h"
-#include "../support/scalars.h"
+#include "../../ir/attr_functor.h"
+#include "../../parser/meta_ref.h"
+#include "../../support/scalars.h"
+#include "../analysis/dependency_graph.h"
 #include "doc.h"
 #include "meta_data.h"
 #include "text_printer.h"
@@ -970,10 +970,5 @@ Doc RelayTextPrinter::PrintSpan(const Span& span) {
   return doc;
 }
 
-TVM_REGISTER_GLOBAL("ir.TextPrinter").set_body_typed([](ObjectRef node) {
-  auto text = AsText(node, false, nullptr);
-  return text;
-});
-
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/printer/text_printer.cc b/src/relay/printer/text_printer.cc
similarity index 95%
rename from src/printer/text_printer.cc
rename to src/relay/printer/text_printer.cc
index 4d4113fef694..f51f7c3dfa57 100644
--- a/src/printer/text_printer.cc
+++ b/src/relay/printer/text_printer.cc
@@ -23,7 +23,7 @@
  *        that can be parsed by a parser.
  */
 
-#include "text_printer.h"
+#include "./text_printer.h"
 
 #include <tvm/tir/function.h>
 
@@ -31,6 +31,7 @@
 #include <string>
 
 namespace tvm {
+namespace relay {
 
 static const char* kSemVer = "0.0.5";
 
@@ -124,8 +125,8 @@ String AsText(const ObjectRef& node, bool show_meta_data,
   return doc.str();
 }
 
-TVM_REGISTER_GLOBAL("ir.PrettyPrint").set_body_typed(PrettyPrint);
-
-TVM_REGISTER_GLOBAL("ir.AsText").set_body_typed(AsText);
+TVM_REGISTER_GLOBAL("relay.ir.PrettyPrint").set_body_typed(PrettyPrint);
+TVM_REGISTER_GLOBAL("relay.ir.AsText").set_body_typed(AsText);
 
+}  // namespace relay
 }  // namespace tvm
diff --git a/src/printer/text_printer.h b/src/relay/printer/text_printer.h
similarity index 95%
rename from src/printer/text_printer.h
rename to src/relay/printer/text_printer.h
index 925c2ebf494e..707bbec5ad33 100644
--- a/src/printer/text_printer.h
+++ b/src/relay/printer/text_printer.h
@@ -23,8 +23,8 @@
  *        that can be parsed by a parser.
  */
 
-#ifndef TVM_PRINTER_TEXT_PRINTER_H_
-#define TVM_PRINTER_TEXT_PRINTER_H_
+#ifndef TVM_RELAY_PRINTER_TEXT_PRINTER_H_
+#define TVM_RELAY_PRINTER_TEXT_PRINTER_H_
 
 #include <tvm/ir/module.h>
 #include <tvm/ir/type_functor.h>
@@ -41,19 +41,16 @@
 #include <unordered_set>
 #include <vector>
 
-#include "../ir/attr_functor.h"
-#include "../relay/analysis/dependency_graph.h"
+#include "../../ir/attr_functor.h"
+#include "../analysis/dependency_graph.h"
 #include "doc.h"
 #include "meta_data.h"
-#include "text_printer.h"
-
-namespace tvm {
-class TextPrinter;
-}  // namespace tvm
 
 namespace tvm {
 namespace relay {
 
+class TextPrinter;
+
 class RelayTextPrinter : public ExprFunctor<Doc(const Expr&)>,
                          public PatternFunctor<Doc(const Pattern&)>,
                          public TypeFunctor<Doc(const Type&)>,
@@ -227,14 +224,10 @@ class RelayTextPrinter : public ExprFunctor<Doc(const Expr&)>,
   DependencyGraph dg_;
   class AttrPrinter;
   friend class AttrPrinter;
-  friend class tvm::TextPrinter;
+  friend class tvm::relay::TextPrinter;
 };
 
-}  // namespace relay
-}  // namespace tvm
-
-namespace tvm {
-namespace tir {
+using namespace ::tvm::tir;
 
 /*!
  *  \brief Meta node collector
@@ -274,7 +267,7 @@ class MetaCollector : public StmtExprVisitor {
 };
 
 class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
-                       public ExprFunctor<Doc(const PrimExpr&)>,
+                       public tir::ExprFunctor<Doc(const PrimExpr&)>,
                        public TypeFunctor<Doc(const Type&)> {
  public:
   explicit TIRTextPrinter(bool show_meta, TextMetaDataContext* meta)
@@ -298,7 +291,7 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc VisitExpr_(const FloatImmNode* op) override;
   Doc VisitExpr_(const StringImmNode* op) override;
   Doc VisitExpr_(const CastNode* op) override;
-  Doc VisitExpr_(const VarNode* op) override;
+  Doc VisitExpr_(const tir::VarNode* op) override;
   Doc VisitExpr_(const AddNode* op) override;
   Doc VisitExpr_(const SubNode* op) override;
   Doc VisitExpr_(const MulNode* op) override;
@@ -323,8 +316,8 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc VisitExpr_(const LoadNode* op) override;
   Doc VisitExpr_(const RampNode* op) override;
   Doc VisitExpr_(const BroadcastNode* op) override;
-  Doc VisitExpr_(const LetNode* op) override;
-  Doc VisitExpr_(const CallNode* op) override;
+  Doc VisitExpr_(const tir::LetNode* op) override;
+  Doc VisitExpr_(const tir::CallNode* op) override;
   Doc VisitExpr_(const ShuffleNode* op) override;
   Doc VisitExpr_(const ReduceNode* op) override;
   Doc VisitExprDefault_(const Object* op) override;
@@ -357,7 +350,7 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   /*! \brief meta collector */
   MetaCollector meta_collector_;
   /*! \brief Map from Var to Doc */
-  std::unordered_map<Var, Doc, ObjectPtrHash, ObjectPtrEqual> memo_var_;
+  std::unordered_map<tir::Var, Doc, ObjectPtrHash, ObjectPtrEqual> memo_var_;
   /*! \brief Map from Buffer to Doc */
   std::unordered_map<Buffer, Doc, ObjectPtrHash, ObjectPtrEqual> memo_buf_;
   /*! \brief Map from Buffer to Doc */
@@ -365,7 +358,7 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   /*! \brief name allocation map */
   std::unordered_map<std::string, int> name_alloc_map_;
 
-  friend class tvm::TextPrinter;
+  friend class TextPrinter;
 
   Doc VisitType_(const PrimTypeNode* node) override;
   Doc VisitType_(const PointerTypeNode* node) override;
@@ -396,7 +389,7 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   template <typename T>
   static Doc PrintConstScalar(DataType dtype, const T& data);
   Doc GetUniqueName(std::string prefix);
-  Doc AllocVar(const Var& var);
+  Doc AllocVar(const tir::Var& var);
   Doc AllocConst(const AllocateConst& var);
   Doc AllocBuf(const Buffer& buffer);
   Doc AllocProducer(const DataProducer& buffer);
@@ -412,11 +405,6 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
 String AsTVMScriptWithDiagnostic(const ObjectRef& mod, const String& tir_prefix, bool show_meta,
                                  runtime::TypedPackedFunc<std::string(Stmt)> annotate);
 
-}  // namespace tir
-}  // namespace tvm
-
-namespace tvm {
-
 class TextPrinter {
  public:
   explicit TextPrinter(bool show_meta_data,
@@ -441,7 +429,7 @@ class TextPrinter {
   /*! \brief Relay Text Printer */
   relay::RelayTextPrinter relay_text_printer_;
   /*! \brief TIR Text Printer */
-  tir::TIRTextPrinter tir_text_printer_;
+  TIRTextPrinter tir_text_printer_;
 
   bool GetVarName(::tvm::tir::Var v, std::string* s) { return tir_text_printer_.GetVarName(v, s); }
 
@@ -472,6 +460,7 @@ class TextPrinter {
 
   Doc PrintMod(const IRModule& mod);
 };
+}  // namespace relay
 }  // namespace tvm
 
-#endif  // TVM_PRINTER_TEXT_PRINTER_H_
+#endif  // TVM_RELAY_PRINTER_TEXT_PRINTER_H_
diff --git a/src/printer/tir_text_printer.cc b/src/relay/printer/tir_text_printer.cc
similarity index 97%
rename from src/printer/tir_text_printer.cc
rename to src/relay/printer/tir_text_printer.cc
index 4d74cc6d5a48..eb089bd0d7ed 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/relay/printer/tir_text_printer.cc
@@ -36,13 +36,13 @@
 #include <algorithm>
 #include <string>
 
-#include "../tir/transforms/ir_utils.h"
+#include "../../tir/transforms/ir_utils.h"
 #include "doc.h"
 #include "meta_data.h"
 #include "text_printer.h"
 
 namespace tvm {
-namespace tir {
+namespace relay {
 
 Doc TIRTextPrinter::Print(const ObjectRef& node) {
   if (!node.defined()) return Doc::Text("(nullptr)");
@@ -93,9 +93,9 @@ Doc TIRTextPrinter::PrintPrimFunc(const PrimFunc& prim_func) {
   memo_buf_.clear();
 
   // ordered vars associated with buffers, for consistent printing
-  std::vector<Var> buffer_vars_ordered;
+  std::vector<tir::Var> buffer_vars_ordered;
 
-  for (Var v : op->params) {
+  for (tir::Var v : op->params) {
     auto buffer_map_find = op->buffer_map.find(v);
     if (buffer_map_find != op->buffer_map.end()) {
       auto map_data = *buffer_map_find;
@@ -132,7 +132,7 @@ Doc TIRTextPrinter::PrintPrimFunc(const PrimFunc& prim_func) {
   if (memo_buf_.size() != 0) {
     Doc buffer_doc;
     std::vector<Doc> buffer_docs;
-    for (const Var& v : buffer_vars_ordered) {
+    for (const tir::Var& v : buffer_vars_ordered) {
       const Buffer buf = op->buffer_map[v];
       buffer_docs.push_back(BufferNode2Doc(buf.get(), Print(buf)));
     }
@@ -144,7 +144,7 @@ Doc TIRTextPrinter::PrintPrimFunc(const PrimFunc& prim_func) {
   if (op->buffer_map.size() != 0) {
     // print buffer_map
     std::vector<Doc> buffer_map_doc;
-    for (const Var& v : buffer_vars_ordered) {
+    for (const tir::Var& v : buffer_vars_ordered) {
       const Buffer buf = op->buffer_map[v];
       buffer_map_doc.push_back(Print(v) << ": " << Print(buf));
     }
@@ -302,9 +302,9 @@ Doc TIRTextPrinter::VisitExpr_(const CastNode* op) {
   return doc;
 }
 
-Doc TIRTextPrinter::VisitExpr_(const VarNode* op) {
-  const Var& var = GetRef<Var>(op);
-  return meta_->InMeta(var) ? meta_->GetMetaNode(var) : AllocVar(GetRef<Var>(op));
+Doc TIRTextPrinter::VisitExpr_(const tir::VarNode* op) {
+  const tir::Var& var = GetRef<tir::Var>(op);
+  return meta_->InMeta(var) ? meta_->GetMetaNode(var) : AllocVar(GetRef<tir::Var>(op));
 }
 
 #define TVM_DECLARE_TIR_TEXT_PRINTER_BINOP(OpName, OpString) \
@@ -401,13 +401,13 @@ Doc TIRTextPrinter::VisitExpr_(const BroadcastNode* op) {
   return doc;
 }
 
-Doc TIRTextPrinter::VisitExpr_(const LetNode* op) {
+Doc TIRTextPrinter::VisitExpr_(const tir::LetNode* op) {
   Doc doc;
   doc << "let " << Print(op->var) << " = " << Print(op->value) << " in " << Print(op->body);
   return doc;
 }
 
-Doc TIRTextPrinter::VisitExpr_(const CallNode* op) {
+Doc TIRTextPrinter::VisitExpr_(const tir::CallNode* op) {
   Doc doc;
   std::vector<Doc> func_args;
   if (auto* ptr_op = op->op.as<OpNode>()) {
@@ -771,7 +771,7 @@ Doc TIRTextPrinter::GetUniqueName(std::string prefix) {
   return Doc::Text(unique_prefix);
 }
 
-Doc TIRTextPrinter::AllocVar(const Var& var) {
+Doc TIRTextPrinter::AllocVar(const tir::Var& var) {
   const auto& it = memo_var_.find(var);
   if (it != memo_var_.end()) {
     return it->second;
@@ -831,7 +831,7 @@ Doc TIRTextPrinter::PrintBody(const Stmt& body, bool indent) {
   return doc;
 }
 
-bool TIRTextPrinter::GetVarName(Var v, std::string* s) {
+bool TIRTextPrinter::GetVarName(tir::Var v, std::string* s) {
   auto it = memo_var_.find(v);
   if (it == memo_var_.end()) {
     return false;
@@ -841,5 +841,5 @@ bool TIRTextPrinter::GetVarName(Var v, std::string* s) {
   return true;
 }
 
-}  // namespace tir
+}  // namespace relay
 }  // namespace tvm
diff --git a/src/printer/tir_text_printer_debug.cc b/src/relay/printer/tir_text_printer_debug.cc
similarity index 98%
rename from src/printer/tir_text_printer_debug.cc
rename to src/relay/printer/tir_text_printer_debug.cc
index 6c29558f722c..914d8877d2f7 100644
--- a/src/printer/tir_text_printer_debug.cc
+++ b/src/relay/printer/tir_text_printer_debug.cc
@@ -29,7 +29,7 @@
 #include <string>
 
 namespace tvm {
-namespace tir {
+namespace relay {
 
 std::optional<std::string> span_text(const Span& span) {
   if (!span.defined()) {
@@ -93,5 +93,5 @@ Doc TIRTextPrinterDebug::VisitExpr(const PrimExpr& e) {
   return TIRTextPrinter::VisitExpr(e);
 }
 
-}  // namespace tir
+}  // namespace relay
 }  // namespace tvm
diff --git a/src/printer/tir_text_printer_debug.h b/src/relay/printer/tir_text_printer_debug.h
similarity index 90%
rename from src/printer/tir_text_printer_debug.h
rename to src/relay/printer/tir_text_printer_debug.h
index d0046034cfbf..f7cb7a6554ec 100644
--- a/src/printer/tir_text_printer_debug.h
+++ b/src/relay/printer/tir_text_printer_debug.h
@@ -23,8 +23,8 @@
  *        that can be parsed by a parser.
  */
 
-#ifndef TVM_PRINTER_TIR_TEXT_PRINTER_DEBUG_H_
-#define TVM_PRINTER_TIR_TEXT_PRINTER_DEBUG_H_
+#ifndef TVM_RELAY_PRINTER_TIR_TEXT_PRINTER_DEBUG_H_
+#define TVM_RELAY_PRINTER_TIR_TEXT_PRINTER_DEBUG_H_
 
 #include <tuple>
 #include <vector>
@@ -32,7 +32,7 @@
 #include "text_printer.h"
 
 namespace tvm {
-namespace tir {
+namespace relay {
 
 class TIRTextPrinterDebug : public TIRTextPrinter {
  public:
@@ -64,7 +64,7 @@ class TIRTextPrinterDebug : public TIRTextPrinter {
   std::vector<std::tuple<const PrimExprNode*, size_t>> exprs_by_line_;
 };
 
-}  // namespace tir
+}  // namespace relay
 }  // namespace tvm
 
-#endif  // TVM_PRINTER_TIR_TEXT_PRINTER_DEBUG_H_
+#endif  // TVM_RELAY_PRINTER_TIR_TEXT_PRINTER_DEBUG_H_
diff --git a/src/printer/tvmscript_printer.cc b/src/relay/printer/tvmscript_printer.cc
similarity index 96%
rename from src/printer/tvmscript_printer.cc
rename to src/relay/printer/tvmscript_printer.cc
index c578bc53d3d3..096611095097 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/relay/printer/tvmscript_printer.cc
@@ -39,13 +39,15 @@
 #include <algorithm>
 #include <utility>
 
-#include "../tir/transforms/ir_utils.h"
+#include "../../tir/transforms/ir_utils.h"
 #include "doc.h"
 #include "meta_data.h"
 #include "text_printer.h"
 
 namespace tvm {
-namespace tir {
+namespace relay {
+
+using namespace tvm::tir;
 
 enum class ExprPrecedence : int {
   /*! \brief Identity(e.g., IntImm, Var) and function call(e.g., floordiv, min) */
@@ -77,14 +79,14 @@ enum class ExprPrecedence : int {
  */
 class BufferUsageFinder : public StmtExprVisitor {
  public:
-  static Map<Var, Array<Buffer>> FindUsage(Map<Var, Array<Buffer>> usage, Stmt body) {
+  static Map<tir::Var, Array<Buffer>> FindUsage(Map<tir::Var, Array<Buffer>> usage, Stmt body) {
     BufferUsageFinder visitor(std::move(usage));
     visitor.VisitStmt(body);
     return std::move(visitor.usage_);
   }
 
-  void VisitExpr_(const VarNode* op) final {
-    Var var = GetRef<Var>(op);
+  void VisitExpr_(const tir::VarNode* op) final {
+    tir::Var var = GetRef<tir::Var>(op);
     if (!usage_.count(var)) {
       usage_.Set(var, {});
     }
@@ -107,7 +109,7 @@ class BufferUsageFinder : public StmtExprVisitor {
   }
 
  private:
-  explicit BufferUsageFinder(Map<Var, Array<Buffer>> usage) : usage_(usage) {}
+  explicit BufferUsageFinder(Map<tir::Var, Array<Buffer>> usage) : usage_(usage) {}
 
   void VisitBuffer(const Buffer& buffer) {
     if (buffers_visited_.count(buffer.get())) {
@@ -124,7 +126,7 @@ class BufferUsageFinder : public StmtExprVisitor {
   }
 
   // The search result.
-  Map<Var, Array<Buffer>> usage_;
+  Map<tir::Var, Array<Buffer>> usage_;
   // The buffers that have been visited so far, to avoid duplicate
   // entries in the search result.
   std::unordered_set<const BufferNode*> buffers_visited_;
@@ -139,7 +141,7 @@ class BufferUsageFinder : public StmtExprVisitor {
  *          subexpression to decide whether or not parentheses is needed.
  */
 class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
-                         public ExprFunctor<Doc(const PrimExpr&, ExprPrecedence*)>,
+                         public tir::ExprFunctor<Doc(const PrimExpr&, ExprPrecedence*)>,
                          public TypeFunctor<Doc(const Type&)> {
  public:
   explicit TVMScriptPrinter(const String& tir_prefix, bool show_meta,
@@ -167,20 +169,20 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   /*! \brief meta data context */
   TextMetaDataContext meta_;
   /*! \brief meta collector */
-  MetaCollector meta_collector_;
+  relay::MetaCollector meta_collector_;
   /*! \brief map from Function to GlobalVar */
   std::unordered_map<const BaseFuncNode*, GlobalVar> func2var_;
   /*! \brief var collector (var defined by For/Loop/Block) */
-  std::unordered_set<const VarNode*> var_not_in_headers_;
+  std::unordered_set<const tir::VarNode*> var_not_in_headers_;
   /*!
    * \brief buffer collector
    *        (buffer defined in BufferMap, BufferAllocation and MatchBufferRegion)
    */
   std::unordered_set<const BufferNode*> buf_not_in_headers_;
   /*! \brief Map from Var to thread env name */
-  std::unordered_map<Var, String, ObjectPtrHash, ObjectPtrEqual> var_env_map_;
+  std::unordered_map<tir::Var, String, ObjectPtrHash, ObjectPtrEqual> var_env_map_;
   /*! \brief Map from Var to Doc */
-  std::unordered_map<Var, Doc, ObjectPtrHash, ObjectPtrEqual> memo_var_;
+  std::unordered_map<tir::Var, Doc, ObjectPtrHash, ObjectPtrEqual> memo_var_;
   /*! \brief Map from Buffer to Doc */
   std::unordered_map<Buffer, Doc, ObjectPtrHash, ObjectPtrEqual> memo_buf_;
   /*! \brief Map from Buffer to Declaration Doc */
@@ -194,7 +196,7 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   /*! \brief loop stack without annotations */
   std::vector<For> simple_loop_stack_;
   /*! \brief the maps from loop_vars to the loops */
-  std::unordered_map<const VarNode*, For> loop_var_map_;
+  std::unordered_map<const tir::VarNode*, For> loop_var_map_;
   /*!
    * \brief simple block vars remap from loop vars
    * simple_remap requires:
@@ -210,12 +212,12 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
    * LetStmt or Allocate that generates their data pointer, rather
    * than in the header.
    */
-  Map<Var, Array<Buffer>> buffer_var_usage_;
+  Map<tir::Var, Array<Buffer>> buffer_var_usage_;
   /*! \brief Analyzer to simplify some expressions. */
   arith::Analyzer ana_;
 
   Doc VisitExpr_(const CastNode* op, ExprPrecedence* out_precedence) override;
-  Doc VisitExpr_(const VarNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const tir::VarNode* op, ExprPrecedence* out_precedence) override;
   Doc VisitExpr_(const AddNode* op, ExprPrecedence* out_precedence) override;
   Doc VisitExpr_(const SubNode* op, ExprPrecedence* out_precedence) override;
   Doc VisitExpr_(const MulNode* op, ExprPrecedence* out_precedence) override;
@@ -243,8 +245,8 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc VisitExpr_(const LoadNode* op, ExprPrecedence* out_precedence) override;
   Doc VisitExpr_(const RampNode* op, ExprPrecedence* out_precedence) override;
   Doc VisitExpr_(const BroadcastNode* op, ExprPrecedence* out_precedence) override;
-  Doc VisitExpr_(const LetNode* op, ExprPrecedence* out_precedence) override;
-  Doc VisitExpr_(const CallNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const tir::LetNode* op, ExprPrecedence* out_precedence) override;
+  Doc VisitExpr_(const tir::CallNode* op, ExprPrecedence* out_precedence) override;
   Doc VisitExpr_(const ShuffleNode* op, ExprPrecedence* out_precedence) override;
   Doc VisitExpr_(const ReduceNode* op, ExprPrecedence* out_precedence) override;
   Doc VisitExprDefault_(const Object* op, ExprPrecedence* out_precedence) override;
@@ -297,9 +299,9 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   static Doc PrintString(const StringObj* op) { return Doc::StrLiteral(op->data); }
 
   Doc GetUniqueName(std::string prefix);
-  Doc AllocVar(const Var& var);
+  Doc AllocVar(const tir::Var& var);
   Doc AllocBuf(const Buffer& buffer);
-  void TryDeallocVar(const Var& var);
+  void TryDeallocVar(const tir::Var& var);
   bool ContainsOptionalInfo(const Stmt& stmt);
   /*!
    * \brief Check if a buffer declaration satisfies:
@@ -338,7 +340,9 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
    * \return A boolean indicating whether the input loop depends on previous loops
    */
   bool DependOnPrevLoops(const ForNode* for_op) {
-    auto f_check = [&var_map = this->loop_var_map_](const VarNode* v) { return var_map.count(v); };
+    auto f_check = [&var_map = this->loop_var_map_](const tir::VarNode* v) {
+      return var_map.count(v);
+    };
     return UsesVar(for_op->min, f_check) || UsesVar(for_op->extent, f_check);
   }
 
@@ -494,7 +498,7 @@ Doc TVMScriptPrinter::GetUniqueName(std::string prefix) {
   return Doc::Text(unique_prefix);
 }
 
-Doc TVMScriptPrinter::AllocVar(const Var& var) {
+Doc TVMScriptPrinter::AllocVar(const tir::Var& var) {
   const auto& it = memo_var_.find(var);
   if (it != memo_var_.end()) {
     return it->second;
@@ -522,8 +526,8 @@ Doc TVMScriptPrinter::AllocBufferDeclaration(const Buffer& buf) {
   if (!buf->strides.empty()) {
     doc << ", strides=" << Print(buf->strides);
   }
-  if (buf->elem_offset->IsInstance<VarNode>()) {
-    Var elem_offset = Downcast<Var>(buf->elem_offset);
+  if (buf->elem_offset->IsInstance<tir::VarNode>()) {
+    tir::Var elem_offset = Downcast<tir::Var>(buf->elem_offset);
     if (memo_var_.find(elem_offset) != memo_var_.end()) {
       doc << ", elem_offset=" << Print(buf->elem_offset);
     } else {
@@ -585,7 +589,7 @@ bool TVMScriptPrinter::ContainsOptionalInfo(const Stmt& stmt) {
  * \brief Try to dealloc vars out of space and leave the index to coming vars.
  * \note It is not a necessary step.
  */
-void TVMScriptPrinter::TryDeallocVar(const Var& var) {
+void TVMScriptPrinter::TryDeallocVar(const tir::Var& var) {
   auto it = memo_var_.find(var);
   ICHECK(it != memo_var_.end());
   std::string print_name = it->second.str();
@@ -695,7 +699,7 @@ Doc TVMScriptPrinter::PrintCommReducer(const CommReducerNode* op) {
   int n_var = static_cast<int>(op->rhs.size());
 
   doc << tir_prefix_ << ".comm_reducer(lambda ";
-  for (const Var& v_lhs : op->lhs) {
+  for (const tir::Var& v_lhs : op->lhs) {
     doc << Print(v_lhs) << ", ";
   }
   for (int i = 0; i < n_var; ++i) {
@@ -789,10 +793,10 @@ Doc TVMScriptPrinter::VisitExpr_(const CastNode* op, ExprPrecedence* out_precede
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const VarNode* op, ExprPrecedence* out_precedence) {
+Doc TVMScriptPrinter::VisitExpr_(const tir::VarNode* op, ExprPrecedence* out_precedence) {
   *out_precedence = ExprPrecedence::kIdentity;
-  const Var& var = GetRef<Var>(op);
-  return meta_.InMeta(var) ? meta_.GetMetaNode(var) : AllocVar(GetRef<Var>(op));
+  const tir::Var& var = GetRef<tir::Var>(op);
+  return meta_.InMeta(var) ? meta_.GetMetaNode(var) : AllocVar(GetRef<tir::Var>(op));
 }
 
 bool WillPrintConstScalar(const PrimExpr& expr) {
@@ -938,7 +942,7 @@ Doc TVMScriptPrinter::VisitExpr_(const BroadcastNode* op, ExprPrecedence* out_pr
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const LetNode* op, ExprPrecedence* out_precedence) {
+Doc TVMScriptPrinter::VisitExpr_(const tir::LetNode* op, ExprPrecedence* out_precedence) {
   *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   doc << tir_prefix_ << ".let(" << Print(op->var) << ", " << Print(op->value) << ", "
@@ -946,7 +950,7 @@ Doc TVMScriptPrinter::VisitExpr_(const LetNode* op, ExprPrecedence* out_preceden
   return doc;
 }
 
-Doc TVMScriptPrinter::VisitExpr_(const CallNode* op, ExprPrecedence* out_precedence) {
+Doc TVMScriptPrinter::VisitExpr_(const tir::CallNode* op, ExprPrecedence* out_precedence) {
   *out_precedence = ExprPrecedence::kIdentity;
   Doc doc;
   if (auto* ptr_op = op->op.as<OpNode>()) {
@@ -1090,7 +1094,7 @@ Doc TVMScriptPrinter::VisitStmt_(const BufferRealizeNode* op) {
 namespace {
 
 bool IsAllocateDeclBufferPattern(const AllocateNode* allocate) {
-  const Var& buffer_var = allocate->buffer_var;
+  const tir::Var& buffer_var = allocate->buffer_var;
   const DeclBufferNode* decl_buffer = allocate->body.as<DeclBufferNode>();
   if (!decl_buffer) {
     return false;
@@ -1468,8 +1472,8 @@ Doc TVMScriptPrinter::PrintBlockVars(const BlockRealizeNode* op) {
   auto is_simple_remap = [this, &expr_equal](const IterVar& iter_var,
                                              const PrimExpr& value) -> bool {
     if (iter_var->iter_type != kDataPar && iter_var->iter_type != kCommReduce) return false;
-    if (!value->IsInstance<VarNode>()) return false;
-    const Var& var = Downcast<Var>(value);
+    if (!value->IsInstance<tir::VarNode>()) return false;
+    const tir::Var& var = Downcast<tir::Var>(value);
     auto it = loop_var_map_.find(var.get());
     return it != loop_var_map_.end() && expr_equal(it->second->min, iter_var->dom->min) &&
            expr_equal(it->second->extent, iter_var->dom->extent);
@@ -1763,7 +1767,7 @@ Doc TVMScriptPrinter::PrintPrimFunc(const PrimFunc& primFunc) {
   }
   // print var declaration
   Doc header_var;
-  std::vector<const VarNode*> vars;
+  std::vector<const tir::VarNode*> vars;
   for (const auto& it : memo_var_) {
     if (var_not_in_headers_.find(it.first.get()) == var_not_in_headers_.end()) {
       vars.push_back(it.first.get());
@@ -1777,20 +1781,21 @@ Doc TVMScriptPrinter::PrintPrimFunc(const PrimFunc& primFunc) {
     }
   }
   if (!vars.empty()) {
-    std::sort(vars.begin(), vars.end(), [&](const VarNode* a, const VarNode* b) {
-      return memo_var_[GetRef<Var>(a)].str() < memo_var_[GetRef<Var>(b)].str();
+    std::sort(vars.begin(), vars.end(), [&](const tir::VarNode* a, const tir::VarNode* b) {
+      return memo_var_[GetRef<tir::Var>(a)].str() < memo_var_[GetRef<tir::Var>(b)].str();
     });
     for (const auto& var : vars) {
-      auto type = GetRef<Var>(var)->type_annotation;
+      auto type = GetRef<tir::Var>(var)->type_annotation;
       if (auto* ptr_type = type.as<PointerTypeNode>()) {
         auto* prim_type = ptr_type->element_type.as<PrimTypeNode>();
         ICHECK(prim_type);
-        header_var << Doc::NewLine() << Print(GetRef<Var>(var)) << " = " << tir_prefix_
+        header_var << Doc::NewLine() << Print(GetRef<tir::Var>(var)) << " = " << tir_prefix_
                    << ".buffer_var(";
         header_var << PrintDType(prim_type->dtype) << ", "
                    << Doc::StrLiteral(ptr_type->storage_scope) << ")";
       } else {
-        header_var << Doc::NewLine() << Print(GetRef<Var>(var)) << " = " << tir_prefix_ << ".var(";
+        header_var << Doc::NewLine() << Print(GetRef<tir::Var>(var)) << " = " << tir_prefix_
+                   << ".var(";
         header_var << PrintDType(var->dtype) << ")";
       }
     }
@@ -2013,5 +2018,5 @@ String AsTVMScriptWithDiagnostic(const ObjectRef& mod, const String& tir_prefix,
 
 TVM_REGISTER_GLOBAL("script.AsTVMScriptWithDiagnostic").set_body_typed(AsTVMScriptWithDiagnostic);
 
-}  // namespace tir
+}  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/transforms/merge_compiler_regions.cc b/src/relay/transforms/merge_compiler_regions.cc
index d18c17e63ca1..d70c7480e9e5 100644
--- a/src/relay/transforms/merge_compiler_regions.cc
+++ b/src/relay/transforms/merge_compiler_regions.cc
@@ -30,9 +30,9 @@
  * as external functions.
  */
 
-#include <tvm/ir/error.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc
index f6cdf6d1ca18..32ca2878fdc9 100644
--- a/src/relay/transforms/partition_graph.cc
+++ b/src/relay/transforms/partition_graph.cc
@@ -29,10 +29,10 @@
  * external functions, and they will use the provided compiler for codegen.
  */
 
-#include <tvm/ir/error.h>
 #include <tvm/ir/module.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/error.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
diff --git a/src/script/printer/printer.cc b/src/script/printer/printer.cc
index 9ebdcb1e99b3..878b380a3717 100644
--- a/src/script/printer/printer.cc
+++ b/src/script/printer/printer.cc
@@ -23,18 +23,11 @@ namespace tvm {
 namespace script {
 namespace printer {
 
-String Script(ObjectRef obj, int indent_spaces, bool print_line_numbers, int num_context_lines,
-              Optional<ObjectPath> path_to_underline) {
-  return DocToPythonScript(IRDocsifier()->AsDoc(obj, ObjectPath::Root()), indent_spaces,
-                           print_line_numbers, num_context_lines, path_to_underline);
-}
-
 Default* Default::Instance() {
   static Default inst;
   return &inst;
 }
 
-TVM_REGISTER_GLOBAL("script.printer.Script").set_body_typed(Script);
 TVM_REGISTER_GLOBAL("script.printer.DefaultIRPrefix")
     .set_body_typed([](std::string ir, std::string prefix) { Default::Prefix(ir) = prefix; });
 TVM_REGISTER_GLOBAL("script.printer.DefaultBufferDType")
diff --git a/src/tir/schedule/error.cc b/src/tir/schedule/error.cc
index 55d751c3311e..1aae0202ac42 100644
--- a/src/tir/schedule/error.cc
+++ b/src/tir/schedule/error.cc
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include "../../printer/text_printer.h"
 #include "./utils.h"
 
 namespace tvm {
@@ -52,10 +51,11 @@ String ScheduleError::RenderReport(const String& primitive) const {
             }
             return it->second;
           });
-
+  const auto* f = runtime::Registry::Get("script.AsTVMScriptWithDiagnostic");
+  ICHECK(f != nullptr);
   os << "ScheduleError: An error occurred in the schedule primitive '" << primitive
      << "'.\n\nThe IR with diagnostic is:\n"
-     << AsTVMScriptWithDiagnostic(mod, "T", false, annotate);
+     << ((*f)(mod, "T", false, annotate).operator String());
 
   // print error message
   os << "Error message: " << msg;
diff --git a/src/tir/transforms/install_debug_spans.cc b/src/tir/transforms/install_debug_spans.cc
index bc9002ee841f..c97070e1bf89 100644
--- a/src/tir/transforms/install_debug_spans.cc
+++ b/src/tir/transforms/install_debug_spans.cc
@@ -30,7 +30,7 @@
 #include <string>
 #include <utility>
 
-#include "../../printer/tir_text_printer_debug.h"
+#include "../../relay/printer/tir_text_printer_debug.h"
 
 namespace tvm {
 namespace tir {
@@ -42,7 +42,7 @@ Stmt DebugInfoInstaller::InstallInfo(const std::string& name, const Stmt& stmt)
 
 DebugInfoInstaller::DebugInfoInstaller(const Stmt& stmt, const std::string& filename) {
   // Determine the line that each stmt/expr will be printed on
-  tvm::tir::TIRTextPrinterDebug printer(false);
+  tvm::relay::TIRTextPrinterDebug printer(false);
 
   // Fill in the stmts and exprs' line info
   auto result = printer.Print(stmt).str();
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
index 5ea6d7e5de6a..08fa01f0b39b 100644
--- a/tests/python/relay/test_ir_parser.py
+++ b/tests/python/relay/test_ir_parser.py
@@ -14,15 +14,15 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+from typing import Union
+
 import numpy as np
 import pytest
-
 import tvm
-import tvm.testing
-from tvm import relay
 import tvm.relay.testing
+import tvm.testing
 from numpy import isclose
-from typing import Union
+from tvm import relay
 
 SEMVER = '#[version = "0.0.5"]\n'
 
@@ -74,7 +74,7 @@ def graph_equal(lhs, rhs):
 
 
 def roundtrip_expr(expr):
-    text = tvm.relay.Expr.astext(expr, show_meta_data=False)
+    text = expr.astext()
     x = tvm.parser.parse_expr(text)
     assert_graph_equal(x, expr)
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
index bb9602279404..f40d9427490d 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py
@@ -885,5 +885,4 @@ def max_pool_blocked_compute(height, width, channel):
 
 
 if __name__ == "__main__":
-    # tvm.testing.main()
-    test_cache_read_specify_consumer()
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_nodes.py b/tests/python/unittest/test_tir_nodes.py
index d4ae84a556d7..2806c7b2fc52 100644
--- a/tests/python/unittest/test_tir_nodes.py
+++ b/tests/python/unittest/test_tir_nodes.py
@@ -343,7 +343,6 @@ def test_prim_func():
 
     func = tvm.tir.PrimFunc([x, y, b], stmt)
     # make sure we can print
-    func.astext()
     assert func.buffer_map[func.params[2]].same_as(b)
 
     assert len(func.buffer_map) == 1
@@ -399,130 +398,5 @@ def test_intimm_cond():
     assert x == 1
 
 
-def test_block_blockrealize():
-    x = tvm.tir.Var("x", "int32")
-    y = tvm.tir.Var("y", "int32")
-    vx = tvm.tir.IterVar((16, 16), "vx", 0)
-    vx_var = vx.var
-    vy = tvm.tir.IterVar((16, 16), "vy", 2)
-    vy_var = vy.var
-    A = tvm.tir.decl_buffer((16), "float32")
-    B = tvm.tir.decl_buffer((16, 16), "float32")
-    alloc_buffer = tvm.tir.decl_buffer((16, 16), "float32")
-    match_buffer = tvm.tir.decl_buffer((16, 16), "float32")
-    init_body = tvm.tir.BufferStore(A, 0.0, [vx_var])
-    body = tvm.tir.BufferStore(
-        A,
-        tvm.tir.BufferLoad(A, [vx_var]) + tvm.tir.BufferLoad(B, [vx_var, vy_var]),
-        [vx_var],
-    )
-    reads = [
-        tvm.tir.BufferRegion(
-            B, [tvm.ir.Range.from_min_extent(vx_var, 1), tvm.ir.Range.from_min_extent(vy_var, 1)]
-        )
-    ]
-    writes = [tvm.tir.BufferRegion(A, [tvm.ir.Range.from_min_extent(vx_var, 1)])]
-    block_match_buffer = tvm.tir.MatchBufferRegion(
-        match_buffer, tvm.tir.BufferRegion(B, [tvm.ir.Range(0, 16), tvm.ir.Range(0, 16)])
-    )
-
-    block = tvm.tir.Block(
-        [vx, vy],
-        reads,
-        writes,
-        "block",
-        body,
-        init=init_body,
-        alloc_buffers=[alloc_buffer],
-        match_buffers=[block_match_buffer],
-        annotations={"attr_key": "attr_value"},
-    )
-
-    # Checking Block
-    assert isinstance(block, tvm.tir.Block)
-    # Checking iter_vars
-    assert block.iter_vars[0] == vx
-    assert block.iter_vars[1] == vy
-    # Checking reads/writes region
-    assert isinstance(block.reads[0], tvm.tir.BufferRegion)
-    assert block.reads[0].buffer == B
-    assert block.reads[0].region[0].min == vx_var
-    assert block.reads[0].region[1].min == vy_var
-    assert isinstance(block.writes[0], tvm.tir.BufferRegion)
-    assert block.writes[0].buffer == A
-    assert block.writes[0].region[0].min == vx_var
-    assert block.writes[0].region[0].extent == 1
-    # Checking name_hint
-    assert block.name_hint == "block"
-    # Checking body
-    assert block.body == body
-    # Checking init
-    assert block.init == init_body
-    # Checking alloc_buffers
-    assert block.alloc_buffers[0] == alloc_buffer
-    # Checking match_buffers
-    assert block.match_buffers[0].buffer == match_buffer
-    assert isinstance(block.match_buffers[0].source, tvm.tir.BufferRegion)
-    assert block.match_buffers[0].source.buffer == B
-    assert block.match_buffers[0].source.region[0].min == 0
-    assert block.match_buffers[0].source.region[0].extent == 16
-
-    # Checking BlockRealize
-    block_realize = tvm.tir.BlockRealize([x, y], tvm.tir.const(True, "bool"), block)
-    assert isinstance(block_realize, tvm.tir.BlockRealize)
-    assert block_realize.iter_values[0] == x
-    assert block_realize.iter_values[1] == y
-    assert block_realize.predicate == tvm.tir.const(True, "bool")
-    assert block_realize.block == block
-
-    # make sure we can print using ReprPrinter
-    str(block)
-    str(block_realize)
-    # make sure we can print using TIRTextPrinter
-    func = tvm.tir.PrimFunc([], block_realize)
-    output = func.astext()
-    assert output.find("meta[tir.BlockRealise]") == -1
-    assert output.find("bind") != -1
-    assert output.find("reads") != -1
-    assert output.find("writes") != -1
-    assert output.find("alloc_buffer") != -1
-    assert output.find("match_buffer") != -1
-    assert output.find("attr") != -1
-    assert output.find("with init()") != -1
-
-
-def test_tir_allocate():
-    dtype = "int8"
-    storage_scope = "global"
-    ptype = tvm.ir.PointerType(tvm.ir.PrimType(dtype), storage_scope)
-    a = te.var("buffer", ptype)
-    allocate = tvm.tir.Allocate(
-        buffer_var=a,
-        dtype=dtype,
-        extents=[2, 2],
-        condition=tvm.get_global_func("tir.const_true")(dtype, None),
-        body=tvm.tir.Evaluate(2 + 1),
-        annotations={
-            "attr1": "foo",
-            "attr2": "bar",
-        },
-    )
-    assert allocate.buffer_var == a
-    assert allocate.dtype == "int8"
-    assert list(allocate.extents) == [2, 2]
-    assert allocate.annotations["attr1"] == "foo"
-    assert allocate.annotations["attr2"] == "bar"
-
-    # make sure we can print using TIRTextPrinter
-    func = tvm.tir.PrimFunc([], allocate)
-    output = func.astext()
-    assert (
-        output.find(
-            'allocate(buffer: Pointer(global int8), int8, [2, 2]), storage_scope = global, annotations = {"attr2": "bar", "attr1": "foo"})'
-        )
-        != -1
-    )
-
-
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_lower_warp_memory.py b/tests/python/unittest/test_tir_transform_lower_warp_memory.py
index 48af3ebaf529..d4abc26bb204 100644
--- a/tests/python/unittest/test_tir_transform_lower_warp_memory.py
+++ b/tests/python/unittest/test_tir_transform_lower_warp_memory.py
@@ -14,14 +14,13 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import numpy as np
+import pytest
 import tvm
+import tvm.testing
 from tvm import te
 from tvm.contrib.nvcc import have_fp16
 
-import numpy as np
-import tvm.testing
-import pytest
-
 
 @tvm.testing.requires_cuda
 def test_lower_warp_memory_local_scope():
@@ -320,7 +319,7 @@ def test_lower_warp_memory_same_thread():
     fdevice = tvm.tir.transform.SplitHostDevice()(mod)["f_kernel0"]
     mod = tvm.IRModule.from_expr(fdevice)
     fdevice = tvm.tir.transform.LowerWarpMemory()(mod)["f_kernel0"]
-    assert "tvm_warp_shuffle" not in fdevice.astext()
+    assert "tvm_warp_shuffle" not in fdevice.script()
 
 
 @tvm.testing.requires_cuda
diff --git a/tests/python/unittest/test_tvmscript_printer_syntax_sugar.py b/tests/python/unittest/test_tvmscript_printer_syntax_sugar.py
deleted file mode 100644
index 1bccb8188c9d..000000000000
--- a/tests/python/unittest/test_tvmscript_printer_syntax_sugar.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import tvm.testing
-from tvm.script.parser import tir as T
-from tvm.script import script
-
-
-def _test(obj, expected: str):
-    assert script(obj).strip() == expected.strip()
-
-
-def test_remap():
-    @T.prim_func
-    def block_with_remap_implicitly():
-        for i0, i1, i2, i3, i4, i5 in T.grid(128, 128, 128, 128, 128, 128):
-            with T.block("update"):
-                v0 = T.axis.spatial(128, i0 + 1)
-                v1 = T.axis.spatial(128, i1)
-                v2 = T.axis.reduce(128, i2)
-                v3 = T.axis.spatial(128, i3 - 1)
-                v4 = T.axis.reduce(128, i4)
-                v5 = T.axis.spatial(128, i5)
-                pass
-
-    @T.prim_func
-    def block_with_remap_explicitly():
-        for i0, i1, i2, i3, i4, i5 in T.grid(128, 128, 128, 128, 128, 128):
-            with T.block("update"):
-                v0 = T.axis.spatial(128, i0 + 1)
-                v1, v2 = T.axis.remap("SR", [i1, i2])
-                v3 = T.axis.spatial(128, i3 - 1)
-                v4, v5 = T.axis.remap("RS", [i4, i5])
-                pass
-
-    expected_output = """@T.prim_func
-def main():
-    with T.block("root"):
-        T.reads()
-        T.writes()
-        for i0, i1, i2, i3, i4, i5 in T.grid(128, 128, 128, 128, 128, 128):
-            with T.block("update"):
-                v0 = T.axis.spatial(128, i0 + 1)
-                v1, v2 = T.axis.remap("SR", [i1, i2])
-                v3 = T.axis.spatial(128, i3 - 1)
-                v4, v5 = T.axis.remap("RS", [i4, i5])
-                T.reads()
-                T.writes()
-                T.evaluate(0)"""
-    _test(block_with_remap_implicitly, expected_output)
-    _test(block_with_remap_explicitly, expected_output)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/unittest/test_tvmscript_printer_tir.py b/tests/python/unittest/test_tvmscript_printer_tir.py
index 9c15fbc88949..d62a1cd12c28 100644
--- a/tests/python/unittest/test_tvmscript_printer_tir.py
+++ b/tests/python/unittest/test_tvmscript_printer_tir.py
@@ -598,6 +598,47 @@ def test_tuple_type():
     _assert_print(obj, "T.Tuple(T.float32, T.int32)")
 
 
+def test_remap():
+    from tvm.script import tir as T
+
+    @T.prim_func
+    def block_with_remap_implicitly():
+        for i0, i1, i2, i3, i4, i5 in T.grid(128, 128, 128, 128, 128, 128):
+            with T.block("update"):
+                v0 = T.axis.spatial(128, i0 + 1)
+                v1 = T.axis.spatial(128, i1)
+                v2 = T.axis.reduce(128, i2)
+                v3 = T.axis.spatial(128, i3 - 1)
+                v4 = T.axis.reduce(128, i4)
+                v5 = T.axis.spatial(128, i5)
+
+    @T.prim_func
+    def block_with_remap_explicitly():
+        for i0, i1, i2, i3, i4, i5 in T.grid(128, 128, 128, 128, 128, 128):
+            with T.block("update"):
+                v0 = T.axis.spatial(128, i0 + 1)
+                v1, v2 = T.axis.remap("SR", [i1, i2])
+                v3 = T.axis.spatial(128, i3 - 1)
+                v4, v5 = T.axis.remap("RS", [i4, i5])
+
+    expected_output = """@T.prim_func
+def main():
+    with T.block("root"):
+        T.reads()
+        T.writes()
+        for i0, i1, i2, i3, i4, i5 in T.grid(128, 128, 128, 128, 128, 128):
+            with T.block("update"):
+                v0 = T.axis.spatial(128, i0 + 1)
+                v1, v2 = T.axis.remap("SR", [i1, i2])
+                v3 = T.axis.spatial(128, i3 - 1)
+                v4, v5 = T.axis.remap("RS", [i4, i5])
+                T.reads()
+                T.writes()
+                T.evaluate(0)"""
+    _assert_print(block_with_remap_explicitly, expected_output)
+    _assert_print(block_with_remap_implicitly, expected_output)
+
+
 if __name__ == "__main__":
     test_prim_func()
     test_block_realize()
@@ -639,3 +680,4 @@ def test_tuple_type():
     test_prim_type()
     test_pointer_type()
     test_tuple_type()
+    test_remap()

From 5bb7344eb59b4967f0d9152cd7815670c71995cd Mon Sep 17 00:00:00 2001
From: Alexey Yazev <113356454+Alexey-Yazev@users.noreply.github.com>
Date: Wed, 18 Jan 2023 18:58:07 +0400
Subject: [PATCH 191/286] [microNPU] Upgrade to 22.08 version of Arm(R)
 Ethos(TM)-U NPU drivers (#13529)

This PR upgrades the Arm(R) Ethos(TM)-U NPU drivers to version 22.08. Now the tests are run on U55 and U65 before that only on U55.
---
 apps/microtvm/cmsisnn/Makefile                | 19 +++++++----
 apps/microtvm/cmsisnn/src/demo_bare_metal.c   |  4 +--
 apps/microtvm/ethosu/Makefile                 | 24 ++++++++------
 apps/microtvm/ethosu/src/demo_bare_metal.c    |  4 +--
 apps/microtvm/ethosu/src/demo_freertos.c      |  4 +--
 ci/jenkins/docker-images.ini                  |  2 +-
 .../ubuntu_install_ethosu_driver_stack.sh     | 12 ++++---
 .../how_to/work_with_microtvm/micro_ethosu.py |  4 +--
 python/tvm/micro/testing/aot_test_utils.py    |  8 ++---
 tests/python/contrib/test_cmsisnn/utils.py    |  4 +--
 tests/python/contrib/test_ethosu/infra.py     |  4 +--
 tests/python/relay/aot/corstone300.mk         | 33 +++++++++++--------
 12 files changed, 71 insertions(+), 51 deletions(-)

diff --git a/apps/microtvm/cmsisnn/Makefile b/apps/microtvm/cmsisnn/Makefile
index 2fc4d4fa06c9..00c4893317e1 100644
--- a/apps/microtvm/cmsisnn/Makefile
+++ b/apps/microtvm/cmsisnn/Makefile
@@ -35,15 +35,14 @@ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I${STANDALONE_CRT_PATH}/include \
 	-I${STANDALONE_CRT_PATH}/src/runtime/crt/include \
 	-I${PWD}/include \
-	-I${CORSTONE_300_PATH} \
+	-I${ETHOSU_PLATFORM_PATH}/drivers/uart/include \
 	-I${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Include/ \
 	-I${CMSIS_PATH}/CMSIS/Core/Include \
 	-I${CMSIS_PATH}/CMSIS-NN/Include \
 	-I${CMSIS_PATH}/CMSIS/DSP/Include \
 	-I$(abspath $(BUILD_DIR))/codegen/host/include
-CMSIS_NN_CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=$(abspath $(BUILD_DIR))/../arm-none-eabi-gcc.cmake \
-	-DTARGET_CPU=cortex-m55 \
-	-DBUILD_CMSIS_NN_FUNCTIONS=YES
+CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=$(abspath $(BUILD_DIR))/../arm-none-eabi-gcc.cmake \
+	-DTARGET_CPU=cortex-m55
 PKG_LDFLAGS = -lm -specs=nosys.specs -static -T corstone300.ld
 
 $(ifeq VERBOSE,1)
@@ -57,7 +56,7 @@ CODEGEN_SRCS = $(wildcard $(abspath $(BUILD_DIR))/codegen/host/src/*.c)
 CODEGEN_OBJS = $(subst .c,.o,$(CODEGEN_SRCS))
 CMSIS_STARTUP_SRCS = $(wildcard ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
 CMSIS_NN_SRCS = $(shell find ${CMSIS_PATH}/CMSIS-NN/Source/*/*.c)
-UART_SRCS = $(wildcard ${CORSTONE_300_PATH}/*.c)
+CORSTONE_300_SRCS = $(wildcard ${CORSTONE_300_PATH}/*.c)
 
 demo: $(BUILD_DIR)/demo
 
@@ -89,9 +88,15 @@ ${BUILD_DIR}/libcmsis_nn.a: $(CMSIS_NN_SRCS)
 	$(QUIET)$(AR) -cr $(abspath $(BUILD_DIR)/libcmsis_nn.a) $(abspath $(BUILD_DIR))/libcmsis_nn/*.o
 	$(QUIET)$(RANLIB) $(abspath $(BUILD_DIR)/libcmsis_nn.a)
 
+# Build UART driver
+${BUILD_DIR}/ethosu_core_platform/libethosu_uart_cmsdk_apb.a:
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)cd ${ETHOSU_PLATFORM_PATH}/drivers/uart && $(CMAKE) -B $(abspath $(BUILD_DIR)/ethosu_core_platform) $(CMAKE_FLAGS)
+	$(QUIET)cd $(abspath $(BUILD_DIR)/ethosu_core_platform) && $(MAKE)
+
 # Build demo application
-$(BUILD_DIR)/demo: $(DEMO_MAIN) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o \
-	${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/libcmsis_nn.a
+$(BUILD_DIR)/demo: $(DEMO_MAIN) $(CORSTONE_300_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o \
+	${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/libcmsis_nn.a ${BUILD_DIR}/ethosu_core_platform/libethosu_uart_cmsdk_apb.a
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS)
 
diff --git a/apps/microtvm/cmsisnn/src/demo_bare_metal.c b/apps/microtvm/cmsisnn/src/demo_bare_metal.c
index f17fe859f219..80b298d8b2d8 100644
--- a/apps/microtvm/cmsisnn/src/demo_bare_metal.c
+++ b/apps/microtvm/cmsisnn/src/demo_bare_metal.c
@@ -21,14 +21,14 @@
 #include <tvm_runtime.h>
 #include <tvmgen_detection.h>
 
-#include "uart.h"
+#include "uart_stdout.h"
 
 // Header files generated by convert_image.py
 #include "inputs.h"
 #include "outputs.h"
 
 int main(int argc, char** argv) {
-  uart_init();
+  UartStdOutInit();
   printf("Starting Demo\n");
 
   printf("Running detection inference\n");
diff --git a/apps/microtvm/ethosu/Makefile b/apps/microtvm/ethosu/Makefile
index 630a2082473d..5a20efc5db26 100644
--- a/apps/microtvm/ethosu/Makefile
+++ b/apps/microtvm/ethosu/Makefile
@@ -36,20 +36,18 @@ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I${STANDALONE_CRT_PATH}/include \
 	-I${STANDALONE_CRT_PATH}/src/runtime/crt/include \
 	-I${PWD}/include \
-	-I${CORSTONE_300_PATH} \
-	-I${ETHOSU_PATH}/core_driver/include \
+	-I${ETHOSU_DRIVER_PATH}/include \
+	-I${ETHOSU_PLATFORM_PATH}/drivers/uart/include \
 	-I${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Include/ \
 	-I${CMSIS_PATH}/CMSIS/Core/Include \
 	-I${CMSIS_PATH}/CMSIS-NN/Include \
 	-I${CMSIS_PATH}/CMSIS/DSP/Include \
 	-I$(abspath $(BUILD_DIR))/codegen/host/include \
 	-DETHOSU_TEST_RUNNER_TOL=${ETHOSU_TEST_RUNNER_TOL}
-DRIVER_CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=$(abspath $(BUILD_DIR))/../arm-none-eabi-gcc.cmake \
-	-DETHOSU_LOG_SEVERITY=debug \
-	-DCMAKE_SYSTEM_PROCESSOR=cortex-m55
-CMSIS_NN_CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=$(abspath $(BUILD_DIR))/../arm-none-eabi-gcc.cmake \
-	-DTARGET_CPU=cortex-m55 \
-	-DBUILD_CMSIS_NN_FUNCTIONS=YES
+CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=$(abspath $(BUILD_DIR))/../arm-none-eabi-gcc.cmake \
+	-DTARGET_CPU=cortex-m55
+DRIVER_CMAKE_FLAGS = $(CMAKE_FLAGS) \
+	-DETHOSU_LOG_SEVERITY=debug
 PKG_LDFLAGS = -lm -specs=nosys.specs -static -T corstone300.ld
 
 $(ifeq VERBOSE,1)
@@ -79,7 +77,7 @@ CODEGEN_SRCS = $(wildcard $(abspath $(BUILD_DIR))/codegen/host/src/*.c)
 CODEGEN_OBJS = $(subst .c,.o,$(CODEGEN_SRCS))
 CMSIS_STARTUP_SRCS = $(wildcard ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
 CMSIS_NN_SOFTMAX_SRCS = $(shell find ${CMSIS_PATH}/CMSIS-NN/Source/SoftmaxFunctions/*.c)
-UART_SRCS = $(wildcard ${CORSTONE_300_PATH}/*.c)
+CORSTONE_300_SRCS = $(wildcard ${CORSTONE_300_PATH}/*.c)
 
 demo: $(BUILD_DIR)/demo
 
@@ -117,7 +115,13 @@ ${BUILD_DIR}/libcmsis_nn_softmax.a: $(CMSIS_NN_SOFTMAX_SRCS)
 	$(QUIET)$(AR) -cr $(abspath $(BUILD_DIR)/libcmsis_nn_softmax.a) $(abspath $(BUILD_DIR))/libcmsis_nn/*.o
 	$(QUIET)$(RANLIB) $(abspath $(BUILD_DIR)/libcmsis_nn_softmax.a)
 
-$(BUILD_DIR)/demo: $(DEMO_MAIN) src/tvm_ethosu_runtime.c $(FREERTOS_SOURCES) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/ethosu_core_driver/libethosu_core_driver.a ${BUILD_DIR}/libcmsis_nn_softmax.a
+# Build UART driver
+${BUILD_DIR}/ethosu_core_platform/libethosu_uart_cmsdk_apb.a:
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)cd ${ETHOSU_PLATFORM_PATH}/drivers/uart && $(CMAKE) -B $(abspath $(BUILD_DIR)/ethosu_core_platform) $(CMAKE_FLAGS)
+	$(QUIET)cd $(abspath $(BUILD_DIR)/ethosu_core_platform) && $(MAKE)
+
+$(BUILD_DIR)/demo: $(DEMO_MAIN) src/tvm_ethosu_runtime.c $(FREERTOS_SOURCES) $(CORSTONE_300_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/ethosu_core_driver/libethosu_core_driver.a ${BUILD_DIR}/libcmsis_nn_softmax.a ${BUILD_DIR}/ethosu_core_platform/libethosu_uart_cmsdk_apb.a
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ $^ $(PKG_LDFLAGS)
 
diff --git a/apps/microtvm/ethosu/src/demo_bare_metal.c b/apps/microtvm/ethosu/src/demo_bare_metal.c
index febc2121299f..1bef90cfb301 100644
--- a/apps/microtvm/ethosu/src/demo_bare_metal.c
+++ b/apps/microtvm/ethosu/src/demo_bare_metal.c
@@ -21,7 +21,7 @@
 #include <tvm_runtime.h>
 
 #include "ethosu_mod.h"
-#include "uart.h"
+#include "uart_stdout.h"
 
 // Header files generated by convert_image.py and convert_labels.py
 #include "inputs.h"
@@ -31,7 +31,7 @@
 int abs(int v) { return v * ((v > 0) - (v < 0)); }
 
 int main(int argc, char** argv) {
-  uart_init();
+  UartStdOutInit();
   printf("Starting Demo\n");
   EthosuInit();
 
diff --git a/apps/microtvm/ethosu/src/demo_freertos.c b/apps/microtvm/ethosu/src/demo_freertos.c
index 4fa363a50e4d..e59d7aeaccf5 100644
--- a/apps/microtvm/ethosu/src/demo_freertos.c
+++ b/apps/microtvm/ethosu/src/demo_freertos.c
@@ -24,7 +24,7 @@
 #include <tvm_runtime.h>
 
 #include "ethosu_mod.h"
-#include "uart.h"
+#include "uart_stdout.h"
 
 // Header files generated by convert_image.py and convert_labels.py
 #include "inputs.h"
@@ -46,7 +46,7 @@ static QueueHandle_t xQueue = NULL;
 
 int main(void) {
   // Platform UART
-  uart_init();
+  UartStdOutInit();
   // NPU
   EthosuInit();
 
diff --git a/ci/jenkins/docker-images.ini b/ci/jenkins/docker-images.ini
index d1e34487bd61..53ad2092ea4f 100644
--- a/ci/jenkins/docker-images.ini
+++ b/ci/jenkins/docker-images.ini
@@ -18,7 +18,7 @@
 # This data file is read during when Jenkins runs job to determine docker images.
 [jenkins]
 ci_arm: tlcpack/ci-arm:20221013-060115-61c9742ea
-ci_cortexm: tlcpack/ci-cortexm:20230111-165944-a9c6f137d
+ci_cortexm: tlcpack/ci-cortexm:20230116-133924-dad13d1c1
 ci_cpu: tlcpack/ci-cpu:20230110-070003-d00168ffb
 ci_gpu: tlcpack/ci-gpu:20221128-070141-ae4fd7df7
 ci_hexagon: tlcpack/ci-hexagon:20221013-060115-61c9742ea
diff --git a/docker/install/ubuntu_install_ethosu_driver_stack.sh b/docker/install/ubuntu_install_ethosu_driver_stack.sh
index 8bc6b733edc8..6f4598258eee 100755
--- a/docker/install/ubuntu_install_ethosu_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosu_driver_stack.sh
@@ -23,7 +23,7 @@ set -o pipefail
 fvp_dir="/opt/arm/FVP_Corstone_SSE-300"
 cmake_dir="/opt/arm/cmake"
 ethosu_dir="/opt/arm/ethosu"
-ethosu_driver_ver="21.11"
+ethosu_driver_ver="22.08"
 
 mkdir -p /opt/arm
 
@@ -80,9 +80,13 @@ git clone --branch ${ethosu_driver_ver} "https://review.mlplatform.org/ml/ethos-
 git clone --branch ${ethosu_driver_ver} "https://review.mlplatform.org/ml/ethos-u/ethos-u-core-platform" core_platform
 
 # Build Driver
-mkdir ${ethosu_dir}/core_driver/build && cd ${ethosu_dir}/core_driver/build
-cmake -DCMAKE_TOOLCHAIN_FILE=${ethosu_dir}/core_platform/cmake/toolchain/arm-none-eabi-gcc.cmake -DETHOSU_LOG_SEVERITY=debug -DTARGET_CPU=cortex-m55 ..
-make
+NPU_VARIANTS=("u55" "u65")
+for i in ${NPU_VARIANTS[*]}
+do
+    mkdir ${ethosu_dir}/core_driver/build_${i} && cd ${ethosu_dir}/core_driver/build_${i}
+    cmake -DCMAKE_TOOLCHAIN_FILE=${ethosu_dir}/core_platform/cmake/toolchain/arm-none-eabi-gcc.cmake -DETHOSU_LOG_SEVERITY=debug -DTARGET_CPU=cortex-m55 -DETHOSU_TARGET_NPU_CONFIG=ethos-${i}-128 ..
+    make
+done
 
 # Build NN Library
 mkdir ${CMSIS_PATH}/CMSIS-NN/build/ && cd ${CMSIS_PATH}/CMSIS-NN/build/
diff --git a/gallery/how_to/work_with_microtvm/micro_ethosu.py b/gallery/how_to/work_with_microtvm/micro_ethosu.py
index f257507bb5a5..74a9d59d77c1 100644
--- a/gallery/how_to/work_with_microtvm/micro_ethosu.py
+++ b/gallery/how_to/work_with_microtvm/micro_ethosu.py
@@ -380,7 +380,7 @@
 #     #include <tvm_runtime.h>
 #
 #     #include "ethosu_mod.h"
-#     #include "uart.h"
+#     #include "uart_stdout.h"
 #
 #     // Header files generated by convert_image.py and convert_labels.py
 #     #include "inputs.h"
@@ -390,7 +390,7 @@
 #     int abs(int v) { return v * ((v > 0) - (v < 0)); }
 #
 #     int main(int argc, char** argv) {
-#       uart_init();
+#       UartStdOutInit();
 #       printf("Starting Demo\n");
 #       EthosuInit();
 #
diff --git a/python/tvm/micro/testing/aot_test_utils.py b/python/tvm/micro/testing/aot_test_utils.py
index 89c08395deb7..06cd0f1c9ea4 100644
--- a/python/tvm/micro/testing/aot_test_utils.py
+++ b/python/tvm/micro/testing/aot_test_utils.py
@@ -41,9 +41,9 @@
 AOT_CORSTONE300_RUNNER = AOTTestRunner(
     makefile="corstone300",
     prologue="""
-    uart_init();
+    UartStdOutInit();
     """,
-    includes=["uart.h"],
+    includes=["uart_stdout.h"],
     pass_config={
         "relay.ext.cmsisnn.options": {
             "mcpu": "cortex-m55",
@@ -54,9 +54,9 @@
 AOT_USMP_CORSTONE300_RUNNER = AOTTestRunner(
     makefile="corstone300",
     prologue="""
-    uart_init();
+    UartStdOutInit();
     """,
-    includes=["uart.h"],
+    includes=["uart_stdout.h"],
     pass_config={
         "relay.ext.cmsisnn.options": {
             "mcpu": "cortex-m55",
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index 1ec3e609f1a3..74d9686a784e 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -274,9 +274,9 @@ def create_test_runner(compiler_cpu="cortex-m55", cpu_flags=""):
     return AOTTestRunner(
         makefile="corstone300",
         prologue="""
-        uart_init();
+        UartStdOutInit();
         """,
-        includes=["uart.h"],
+        includes=["uart_stdout.h"],
         pass_config={
             "relay.ext.cmsisnn.options": {
                 "mcpu": compiler_cpu + cpu_flags,
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index b2bbcd377b84..efab6e6911b9 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -133,7 +133,7 @@ def create_test_runner(
     ethosu_variant = ethosu_variant.upper()
 
     prologue = """
-    uart_init();
+    UartStdOutInit();
     EthosuInit();
 
     struct ethosu_driver* ethos_u = ethosu_reserve_driver();
@@ -158,7 +158,7 @@ def create_test_runner(
         epilogue="""
         ethosu_release_driver(ethos_u);
         """,
-        includes=["uart.h", "ethosu_55.h", "ethosu_mod.h", "hard_fault.h"],
+        includes=["uart_stdout.h", "ethosu_55.h", "ethosu_mod.h", "hard_fault.h"],
         parameters={
             "ETHOSU_TEST_ROOT": test_root,
             "NPU_MACS": ethosu_macs,
diff --git a/tests/python/relay/aot/corstone300.mk b/tests/python/relay/aot/corstone300.mk
index 45d93ab493ed..61373ec3efba 100644
--- a/tests/python/relay/aot/corstone300.mk
+++ b/tests/python/relay/aot/corstone300.mk
@@ -41,7 +41,8 @@ DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
 ETHOSU_PATH=/opt/arm/ethosu
 DRIVER_PATH=${ETHOSU_PATH}/core_driver
 CMSIS_PATH=${ETHOSU_PATH}/cmsis
-PLATFORM_PATH=${ETHOSU_PATH}/core_platform/targets/corstone-300
+ETHOSU_PLATFORM_PATH=/opt/arm/ethosu/core_platform
+CORSTONE_300_PATH = ${ETHOSU_PLATFORM_PATH}/targets/corstone-300
 PKG_COMPILE_OPTS = -g -Wall -O2 -Wno-incompatible-pointer-types -Wno-format -Werror-implicit-function-declaration -mcpu=${MCPU}${MCPU_FLAGS} -mthumb -mfloat-abi=${MFLOAT_ABI} -std=gnu99
 CMAKE = /opt/arm/cmake/bin/cmake
 CC = arm-none-eabi-gcc
@@ -53,16 +54,15 @@ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I$(build_dir)/../include \
 	-I${TVM_ROOT}/src/runtime/contrib/ethosu/bare_metal \
 	-I$(CODEGEN_ROOT)/host/include \
-	-I${PLATFORM_PATH} \
+	-I${ETHOSU_PLATFORM_PATH}/drivers/uart/include \
 	-I${DRIVER_PATH}/include \
 	-I${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Include/ \
 	-I${CMSIS_PATH}/CMSIS/Core/Include \
 	-I${CMSIS_PATH}/CMSIS-NN/Include \
 	-I${CMSIS_PATH}/CMSIS/DSP/Include \
 	-isystem$(STANDALONE_CRT_DIR)/include
-DRIVER_CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=$(ETHOSU_TEST_ROOT)/arm-none-eabi-gcc.cmake \
-	-DETHOSU_LOG_SEVERITY=debug \
-	-DCMAKE_SYSTEM_PROCESSOR=cortex-m55
+CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=${TVM_ROOT}/tests/python/contrib/test_ethosu/reference_system/arm-none-eabi-gcc.cmake \
+	-DCMAKE_SYSTEM_PROCESSOR=${MCPU}
 
 PKG_LDFLAGS = -lm -specs=nosys.specs -static -T ${AOT_TEST_ROOT}/corstone300.ld
 
@@ -79,10 +79,11 @@ C_CODEGEN_OBJS = $(subst .c,.o,$(C_CODEGEN_SRCS))
 CC_CODEGEN_OBJS = $(subst .cc,.o,$(CC_CODEGEN_SRCS))
 CMSIS_STARTUP_SRCS = $(shell find ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
 CMSIS_NN_SRCS = $(shell find ${CMSIS_PATH}/CMSIS-NN/Source/*/*.c)
-UART_SRCS = $(shell find ${PLATFORM_PATH}/*.c)
+CORSTONE_300_SRCS = $(shell find ${CORSTONE_300_PATH}/*.c)
 
 ifdef ETHOSU_TEST_ROOT
-ETHOSU_DRIVER_LIBS = $(wildcard ${DRIVER_PATH}/build/*.a)
+NPU=$(shell echo "${NPU_VARIANT}" | tr '[:upper:]' '[:lower:]')
+ETHOSU_DRIVER_LIBS = ${DRIVER_PATH}/build_${NPU}/*.a
 ETHOSU_RUNTIME=$(build_dir)/tvm_ethosu_runtime.o
 ETHOSU_INCLUDE=-I$(ETHOSU_TEST_ROOT)
 endif
@@ -118,13 +119,19 @@ ${build_dir}/libcmsis_nn.a: $(CMSIS_NN_SRCS)
 	$(QUIET)$(AR) -cr $(abspath $(build_dir)/libcmsis_nn.a) $(abspath $(build_dir))/libcmsis_nn/*.o
 	$(QUIET)$(RANLIB) $(abspath $(build_dir)/libcmsis_nn.a)
 
-${build_dir}/libuart.a: $(UART_SRCS)
-	$(QUIET)mkdir -p $(abspath $(build_dir)/libuart)
-	$(QUIET)cd $(abspath $(build_dir)/libuart) && $(CC) -c $(PKG_CFLAGS) $^
-	$(QUIET)$(AR) -cr $(abspath $(build_dir)/libuart.a) $(abspath $(build_dir))/libuart/*.o
-	$(QUIET)$(RANLIB) $(abspath $(build_dir)/libuart.a)
+${build_dir}/libcorstone.a: $(CORSTONE_300_SRCS)
+	$(QUIET)mkdir -p $(abspath $(build_dir)/libcorstone)
+	$(QUIET)cd $(abspath $(build_dir)/libcorstone) && $(CC) -c $(PKG_CFLAGS) $^
+	$(QUIET)$(AR) -cr $(abspath $(build_dir)/libcorstone.a) $(abspath $(build_dir))/libcorstone/*.o
+	$(QUIET)$(RANLIB) $(abspath $(build_dir)/libcorstone.a)
 
-$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/crt_backend_api.o $(build_dir)/stack_allocator.o $(build_dir)/libcodegen.a ${build_dir}/libcmsis_startup.a ${build_dir}/libcmsis_nn.a ${build_dir}/libuart.a $(ETHOSU_DRIVER_LIBS) $(ETHOSU_RUNTIME)
+# Build UART driver
+${build_dir}/ethosu_core_platform/libethosu_uart_cmsdk_apb.a:
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)cd ${ETHOSU_PLATFORM_PATH}/drivers/uart && $(CMAKE) -B $(abspath $(build_dir)/ethosu_core_platform) $(CMAKE_FLAGS)
+	$(QUIET)cd $(abspath $(build_dir)/ethosu_core_platform) && $(MAKE)
+
+$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/crt_backend_api.o $(build_dir)/stack_allocator.o $(build_dir)/libcodegen.a ${build_dir}/libcmsis_startup.a ${build_dir}/libcmsis_nn.a ${build_dir}/libcorstone.a ${build_dir}/ethosu_core_platform/libethosu_uart_cmsdk_apb.a $(ETHOSU_DRIVER_LIBS) $(ETHOSU_RUNTIME)
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) $(PKG_CFLAGS) $(ETHOSU_INCLUDE) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS)
 

From d25feaf5a05cffc5ad61ff4ad40b9ea40458ccb5 Mon Sep 17 00:00:00 2001
From: Alexey Yazev <113356454+Alexey-Yazev@users.noreply.github.com>
Date: Wed, 18 Jan 2023 19:09:40 +0400
Subject: [PATCH 192/286] [microNPU] Add hardware constraints for binary
 elementwise (#13772)

Does not fuse min and max operations with requantize if there are different scales as it is not supported on NPU. Since there are hardware constraints, we cannot perform min or max operation fused with requantize (please look at NPU_SET_OFM_SCALE register description https://developer.arm.com/documentation/102420/0200/Programmers-model/Command-stream/cmd1-commands-) when we have different scales.
min/max operations with matching scales are offloaded to NPU as ethosu_binary_elementwise
min/max operations with different scales are offloaded to NPU as ethosu_binary_elementwise + ethosu_identity
---
 python/tvm/relay/op/contrib/ethosu.py         | 80 ++++++++++++++----
 .../contrib/test_ethosu/test_codegen.py       | 23 ++++++
 .../contrib/test_ethosu/test_legalize.py      | 81 +++++++++++++++----
 3 files changed, 150 insertions(+), 34 deletions(-)

diff --git a/python/tvm/relay/op/contrib/ethosu.py b/python/tvm/relay/op/contrib/ethosu.py
index bd9a7d5ba0d1..5d1e75b03043 100644
--- a/python/tvm/relay/op/contrib/ethosu.py
+++ b/python/tvm/relay/op/contrib/ethosu.py
@@ -700,15 +700,13 @@ def __init__(self, func_body: Call, operator_type: str, is_quantized_operation:
         clip = None
         requantize = None
 
-        if is_quantized_operation:
-            if str(current_call.op.name) == "clip":
-                clip = current_call
-                current_call = clip.args[0]
-        else:
-            if str(current_call.op.name) == "qnn.requantize":
-                requantize = current_call
-                clip = current_call.args[0]
-                current_call = clip.args[0]
+        if str(current_call.op.name) == "clip":
+            clip = current_call
+            current_call = clip.args[0]
+        elif str(current_call.op.name) == "qnn.requantize":
+            requantize = current_call
+            clip = current_call.args[0]
+            current_call = clip.args[0]
         binary_op = current_call
 
         layout = "NHWC"
@@ -941,21 +939,40 @@ def is_valid(self):
             [self.ifm, self.ifm2, self.ofm], supported_dtypes=[np.uint8, np.int8]
         ):
             return False
+        # MIN with different scales is not supported on NPU
+        # (please look at NPU_SET_OFM_SCALE register description
+        # https://developer.arm.com/documentation/102420/0200/Programmers-model/Command-stream/cmd1-commands-).
+        if self.ifm.q_params.scale_f32 != self.ofm.q_params.scale_f32:
+            return False
         return True
 
 
+# This pattern is for case when there are different scales for requantize and
+# minimum + clip + qnn.requantize can't be offloaded to NPU by one operation
+# due to hardware constraints.
+# It's offloaded by two operations ethosu_binary_elementwise + ethosu_identity.
 def minimum_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
     """
-    This function creates the pattern for minimum with optional fused RELU activation.
+    This function creates the pattern for minimum with optional fused RELU activation without
+    requantize.
     """
     minimum = is_op("minimum")(wildcard(), wildcard())
     optional_min_clip = is_op("clip")(minimum)
-    optional_min_clip = is_op("qnn.requantize")(
-        optional_min_clip, is_constant(), is_constant(), is_constant(), is_constant()
-    )
     return minimum | optional_min_clip
 
 
+def minimum_clip_requantize_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """
+    This function creates the pattern for minimum with fused RELU activation with requantize.
+    """
+    pattern = is_op("minimum")(wildcard(), wildcard())
+    pattern = is_op("clip")(pattern)
+    pattern = is_op("qnn.requantize")(
+        pattern, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    return pattern
+
+
 class MaxParams(BinaryElementwiseParams):
     """
     This class will parse a call to a ethosu.binary_elementwise Max composite function
@@ -979,21 +996,40 @@ def is_valid(self):
             [self.ifm, self.ifm2, self.ofm], supported_dtypes=[np.uint8, np.int8]
         ):
             return False
+        # MAX with different scales is not supported on NPU
+        # (please look at NPU_SET_OFM_SCALE register description
+        # https://developer.arm.com/documentation/102420/0200/Programmers-model/Command-stream/cmd1-commands-).
+        if self.ifm.q_params.scale_f32 != self.ofm.q_params.scale_f32:
+            return False
         return True
 
 
+# This pattern is for case when there are different scales for requantize and
+# maximum + clip + qnn.requantize can't be offloaded to NPU by one operation due to
+# hardware constraints.
+# It's offloaded by two operations ethosu_binary_elementwise + ethosu_identity.
 def maximum_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
     """
-    This function creates the pattern for maximum with optional fused RELU activation.
+    This function creates the pattern for maximum with optional fused RELU activation without
+    requantize.
     """
     maximum = is_op("maximum")(wildcard(), wildcard())
     optional_max_clip = is_op("clip")(maximum)
-    optional_max_clip = is_op("qnn.requantize")(
-        optional_max_clip, is_constant(), is_constant(), is_constant(), is_constant()
-    )
     return maximum | optional_max_clip
 
 
+def maximum_clip_requantize_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """
+    This function creates the pattern for maximum with fused RELU activation with requantize.
+    """
+    pattern = is_op("maximum")(wildcard(), wildcard())
+    pattern = is_op("clip")(pattern)
+    pattern = is_op("qnn.requantize")(
+        pattern, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    return pattern
+
+
 class ShlParams(BinaryElementwiseParams):
     """
     This class will parse a call to a ethosu.binary_elementwise Shl composite function
@@ -1913,11 +1949,21 @@ def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Cal
             qnn_mul_pattern(),
             lambda pat: MulParams(pat).is_valid(),
         ),
+        (
+            MinParams.composite_name,
+            minimum_clip_requantize_pattern(),
+            lambda pat: MinParams(pat).is_valid(),
+        ),
         (
             MinParams.composite_name,
             minimum_pattern(),
             lambda pat: MinParams(pat).is_valid(),
         ),
+        (
+            MaxParams.composite_name,
+            maximum_clip_requantize_pattern(),
+            lambda pat: MaxParams(pat).is_valid(),
+        ),
         (
             MaxParams.composite_name,
             maximum_pattern(),
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index dc54ef071d19..05ba7467b309 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -1191,6 +1191,29 @@ def conv2d_relu6(x):
     )
 
 
+# Specific case when operation cannot be offloaded to NPU by single binary elementwise operation because
+# min and max operations cannot be fused with requantize if there are different scales as it's not supported on NPU.
+@pytest.mark.parametrize("operation", [tf.math.minimum, tf.math.maximum])
+def test_tflite_min_max_relu_n1_to_1(operation):
+    np.random.seed(0)
+    accel_type = "ethos-u55-128"
+    ifm_shape = (1, 12, 16, 8)
+
+    @tf.function
+    def min_max_relu_n1_to_1(lhs, rhs):
+        op = operation(lhs, rhs)
+        # The specific pattern will be replaced into RELU_N1_TO_1 by tflite.
+        return tf.math.maximum(-1.0, tf.math.minimum(op, 1.0))
+
+    infra.compare_tvm_with_tflite(
+        min_max_relu_n1_to_1,
+        [ifm_shape, ifm_shape],
+        accel_type,
+        enable_cascader=True,
+        ranges=[(-1, 1), (0, 2)],
+    )
+
+
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
 @pytest.mark.parametrize("ifm_shape", [(1, 14), (1, 151)])
 @pytest.mark.parametrize("ofm_channels", [32, 64])
diff --git a/tests/python/contrib/test_ethosu/test_legalize.py b/tests/python/contrib/test_ethosu/test_legalize.py
index 5ddc7565f20c..5bc31dacb59d 100644
--- a/tests/python/contrib/test_ethosu/test_legalize.py
+++ b/tests/python/contrib/test_ethosu/test_legalize.py
@@ -53,6 +53,13 @@ def partition_ethosu_by_table(mod, pattern_table):
     return mod
 
 
+def relu_n1_to_1(x):
+    """
+    The specific pattern will be replaced into RELU_N1_TO_1 by tflite.
+    """
+    return tf.math.maximum(-1.0, tf.math.minimum(x, 1.0))
+
+
 def test_split_indices_legalize():
     def create_graph(axis):
         x = relay.var("x", shape=(1, 50, 50, 3))
@@ -881,7 +888,7 @@ def verify(ext_func):
         ([1, 4, 4], [4, 1], False),
     ],
 )
-@pytest.mark.parametrize("activation_function", ["NONE", "RELU"])
+@pytest.mark.parametrize("activation_function", [None, tf.nn.relu])
 def test_tflite_binary_elemwise_legalize(
     operator_type,
     ifm_shape,
@@ -906,8 +913,8 @@ def tf_function(self, x, y):
                     op = tf.math.minimum(x, y)
                 elif operator_type == "MAX":
                     op = tf.math.maximum(x, y)
-                if activation_function == "RELU":
-                    op = tf.nn.relu(op)
+                if activation_function:
+                    op = activation_function(op)
                 return op
 
         model = Model()
@@ -938,9 +945,13 @@ def verify(ext_func):
         op = ext_func.body
 
         has_reshaped_output = False
+        has_separate_requantize = False
         shapes_padded = [[1] * (4 - len(s)) + s for s in shapes]
         out_padded = [1] * (4 - len(out_shape)) + out_shape
-        if op.op.name != "contrib.ethosu.binary_elementwise":
+        if op.op.name == "contrib.ethosu.identity":
+            op = op.args[0]
+            has_separate_requantize = True
+        if op.op.name == "reshape":
             has_reshaped_output = True
             op = op.args[0]
 
@@ -951,20 +962,30 @@ def verify(ext_func):
         assert op.checked_type.dtype == dtype
         assert op.attrs.operator_type == operator_type
         assert op.attrs.reversed_operands == reversed_operands
-        if activation_function == "RELU":
+        if activation_function != None:
             assert str(op.attrs.activation) == "CLIP"
 
             if operator_type in ["MIN", "MAX"]:
-                # MIN and MAX with an activation must have a requantize operation
-                # baked into the output. To check the extra requantize node was
-                # picked up by the pattern, we can make sure the quantization
-                # information is not default.
-                assert float(op.attrs.ifm_scale) != 1.0
-                assert int(op.attrs.ifm_zero_point) != 0
-                assert float(op.attrs.ifm2_scale) != 1.0
-                assert int(op.attrs.ifm2_zero_point) != 0
-                assert float(op.attrs.ofm_scale) != 1.0
-                assert int(op.attrs.ofm_zero_point) != 0
+                if has_separate_requantize:
+                    # In case when requantize cannot be fused with MIN/MAX + CLIP due to hardware constraints
+                    # there should be default quantization values since requantize is separate operation.
+                    assert float(op.attrs.ifm_scale) == 1.0
+                    assert int(op.attrs.ifm_zero_point) == 0
+                    assert float(op.attrs.ifm2_scale) == 1.0
+                    assert int(op.attrs.ifm2_zero_point) == 0
+                    assert float(op.attrs.ofm_scale) == 1.0
+                    assert int(op.attrs.ofm_zero_point) == 0
+                else:
+                    # MIN and MAX with an activation must have a requantize operation
+                    # baked into the output. To check the extra requantize node was
+                    # picked up by the pattern, we can make sure the quantization
+                    # information is not default.
+                    assert float(op.attrs.ifm_scale) != 1.0
+                    assert int(op.attrs.ifm_zero_point) != 0
+                    assert float(op.attrs.ifm2_scale) != 1.0
+                    assert int(op.attrs.ifm2_zero_point) != 0
+                    assert float(op.attrs.ofm_scale) != 1.0
+                    assert int(op.attrs.ofm_zero_point) != 0
 
         if has_reshaped_output:
             assert list(ext_func.body.checked_type.shape) == out_shape
@@ -997,22 +1018,42 @@ def verify(ext_func):
             ),
         ]
     elif operator_type == "MIN":
-        rewriter = legalize.MinRewriter()
+        rewriter = [legalize.MinRewriter(), legalize.RequantizeRewriter()]
         pattern_table = [
+            (
+                ethosu.MinParams.composite_name,
+                ethosu.minimum_clip_requantize_pattern(),
+                lambda pat: ethosu.MinParams(pat).is_valid(),
+            ),
             (
                 ethosu.MinParams.composite_name,
                 ethosu.minimum_pattern(),
                 lambda pat: ethosu.MinParams(pat).is_valid(),
             ),
+            (
+                ethosu.RequantizeParams.composite_name,
+                ethosu.requantize_pattern(),
+                lambda pat: ethosu.RequantizeParams(pat).is_valid(),
+            ),
         ]
     elif operator_type == "MAX":
-        rewriter = legalize.MaxRewriter()
+        rewriter = [legalize.MaxRewriter(), legalize.RequantizeRewriter()]
         pattern_table = [
+            (
+                ethosu.MaxParams.composite_name,
+                ethosu.maximum_clip_requantize_pattern(),
+                lambda pat: ethosu.MaxParams(pat).is_valid(),
+            ),
             (
                 ethosu.MaxParams.composite_name,
                 ethosu.maximum_pattern(),
                 lambda pat: ethosu.MaxParams(pat).is_valid(),
             ),
+            (
+                ethosu.RequantizeParams.composite_name,
+                ethosu.requantize_pattern(),
+                lambda pat: ethosu.RequantizeParams(pat).is_valid(),
+            ),
         ]
 
     tflite_graph = create_tflite_graph()
@@ -1031,6 +1072,12 @@ def verify(ext_func):
     verify(mod["tvmgen_default_ethos_u_main_0"])
 
 
+# This test is for checking the case when requantize cannot be fused with MIN/MAX + CLIP due to hardware constraints.
+def test_tflite_max_relu_n1_to_1_legalize():
+    ifm_shape = [1, 4, 8, 16]
+    test_tflite_binary_elemwise_legalize("MAX", ifm_shape, ifm_shape, False, relu_n1_to_1)
+
+
 def test_binary_add_from_constant_scalar():
     dtype = "uint8"
     ifm_shape = (1, 4, 4, 8)

From 8f864f644a0d16590b76fd66f9add6bf1a71e25d Mon Sep 17 00:00:00 2001
From: Ever-Kid <mayiheng8@gmail.com>
Date: Thu, 19 Jan 2023 03:59:17 +0800
Subject: [PATCH 193/286] [Bugfix][TIR] Fix version conflict with typing for
 Python 3.8.0 (#13744)

I came across this bug under python3.8.0 with error from `typing.get_args()` while trying to run testcases like `tests/python/unittest/test_tir_schedule_set_axis_separator.py::test_set_axis_separator[transform_layout_named]`
```
>           if get_origin(tp) is collections.abc.Callable and res[0] is not Ellipsis:
E           IndexError: tuple index out of range
```
And the root cause here is a difference  between python3.8.0 and later version:
```diff
        get_args(Callable[[], T][int]) == ([], int)
    """
-     if isinstance(tp, _GenericAlias):    // python3.8.0
+    if isinstance(tp, _GenericAlias) and not tp._special:    // python3.8.15
        res = tp.__args__
        if get_origin(tp) is collections.abc.Callable and res[0] is not Ellipsis:
}
```
So I added it back to `python/tvm/tir/schedule/_type_checker.py`
---
 python/tvm/tir/schedule/_type_checker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/tir/schedule/_type_checker.py b/python/tvm/tir/schedule/_type_checker.py
index becf8c095057..12ce1ebc1f92 100644
--- a/python/tvm/tir/schedule/_type_checker.py
+++ b/python/tvm/tir/schedule/_type_checker.py
@@ -98,7 +98,7 @@ def union(type_: Any) -> Optional[List[type]]:  # pylint: disable=missing-functi
         @staticmethod
         def callable(type_: Any) -> Optional[List[type]]:
             if _Subtype._origin(type_) is collections.abc.Callable:
-                if hasattr(typing, "get_args"):
+                if hasattr(typing, "get_args") and not type_._special:
                     subtypes = typing.get_args(type_)  # type: ignore
                 else:
                     subtypes = type_.__args__

From b51645159f8e80cff137e4ed0e8860720b42cb02 Mon Sep 17 00:00:00 2001
From: neildhickey <neil.hickey@arm.com>
Date: Wed, 18 Jan 2023 22:05:00 +0000
Subject: [PATCH 194/286] [TOPI] Making test_strided_set require a GPU for
 testing (#13804)

* [TOPI] Making test_strided_set require a GPU for testing

Skipping test_strided_set with ci_cpu docker image
due to an issue reported in https://github.com/apache/tvm/pull/13724.
---
 tests/python/topi/python/test_topi_transform.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py
index 0f64b486f375..e34905f15379 100644
--- a/tests/python/topi/python/test_topi_transform.py
+++ b/tests/python/topi/python/test_topi_transform.py
@@ -859,6 +859,7 @@ def test_dynamic_strided_slice():
     verify_dynamic_strided_slice((3, 4, 3), [0, 2, 0], [1, 2, 3])
 
 
+@tvm.testing.requires_gpu
 @tvm.testing.uses_gpu
 def test_strided_set():
     verify_strided_set((3, 4, 3), (3, 2, 2), [0, 3, 0], [4, 1, 4], [1, -1, 2])

From 53f51486a82faf66b6ed41251ff3fbae57980752 Mon Sep 17 00:00:00 2001
From: lightzhan <1126207509@qq.com>
Date: Thu, 19 Jan 2023 13:25:05 +0800
Subject: [PATCH 195/286] [TIR]Fix the crash of the pass RemoveNoOp (#13808)

Fix the crash of the pass RemoveNoOp.

Co-authored-by: lightzhan-intellif <zhan.liang@intellif.com>
---
 src/tir/transforms/remove_no_op.cc                 |  5 +++++
 .../unittest/test_tir_transform_remove_no_op.py    | 14 ++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/src/tir/transforms/remove_no_op.cc b/src/tir/transforms/remove_no_op.cc
index 430c1f41bfaf..d35cf8b8d602 100644
--- a/src/tir/transforms/remove_no_op.cc
+++ b/src/tir/transforms/remove_no_op.cc
@@ -119,6 +119,11 @@ class NoOpRemover : public arith::IRMutatorWithAnalyzer {
   Stmt VisitStmt_(const IfThenElseNode* op) final {
     Stmt stmt = Parent::VisitStmt_(op);
     op = stmt.as<IfThenElseNode>();
+    // Sometimes the condition can be statically determined,
+    // in which the type of the `stmt` will not be IfThenElseNode.
+    if (!op) {
+      return stmt;
+    }
     if (op->else_case) {
       bool no_op_else = is_no_op(op->else_case.value());
       bool no_op_then = is_no_op(op->then_case);
diff --git a/tests/python/unittest/test_tir_transform_remove_no_op.py b/tests/python/unittest/test_tir_transform_remove_no_op.py
index ce37329b7ed3..06d9289aa795 100644
--- a/tests/python/unittest/test_tir_transform_remove_no_op.py
+++ b/tests/python/unittest/test_tir_transform_remove_no_op.py
@@ -603,5 +603,19 @@ def expected(A: T.Buffer[16, "int32"], C: T.Buffer[1, "int32"]):
             C[0] = C[0] + B[i]
 
 
+class TestCertainConditon(BaseBeforeAfter):
+    """The conditon of the If-Else node is certain.
+    This would cause `Segmentation fault` error before."""
+
+    def before():
+        if True:
+            T.evaluate(0)
+        else:
+            T.evaluate(0)
+
+    def expected():
+        T.evaluate(0)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From ada1caf3652ea391f9a217b24d9748f15f8d37eb Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Thu, 19 Jan 2023 17:38:33 +0300
Subject: [PATCH 196/286] [Adreno] Update interface of AnnotateMemoryScope pass
 (#13779)

Removed redundant and unused parameter from AnnotateMemoryScope pass.
---
 include/tvm/relay/transform.h                    | 2 +-
 src/relay/backend/build_module.cc                | 2 +-
 src/relay/transforms/annotate_texture_storage.cc | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 3227f7979d87..43a0f89d95c1 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -584,7 +584,7 @@ TVM_DLL Pass CapturePostDfsIndexInSpans();
  * \brief Calls device dependent memory scope analysis pass, collects mapping of desirable
  * expr->memory_scope and annotates expressions by VirtualDevice with required memory_scope
  */
-TVM_DLL Pass AnnotateMemoryScope(CompilationConfig config);
+TVM_DLL Pass AnnotateMemoryScope();
 
 /*!
  * \brief Removes non-fused reshapes after lowering the graph.
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index bca524794a20..0642c0c67253 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -396,7 +396,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     relay_module = transform::Inline()(relay_module);
     relay_module = transform::InferType()(relay_module);
     relay_module = transform::LabelOps()(relay_module);
-    relay_module = transform::AnnotateMemoryScope(config_)(relay_module);
+    relay_module = transform::AnnotateMemoryScope()(relay_module);
 
     ICHECK(relay_module.defined());
 
diff --git a/src/relay/transforms/annotate_texture_storage.cc b/src/relay/transforms/annotate_texture_storage.cc
index 9dbd631ad32d..39f065ea8c11 100644
--- a/src/relay/transforms/annotate_texture_storage.cc
+++ b/src/relay/transforms/annotate_texture_storage.cc
@@ -645,7 +645,7 @@ Map<Expr, Map<Expr, Array<String>>> CollectStorageInfo(const Expr& expr) {
   return storage_info;
 }
 
-Expr AnnotateMemoryScopeExpr(const Expr& expr, const IRModule& mod, CompilationConfig config) {
+Expr AnnotateMemoryScopeExpr(const Expr& expr, const IRModule& mod) {
   auto storage_scope = CollectStorageInfo(expr);
   if (storage_scope.size()) {
     return RewriteVDStorageScopes(storage_scope).Rewrite(expr);
@@ -655,10 +655,10 @@ Expr AnnotateMemoryScopeExpr(const Expr& expr, const IRModule& mod, CompilationC
 }
 
 namespace transform {
-tvm::transform::Pass AnnotateMemoryScope(CompilationConfig config) {
+tvm::transform::Pass AnnotateMemoryScope() {
   runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
-      [config = std::move(config)](Function f, IRModule m, PassContext pc) {
-        return Downcast<Function>(AnnotateMemoryScopeExpr(f, m, config));
+      [](Function f, IRModule m, PassContext pc) {
+        return Downcast<Function>(AnnotateMemoryScopeExpr(f, m));
       };
   return CreateFunctionPass(pass_func, 2, "AnnotateMemoryScope", {});
 }

From bb215d26c68e0b46b87356924b9d1eaafd87b102 Mon Sep 17 00:00:00 2001
From: Alexey Gladyshev <wotpricol@mail.ru>
Date: Thu, 19 Jan 2023 20:51:48 +0300
Subject: [PATCH 197/286] [ONNX] Extend converter for Attention from Microsoft
 onnxruntime contrib opset (#13797)

* add type & shape checking

* add base class for Attention converter

* add support for 'past' input

* add support for 'unidirectional' attribute

* fix for 'huggingface implementation'

* add common method for calculating Attention

* expand test coverage for Attention
---
 python/tvm/relay/frontend/onnx.py          | 517 +++++++++++++--------
 tests/python/frontend/onnx/test_forward.py |  92 +++-
 2 files changed, 392 insertions(+), 217 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index c4eb7774d756..ffd31317e9f5 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1297,7 +1297,213 @@ def _impl_v1(cls, inputs, attr, params):
         return _expr.TupleWrapper(_expr.Tuple([output, placeholder, placeholder]), 3)
 
 
-class Attention(OnnxOpConverter):
+class OrtAttentionBase:
+    """
+    Base class for Attention and QAttention from Microsoft onnxruntime contrib opset.
+    """
+
+    @classmethod
+    def _check_input_embeddings(cls, input_emb, valid_types, **kwargs):
+        assert infer_type(input_emb).checked_type.dtype in valid_types
+        assert (
+            len(infer_shape(input_emb)) == 3
+        ), "Input should be 3D tensor with shape (batch_size, sequence_length, input_hidden_size)"
+        (batch_size, seq_len, input_hidden) = infer_shape(input_emb)
+        assert input_hidden > 0, (
+            "The weight tensor has (input_hidden_size, 3 * output_hidden_size) shape, so it doesn't"
+            f" make sense to have ({input_hidden}, 3 * output_hidden_size) weight tensor."
+        )
+        assert seq_len > 0, (
+            "The output tensor has (batch_size, sequence_length, hidden_size) shape,"
+            f" so it doesn't make sense to have (batch_size, {seq_len}, hidden_size) output."
+        )
+
+        return batch_size, seq_len, input_hidden
+
+    @classmethod
+    def _check_weights(cls, weight, valid_types, **kwargs):
+        assert infer_type(weight).checked_type.dtype in valid_types
+        assert len(infer_shape(weight)) == 2, (
+            "Weight should be 2D input tensor with shape (input_hidden_size, 3 * hidden_size), "
+            "hidden_size = num_heads * head_size"
+        )
+        (input_hidden_weight, out_hidden_x3) = infer_shape(weight)
+        assert kwargs["input_hidden"] == input_hidden_weight
+        assert out_hidden_x3 % 3 == 0, "output hidden shape should be divisible by 3: W_Q, W_K, W_V"
+        out_hidden = out_hidden_x3 // 3
+        assert (
+            out_hidden % kwargs["num_heads"] == 0
+        ), "output hidden size should be divisible by number of attention heads"
+        head_size = out_hidden // kwargs["num_heads"]
+
+        return out_hidden_x3, out_hidden, head_size
+
+    @classmethod
+    def _check_bias(cls, bias, valid_types, **kwargs):
+        assert infer_type(bias).checked_type.dtype in valid_types
+        assert (
+            len(infer_shape(bias)) == 1
+        ), "Bias should be 1D input tensor with shape (3 * hidden_size)"
+        (out_hidden_x3_bias,) = infer_shape(bias)
+        assert kwargs["out_hidden_x3"] == out_hidden_x3_bias
+
+    @classmethod
+    def _check_mask_index(cls, mask_index, valid_types, **kwargs):
+        assert infer_type(mask_index).checked_type.dtype in valid_types
+        mask_index_shape = infer_shape(mask_index)
+        assert (
+            len(mask_index_shape) == 2
+            and mask_index_shape[0] == kwargs["batch_size"]
+            and mask_index_shape[1] >= kwargs["seq_len"]
+        ), "currently only support (batch_size, past_sequence_len + sequence_length) mask index"
+
+        return mask_index_shape[1]
+
+    @classmethod
+    def _check_past(cls, past, valid_types, **kwargs):
+        assert infer_type(past).checked_type.dtype in valid_types
+        past_shape = infer_shape(past)
+        assert len(past_shape) == 5, "past should be 5D tensor"
+        assert (
+            past_shape[0] == 2
+            and past_shape[1] == kwargs["batch_size"]
+            and past_shape[2] == kwargs["num_heads"]
+            and past_shape[3] + kwargs["seq_len"] == kwargs["total_seq_len"]
+            and past_shape[4] == kwargs["head_size"]
+        )
+        past_seq_len = past_shape[3]
+        return past_seq_len
+
+    @classmethod
+    def _split_into_heads(cls, tensor, batch_size, seq_len, num_heads, head_size):
+        """
+        In the implementation of Multi-head attention we just split queries, keys, and values
+        we compute for a single-head attention into several parts:
+        (batch_size, num_heads, seq_len, head_size)
+        """
+        tensor = _op.reshape(tensor, (batch_size, seq_len, num_heads, head_size))
+
+        # (batch_size, num_heads, seq_len, head_size)
+        tensor = _op.transpose(tensor, axes=[0, 2, 1, 3])
+
+        return tensor
+
+    @classmethod
+    def _merge_first_dimensions(cls, tensor):
+        """
+        nn.batch_matmul is expecting 3D tensor:
+        (batch_size * num_heads, past_seq_len + seq_len, head_size)
+        """
+        return _op.reverse_reshape(tensor, (-1, 0, 0))
+
+    @classmethod
+    def _create_unidirectional_mask(cls, left_value, right_value, past_seq_len, seq_len, dtype):
+        """
+        [lhs rhs rhs ... rhs rhs]
+        [lhs lhs rhs ... rhs rhs]
+        [lhs lhs lhs ... rhs rhs]
+        .........................
+        [lhs lhs lhs ... lhs rhs]
+        [lhs lhs lhs ... lhs lhs]
+        """
+        numpy_unidirectional_mask = np.array(
+            [
+                np.concatenate(
+                    [
+                        np.full(past_seq_len + s_i + 1, left_value),
+                        np.full(seq_len - s_i - 1, right_value),
+                    ]
+                )
+                for s_i in range(seq_len)
+            ]
+        )
+        unidirectional_mask = _op.const(numpy_unidirectional_mask, dtype=dtype)
+        unidirectional_mask = _op.expand_dims(unidirectional_mask, 0, num_newaxis=2)
+
+        return unidirectional_mask
+
+    @classmethod
+    def _compute_attention(cls, Q, K, V, mask_index, **kwargs):
+        # Compute Attention scores
+        att_scores = _op.nn.batch_matmul(Q, K, transpose_a=False, transpose_b=True)
+        score_dtype = infer_type(att_scores).checked_type.dtype
+        att_scores = _op.divide(
+            att_scores,
+            _op.const(
+                np.sqrt(kwargs["head_size"]), dtype=infer_type(att_scores).checked_type.dtype
+            ),
+        )
+        att_scores = _op.reshape(
+            att_scores,
+            (
+                kwargs["batch_size"],
+                kwargs["num_heads"],
+                kwargs["seq_len"],
+                kwargs["past_seq_len"] + kwargs["seq_len"],
+            ),
+        )
+
+        # Build the attention mask
+        att_mask = _op.cast(mask_index, score_dtype)
+        # Attention mask has value 0 or 1. Here we convert 0 to -10000, and 1 to 0.
+        att_mask = _op.subtract(_op.const(1, dtype=score_dtype), att_mask)
+        att_mask = _op.multiply(att_mask, _op.const(-10000, dtype=score_dtype))
+        # Expand for att_scores broadcast
+        # (batch_size, past_seq_len + seq_len) -> (batch_size, 1, seq_len, past_seq_len + seq_len)
+        att_mask = _op.expand_dims(att_mask, 1, num_newaxis=2)
+        att_mask = _op.concatenate([att_mask] * kwargs["seq_len"], axis=2)
+
+        if kwargs["unidirectional"]:
+            att_mask = _op.add(
+                att_mask,
+                cls._create_unidirectional_mask(
+                    0, -10000, kwargs["past_seq_len"], kwargs["seq_len"], score_dtype
+                ),
+            )
+
+        # Apply the mask
+        att_scores = _op.add(att_scores, att_mask)
+        # TODO(agladyshev):
+        #   Comment from ORT source code (onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h):
+        #   "Fix unidirectional mask to be parity with huggingface implementation"
+        if kwargs["unidirectional"]:
+            att_scores = _op.multiply(
+                att_scores,
+                cls._create_unidirectional_mask(
+                    1, 0, kwargs["past_seq_len"], kwargs["seq_len"], score_dtype
+                ),
+            )
+            att_scores = _op.add(
+                att_scores,
+                _op.multiply(
+                    att_mask,
+                    cls._create_unidirectional_mask(
+                        0, 1, kwargs["past_seq_len"], kwargs["seq_len"], score_dtype
+                    ),
+                ),
+            )
+
+        # Compute Softmax
+        att_scores = _op.reshape(
+            att_scores,
+            (
+                kwargs["batch_size"] * kwargs["num_heads"],
+                kwargs["seq_len"],
+                kwargs["past_seq_len"] + kwargs["seq_len"],
+            ),
+        )
+        att_probs = _op.nn.softmax(att_scores, axis=-1)
+
+        # Compute output
+        output = _op.nn.batch_matmul(att_probs, V, transpose_a=False, transpose_b=False)
+        output = _op.reverse_reshape(output, (-1, kwargs["num_heads"], 0, 0))
+        output = _op.transpose(output, axes=[0, 2, 1, 3])
+        output = _op.reshape(output, (0, 0, kwargs["out_hidden"]))
+
+        return output
+
+
+class Attention(OrtAttentionBase, OnnxOpConverter):
     """Operator converter for Attention from Microsoft onnxruntime contrib opset.
 
     This is the self-attention mechanism used in transformer models.
@@ -1305,16 +1511,30 @@ class Attention(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
+        # ************************* Read attrs *************************
         num_heads = attr["num_heads"]
+        unidirectional = attr["unidirectional"]
+
+        assert (
+            "past_present_share_buffer" not in attr
+        ), "share past and present buffers are not currently supported"
         assert (
             "qkv_hidden_sizes" not in attr
         ), "different hidden sizes for Q, K, V are not currently supported"
-        assert "unidirectional" not in attr, "unidirectional attention not current supported"
 
+        # ************************* Read inputs *************************
         # (batch, seq, in_hidden)
         input_emb = inputs[0]
 
-        # (in_hidden, 3 * out_hidden), where out_hidden = num_heads * head_size
+        # TODO(agladyshev):
+        #   ORT documentation says:
+        #       The weights for input projection of Q, K and V are merged.
+        #       The data is stacked on the second dimension.
+        #       Its shape is (input_hidden_size, hidden_size + hidden_size + v_hidden_size).
+        #       Here hidden_size is the hidden dimension of Q and K, and v_hidden_size is that of V.
+        #   However, in our case, we consider that hidden_size == v_hidden_size.
+        #   Therefore, weight has the following shape:
+        #       (in_hidden, 3 * out_hidden), where out_hidden = num_heads * head_size
         weight = inputs[1]
 
         # (3 * out_hidden,)
@@ -1325,7 +1545,7 @@ def _impl_v1(cls, inputs, attr, params):
         # 3. (    batch,            seq, past_seq + seq,)
         # 4. (    batch,)
         # 5. (2 * batch,)
-        # For now, we only support case 2.
+        # TODO: For now, we only support case 2.
         mask_index = inputs[3]
 
         # (2, batch, num_heads, past_seq, head_size)
@@ -1333,28 +1553,47 @@ def _impl_v1(cls, inputs, attr, params):
 
         # (batch, num_heads, seq, seq)
         extra_add = inputs[5]
+        assert extra_add is None, "extra add to QxK not currently supported"
 
-        (batch_size, seq_len, _) = infer_shape(input_emb)
-        (out_hidden_x3,) = infer_shape(bias)
-        assert out_hidden_x3 % 3 == 0, "bias shape should be divisible by 3"
-        out_hidden = out_hidden_x3 // 3
-        assert (
-            out_hidden % num_heads == 0
-        ), "output hidden size should be divisible by number of attention heads"
-        head_size = out_hidden // num_heads
+        # When past_present_share_buffer is used,
+        # it is required to specify past_sequence_length (could be 0)
+        past_seq_len = inputs[6]
+        assert past_seq_len is None, "past sequence length not currently supported"
+
+        # ************************* Parse inputs *************************
+        t = ["float32", "float16"]
+        m = ["int32"]
+
+        # input
+        batch_size, seq_len, input_hidden = cls._check_input_embeddings(input_emb, t)
+
+        # weight
+        out_hidden_x3, out_hidden, head_size = cls._check_weights(
+            weight, t, num_heads=num_heads, input_hidden=input_hidden
+        )
 
+        # bias
+        cls._check_bias(bias, t, out_hidden_x3=out_hidden_x3)
+
+        # mask_index
         assert (
             mask_index is not None
         ), "Attention import currently only supports required mask_index"
-        mask_index_shape = infer_shape(mask_index)
-        assert (
-            len(mask_index_shape) == 2
-            and mask_index_shape[0] == batch_size
-            and mask_index_shape[1] == seq_len
-        ), "currently only support (batch_size, sequence_length) mask index"
+        total_seq_len = cls._check_mask_index(mask_index, m, batch_size=batch_size, seq_len=seq_len)
 
-        assert past is None, "past K, V state is not currently supported"
-        assert extra_add is None, "extra add to QxK not currently supported"
+        # past
+        if past_seq_len is None:
+            past_seq_len = 0
+        if past is not None:
+            past_seq_len = cls._check_past(
+                past,
+                t,
+                batch_size=batch_size,
+                num_heads=num_heads,
+                seq_len=seq_len,
+                total_seq_len=total_seq_len,
+                head_size=head_size,
+            )
 
         # split weight and biases and do the matmuls
         w_Q, w_K, w_V = _op.split(weight, 3, axis=1)
@@ -1365,53 +1604,44 @@ def _impl_v1(cls, inputs, attr, params):
         K = _op.add(_op.nn.matmul(input_emb, w_K), b_K)
         V = _op.add(_op.nn.matmul(input_emb, w_V), b_V)
 
-        # massage tensors in preparation for batched matmul
-        def massage(tensor):
-            tensor = _op.reshape(tensor, (batch_size, seq_len, num_heads, head_size))
-
-            # (batch_size, num_heads, seq_len, head_size)
-            tensor = _op.transpose(tensor, axes=[0, 2, 1, 3])
+        Q = cls._split_into_heads(Q, batch_size, seq_len, num_heads, head_size)
+        K = cls._split_into_heads(K, batch_size, seq_len, num_heads, head_size)
+        V = cls._split_into_heads(V, batch_size, seq_len, num_heads, head_size)
 
-            # (batch_size * num_heads, seq_len, head_size)
-            return _op.reverse_reshape(tensor, (-1, 0, 0))
-
-        Q = massage(Q)
-        K = massage(K)
-        V = massage(V)
+        # Concatenate (past_K, past_V) with (K, V) by sequence axis:
+        # (batch_size, num_heads, past_sequence_length + sequence_length, head_size)
+        if past is not None and past_seq_len > 0:
+            K_past, V_past = _op.split(past, 2, axis=0)
+            K = _op.concatenate([_op.squeeze(K_past, axis=[0]), K], axis=2)
+            V = _op.concatenate([_op.squeeze(V_past, axis=[0]), V], axis=2)
 
-        K_present = _op.reshape(K, (batch_size, num_heads, seq_len, head_size))
-        V_present = _op.reshape(V, (batch_size, num_heads, seq_len, head_size))
-        present = _op.stack([K_present, V_present], axis=0)
+        # Prepare present state for Key and Value with shape
+        # (2, batch_size, num_heads, past_sequence_length + sequence_length, head_size)
+        present = _op.stack([K, V], axis=0)
 
-        att_scores = _op.nn.batch_matmul(Q, K, transpose_a=False, transpose_b=True)
-        score_dtype = infer_type(att_scores).checked_type.dtype
-        att_scores = _op.divide(
-            att_scores,
-            _op.const(np.sqrt(head_size), dtype=infer_type(att_scores).checked_type.dtype),
+        Q = cls._merge_first_dimensions(Q)
+        K = cls._merge_first_dimensions(K)
+        V = cls._merge_first_dimensions(V)
+
+        # Compute Attention output
+        output = cls._compute_attention(
+            Q,
+            K,
+            V,
+            mask_index,
+            unidirectional=unidirectional,
+            batch_size=batch_size,
+            out_hidden=out_hidden,
+            num_heads=num_heads,
+            head_size=head_size,
+            seq_len=seq_len,
+            past_seq_len=past_seq_len,
         )
-        att_scores = _op.reshape(att_scores, (batch_size, num_heads, seq_len, seq_len))
-
-        # build the attention mask
-        att_mask = _op.cast(mask_index, score_dtype)
-        att_mask = _op.expand_dims(att_mask, 1, num_newaxis=2)
-        att_mask = _op.subtract(_op.const(1, dtype=score_dtype), att_mask)
-        att_mask = _op.multiply(att_mask, _op.const(-10000, dtype=score_dtype))
-
-        # apply the mask
-        att_scores = _op.add(att_scores, att_mask)
-        att_scores = _op.reshape(att_scores, (batch_size * num_heads, seq_len, seq_len))
-
-        att_probs = _op.nn.softmax(att_scores, axis=-1)
-
-        output = _op.nn.batch_matmul(att_probs, V, transpose_a=False, transpose_b=False)
-        output = _op.reverse_reshape(output, (-1, num_heads, 0, 0))
-        output = _op.transpose(output, axes=[0, 2, 1, 3])
-        output = _op.reshape(output, (0, 0, out_hidden))
 
         return _expr.TupleWrapper(_expr.Tuple([output, present]), 2)
 
 
-class QAttention(OnnxOpConverter):
+class QAttention(OrtAttentionBase, OnnxOpConverter):
     """Operator converter for QAttention from Microsoft onnxruntime contrib opset.
 
     This is the self-attention mechanism used in transformer models.
@@ -1473,42 +1703,15 @@ def _impl_v1(cls, inputs, attr, params):
         t4 = ["int32"]
 
         # input
-        assert infer_type(input_emb).checked_type.dtype in t1
-        assert (
-            len(infer_shape(input_emb)) == 3
-        ), "Input should be 3D tensor with shape (batch_size, sequence_length, input_hidden_size)"
-        (batch_size, seq_len, input_hidden) = infer_shape(input_emb)
-        assert input_hidden > 0, (
-            "The weight tensor has (input_hidden_size, 3 * output_hidden_size) shape, so it doesn't"
-            f" make sense to have ({input_hidden}, 3 * output_hidden_size) weight tensor."
-        )
-        assert seq_len > 0, (
-            "The output tensor has (batch_size, sequence_length, hidden_size) shape,"
-            f" so it doesn't make sense to have (batch_size, {seq_len}, hidden_size) output."
-        )
+        batch_size, seq_len, input_hidden = cls._check_input_embeddings(input_emb, t1)
 
         # weight
-        assert infer_type(weight).checked_type.dtype in t2
-        assert len(infer_shape(weight)) == 2, (
-            "Weight should be 2D input tensor with shape (input_hidden_size, 3 * hidden_size), "
-            "hidden_size = num_heads * head_size"
+        out_hidden_x3, out_hidden, head_size = cls._check_weights(
+            weight, t2, num_heads=num_heads, input_hidden=input_hidden
         )
-        (input_hidden_weight, out_hidden_x3) = infer_shape(weight)
-        assert input_hidden == input_hidden_weight
-        assert out_hidden_x3 % 3 == 0, "output hidden shape should be divisible by 3: W_Q, W_K, W_V"
-        out_hidden = out_hidden_x3 // 3
-        assert (
-            out_hidden % num_heads == 0
-        ), "output hidden size should be divisible by number of attention heads"
-        head_size = out_hidden // num_heads
 
         # bias
-        assert infer_type(bias).checked_type.dtype in t3
-        assert (
-            len(infer_shape(bias)) == 1
-        ), "Bias should be 1D input tensor with shape (3 * hidden_size)"
-        (out_hidden_x3_bias,) = infer_shape(bias)
-        assert out_hidden_x3 == out_hidden_x3_bias
+        cls._check_bias(bias, t3, out_hidden_x3=out_hidden_x3)
 
         # input_scale
         assert infer_type(input_scale).checked_type.dtype in t3
@@ -1527,13 +1730,9 @@ def _impl_v1(cls, inputs, attr, params):
         assert (
             mask_index is not None
         ), "Attention import currently only supports required mask_index"
-        assert infer_type(mask_index).checked_type.dtype in t4
-        mask_index_shape = infer_shape(mask_index)
-        assert (
-            len(mask_index_shape) == 2
-            and mask_index_shape[0] == batch_size
-            and mask_index_shape[1] >= seq_len
-        ), "currently only support (batch_size, sequence_length) mask index"
+        total_seq_len = cls._check_mask_index(
+            mask_index, t4, batch_size=batch_size, seq_len=seq_len
+        )
 
         # TODO(agladyshev): int32 required for qnn.batch_matmul (QnnBatchMatmulRel)
         zero_point_zero = _expr.const(0, "int32")
@@ -1557,17 +1756,15 @@ def _impl_v1(cls, inputs, attr, params):
         # past (2, batch_size, num_heads, past_sequence_length, head_size)
         past_seq_len = 0
         if past is not None:
-            assert infer_type(past).checked_type.dtype in t3
-            past_shape = infer_shape(past)
-            assert len(past_shape) == 5, "past should be 5D tensor"
-            assert (
-                past_shape[0] == 2
-                and past_shape[1] == batch_size
-                and past_shape[2] == num_heads
-                and past_shape[3] + seq_len == mask_index_shape[1]
-                and past_shape[4] == head_size
+            past_seq_len = cls._check_past(
+                past,
+                t3,
+                batch_size=batch_size,
+                num_heads=num_heads,
+                seq_len=seq_len,
+                total_seq_len=total_seq_len,
+                head_size=head_size,
             )
-            past_seq_len = past_shape[3]
 
         # ************************* Create Relay *************************
         # Add batch dimension for QNN Batch Matmul
@@ -1604,22 +1801,9 @@ def qmatmul_dequantize_bias(
             input_emb, w_V, input_scale, weight_scale, input_zero_point, weight_zero_point, b_V
         )
 
-        def split_into_heads(tensor):
-            """
-            In the implementation of Multi-head attention we just split queries, keys, and values
-            we compute for a single-head attention into several parts:
-            (batch_size, num_heads, seq_len, head_size)
-            """
-            tensor = _op.reshape(tensor, (batch_size, seq_len, num_heads, head_size))
-
-            # (batch_size, num_heads, seq_len, head_size)
-            tensor = _op.transpose(tensor, axes=[0, 2, 1, 3])
-
-            return tensor
-
-        Q = split_into_heads(Q)
-        K = split_into_heads(K)
-        V = split_into_heads(V)
+        Q = cls._split_into_heads(Q, batch_size, seq_len, num_heads, head_size)
+        K = cls._split_into_heads(K, batch_size, seq_len, num_heads, head_size)
+        V = cls._split_into_heads(V, batch_size, seq_len, num_heads, head_size)
 
         # Concatenate (past_K, past_V) with (K, V) by sequence axis:
         # (batch_size, num_heads, past_sequence_length + sequence_length, head_size)
@@ -1632,78 +1816,25 @@ def split_into_heads(tensor):
         # (2, batch_size, num_heads, past_sequence_length + sequence_length, head_size)
         present = _op.stack([K, V], axis=0)
 
-        def merge_first_dimensions(tensor):
-            """
-            nn.batch_matmul is expecting 3D tensor:
-            (batch_size * num_heads, past_seq_len + seq_len, head_size)
-            """
-            return _op.reverse_reshape(tensor, (-1, 0, 0))
-
-        Q = merge_first_dimensions(Q)
-        K = merge_first_dimensions(K)
-        V = merge_first_dimensions(V)
-
-        att_scores = _op.nn.batch_matmul(Q, K, transpose_a=False, transpose_b=True)
-        score_dtype = infer_type(att_scores).checked_type.dtype
-        att_scores = _op.divide(
-            att_scores,
-            _op.const(np.sqrt(head_size), dtype=infer_type(att_scores).checked_type.dtype),
-        )
-        att_scores = _op.reshape(
-            att_scores, (batch_size, num_heads, seq_len, past_seq_len + seq_len)
+        Q = cls._merge_first_dimensions(Q)
+        K = cls._merge_first_dimensions(K)
+        V = cls._merge_first_dimensions(V)
+
+        # Compute Attention output
+        output = cls._compute_attention(
+            Q,
+            K,
+            V,
+            mask_index,
+            unidirectional=unidirectional,
+            batch_size=batch_size,
+            out_hidden=out_hidden,
+            num_heads=num_heads,
+            head_size=head_size,
+            seq_len=seq_len,
+            past_seq_len=past_seq_len,
         )
 
-        # Build the attention mask
-        att_mask = _op.cast(mask_index, score_dtype)
-        # Attention mask has value 0 or 1. Here we convert 0 to -10000, and 1 to 0.
-        att_mask = _op.subtract(_op.const(1, dtype=score_dtype), att_mask)
-        att_mask = _op.multiply(att_mask, _op.const(-10000, dtype=score_dtype))
-        # Expand for att_scores broadcast
-        # (batch_size, past_seq_len + seq_len) -> (batch_size, 1, seq_len, past_seq_len + seq_len)
-        att_mask = _op.expand_dims(att_mask, 1, num_newaxis=2)
-        att_mask = _op.concatenate([att_mask] * seq_len, axis=2)
-
-        def create_unidirectional_mask(left_value, right_value):
-            numpy_unidirectional_mask = np.array(
-                [
-                    np.concatenate(
-                        [
-                            np.full(past_seq_len + s_i + 1, left_value),
-                            np.full(seq_len - s_i - 1, right_value),
-                        ]
-                    )
-                    for s_i in range(seq_len)
-                ]
-            )
-            unidirectional_mask = _op.const(numpy_unidirectional_mask, dtype=score_dtype)
-            unidirectional_mask = _op.expand_dims(unidirectional_mask, 0, num_newaxis=2)
-
-            return unidirectional_mask
-
-        if unidirectional:
-            att_mask = _op.add(att_mask, create_unidirectional_mask(0, -10000))
-
-        # Apply the mask
-        att_scores = _op.add(att_scores, att_mask)
-        # TODO(agladyshev):
-        #   Comment from ORT source code (onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h):
-        #   "Fix unidirectional mask to be parity with huggingface implementation"
-        if unidirectional:
-            att_scores = _op.multiply(att_scores, create_unidirectional_mask(1, 0))
-            att_scores = _op.add(att_scores, create_unidirectional_mask(0, -10000))
-
-        # Compute Softmax
-        att_scores = _op.reshape(
-            att_scores, (batch_size * num_heads, seq_len, past_seq_len + seq_len)
-        )
-        att_probs = _op.nn.softmax(att_scores, axis=-1)
-
-        # Compute output
-        output = _op.nn.batch_matmul(att_probs, V, transpose_a=False, transpose_b=False)
-        output = _op.reverse_reshape(output, (-1, num_heads, 0, 0))
-        output = _op.transpose(output, axes=[0, 2, 1, 3])
-        output = _op.reshape(output, (0, 0, out_hidden))
-
         return _expr.TupleWrapper(_expr.Tuple([output, present]), 2)
 
 
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index a84de82f3bab..f5b5f7c65cb5 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5878,30 +5878,47 @@ def verify_embedlayernormalization(
 def test_attention(target, dev):
     """test_attention"""
 
-    def verify_attention(input_, weight, bias, mask_index, num_heads):
+    def verify_attention(_unidirectional, _input, _weight, _bias, _mask_index=None, _past=None):
+        input_names = ["input", "weight", "bias"]
+        if _mask_index is not None:
+            input_names.append("mask_index")
+        if _past is not None:
+            input_names.append("past")
+
         node = onnx.helper.make_node(
             "Attention",
-            inputs=["input", "weight", "bias", "mask_index"],
+            inputs=input_names,
             outputs=["output", "present"],
             domain="com.microsoft",
             num_heads=num_heads,
+            unidirectional=_unidirectional,
         )
 
+        past_shape = (2, batch_size, num_heads, past_sequence_length, head_size)
         present_output_shape = (2, batch_size, num_heads, sequence_length, head_size)
 
+        inputs_info = [
+            helper.make_tensor_value_info("input", TensorProto.FLOAT, list(_input.shape)),
+            helper.make_tensor_value_info("weight", TensorProto.FLOAT, list(_weight.shape)),
+            helper.make_tensor_value_info("bias", TensorProto.FLOAT, list(_bias.shape)),
+        ]
+        if _mask_index is not None:
+            inputs_info.append(
+                helper.make_tensor_value_info(
+                    "mask_index", TensorProto.INT32, list(_mask_index.shape)
+                ),
+            )
+        if _past is not None:
+            inputs_info.append(
+                helper.make_tensor_value_info("past", TensorProto.FLOAT, list(past_shape))
+            )
+
         graph = helper.make_graph(
             [node],
             "attention_test",
-            inputs=[
-                helper.make_tensor_value_info("input", TensorProto.FLOAT, list(input_.shape)),
-                helper.make_tensor_value_info("weight", TensorProto.FLOAT, list(weight.shape)),
-                helper.make_tensor_value_info("bias", TensorProto.FLOAT, list(bias.shape)),
-                helper.make_tensor_value_info(
-                    "mask_index", TensorProto.INT32, list(mask_index.shape)
-                ),
-            ],
+            inputs=inputs_info,
             outputs=[
-                helper.make_tensor_value_info("output", TensorProto.FLOAT, list(input_.shape)),
+                helper.make_tensor_value_info("output", TensorProto.FLOAT, list(_input.shape)),
                 helper.make_tensor_value_info(
                     "present", TensorProto.FLOAT, list(present_output_shape)
                 ),
@@ -5910,31 +5927,58 @@ def verify_attention(input_, weight, bias, mask_index, num_heads):
 
         model = helper.make_model(graph, producer_name="attention_test")
 
+        inputs = [_input, _weight, _bias]
+        if _mask_index is not None:
+            inputs.append(_mask_index)
+        if _past is not None:
+            inputs.append(_past)
+
         # "present" output should be nullptr when the "past" input isn't included,
         # but ort requires an output shape to be specified?
         verify_with_ort_with_inputs(
             model,
-            [input_, weight, bias, mask_index],
-            [input_.shape, present_output_shape],
+            inputs,
+            [_input.shape, present_output_shape],
             target=target,
             dev=dev,
             rtol=1e-4,
             atol=1e-4,
         )
 
-    hidden_size = 384
-    batch_size = 4
-    sequence_length = 4
-    num_heads = 12
-    head_size = 32
+    batch_size = 11
+    num_heads = 13
+    head_size = 37
+    sequence_length = 7
+    input_hidden_size = 147
+    weight_hidden_size = num_heads * head_size
+    past_sequence_length = 17
 
-    dtype = "float32"
-    input_array = np.random.random((batch_size, sequence_length, hidden_size)).astype(dtype)
-    weight = np.random.normal(size=(hidden_size, 3 * hidden_size)).astype(dtype) * 0.1
-    bias = np.random.randn(3 * hidden_size).astype(dtype)
-    mask_index = np.full((batch_size, sequence_length), 1).astype("int32")
+    total_sequence_length = past_sequence_length + sequence_length
 
-    verify_attention(input_array, weight, bias, mask_index, num_heads)
+    # Required inputs
+    input_array = np.random.normal(size=(batch_size, sequence_length, input_hidden_size)).astype(
+        "float32"
+    )
+    weight = (
+        np.random.normal(size=(input_hidden_size, 3 * weight_hidden_size)).astype("float32") * 0.1
+    )
+    bias = np.random.randn(3 * weight_hidden_size).astype("float32")
+
+    # Optional inputs
+    past = np.random.random((2, batch_size, num_heads, past_sequence_length, head_size)).astype(
+        "float32"
+    )
+
+    for unidirectional in [0, 1]:
+        for have_past in [False, True]:
+            if not have_past:
+                mask_index = np.random.randint(0, 2, (batch_size, sequence_length)).astype("int32")
+                verify_attention(unidirectional, input_array, weight, bias, mask_index)
+            else:
+                mask_index = np.random.randint(0, 2, (batch_size, total_sequence_length)).astype(
+                    "int32"
+                )
+                verify_attention(unidirectional, input_array, weight, bias, mask_index, past)
 
 
 @tvm.testing.parametrize_targets

From c3730770f03dae7fb008b71f548b306731ddf7ee Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 19 Jan 2023 12:43:59 -0800
Subject: [PATCH 198/286] [Docker]Add dialout group by default on login
 (#13810)

This would help to run commands like nrfjprog inside docker without sudo command.
---
 docker/with_the_same_user | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/with_the_same_user b/docker/with_the_same_user
index bd332cd91374..0c17be519755 100644
--- a/docker/with_the_same_user
+++ b/docker/with_the_same_user
@@ -54,6 +54,7 @@ getent passwd "${CI_BUILD_UID}" || adduser --force-badname --gid "${CI_BUILD_GID
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
     --disabled-password --home "${CI_BUILD_HOME}" --quiet "${CI_BUILD_USER}"
 usermod -a -G sudo -G tvm-venv "${CI_BUILD_USER}"
+usermod -a -G sudo -G dialout "${CI_BUILD_USER}"
 
 # Add user to video group for ROCm
 if [[ ! -z "${ROCM_ENABLED-}" ]]; then

From 64619f4e5abe8ca00f6c38a45becb932582a0346 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 19 Jan 2023 15:32:51 -0800
Subject: [PATCH 199/286] [microTVM] Add tutorial on how to generate MLPerfTiny
 submissions (#13783)

This PR adds a tutorial on how to generate an MLPerftiny submission on Zephyr OS using microTVM.
---
 docs/conf.py                                  |   5 +-
 .../work_with_microtvm/micro_mlperftiny.py    | 312 ++++++++++++++++++
 python/tvm/micro/testing/utils.py             |  44 ++-
 tests/micro/zephyr/utils.py                   |  37 +--
 tests/scripts/request_hook/request_hook.py    |   1 +
 tests/scripts/task_python_microtvm.sh         |   7 +
 6 files changed, 368 insertions(+), 38 deletions(-)
 create mode 100644 gallery/how_to/work_with_microtvm/micro_mlperftiny.py

diff --git a/docs/conf.py b/docs/conf.py
index 08fbedb8ffca..eb2b39d4b1fd 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -550,6 +550,9 @@ def force_gc(gallery_conf, fname):
     gc.collect()
 
 
+# Skips certain files to avoid dependency issues
+filename_pattern_default = "^(?!.*micro_mlperftiny.py).*$"
+
 sphinx_gallery_conf = {
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("tvm", "numpy"),
@@ -562,7 +565,7 @@ def force_gc(gallery_conf, fname):
     "within_subsection_order": WithinSubsectionOrder,
     "gallery_dirs": gallery_dirs,
     "subsection_order": subsection_order,
-    "filename_pattern": os.environ.get("TVM_TUTORIAL_EXEC_PATTERN", ".py"),
+    "filename_pattern": os.environ.get("TVM_TUTORIAL_EXEC_PATTERN", filename_pattern_default),
     "download_all_examples": False,
     "min_reported_time": 60,
     "expected_failing_examples": [],
diff --git a/gallery/how_to/work_with_microtvm/micro_mlperftiny.py b/gallery/how_to/work_with_microtvm/micro_mlperftiny.py
new file mode 100644
index 000000000000..79308e072365
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_mlperftiny.py
@@ -0,0 +1,312 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _tutorial-micro-MLPerfTiny:
+
+Creating Your MLPerfTiny Submission with microTVM
+=================================================
+**Authors**:
+`Mehrdad Hessar <https://github.com/mehrdadh>`_
+
+This tutorial is showcasing building an MLPerfTiny submission using microTVM. This
+tutorial shows the steps to import a TFLite model from MLPerfTiny benchmark models,
+compile it with TVM and generate a Zephyr project which can be flashed to a Zephyr
+supported board to benchmark the model using EEMBC runner.
+"""
+
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
+#
+
+import os
+import pathlib
+import tarfile
+import tempfile
+import shutil
+
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_zephyr.rst
+#
+
+
+######################################################################
+#
+# **Note:** Install CMSIS-NN only if you are interested to generate this submission
+# using CMSIS-NN code generator.
+#
+
+######################################################################
+#
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_cmsis.rst
+#
+
+######################################################################
+# Import Python dependencies
+# -------------------------------
+#
+import tensorflow as tf
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.backend import Executor, Runtime
+from tvm.contrib.download import download_testdata
+from tvm.micro import export_model_library_format
+from tvm.micro.model_library_format import generate_c_interface_header
+from tvm.micro.testing.utils import (
+    create_header_file,
+    mlf_extract_workspace_size_bytes,
+)
+
+######################################################################
+# Import Visual Wake Word Model
+# --------------------------------------------------------------------
+#
+# To begin with, download and import the Visual Wake Word (VWW) TFLite model from MLPerfTiny.
+# This model is originally from `MLPerf Tiny repository <https://github.com/mlcommons/tiny>`_.
+# We also capture metadata information from the TFLite model such as input/output name,
+# quantization parameters, etc. which will be used in following steps.
+#
+# We use indexing for various models to build the submission. The indices are defined as follows:
+# To build another model, you need to update the model URL, the short name and index number.
+#
+#   * Keyword Spotting(KWS) 1
+#   * Visual Wake Word(VWW) 2
+#   * Anomaly Detection(AD) 3
+#   * Image Classification(IC) 4
+#
+# If you would like to build the submission with CMSIS-NN, modify USE_CMSIS environment variable.
+#
+#   .. code-block:: bash
+#
+#     export USE_CMSIS=1
+#
+
+MODEL_URL = "https://github.com/mlcommons/tiny/raw/bceb91c5ad2e2deb295547d81505721d3a87d578/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite"
+MODEL_PATH = download_testdata(MODEL_URL, "vww_96_int8.tflite", module="model")
+
+MODEL_SHORT_NAME = "VWW"
+MODEL_INDEX = 2
+
+USE_CMSIS = os.environ.get("TVM_USE_CMSIS", False)
+
+tflite_model_buf = open(MODEL_PATH, "rb").read()
+try:
+    import tflite
+
+    tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+except AttributeError:
+    import tflite.Model
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+interpreter = tf.lite.Interpreter(model_path=str(MODEL_PATH))
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+
+input_name = input_details[0]["name"]
+input_shape = tuple(input_details[0]["shape"])
+input_dtype = np.dtype(input_details[0]["dtype"]).name
+output_name = output_details[0]["name"]
+output_shape = tuple(output_details[0]["shape"])
+output_dtype = np.dtype(output_details[0]["dtype"]).name
+
+# We extract quantization information from TFLite model.
+# This is required for all models except Anomaly Detection,
+# because for other models we send quantized data to interpreter
+# from host, however, for AD model we send floating data and quantization
+# happens on the microcontroller.
+if MODEL_SHORT_NAME != "AD":
+    quant_output_scale = output_details[0]["quantization_parameters"]["scales"][0]
+    quant_output_zero_point = output_details[0]["quantization_parameters"]["zero_points"][0]
+
+relay_mod, params = relay.frontend.from_tflite(
+    tflite_model, shape_dict={input_name: input_shape}, dtype_dict={input_name: input_dtype}
+)
+
+######################################################################
+# Defining Target, Runtime and Executor
+# --------------------------------------------------------------------
+#
+# Now we need to define the target, runtime and executor to compile this model. In this tutorial,
+# we use Ahead-of-Time (AoT) compilation and we build a standalone project. This is different
+# than using AoT with host-driven mode where the target would communicate with host using host-driven
+# AoT executor to run inference.
+#
+
+# Use the C runtime (crt)
+RUNTIME = Runtime("crt")
+
+# Use the AoT executor with `unpacked-api=True` and `interface-api=c`. `interface-api=c` forces
+# the compiler to generate C type function APIs and `unpacked-api=True` forces the compiler
+# to generate minimal unpacked format inputs which reduces the stack memory usage on calling
+# inference layers of the model.
+EXECUTOR = Executor(
+    "aot",
+    {"unpacked-api": True, "interface-api": "c", "workspace-byte-alignment": 8},
+)
+
+# Select a Zephyr board
+BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
+
+# Get the the full target description using the BOARD
+TARGET = tvm.micro.testing.get_target("zephyr", BOARD)
+
+######################################################################
+# Compile the model and export model library format
+# --------------------------------------------------------------------
+#
+# Now, we compile the model for the target. Then, we generate model
+# library format for the compiled model. We also need to calculate the
+# workspace size that is required for the compiled model.
+#
+#
+
+config = {"tir.disable_vectorize": True}
+if USE_CMSIS:
+    from tvm.relay.op.contrib import cmsisnn
+
+    config["relay.ext.cmsisnn.options"] = {"mcpu": TARGET.mcpu}
+    relay_mod = cmsisnn.partition_for_cmsisnn(relay_mod, params, mcpu=TARGET.mcpu)
+
+with tvm.transform.PassContext(opt_level=3, config=config):
+    module = tvm.relay.build(
+        relay_mod, target=TARGET, params=params, runtime=RUNTIME, executor=EXECUTOR
+    )
+
+temp_dir = tvm.contrib.utils.tempdir()
+model_tar_path = temp_dir / "model.tar"
+export_model_library_format(module, model_tar_path)
+workspace_size = mlf_extract_workspace_size_bytes(model_tar_path)
+
+######################################################################
+# Generate input/output header files
+# --------------------------------------------------------------------
+#
+# To create a microTVM standalone project with AoT, we need to generate
+# input and output header files. These header files are used to connect
+# the input and output API from generated code to the rest of the
+# standalone project. For this specific submission, we only need to generate
+# output header file since the input API call is handled differently.
+#
+
+extra_tar_dir = tvm.contrib.utils.tempdir()
+extra_tar_file = extra_tar_dir / "extra.tar"
+
+with tarfile.open(extra_tar_file, "w:gz") as tf:
+    with tempfile.TemporaryDirectory() as tar_temp_dir:
+        model_files_path = os.path.join(tar_temp_dir, "include")
+        os.mkdir(model_files_path)
+        header_path = generate_c_interface_header(
+            module.libmod_name, [input_name], [output_name], [], {}, [], 0, model_files_path, {}, {}
+        )
+        tf.add(header_path, arcname=os.path.relpath(header_path, tar_temp_dir))
+
+    create_header_file(
+        "output_data",
+        np.zeros(
+            shape=output_shape,
+            dtype=output_dtype,
+        ),
+        "include",
+        tf,
+    )
+
+######################################################################
+# Create the project, build and prepare the project tar file
+# --------------------------------------------------------------------
+#
+# Now that we have the compiled model as a model library format,
+# we can generate the full project using Zephyr template project. First,
+# we prepare the project options, then build the project. Finally, we
+# cleanup the temporary files and move the submission project to the
+# current working directory which could be downloaded and used on
+# your development kit.
+#
+
+input_total_size = 1
+for i in range(len(input_shape)):
+    input_total_size *= input_shape[i]
+
+template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
+project_options = {
+    "extra_files_tar": str(extra_tar_file),
+    "project_type": "mlperftiny",
+    "board": BOARD,
+    "compile_definitions": [
+        f"-DWORKSPACE_SIZE={workspace_size + 512}",  # Memory workspace size, 512 is a temporary offset
+        # since the memory calculation is not accurate.
+        f"-DTARGET_MODEL={MODEL_INDEX}",  # Sets the model index for project compilation.
+        f"-DTH_MODEL_VERSION=EE_MODEL_VERSION_{MODEL_SHORT_NAME}01",  # Sets model version. This is required by MLPerfTiny API.
+        f"-DMAX_DB_INPUT_SIZE={input_total_size}",  # Max size of the input data array.
+    ],
+}
+
+if MODEL_SHORT_NAME != "AD":
+    project_options["compile_definitions"].append(f"-DOUT_QUANT_SCALE={quant_output_scale}")
+    project_options["compile_definitions"].append(f"-DOUT_QUANT_ZERO={quant_output_zero_point}")
+
+if USE_CMSIS:
+    project_options["compile_definitions"].append(f"-DCOMPILE_WITH_CMSISNN=1")
+
+# Note: You might need to adjust this based on the board that you are using.
+project_options["config_main_stack_size"] = 4000
+
+if USE_CMSIS:
+    project_options["cmsis_path"] = os.environ.get("CMSIS_PATH", "/content/cmsis")
+
+generated_project_dir = temp_dir / "project"
+
+project = tvm.micro.project.generate_project_from_mlf(
+    template_project_path, generated_project_dir, model_tar_path, project_options
+)
+project.build()
+
+# Cleanup the build directory and extra artifacts
+shutil.rmtree(generated_project_dir / "build")
+(generated_project_dir / "model.tar").unlink()
+
+project_tar_path = pathlib.Path(os.getcwd()) / "project.tar"
+with tarfile.open(project_tar_path, "w:tar") as tar:
+    tar.add(generated_project_dir, arcname=os.path.basename("project"))
+
+print(f"The generated project is located here: {project_tar_path}")
+
+######################################################################
+# Use this project with your board
+# --------------------------------------------------------------------
+#
+# Now that we have the generated project, you can use this project locally
+# to flash your board and prepare it for EEMBC runner software.
+# To do this follow these steps:
+#
+#   .. code-block:: bash
+#
+#     tar -xf project.tar
+#     cd project
+#     mkdir build
+#     cmake ..
+#     make -j2
+#     west flash
+#
+# Now you can connect your board to EEMBC runner using this
+# `instructions <https://github.com/eembc/energyrunner>`_
+# and benchmark this model on your board.
+#
diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py
index 097fbf283a58..170c57631444 100644
--- a/python/tvm/micro/testing/utils.py
+++ b/python/tvm/micro/testing/utils.py
@@ -17,6 +17,7 @@
 
 """Defines the test methods used with microTVM."""
 
+import io
 from functools import lru_cache
 import json
 import logging
@@ -24,6 +25,7 @@
 import tarfile
 import time
 from typing import Union
+import numpy as np
 
 import tvm
 from tvm import relay
@@ -102,7 +104,7 @@ def mlf_extract_workspace_size_bytes(mlf_tar_path: Union[Path, str]) -> int:
 
     workspace_size = 0
     with tarfile.open(mlf_tar_path, "r:*") as tar_file:
-        tar_members = [ti.name for ti in tar_file.getmembers()]
+        tar_members = [tar_info.name for tar_info in tar_file.getmembers()]
         assert "./metadata.json" in tar_members
         with tar_file.extractfile("./metadata.json") as f:
             metadata = json.load(f)
@@ -133,3 +135,43 @@ def get_conv2d_relay_module():
     mod = tvm.IRModule.from_expr(f)
     mod = relay.transform.InferType()(mod)
     return mod
+
+
+def _npy_dtype_to_ctype(data: np.ndarray) -> str:
+    if data.dtype == "int8":
+        return "int8_t"
+    elif data.dtype == "int32":
+        return "int32_t"
+    elif data.dtype == "uint8":
+        return "uint8_t"
+    elif data.dtype == "float32":
+        return "float"
+    else:
+        raise ValueError(f"Data type {data.dtype} not expected.")
+
+
+def create_header_file(tensor_name: str, npy_data: np.array, output_path: str, tar_file: str):
+    """
+    This method generates a header file containing the data contained in the numpy array provided
+    and adds the header file to a tar file.
+    It is used to capture the tensor data (for both inputs and output).
+    """
+    header_file = io.StringIO()
+    header_file.write("#include <stddef.h>\n")
+    header_file.write("#include <stdint.h>\n")
+    header_file.write("#include <dlpack/dlpack.h>\n")
+    header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n")
+    header_file.write(f"{_npy_dtype_to_ctype(npy_data)} {tensor_name}[] =")
+
+    header_file.write("{")
+    for i in np.ndindex(npy_data.shape):
+        header_file.write(f"{npy_data[i]}, ")
+    header_file.write("};\n\n")
+
+    header_file_bytes = bytes(header_file.getvalue(), "utf-8")
+    raw_path = Path(output_path) / f"{tensor_name}.h"
+    tar_info = tarfile.TarInfo(name=str(raw_path))
+    tar_info.size = len(header_file_bytes)
+    tar_info.mode = 0o644
+    tar_info.type = tarfile.REGTYPE
+    tar_file.addfile(tar_info, io.BytesIO(header_file_bytes))
diff --git a/tests/micro/zephyr/utils.py b/tests/micro/zephyr/utils.py
index 42419b637fa4..bdac4e9c63a7 100644
--- a/tests/micro/zephyr/utils.py
+++ b/tests/micro/zephyr/utils.py
@@ -32,6 +32,7 @@
 import tvm.micro
 from tvm.micro import export_model_library_format
 from tvm.micro.model_library_format import generate_c_interface_header
+from tvm.micro.testing.utils import create_header_file
 from tvm.micro.testing.utils import (
     mlf_extract_workspace_size_bytes,
     aot_transport_init_wait,
@@ -106,42 +107,6 @@ def build_project(
     return project, project_dir
 
 
-def create_header_file(tensor_name, npy_data, output_path, tar_file):
-    """
-    This method generates a header file containing the data contained in the numpy array provided.
-    It is used to capture the tensor data (for both inputs and expected outputs).
-    """
-    header_file = io.StringIO()
-    header_file.write("#include <stddef.h>\n")
-    header_file.write("#include <stdint.h>\n")
-    header_file.write("#include <dlpack/dlpack.h>\n")
-    header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n")
-
-    if npy_data.dtype == "int8":
-        header_file.write(f"int8_t {tensor_name}[] =")
-    elif npy_data.dtype == "int32":
-        header_file.write(f"int32_t {tensor_name}[] = ")
-    elif npy_data.dtype == "uint8":
-        header_file.write(f"uint8_t {tensor_name}[] = ")
-    elif npy_data.dtype == "float32":
-        header_file.write(f"float {tensor_name}[] = ")
-    else:
-        raise ValueError("Data type not expected.")
-
-    header_file.write("{")
-    for i in np.ndindex(npy_data.shape):
-        header_file.write(f"{npy_data[i]}, ")
-    header_file.write("};\n\n")
-
-    header_file_bytes = bytes(header_file.getvalue(), "utf-8")
-    raw_path = pathlib.Path(output_path) / f"{tensor_name}.h"
-    ti = tarfile.TarInfo(name=str(raw_path))
-    ti.size = len(header_file_bytes)
-    ti.mode = 0o644
-    ti.type = tarfile.REGTYPE
-    tar_file.addfile(ti, io.BytesIO(header_file_bytes))
-
-
 # TODO move CMSIS integration to microtvm_api_server.py
 # see https://discuss.tvm.apache.org/t/tvm-capturing-dependent-libraries-of-code-generated-tir-initially-for-use-in-model-library-format/11080
 def loadCMSIS(temp_dir):
diff --git a/tests/scripts/request_hook/request_hook.py b/tests/scripts/request_hook/request_hook.py
index 4e3db220e0b4..b033f1ca8457 100644
--- a/tests/scripts/request_hook/request_hook.py
+++ b/tests/scripts/request_hook/request_hook.py
@@ -208,6 +208,7 @@
     "https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5": f"{BASE}/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5",
     "https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels.h5": f"{BASE}/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels.h5",
     "https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz": f"{BASE}/tensorflow/tf-keras-datasets/mnist.npz",
+    "https://github.com/mlcommons/tiny/raw/bceb91c5ad2e2deb295547d81505721d3a87d578/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite": f"{BASE}/mlcommons/tiny/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite",
 }
 
 
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index 6153cdf82392..0b43c9c1fa8f 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -51,6 +51,13 @@ python3 gallery/how_to/work_with_microtvm/micro_aot.py
 python3 gallery/how_to/work_with_microtvm/micro_pytorch.py
 ./gallery/how_to/work_with_microtvm/micro_tvmc.sh
 
+# without CMSIS-NN
+python3 gallery/how_to/work_with_microtvm/micro_mlperftiny.py
+# with CMSIS-NN
+export TVM_USE_CMSIS=1
+python3 gallery/how_to/work_with_microtvm/micro_mlperftiny.py
+export TVM_USE_CMSIS=
+
 # Tutorials running with Zephyr
 export TVM_MICRO_USE_HW=1
 export TVM_MICRO_BOARD=qemu_x86

From d4aedac4751631c40cb2d5b509362480ed833617 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Thu, 19 Jan 2023 18:27:26 -0800
Subject: [PATCH 200/286] [TVMScript] `T.match_buffer` syntax sugar in
 arguments for TVMScript printer (#13801)

This PR implements the syntax sugar of `T.match_buffer` for new TVMScript printer. This syntax sugar will replace the `T.handle` in `T.prim_func` arguments, with matched simple buffer. For example, it will change
```python
@T.prim_func
def func(a: T.handle, b: T.handle, c: T.handle):
  A = T.match_buffer(a, [128], dtype="float32")
  B = T.match_buffer(b, [128, 128], dtype="int32")
  C = T.match_buffer(c, [128, 128, 128], dtype="uint8")
```
into
```python
@T.prim_func
def main(A: T.Buffer[(128,)], B: T.Buffer[(128, 128), "int32"], C: T.Buffer[(128, 128, 128), "uint8"]):
  T.evaluate(0)
```

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
---
 src/script/printer/tir/buffer.cc              |   8 ++
 src/script/printer/tir/function.cc            | 105 ++++++++++++++++++
 src/script/printer/tir/utils.h                |  11 ++
 .../unittest/test_tvmscript_printer_tir.py    |  52 ++++++++-
 4 files changed, 174 insertions(+), 2 deletions(-)

diff --git a/src/script/printer/tir/buffer.cc b/src/script/printer/tir/buffer.cc
index 5400328fe219..126a6e58273f 100644
--- a/src/script/printer/tir/buffer.cc
+++ b/src/script/printer/tir/buffer.cc
@@ -126,6 +126,14 @@ ExprDoc BufferDecl(const tir::Buffer& buffer, const String& method, const Array<
                     /*args=*/args);
 }
 
+ExprDoc BufferAttn(const tir::Buffer& buffer, const ObjectPath& p, const Frame& frame,
+                   const IRDocsifier& d) {
+  Map<String, ExprDoc> attrs = BufferAttrs(buffer, p, frame, d);
+  ExprDoc shape = attrs.Get("shape").value();
+  ExprDoc dtype = attrs.Get("dtype").value_or(LiteralDoc::DataType(buffer->dtype));
+  return TIR("Buffer")->Call({shape, dtype}, {}, {});
+}
+
 Array<Doc> BufferIndices(const Array<PrimExpr>& indices, const ObjectPath& p,
                          const IRDocsifier& d) {
   int n = indices.size();
diff --git a/src/script/printer/tir/function.cc b/src/script/printer/tir/function.cc
index f0f84e81d57c..6094eefb65b1 100644
--- a/src/script/printer/tir/function.cc
+++ b/src/script/printer/tir/function.cc
@@ -16,6 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <tvm/runtime/device_api.h>
+#include <tvm/tir/stmt_functor.h>
+
 #include "./utils.h"
 
 namespace tvm {
@@ -34,16 +37,115 @@ String FindFunctionName(const IRDocsifier& d, const tir::PrimFunc& f) {
   return "main";
 }
 
+bool IsSimpleBuffer(const tir::Buffer& buf) {
+  if (!buf->strides.empty()) {
+    return false;
+  }
+  for (const PrimExpr& shp_i : buf->shape) {
+    if (!tir::UndefinedVars(shp_i).empty()) {
+      return false;
+    }
+  }
+  for (const PrimExpr& stride_i : buf->strides) {
+    if (!tir::UndefinedVars(stride_i).empty()) {
+      return false;
+    }
+  }
+  if (!tir::UndefinedVars(buf->elem_offset).empty()) {
+    return false;
+  } else if (buf->elem_offset->IsInstance<IntImmNode>()) {
+    IntImm elem_offset = Downcast<IntImm>(buf->elem_offset);
+    if (elem_offset->value != 0) {
+      return false;
+    }
+  }
+  return buf.scope() == "global" && buf->data_alignment == runtime::kAllocAlignment &&
+         buf->offset_factor == 1 && buf->buffer_type == tir::BufferType::kDefault &&
+         !buf->axis_separators.size();
+}
+
+int CountVarOccurrence(const tir::PrimFunc& f, const tir::Var& v) {
+  class OccurrenceCounter : public tir::StmtExprVisitor {
+   public:
+    int count = 0;
+    const tir::VarNode* v = nullptr;
+
+    void VisitExpr_(const tir::VarNode* op) final {
+      if (op == v) {
+        ++count;
+      }
+      tir::StmtExprVisitor::VisitExpr_(op);
+    }
+
+    void VisitStmt_(const tir::BufferStoreNode* op) final {
+      VisitBuffer(op->buffer.get());
+      tir::StmtExprVisitor::VisitStmt_(op);
+    }
+
+    void VisitExpr_(const tir::BufferLoadNode* op) final {
+      VisitBuffer(op->buffer.get());
+      tir::StmtExprVisitor::VisitExpr_(op);
+    }
+
+    void VisitStmt_(const tir::DeclBufferNode* op) final {
+      VisitBuffer(op->buffer.get());
+      tir::StmtExprVisitor::VisitStmt_(op);
+    }
+
+    void VisitBuffer(const tir::BufferNode* buffer) {
+      VisitExpr(buffer->data);
+      for (const PrimExpr& shape_i : buffer->shape) {
+        VisitExpr(shape_i);
+      }
+      for (const PrimExpr& stride_i : buffer->strides) {
+        VisitExpr(stride_i);
+      }
+      VisitExpr(buffer->elem_offset);
+    }
+  };
+
+  OccurrenceCounter counter;
+  counter.v = v.get();
+  counter(f->body);
+  for (const tir::Var& v : f->params) {
+    counter(v);
+  }
+  for (const auto& pair : f->buffer_map) {
+    counter(pair.first);
+    counter.VisitBuffer(pair.second.get());
+  }
+  return counter.count;
+}
+
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::PrimFunc>("", [](tir::PrimFunc func, ObjectPath p, IRDocsifier d) -> Doc {
       With<TIRFrame> frame(MakeDispatchFrame(d, func, func));
       int n_args = func->params.size();
+      std::unordered_map<const tir::VarNode*, int> buffer_data_counter;
+      for (const auto& pair : func->buffer_map) {
+        const tir::VarNode* data_var = pair.second->data.get();
+        if (!buffer_data_counter.count(data_var)) {
+          buffer_data_counter.insert({data_var, 0});
+        }
+        ++buffer_data_counter.at(data_var);
+      }
       // Step 1. Handle `func->params`
       Array<AssignDoc> args;
       args.reserve(n_args);
+      std::unordered_set<const tir::BufferNode*> buffer_inlined;
       for (int i = 0; i < n_args; ++i) {
         tir::Var var = func->params[i];
         ObjectPath var_p = p->Attr("params")->ArrayIndex(i);
+        if (CountVarOccurrence(func, var) == 2 && func->buffer_map.count(var)) {
+          tir::Buffer buffer = func->buffer_map[var];
+          if (IsSimpleBuffer(buffer) && buffer_data_counter.at(buffer->data.get()) == 1) {
+            ObjectPath buffer_p = p->Attr("buffer_map")->MapValue(var);
+            args.push_back(AssignDoc(DefineBuffer(buffer, *frame, d), NullOpt,
+                                     BufferAttn(buffer, buffer_p, *frame, d)));
+            buffer_inlined.insert(buffer.get());
+            continue;
+          }
+        }
         ExprDoc a = d->AsDoc<ExprDoc>(var->type_annotation, var_p->Attr("type_annotation"));
         args.push_back(AssignDoc(DefineVar(var, *frame, d), NullOpt, a));
       }
@@ -58,6 +160,9 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         tir::Var param = func->params[i];
         if (func->buffer_map.count(param)) {
           tir::Buffer buffer = func->buffer_map[param];
+          if (buffer_inlined.count(buffer.get())) {
+            continue;
+          }
           ExprDoc param = args[i]->lhs;
           ObjectPath buffer_p = p->Attr("buffer_map")->MapValue(param);
           ExprDoc lhs =
diff --git a/src/script/printer/tir/utils.h b/src/script/printer/tir/utils.h
index 047513dcb316..183400d974ca 100644
--- a/src/script/printer/tir/utils.h
+++ b/src/script/printer/tir/utils.h
@@ -209,6 +209,17 @@ inline void ReprPrintTIR(const ObjectRef& obj, ReprPrinter* p) {
 ExprDoc BufferDecl(const tir::Buffer& buffer, const String& method, const Array<ExprDoc>& args,
                    const ObjectPath& p, const Frame& frame, const IRDocsifier& d);
 
+/*!
+ * \brief Declare and define a buffer as annotation
+ * \param buffer The buffer to be defined
+ * \param p The object path
+ * \param f The frame
+ * \param d The IRDocsifier
+ * \return The ExprDoc corresponding to the buffer declaration
+ */
+ExprDoc BufferAttn(const tir::Buffer& buffer, const ObjectPath& p, const Frame& frame,
+                   const IRDocsifier& d);
+
 }  // namespace printer
 }  // namespace script
 }  // namespace tvm
diff --git a/tests/python/unittest/test_tvmscript_printer_tir.py b/tests/python/unittest/test_tvmscript_printer_tir.py
index d62a1cd12c28..201428b74c66 100644
--- a/tests/python/unittest/test_tvmscript_printer_tir.py
+++ b/tests/python/unittest/test_tvmscript_printer_tir.py
@@ -57,10 +57,56 @@ def test_prim_func():
         func,
         expected="""
 @T.prim_func
+def main(A: T.Buffer((128, 128), "float32"), B: T.Buffer((256, 256), "float32")):
+    T.evaluate(0)""",
+    )
+
+
+def test_prim_func_no_sugar_inlined_buffer():
+    a = tir.Var("a", "handle")
+    b = tir.Var("b", "handle")
+    func = tir.PrimFunc(
+        params=[a, b],
+        ret_type=None,
+        buffer_map={
+            a: tir.decl_buffer(shape=[128, 128], dtype="float32", name="A"),
+            b: tir.decl_buffer(shape=[256, 256], dtype="float32", name="B"),
+        },
+        body=tir.Evaluate(a),
+    )
+    _assert_print(
+        func,
+        expected="""
+@T.prim_func
+def main(a: T.handle, B: T.Buffer((256, 256), "float32")):
+    A = T.match_buffer(a, (128, 128))
+    T.evaluate(a)
+""",
+    )
+
+
+def test_prim_func_no_sugar_shared_buffer_data():
+    a = tir.Var("a", "handle")
+    b = tir.Var("b", "handle")
+    buffer_data = tir.decl_buffer(shape=[128, 128], dtype="float32", name="A").data
+    func = tir.PrimFunc(
+        params=[a, b],
+        ret_type=None,
+        buffer_map={
+            a: tir.decl_buffer(shape=[128, 128], dtype="float32", name="A", data=buffer_data),
+            b: tir.decl_buffer(shape=[256, 256], dtype="float32", name="B", data=buffer_data),
+        },
+        body=tir.Evaluate(0),
+    )
+    _assert_print(
+        func,
+        expected="""
+@T.prim_func
 def main(a: T.handle, b: T.handle):
     A = T.match_buffer(a, (128, 128))
-    B = T.match_buffer(b, (256, 256))
-    T.evaluate(0)""",
+    B = T.match_buffer(b, (256, 256), data=A.data)
+    T.evaluate(0)
+""",
     )
 
 
@@ -641,6 +687,8 @@ def main():
 
 if __name__ == "__main__":
     test_prim_func()
+    test_prim_func_no_sugar_inlined_buffer()
+    test_prim_func_no_sugar_shared_buffer_data()
     test_block_realize()
     test_block()
     test_buffer()

From 693f92a67f687053284b4d6bbfd0fdae32686c87 Mon Sep 17 00:00:00 2001
From: multiverstack <39256082+multiverstack-intellif@users.noreply.github.com>
Date: Fri, 20 Jan 2023 16:23:51 +0800
Subject: [PATCH 201/286] [TIR][Schedule] Improve cache_index to cache common
 subexpressions (#13700)

Much of the index computation is duplicated, so this PR is to do common
subexpression analyze when performing cache_index.

A threshold number is needed when identifying a common sub expr, since
we only want to cache the most frequently appeared sub expressions. And
an arith helper function is added to help analyze the target expr. A
common use case may be calling tvm.arith.detect_common_subexpr() and
determining based on its result whether to perform cache_index and its
corresponding threshold.

Co-authored-by: Min Chen <chen.min@intellif.com>
---
 include/tvm/tir/schedule/schedule.h           |   6 +-
 python/tvm/tir/schedule/schedule.py           |  21 +-
 src/tir/schedule/concrete_schedule.cc         |   5 +-
 src/tir/schedule/concrete_schedule.h          |   3 +-
 src/tir/schedule/primitive.h                  |   5 +-
 src/tir/schedule/primitive/cache_index.cc     | 147 ++++---
 src/tir/schedule/traced_schedule.cc           |   7 +-
 src/tir/schedule/traced_schedule.h            |   3 +-
 .../unittest/test_tir_schedule_cache_index.py | 402 +++++++++++++++++-
 9 files changed, 526 insertions(+), 73 deletions(-)

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 8b22c173a3d8..288601d1cccc 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -419,10 +419,12 @@ class ScheduleNode : public runtime::Object {
    * \brief Create a block to cache precomputed index for later use.
    * if there is no index computation, keep unchanged.
    * \param block_rv The target block
-   * \param buffer_index The index of the target buffer in block's read region
+   * \param storage_scope The storage scope of cached block
+   * \param cse_thresh The repeat threshold that determines a common sub expr
    * \return The cache stage blocks.
    */
-  virtual Array<BlockRV> CacheIndex(const BlockRV& block_rv, int buffer_index) = 0;
+  virtual Array<BlockRV> CacheIndex(const BlockRV& block_rv, const String& storage_scope,
+                                    int cse_thresh) = 0;
   /*!
    * \brief Create a block that read/write a buffer region into a read/write cache with reindexing.
    * The layout of the cache will be the same as by the iterators of the block that reads/writes the
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 64aba0e029fe..6a71e5872fcd 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -1294,7 +1294,10 @@ def cache_inplace(data_io: T.Buffer[64, "int32"]) -> None:
 
     @type_checked
     def cache_index(
-        self, block: Union[BlockRV, str], buffer_index: Union[int, str, Buffer]
+        self,
+        block: Union[BlockRV, str],
+        storage_scope: str,
+        cse_thresh: int = 0,
     ) -> List[BlockRV]:
         """Create a block to cache precomputed index for later use.
         if there is no index computation, keep unchanged.
@@ -1304,8 +1307,12 @@ def cache_index(
         block : Union[BlockRV, str]
             The target block operates on the target buffer.
 
-        buffer_index: int
-            The index of the target buffer in block's read region
+        storage_scope: str
+            The storage scope of cached block.
+
+        cse_thresh: int
+            The repeat threshold that determines a common sub expr,
+            default 0 means cache all index computation.
 
 
         Returns
@@ -1334,7 +1341,7 @@ def resize(a: T.handle, b: T.handle) -> None:
 
             sch = tir.Schedule(resize)
             block_a = sch.get_block("A")
-            sch.cache_index(block_a, 0)
+            sch.cache_index(block_a, "global", 1)
             print(sch.mod["main"].script())
 
         After applying cache_index, the IR becomes:
@@ -1370,12 +1377,8 @@ def resize_cache_index(
         """
         block = self._normalize_block_arg(block)
 
-        if not isinstance(buffer_index, int):
-            _, buffer_index, _ = self._normalize_buffer_arg(
-                block, buffer_index, required_buffer_type="read"
-            )
         return _ffi_api.ScheduleCacheIndex(  # type: ignore # pylint: disable=no-member
-            self, block, buffer_index
+            self, block, storage_scope, cse_thresh
         )
 
     @type_checked
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 163c72eb0777..91ca0f141766 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -581,10 +581,11 @@ Array<BlockRV> ConcreteScheduleNode::CacheInplace(const BlockRV& block_rv, int w
   return return_blocks;
 }
 
-Array<BlockRV> ConcreteScheduleNode::CacheIndex(const BlockRV& block_rv, int buffer_index) {
+Array<BlockRV> ConcreteScheduleNode::CacheIndex(const BlockRV& block_rv,
+                                                const String& storage_scope, int cse_thresh) {
   Array<StmtSRef> result;
   TVM_TIR_SCHEDULE_BEGIN();
-  result = tir::CacheIndex(state_, this->GetSRef(block_rv), buffer_index);
+  result = tir::CacheIndex(state_, this->GetSRef(block_rv), storage_scope, cse_thresh);
   TVM_TIR_SCHEDULE_END("cache-index", this->error_render_level_);
   this->state_->DebugVerify();
   Array<BlockRV> return_blocks;
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 899775f2a15d..95d5fe9c2e44 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -118,7 +118,8 @@ class ConcreteScheduleNode : public ScheduleNode {
                      const Array<BlockRV> consumer_blocks = {}) override;
   Array<BlockRV> CacheInplace(const BlockRV& block_rv, int read_buffer_index,
                               const String& storage_scope) override;
-  Array<BlockRV> CacheIndex(const BlockRV& block_rv, int write_buffer_index) override;
+  Array<BlockRV> CacheIndex(const BlockRV& block_rv, const String& storage_scope,
+                            int cse_thresh) override;
   BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
                   BufferIndexType buffer_index_type) override;
   /******** Schedule: Compute location ********/
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 9e7f77f55ea5..dbc4e235965c 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -285,11 +285,12 @@ TVM_DLL Array<StmtSRef> CacheInplace(ScheduleState self, const StmtSRef& block_s
  * \brief Create a block to cache precomputed index for later use.
  * if there is no index computation, keep unchanged.
  * \param block_sref The target block
- * \param buffer_index The index of the target buffer in block's read region,
+ * \param storage_scope The storage scope of cached block
+ * \param cse_thresh The repeat threshold that determines a common sub expr
  * \return The cache stage block.
  */
 TVM_DLL Array<StmtSRef> CacheIndex(ScheduleState self, const StmtSRef& block_sref,
-                                   int buffer_index);
+                                   const String& storage_scope, int cse_thresh);
 /*!
  *!
  * \brief Create a block that read/write a buffer region into a read/write cache with reindexing.
diff --git a/src/tir/schedule/primitive/cache_index.cc b/src/tir/schedule/primitive/cache_index.cc
index c6f845541dd2..0316feefd5de 100644
--- a/src/tir/schedule/primitive/cache_index.cc
+++ b/src/tir/schedule/primitive/cache_index.cc
@@ -18,6 +18,8 @@
  */
 #include <tvm/arith/int_set.h>
 
+#include "../../transforms/common_subexpr_elim_tools.h"
+#include "../../transforms/replace_selected_expr.h"
 #include "../utils.h"
 
 namespace tvm {
@@ -27,8 +29,10 @@ namespace tir {
 
 /*! \brief The auxiliary info used for the insertion point and content of the cache stage. */
 struct IndexInfo {
-  /*! \brief The target buffer to cache the index. */
-  Buffer target_buffer;
+  /*! \brief The target block to perform cache_index */
+  StmtSRef target_block;
+  /*! \brief Record the common subexpr extract threshold */
+  size_t cse_thresh;
   /*! \brief The cache buffer to store the precomputed index */
   std::vector<Buffer> cache_buffer;
   /*! \brief The expr to be precomputed */
@@ -74,9 +78,8 @@ class IndexInfoCollector : public StmtExprVisitor {
    */
   static void Collect(const ScheduleState& self, const StmtSRef& block_sref,
                       const StmtSRef& scope_sref, IndexInfo* info) {
-    IndexInfoCollector collector(self, block_sref, scope_sref, info->target_buffer);
+    IndexInfoCollector collector(self, block_sref, scope_sref, info->cse_thresh);
     collector(GetRef<Stmt>(scope_sref->stmt));
-    // info->loc_sref = collector.loc_sref_;
     info->loc_pos = collector.loc_pos_;
     info->index_exprs = collector.exprs_;
     info->range_map = collector.range_map_;
@@ -88,11 +91,11 @@ class IndexInfoCollector : public StmtExprVisitor {
    * \param self The state of the schedule
    * \param block_sref The sref of the target block of the buffer being applied cache_index
    * \param scope_sref The sref of the scope block of the target block
-   * \param buffer The target buffer to cache the indexs
+   * \param cse_thresh The repeat threshold that determines a common subexpr
    */
   IndexInfoCollector(const ScheduleState self, const StmtSRef& block_sref,
-                     const StmtSRef& scope_sref, const Buffer& buffer)
-      : self_(self), block_sref_(block_sref), scope_sref_(scope_sref), buffer_(buffer) {}
+                     const StmtSRef& scope_sref, int cse_thresh)
+      : self_(self), block_sref_(block_sref), scope_sref_(scope_sref), cse_thresh_(cse_thresh) {}
 
   void VisitStmt_(const SeqStmtNode* seq_stmt) final {
     for (size_t i = 0; i < seq_stmt->size(); ++i) {
@@ -110,8 +113,9 @@ class IndexInfoCollector : public StmtExprVisitor {
   }
 
   void VisitStmt_(const BlockNode* block) final {
-    // Only visit the target's parent block
+    visiting_target_block = static_cast<bool>(block_sref_->stmt == block);
     StmtVisitor::VisitStmt_(block);
+    visiting_target_block = false;
     if (block == scope_sref_->stmt) {
       // The block vistied is the current parent scope
       // Handling cases when no SeqStmt in the scope
@@ -136,15 +140,56 @@ class IndexInfoCollector : public StmtExprVisitor {
     }
   }
 
-  void VisitExpr_(const BufferLoadNode* load) final {
-    if (load->buffer.same_as(buffer_)) {
-      for (const PrimExpr& it : load->indices) {
-        if (!it->IsInstance<VarNode>()) {
-          exprs_.push_back(it);
+  void VisitStmt_(const BufferStoreNode* store) final {
+    // Only analyze the cache candidate for stores in target block
+    if (visiting_target_block) {
+      auto IsEligibleComputation = [](const PrimExpr& expr) {
+        return (SideEffect(expr) <= CallEffectKind::kPure && CalculateExprComplexity(expr) > 1 &&
+                (expr.as<RampNode>() == nullptr) && (expr.as<BroadcastNode>() == nullptr));
+      };
+
+      // Analyze sub expr candidates
+      ComputationTable table_syntactic_comp_done_by_stmt =
+          ComputationsDoneBy::GetComputationsDoneBy(GetRef<Stmt>(store), IsEligibleComputation,
+                                                    [](const PrimExpr& expr) { return true; });
+      std::vector<std::pair<PrimExpr, size_t>> semantic_comp_done_by_stmt =
+          SyntacticToSemanticComputations(table_syntactic_comp_done_by_stmt, true);
+
+      // Analyze the sub expr of a candidate whose repeat time is under cse_thresh_
+      for (size_t i = 0; i < semantic_comp_done_by_stmt.size(); i++) {
+        std::pair<PrimExpr, size_t>& computation_and_nb = semantic_comp_done_by_stmt[i];
+        if (computation_and_nb.second < cse_thresh_) {
+          std::vector<PrimExpr> direct_subexprs = DirectSubexpr::GetDirectSubexpressions(
+              computation_and_nb.first, IsEligibleComputation,
+              [](const PrimExpr& expr) { return true; });
+          InsertVectorToSortedSemanticComputations(&semantic_comp_done_by_stmt, direct_subexprs,
+                                                   true, computation_and_nb.second);
         }
       }
+
+      // Record the final sub expr with repeat time greater than cse_thresh_
+      // In order to make the result stable, sort it by post order and then by complexity
+      PostOrderVisit(store->value, [&semantic_comp_done_by_stmt, this](const ObjectRef& node) {
+        if (node->IsInstance<PrimExprNode>()) {
+          PrimExpr this_expr = Downcast<PrimExpr>(node);
+          for (auto& it : semantic_comp_done_by_stmt) {
+            if (it.second >= this->cse_thresh_ && EquivalentTerms(this_expr, it.first, true)) {
+              auto find_result =
+                  std::find_if(this->exprs_.begin(), this->exprs_.end(),
+                               [&](PrimExpr expr) { return expr.get() == it.first.get(); });
+              if (find_result == this->exprs_.end()) {
+                this->exprs_.push_back(it.first);
+              }
+            }
+          }
+        }
+      });
+      auto cmp = [&](const PrimExpr& lhs, const PrimExpr& rhs) -> bool {
+        return CalculateExprComplexity(lhs) > CalculateExprComplexity(rhs);
+      };
+      std::stable_sort(exprs_.begin(), exprs_.end(), cmp);
     }
-    ExprVisitor::VisitExpr_(load);
+    StmtVisitor::VisitStmt_(store);
   }
 
   /*! \brief The schedule class */
@@ -153,12 +198,14 @@ class IndexInfoCollector : public StmtExprVisitor {
   const StmtSRef& block_sref_;
   /*! \brief The parent scope of the target block */
   const StmtSRef& scope_sref_;
-  /*! \brief The target buffer to cache the index */
-  const Buffer& buffer_;
+  /*! \brief Record the common subexpr extract threshold */
+  size_t cse_thresh_;
   /*! \brief The calculation expr to be precomputed */
   std::vector<PrimExpr> exprs_;
   /*! \brief The flag whether we have visited the target block */
   bool visited_block_{false};
+  /*! \brief The flag indicating currently visiting target block */
+  bool visiting_target_block{false};
   /*! \brief The index to insert the cache_index stage */
   int loc_pos_{-1};
   /*! \brief The flag indicating the right scope to update seq pos */
@@ -169,17 +216,15 @@ class IndexInfoCollector : public StmtExprVisitor {
 
 /*!
  * \brief Create a loop nest that writes precomputed index into index buffer.
- * \param cache_region The cached copy region.
  * \param info The cache stage information, which will be updated in the function.
  * \param storage_scope The storage scope of the cached buffer (only used in naming here)
  * \returns A block indicating the body of the loop nesting.
  */
-Array<Block> MakeIndexCacheStage(IndexInfo* info) {
+Array<Block> MakeIndexCacheStage(IndexInfo* info, const String& storage_scope) {
   Array<Block> blocks;
   Array<Stmt> bodies;
   bodies.reserve(info->index_exprs.size());
   info->cache_buffer.reserve(info->index_exprs.size());
-  const String& storage_scope = info->target_buffer.scope();
 
   // For each index calculation, create a block to pre-compute.
   for (size_t expr_index = 0; expr_index < info->index_exprs.size(); expr_index++) {
@@ -214,10 +259,7 @@ Array<Block> MakeIndexCacheStage(IndexInfo* info) {
       });
     }
 
-    // Inference the shape and create cache buffer
-    arith::IntSet val_range =
-        arith::EvalSet(Substitute(index_expr, info->var_binding), arith::AsIntSet(info->range_map));
-    DataType data_type = DetermineDatatype(val_range);
+    DataType data_type = index_expr.dtype();
     Var index_buffer_var("index_var_" + std::to_string(expr_index),
                          PointerType(PrimType(data_type), storage_scope));
     Array<PrimExpr> buffer_shape;
@@ -346,7 +388,9 @@ class CacheIndexRewriter : public StmtExprMutator {
   Stmt VisitStmt_(const BlockNode* block) final {
     Block old_stmt = GetRef<Block>(block);
     // Mutate the body
+    visiting_target_block = static_cast<bool>(block == info_->target_block->stmt);
     Block stmt = Downcast<Block>(StmtMutator::VisitStmt_(block));
+    visiting_target_block = false;
 
     // Check if it is the block corresponding to the parent scope
     if (block == scope_sref_->stmt) {
@@ -362,24 +406,23 @@ class CacheIndexRewriter : public StmtExprMutator {
     return std::move(stmt);
   }
 
-  PrimExpr VisitExpr_(const BufferLoadNode* load) final {
-    if (load->buffer.same_as(info_->target_buffer)) {
-      // Rewrite the target buffer load
-      Array<PrimExpr> new_indices;
-      for (const PrimExpr& index : load->indices) {
-        auto it = std::find_if(info_->index_exprs.begin(), info_->index_exprs.end(),
-                               [&](PrimExpr& e) { return e.get() == index.get(); });
-        if (it == info_->index_exprs.end()) {
-          new_indices.push_back(index);
-        } else {
-          // Replace load index with cached index
-          auto offset = std::distance(info_->index_exprs.begin(), it);
-          new_indices.push_back(BufferLoad(info_->cache_buffer[offset], cache_indices_[offset]));
-        }
+  Stmt VisitStmt_(const BufferStoreNode* store) final {
+    Stmt ret_stmt = StmtMutator::VisitStmt_(store);
+    // Replace common sub expr for target block, with cached buffer load
+    if (visiting_target_block) {
+      for (size_t i = 0; i < info_->index_exprs.size(); i++) {
+        PrimExpr& computation = info_->index_exprs[i];
+        std::function<bool(const PrimExpr&)> predicate_selector =
+            [computation](const PrimExpr& current_expr) {
+              return (EquivalentTerms(current_expr, computation, true));
+            };
+        BufferLoad load = BufferLoad(info_->cache_buffer[i], cache_indices_[i]);
+        ret_stmt = ReplaceSelectedExpr::ReplaceSelectedExprInStmt(
+            ret_stmt, predicate_selector, std::move(load),
+            [](const PrimExpr& expr) { return true; });
       }
-      return BufferLoad(load->buffer, new_indices);
     }
-    return ExprMutator::VisitExpr_(load);
+    return ret_stmt;
   }
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
@@ -393,9 +436,12 @@ class CacheIndexRewriter : public StmtExprMutator {
   IndexInfo* info_;
   /*! \brief The indices for the cache buffer */
   std::vector<Array<PrimExpr>> cache_indices_;
+  /*! \brief Indicating whether cache stage is inserted, only do index replacement afterwards*/
+  bool visiting_target_block{false};
 };
 
-Array<StmtSRef> CacheIndex(ScheduleState self, const StmtSRef& block_sref, int buffer_index) {
+Array<StmtSRef> CacheIndex(ScheduleState self, const StmtSRef& block_sref,
+                           const String& storage_scope, int cse_thresh) {
   /*!
    * Check:
    *   - The index is in the array of block reading region
@@ -407,9 +453,9 @@ Array<StmtSRef> CacheIndex(ScheduleState self, const StmtSRef& block_sref, int b
 
   // Step 0. Checking index, getting the target buffer and the parent scope
   IndexInfo info;
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref);
-  info.target_buffer =
-      GetNthAccessBuffer(self, GetRef<Block>(block), buffer_index, BufferIndexType::kRead);
+  info.target_block = block_sref;
+  CHECK_GE(cse_thresh, 0) << "cse_thresh should not be negative number";
+  info.cse_thresh = cse_thresh;
   StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
 
   // Step 1. Collect the indexing info of target buffer.
@@ -418,7 +464,7 @@ Array<StmtSRef> CacheIndex(ScheduleState self, const StmtSRef& block_sref, int b
   // Step 2. Create cache stages and rewrite the stmt.
   BlockRealize realize = GetBlockRealize(self, block_sref);
   info.var_binding = GetBindings(realize);
-  Array<Block> cache_stages = MakeIndexCacheStage(&info);
+  Array<Block> cache_stages = MakeIndexCacheStage(&info, storage_scope);
   Stmt new_scope = CacheIndexRewriter::Rewrite(/*scope_sref=*/scope_sref, /*info=*/&info);
 
   bool old_stage_pipeline = self->block_info[block_sref].scope->stage_pipeline;
@@ -458,17 +504,20 @@ struct CacheIndexTraits : public UnpackedInstTraits<CacheIndexTraits> {
 
  private:
   static constexpr size_t kNumInputs = 1;
-  static constexpr size_t kNumAttrs = 1;
+  static constexpr size_t kNumAttrs = 2;
   static constexpr size_t kNumDecisions = 0;
 
-  static Array<BlockRV> UnpackedApplyToSchedule(Schedule sch, BlockRV block, Integer buffer_index) {
-    return sch->CacheIndex(block, buffer_index->value);
+  static Array<BlockRV> UnpackedApplyToSchedule(Schedule sch, BlockRV block, String storage_scope,
+                                                Integer cse_thresh) {
+    return sch->CacheIndex(block, storage_scope, cse_thresh->value);
   }
 
-  static String UnpackedAsPython(Array<String> outputs, String block, Integer buffer_index) {
+  static String UnpackedAsPython(Array<String> outputs, String block, String storage_scope,
+                                 Integer cse_thresh) {
     PythonAPICall py("cache_index");
     py.Input("block", block);
-    py.Input("buffer_index", buffer_index->value);
+    py.Input("storage_scope", storage_scope);
+    py.Input("cse_thresh", cse_thresh->value);
     py.OutputList(outputs);
     return py.Str();
   }
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 70559608e789..3dc78074fcd6 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -325,8 +325,9 @@ Array<BlockRV> TracedScheduleNode::CacheInplace(const BlockRV& block_rv, int rea
   return result;
 }
 
-Array<BlockRV> TracedScheduleNode::CacheIndex(const BlockRV& block_rv, int buffer_index) {
-  Array<BlockRV> result = ConcreteScheduleNode::CacheIndex(block_rv, buffer_index);
+Array<BlockRV> TracedScheduleNode::CacheIndex(const BlockRV& block_rv, const String& storage_scope,
+                                              int cse_thresh) {
+  Array<BlockRV> result = ConcreteScheduleNode::CacheIndex(block_rv, storage_scope, cse_thresh);
   Array<ObjectRef> outputs;
   for (const BlockRV& r : result) {
     outputs.push_back(r);
@@ -334,7 +335,7 @@ Array<BlockRV> TracedScheduleNode::CacheIndex(const BlockRV& block_rv, int buffe
   static const InstructionKind& kind = InstructionKind::Get("CacheIndex");
   trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
                                       /*inputs=*/{block_rv},
-                                      /*attrs=*/{Integer(buffer_index)},
+                                      /*attrs=*/{storage_scope, Integer(cse_thresh)},
                                       /*outputs=*/outputs));
   return result;
 }
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index c54574e9c9ff..ee65c721ad9f 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -80,7 +80,8 @@ class TracedScheduleNode : public ConcreteScheduleNode {
                               const String& storage_scope) final;
   BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
                   BufferIndexType buffer_index_type) final;
-  Array<BlockRV> CacheIndex(const BlockRV& block_rv, int buffer_index) final;
+  Array<BlockRV> CacheIndex(const BlockRV& block_rv, const String& storage_scope,
+                            int cse_thresh) final;
   /******** Schedule: Compute location ********/
   void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops,
                  int index = -1) final;
diff --git a/tests/python/unittest/test_tir_schedule_cache_index.py b/tests/python/unittest/test_tir_schedule_cache_index.py
index 0c2882d1b617..d446249e018e 100644
--- a/tests/python/unittest/test_tir_schedule_cache_index.py
+++ b/tests/python/unittest/test_tir_schedule_cache_index.py
@@ -47,8 +47,7 @@ def resize_cache_index(
     index_var_1 = T.alloc_buffer([80], dtype="int32", strides=[1])
     for ax0, ax1 in T.grid(80, 80):
         with T.block("index_0"):
-            v0 = T.axis.spatial(80, ax0)
-            v1 = T.axis.spatial(80, ax1)
+            v0, v1 = T.axis.remap("SS", [ax0, ax1])
             T.reads()
             T.writes(index_var_0[v0, v1])
             index_var_0[v0, v1] = v0 // 4 + v1 // 4
@@ -66,13 +65,408 @@ def resize_cache_index(
             B[n, c, vi, vj] = A[n, c, index_var_0[vi, vj], index_var_1[vj]]
 
 
-def test_inplace_cache_read():
+@T.prim_func
+def bilinear_resize(
+    x: T.Buffer[(1, 3, 40, 40), "float16"], resize: T.Buffer[(1, 3, 80, 80), "float16"]
+):
+    for i0, i1, i2, i3 in T.grid(1, 3, 80, 80):
+        with T.block("resize"):
+            i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            T.reads(x[i0_1, i1_1, 0:40, 0:40])
+            T.writes(resize[i0_1, i1_1, i2_1, i3_1])
+            resize[i0_1, i1_1, i2_1, i3_1] = T.Cast(
+                "float16",
+                (
+                    T.Cast(
+                        "float32",
+                        x[
+                            i0_1,
+                            i1_1,
+                            T.max(
+                                T.min(
+                                    T.Cast(
+                                        "int32",
+                                        T.floor(
+                                            (T.Cast("float32", i2_1) + T.float32(0.5))
+                                            * T.float32(0.5)
+                                            - T.float32(0.5),
+                                            dtype="float32",
+                                        ),
+                                    ),
+                                    39,
+                                ),
+                                0,
+                            ),
+                            T.max(
+                                T.min(
+                                    T.Cast(
+                                        "int32",
+                                        T.floor(
+                                            (T.Cast("float32", i3_1) + T.float32(0.5))
+                                            * T.float32(0.5)
+                                            - T.float32(0.5),
+                                            dtype="float32",
+                                        ),
+                                    ),
+                                    39,
+                                ),
+                                0,
+                            ),
+                        ],
+                    )
+                    * (
+                        T.float32(1)
+                        - (
+                            (T.Cast("float32", i3_1) + T.float32(0.5)) * T.float32(0.5)
+                            - T.float32(0.5)
+                            - T.Cast(
+                                "float32",
+                                T.Cast(
+                                    "int32",
+                                    T.floor(
+                                        (T.Cast("float32", i3_1) + T.float32(0.5)) * T.float32(0.5)
+                                        - T.float32(0.5),
+                                        dtype="float32",
+                                    ),
+                                ),
+                            )
+                        )
+                    )
+                    + T.Cast(
+                        "float32",
+                        x[
+                            i0_1,
+                            i1_1,
+                            T.max(
+                                T.min(
+                                    T.Cast(
+                                        "int32",
+                                        T.floor(
+                                            (T.Cast("float32", i2_1) + T.float32(0.5))
+                                            * T.float32(0.5)
+                                            - T.float32(0.5),
+                                            dtype="float32",
+                                        ),
+                                    ),
+                                    39,
+                                ),
+                                0,
+                            ),
+                            T.max(
+                                T.min(
+                                    T.Cast(
+                                        "int32",
+                                        T.floor(
+                                            (T.Cast("float32", i3_1) + T.float32(0.5))
+                                            * T.float32(0.5)
+                                            - T.float32(0.5),
+                                            dtype="float32",
+                                        ),
+                                    )
+                                    + 1,
+                                    39,
+                                ),
+                                0,
+                            ),
+                        ],
+                    )
+                    * (
+                        (T.Cast("float32", i3_1) + T.float32(0.5)) * T.float32(0.5)
+                        - T.float32(0.5)
+                        - T.Cast(
+                            "float32",
+                            T.Cast(
+                                "int32",
+                                T.floor(
+                                    (T.Cast("float32", i3_1) + T.float32(0.5)) * T.float32(0.5)
+                                    - T.float32(0.5),
+                                    dtype="float32",
+                                ),
+                            ),
+                        )
+                    )
+                )
+                * (
+                    T.float32(1)
+                    - (
+                        (T.Cast("float32", i2_1) + T.float32(0.5)) * T.float32(0.5)
+                        - T.float32(0.5)
+                        - T.Cast(
+                            "float32",
+                            T.Cast(
+                                "int32",
+                                T.floor(
+                                    (T.Cast("float32", i2_1) + T.float32(0.5)) * T.float32(0.5)
+                                    - T.float32(0.5),
+                                    dtype="float32",
+                                ),
+                            ),
+                        )
+                    )
+                )
+                + (
+                    T.Cast(
+                        "float32",
+                        x[
+                            i0_1,
+                            i1_1,
+                            T.max(
+                                T.min(
+                                    T.Cast(
+                                        "int32",
+                                        T.floor(
+                                            (T.Cast("float32", i2_1) + T.float32(0.5))
+                                            * T.float32(0.5)
+                                            - T.float32(0.5),
+                                            dtype="float32",
+                                        ),
+                                    )
+                                    + 1,
+                                    39,
+                                ),
+                                0,
+                            ),
+                            T.max(
+                                T.min(
+                                    T.Cast(
+                                        "int32",
+                                        T.floor(
+                                            (T.Cast("float32", i3_1) + T.float32(0.5))
+                                            * T.float32(0.5)
+                                            - T.float32(0.5),
+                                            dtype="float32",
+                                        ),
+                                    ),
+                                    39,
+                                ),
+                                0,
+                            ),
+                        ],
+                    )
+                    * (
+                        T.float32(1)
+                        - (
+                            (T.Cast("float32", i3_1) + T.float32(0.5)) * T.float32(0.5)
+                            - T.float32(0.5)
+                            - T.Cast(
+                                "float32",
+                                T.Cast(
+                                    "int32",
+                                    T.floor(
+                                        (T.Cast("float32", i3_1) + T.float32(0.5)) * T.float32(0.5)
+                                        - T.float32(0.5),
+                                        dtype="float32",
+                                    ),
+                                ),
+                            )
+                        )
+                    )
+                    + T.Cast(
+                        "float32",
+                        x[
+                            i0_1,
+                            i1_1,
+                            T.max(
+                                T.min(
+                                    T.Cast(
+                                        "int32",
+                                        T.floor(
+                                            (T.Cast("float32", i2_1) + T.float32(0.5))
+                                            * T.float32(0.5)
+                                            - T.float32(0.5),
+                                            dtype="float32",
+                                        ),
+                                    )
+                                    + 1,
+                                    39,
+                                ),
+                                0,
+                            ),
+                            T.max(
+                                T.min(
+                                    T.Cast(
+                                        "int32",
+                                        T.floor(
+                                            (T.Cast("float32", i3_1) + T.float32(0.5))
+                                            * T.float32(0.5)
+                                            - T.float32(0.5),
+                                            dtype="float32",
+                                        ),
+                                    )
+                                    + 1,
+                                    39,
+                                ),
+                                0,
+                            ),
+                        ],
+                    )
+                    * (
+                        (T.Cast("float32", i3_1) + T.float32(0.5)) * T.float32(0.5)
+                        - T.float32(0.5)
+                        - T.Cast(
+                            "float32",
+                            T.Cast(
+                                "int32",
+                                T.floor(
+                                    (T.Cast("float32", i3_1) + T.float32(0.5)) * T.float32(0.5)
+                                    - T.float32(0.5),
+                                    dtype="float32",
+                                ),
+                            ),
+                        )
+                    )
+                )
+                * (
+                    (T.Cast("float32", i2_1) + T.float32(0.5)) * T.float32(0.5)
+                    - T.float32(0.5)
+                    - T.Cast(
+                        "float32",
+                        T.Cast(
+                            "int32",
+                            T.floor(
+                                (T.Cast("float32", i2_1) + T.float32(0.5)) * T.float32(0.5)
+                                - T.float32(0.5),
+                                dtype="float32",
+                            ),
+                        ),
+                    )
+                ),
+            )
+
+
+@T.prim_func
+def cached_bilinear_resize(
+    x: T.Buffer[(1, 3, 40, 40), "float16"], resize: T.Buffer[(1, 3, 80, 80), "float16"]
+):
+    index_var_0 = T.alloc_buffer([80], dtype="float32", strides=[1])
+    index_var_1 = T.alloc_buffer([80], dtype="int32", strides=[1])
+    index_var_2 = T.alloc_buffer([80], dtype="int32", strides=[1])
+    for ax0 in T.serial(80):
+        with T.block("index_0"):
+            v0 = T.axis.spatial(80, ax0)
+            T.reads()
+            T.writes(index_var_0[v0])
+            index_var_0[v0] = (
+                (T.Cast("float32", v0) + T.float32(0.5)) * T.float32(0.5)
+                - T.float32(0.5)
+                - T.Cast(
+                    "float32",
+                    T.Cast(
+                        "int32",
+                        T.floor(
+                            (T.Cast("float32", v0) + T.float32(0.5)) * T.float32(0.5)
+                            - T.float32(0.5),
+                            dtype="float32",
+                        ),
+                    ),
+                )
+            )
+    for ax0 in T.serial(80):
+        with T.block("index_1"):
+            v0 = T.axis.spatial(80, ax0)
+            T.reads()
+            T.writes(index_var_1[v0])
+            index_var_1[v0] = T.Cast(
+                "int32",
+                T.floor(
+                    (T.Cast("float32", v0) + T.float32(0.5)) * T.float32(0.5) - T.float32(0.5),
+                    dtype="float32",
+                ),
+            )
+    for ax0 in T.serial(80):
+        with T.block("index_2"):
+            v0 = T.axis.spatial(80, ax0)
+            T.reads()
+            T.writes(index_var_2[v0])
+            index_var_2[v0] = T.Cast(
+                "int32",
+                T.floor(
+                    (T.Cast("float32", v0) + T.float32(0.5)) * T.float32(0.5) - T.float32(0.5),
+                    dtype="float32",
+                ),
+            )
+    for i0, i1, i2, i3 in T.grid(1, 3, 80, 80):
+        with T.block("resize"):
+            i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            T.reads(x[i0_1, i1_1, 0:40, 0:40])
+            T.writes(resize[i0_1, i1_1, i2_1, i3_1])
+            resize[i0_1, i1_1, i2_1, i3_1] = T.Cast(
+                "float16",
+                (
+                    T.Cast(
+                        "float32",
+                        x[
+                            i0_1,
+                            i1_1,
+                            T.max(T.min(index_var_1[i2_1], 39), 0),
+                            T.max(T.min(index_var_2[i3_1], 39), 0),
+                        ],
+                    )
+                    * (T.float32(1) - index_var_0[i3_1])
+                    + T.Cast(
+                        "float32",
+                        x[
+                            i0_1,
+                            i1_1,
+                            T.max(T.min(index_var_1[i2_1], 39), 0),
+                            T.max(T.min(index_var_2[i3_1] + 1, 39), 0),
+                        ],
+                    )
+                    * index_var_0[i3_1]
+                )
+                * (
+                    T.float32(1)
+                    - (
+                        (T.Cast("float32", i2_1) + T.float32(0.5)) * T.float32(0.5)
+                        - T.float32(0.5)
+                        - T.Cast("float32", index_var_1[i2_1])
+                    )
+                )
+                + (
+                    T.Cast(
+                        "float32",
+                        x[
+                            i0_1,
+                            i1_1,
+                            T.max(T.min(index_var_1[i2_1] + 1, 39), 0),
+                            T.max(T.min(index_var_2[i3_1], 39), 0),
+                        ],
+                    )
+                    * (T.float32(1) - index_var_0[i3_1])
+                    + T.Cast(
+                        "float32",
+                        x[
+                            i0_1,
+                            i1_1,
+                            T.max(T.min(index_var_1[i2_1] + 1, 39), 0),
+                            T.max(T.min(index_var_2[i3_1] + 1, 39), 0),
+                        ],
+                    )
+                    * index_var_0[i3_1]
+                )
+                * (
+                    (T.Cast("float32", i2_1) + T.float32(0.5)) * T.float32(0.5)
+                    - T.float32(0.5)
+                    - T.Cast("float32", index_var_1[i2_1])
+                ),
+            )
+
+
+def test_basic_cache_index():
     sch = tvm.tir.Schedule(resize, debug_mask="all")
     block = sch.get_block("A")
-    sch.cache_index(block, 0)
+    sch.cache_index(block, "global")
     tvm.ir.assert_structural_equal(resize_cache_index, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=resize)
 
 
+def test_resize_bilinear_cache_index():
+    sch = tvm.tir.Schedule(bilinear_resize, debug_mask="all")
+    block = sch.get_block("resize")
+    sch.cache_index(block, "global", 4)
+    tvm.ir.assert_structural_equal(sch.mod["main"], cached_bilinear_resize)
+    verify_trace_roundtrip(sch=sch, mod=bilinear_resize)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From f570a8c70e717d3166b888a70edbb6694f96d635 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Fri, 20 Jan 2023 01:33:17 -0800
Subject: [PATCH 202/286] [TVMScript] `T.allocate` with `T.decl_buffer` syntax
 sugar for TVMScript printer (#13813)

This PR implements the syntax sugar of `T.allocate` with `T.decl_buffer` for new TVMScript printer. This syntax sugar will skip the `T.allocate`, when its body is a matched `T.decl_buffer`, and the `Var` defined in `T.allocate` is only used in that `T.decl_buffer`. For example, it will change

```python
buffer_data = T.allocate([128, 128])
buffer = T.decl_buffer([128, 128], data=buffer_data)
```
into

```python
buffer = T.decl_buffer([128, 128])
```
but keep the following `T.allocate` unchanged:
```python
buffer_data = T.allocate([128, 128])
buffer_0 = T.decl_buffer([128, 128], data=buffer_data)
buffer_1 = T.decl_buffer([128, 128], data=buffer_data)
```
and
```python
buffer_data = T.allocate([128, 128])
buffer = T.decl_buffer([256, 256], data=buffer_data)
```
---
 src/script/printer/tir/function.cc            | 43 +--------
 src/script/printer/tir/stmt.cc                | 24 +++++
 src/script/printer/tir/utils.h                | 45 +++++++++
 .../unittest/test_tvmscript_printer_tir.py    | 92 ++++++++++---------
 4 files changed, 119 insertions(+), 85 deletions(-)

diff --git a/src/script/printer/tir/function.cc b/src/script/printer/tir/function.cc
index 6094eefb65b1..40957fcffaca 100644
--- a/src/script/printer/tir/function.cc
+++ b/src/script/printer/tir/function.cc
@@ -17,7 +17,6 @@
  * under the License.
  */
 #include <tvm/runtime/device_api.h>
-#include <tvm/tir/stmt_functor.h>
 
 #include "./utils.h"
 
@@ -65,47 +64,7 @@ bool IsSimpleBuffer(const tir::Buffer& buf) {
 }
 
 int CountVarOccurrence(const tir::PrimFunc& f, const tir::Var& v) {
-  class OccurrenceCounter : public tir::StmtExprVisitor {
-   public:
-    int count = 0;
-    const tir::VarNode* v = nullptr;
-
-    void VisitExpr_(const tir::VarNode* op) final {
-      if (op == v) {
-        ++count;
-      }
-      tir::StmtExprVisitor::VisitExpr_(op);
-    }
-
-    void VisitStmt_(const tir::BufferStoreNode* op) final {
-      VisitBuffer(op->buffer.get());
-      tir::StmtExprVisitor::VisitStmt_(op);
-    }
-
-    void VisitExpr_(const tir::BufferLoadNode* op) final {
-      VisitBuffer(op->buffer.get());
-      tir::StmtExprVisitor::VisitExpr_(op);
-    }
-
-    void VisitStmt_(const tir::DeclBufferNode* op) final {
-      VisitBuffer(op->buffer.get());
-      tir::StmtExprVisitor::VisitStmt_(op);
-    }
-
-    void VisitBuffer(const tir::BufferNode* buffer) {
-      VisitExpr(buffer->data);
-      for (const PrimExpr& shape_i : buffer->shape) {
-        VisitExpr(shape_i);
-      }
-      for (const PrimExpr& stride_i : buffer->strides) {
-        VisitExpr(stride_i);
-      }
-      VisitExpr(buffer->elem_offset);
-    }
-  };
-
-  OccurrenceCounter counter;
-  counter.v = v.get();
+  OccurrenceCounter counter(v.get());
   counter(f->body);
   for (const tir::Var& v : f->params) {
     counter(v);
diff --git a/src/script/printer/tir/stmt.cc b/src/script/printer/tir/stmt.cc
index 7344cb4d98d5..57b4c695a4ee 100644
--- a/src/script/printer/tir/stmt.cc
+++ b/src/script/printer/tir/stmt.cc
@@ -152,10 +152,34 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
                                  }));
         });
 
+bool IsAllocateDeclBufferPattern(const tir::AllocateNode* allocate) {
+  const tir::Var& buffer_var = allocate->buffer_var;
+  if (const tir::DeclBufferNode* decl_buffer = allocate->body.as<tir::DeclBufferNode>()) {
+    const tir::Buffer& buffer = decl_buffer->buffer;
+    if (buffer_var.same_as(buffer->data) && allocate->dtype == buffer->dtype &&
+        tir::is_one(allocate->condition) && !allocate->annotations.size() &&
+        allocate->extents.size() == buffer->shape.size()) {
+      tir::ExprDeepEqual expr_equal;
+      for (size_t i = 0, n = allocate->extents.size(); i < n; ++i) {
+        if (!expr_equal(allocate->extents[i], buffer->shape[i])) {
+          return false;
+        }
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Allocate>(  //
         "", [](tir::Allocate stmt, ObjectPath p, IRDocsifier d) -> Doc {
           bool concise = AllowConciseScoping(d);
+          OccurrenceCounter counter(stmt->buffer_var.get());
+          counter(stmt->body);
+          if (counter.count == 1 && IsAllocateDeclBufferPattern(stmt.get())) {
+            return d->AsDoc(stmt->body, p->Attr("body"));
+          }
           String storage_scope = tir::GetPtrStorageScope(stmt->buffer_var);
           Array<ExprDoc> args;
           Array<String> kwargs_keys;
diff --git a/src/script/printer/tir/utils.h b/src/script/printer/tir/utils.h
index 183400d974ca..e1ffe135229e 100644
--- a/src/script/printer/tir/utils.h
+++ b/src/script/printer/tir/utils.h
@@ -27,6 +27,7 @@
 #include <tvm/tir/function.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
 
 #include <string>
 #include <unordered_map>
@@ -220,6 +221,50 @@ ExprDoc BufferDecl(const tir::Buffer& buffer, const String& method, const Array<
 ExprDoc BufferAttn(const tir::Buffer& buffer, const ObjectPath& p, const Frame& frame,
                    const IRDocsifier& d);
 
+/*! \brief A Var occurrence counter visitor */
+class OccurrenceCounter : public tir::StmtExprVisitor {
+ public:
+  /*! \brief The occurrence counter */
+  int count = 0;
+  /*! \brief The Var to count occurrence */
+  const tir::VarNode* v = nullptr;
+
+  void VisitExpr_(const tir::VarNode* op) final {
+    if (op == v) {
+      ++count;
+    }
+    tir::StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const tir::BufferStoreNode* op) final {
+    VisitBuffer(op->buffer.get());
+    tir::StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const tir::BufferLoadNode* op) final {
+    VisitBuffer(op->buffer.get());
+    tir::StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const tir::DeclBufferNode* op) final {
+    VisitBuffer(op->buffer.get());
+    tir::StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitBuffer(const tir::BufferNode* buffer) {
+    VisitExpr(buffer->data);
+    for (const PrimExpr& shape_i : buffer->shape) {
+      VisitExpr(shape_i);
+    }
+    for (const PrimExpr& stride_i : buffer->strides) {
+      VisitExpr(stride_i);
+    }
+    VisitExpr(buffer->elem_offset);
+  }
+
+  explicit OccurrenceCounter(const tir::VarNode* var) { v = var; }
+};
+
 }  // namespace printer
 }  // namespace script
 }  // namespace tvm
diff --git a/tests/python/unittest/test_tvmscript_printer_tir.py b/tests/python/unittest/test_tvmscript_printer_tir.py
index 201428b74c66..5d86a8860852 100644
--- a/tests/python/unittest/test_tvmscript_printer_tir.py
+++ b/tests/python/unittest/test_tvmscript_printer_tir.py
@@ -22,6 +22,7 @@
 from tvm.script.ir_builder import IRBuilder
 from tvm.script.ir_builder import tir as T
 from tvm.script.printer import default
+import tvm.testing
 
 
 @contextmanager
@@ -327,6 +328,53 @@ def test_allocate():
     )
 
 
+def test_allocate_with_decl_buffer_sugar():
+    with IRBuilder() as ib:
+        with T.allocate([128, 128], "float32") as buffer_data:
+            with T.decl_buffer([128, 128], "float32", data=buffer_data) as buffer:
+                T.evaluate(0)
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+with T.decl_buffer((128, 128)) as buffer:
+    T.evaluate(0)
+""",
+    )
+
+
+def test_allocate_with_decl_buffer_no_sugar_multi_usage():
+    with IRBuilder() as ib:
+        with T.allocate([128, 128], "float32") as buffer_data:
+            with T.decl_buffer([128, 128], "float32", data=buffer_data) as buffer:
+                T.evaluate(buffer_data)
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+with T.allocate([128, 128], "float32", "global") as v:
+    buffer = T.decl_buffer((128, 128), data=v)
+    T.evaluate(v)
+""",
+    )
+
+
+def test_allocate_with_decl_buffer_no_sugar_mismatch():
+    with IRBuilder() as ib:
+        with T.allocate([128, 128], "float32") as buffer_data:
+            with T.decl_buffer([256, 256], "float32", data=buffer_data) as buffer:
+                T.evaluate(buffer_data)
+    obj = ib.get()
+    _assert_print(
+        obj,
+        """
+with T.allocate([128, 128], "float32", "global") as v:
+    buffer = T.decl_buffer((256, 256), data=v)
+    T.evaluate(v)
+""",
+    )
+
+
 def test_decl_buffer():
     with IRBuilder() as ib:
         with T.decl_buffer((10, 10), data=T.ptr("float32")):
@@ -686,46 +734,4 @@ def main():
 
 
 if __name__ == "__main__":
-    test_prim_func()
-    test_prim_func_no_sugar_inlined_buffer()
-    test_prim_func_no_sugar_shared_buffer_data()
-    test_block_realize()
-    test_block()
-    test_buffer()
-    test_buffer_region()
-    test_buffer_load()
-    test_buffer_store()
-    test_match_buffer_region()
-    test_for()
-    test_let_stmt()
-    test_attr_stmt()
-    test_assert_stmt()
-    test_while()
-    test_allocate()
-    test_decl_buffer()
-    test_prefetch()
-    test_seq_stmt()
-    test_if_then_else()
-    test_evaluate()
-    test_buffer_realize()
-    test_var()
-    test_size_var()
-    test_iter_var()
-    test_string_imm()
-    test_cast()
-    test_binary_arith()
-    test_logical()
-    test_select()
-    test_ramp()
-    test_broadcast()
-    test_let_expr()
-    test_call()
-    test_comm_reducer()
-    test_any()
-    test_int_imm()
-    test_float_imm()
-    test_range()
-    test_prim_type()
-    test_pointer_type()
-    test_tuple_type()
-    test_remap()
+    tvm.testing.main()

From 25d278bb86fd9842f69b206ab4f1e1a960752f28 Mon Sep 17 00:00:00 2001
From: Abhikrant Sharma <63697863+abhikran-quic@users.noreply.github.com>
Date: Fri, 20 Jan 2023 16:13:47 +0530
Subject: [PATCH 203/286] [TESTING] Enable execution of
 test_packed_8x8x32_resnet50 (#13799)

* [TESTING] Enable execution of test_packed_8x8x32_resnet50
Resnet 50 is passing now. Need to add below mentioned changes as well for test to pass.

- Add condition for IndexMap
- Add empty dict while calling tune_conv2d_template to avoid failure

* Fix unused variable error

* Remove unused variable and change check in if condition
---
 src/meta_schedule/database/database_utils.cc                    | 2 ++
 .../contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/meta_schedule/database/database_utils.cc b/src/meta_schedule/database/database_utils.cc
index 9e563c39d408..1754b304c904 100644
--- a/src/meta_schedule/database/database_utils.cc
+++ b/src/meta_schedule/database/database_utils.cc
@@ -77,6 +77,8 @@ void JSONDumps(ObjectRef json_obj, std::ostringstream& os) {
       JSONDumps(kv.second, os);
     }
     os << "}";
+  } else if (json_obj->IsInstance<tir::IndexMapNode>()) {
+    // Do nothing for index maps to start
   } else {
     LOG(FATAL) << "TypeError: Unsupported type in JSON object: " << json_obj->GetTypeKey();
   }
diff --git a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
index 1e01cb28a749..d985d2120936 100644
--- a/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
+++ b/tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py
@@ -410,7 +410,7 @@ def test_packed_8x8x32_resnet50(hexagon_launcher):
 
     if do_tune:
         hexagon_lowered = tune_conv2d_template(
-            mod, _schedule_packed_8x8x32_conv2d, "packed_8x8x32", params, hexagon_launcher
+            mod, _schedule_packed_8x8x32_conv2d, "packed_8x8x32", params, hexagon_launcher, {}
         )
     else:
         with tvm.transform.PassContext(opt_level=3):

From cdd209526601069b2235efd33079f40dbb968fde Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Fri, 20 Jan 2023 16:24:06 +0000
Subject: [PATCH 204/286] [COMMUNITY] alanmacd -> Reviewer (#13814)

Please welcome @alanmacd as a new Reviewer in TVM. Alan has contributed to microTVM and improving CI on Windows and MacOS. Moreover, Alan has been consistently doing code-reviews in many patches related to microTVM.

- [Commits History](https://github.com/apache/tvm/commits?author=alanmacd)
- [Code Review](https://github.com/apache/tvm/pulls?utf8=%E2%9C%93&q=reviewed-by:alanmacd)
- [Discuss Forum](https://discuss.tvm.apache.org/u/alanmacd)
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 18a4e13f511d..b04a36276b8f 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -144,6 +144,7 @@ We do encourage everyone to work anything they are interested in.
 - [Eric Lunderberg](https://github.com/Lunderberg): @Lunderberg
 - [Andrew Z. Luo](https://github.com/AndrewZhaoLuo): @AndrewZhaoLuo
 - [Steven Lyubomirsky](https://github.com/slyubomirsky): @slyubomirsky
+- [Alan MacDonald](https://github.com/alanmacd): @alanmacd
 - [Masahiro Masuda](https://github.com/masahi): @masahi
 - [Andrey Malyshev](https://github.com/elvin-n): @elvin-n
 - [Sergey Mironov](https://github.com/grwlf): @grwlf

From c613b8947631e8765185897cd793dae67a823761 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 20 Jan 2023 14:19:39 -0800
Subject: [PATCH 205/286] [docker][microTVM]Update zephyr version to 3.2 and
 Zephyr SDK to 0.15.2 (#13806)

This PR updates Zephyr version and Zephyr SDK version to the latest release only in docker build scripts.
---
 docker/install/ubuntu_init_zephyr_project.sh |  2 +-
 docker/install/ubuntu_install_zephyr.sh      |  1 -
 docker/install/ubuntu_install_zephyr_sdk.sh  | 14 +++++++-------
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/docker/install/ubuntu_init_zephyr_project.sh b/docker/install/ubuntu_init_zephyr_project.sh
index c1cdcaecceb9..49073d07f064 100755
--- a/docker/install/ubuntu_init_zephyr_project.sh
+++ b/docker/install/ubuntu_init_zephyr_project.sh
@@ -51,7 +51,7 @@ if [ "$1" == "--branch" ]; then
     BRANCH=$1
     shift
 else
-    BRANCH="v2.7-branch"
+    BRANCH="v3.2-branch"
 fi
 
 COMMIT=
diff --git a/docker/install/ubuntu_install_zephyr.sh b/docker/install/ubuntu_install_zephyr.sh
index 552ad2626029..6eaa5e5aa16c 100755
--- a/docker/install/ubuntu_install_zephyr.sh
+++ b/docker/install/ubuntu_install_zephyr.sh
@@ -58,7 +58,6 @@ chmod -R o+w ${ZEPHYR_PROJECT_PATH}
 mkdir zephyr/.cache
 chmod o+rwx zephyr/.cache
 
-#/opt/west/bin/pip3 install -r /opt/zephyrproject/zephyr/scripts/requirements.txt
 pip3 install -r /opt/zephyrproject/zephyr/scripts/requirements.txt
 
 # the requirements above overwrite junintparser with an older version, but it is not
diff --git a/docker/install/ubuntu_install_zephyr_sdk.sh b/docker/install/ubuntu_install_zephyr_sdk.sh
index 99ceec0a2956..228baf732120 100755
--- a/docker/install/ubuntu_install_zephyr_sdk.sh
+++ b/docker/install/ubuntu_install_zephyr_sdk.sh
@@ -42,10 +42,10 @@ fi
 INSTALLATION_PATH=$1
 shift
 
-ZEPHYR_SDK_VERSION=0.13.2
-ZEPHYR_SDK_FILE=zephyr-sdk-linux-setup.run
-wget --no-verbose -O $ZEPHYR_SDK_FILE \
-    https://github.com/zephyrproject-rtos/sdk-ng/releases/download/v${ZEPHYR_SDK_VERSION}/zephyr-sdk-${ZEPHYR_SDK_VERSION}-linux-x86_64-setup.run
-chmod +x $ZEPHYR_SDK_FILE
-"./$ZEPHYR_SDK_FILE" -- -d ${INSTALLATION_PATH}
-rm "$ZEPHYR_SDK_FILE"
+ZEPHYR_SDK_FILE_SHA=8e3572fbca9f9ba18a4436c00d680af34a85e239f7fe66c7988da85571a0d23d
+wget https://github.com/zephyrproject-rtos/sdk-ng/releases/download/v0.15.2/zephyr-sdk-0.15.2_linux-x86_64.tar.gz
+echo "$ZEPHYR_SDK_FILE_SHA zephyr-sdk-0.15.2_linux-x86_64.tar.gz" | sha256sum --check
+
+tar xvf zephyr-sdk-0.15.2_linux-x86_64.tar.gz
+mv zephyr-sdk-0.15.2 zephyr-sdk
+rm zephyr-sdk-0.15.2_linux-x86_64.tar.gz

From f889774b747c4be2da11f26a8cb16d9cd537a6fd Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Fri, 20 Jan 2023 22:56:01 +0000
Subject: [PATCH 206/286] [CI] Update rerun list for tvm-bot (#13817)

Update rerun list for tvm-bot
---
 ci/scripts/github/github_tvmbot.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/scripts/github/github_tvmbot.py b/ci/scripts/github/github_tvmbot.py
index d8dfcdb5b312..a692340a8e05 100755
--- a/ci/scripts/github/github_tvmbot.py
+++ b/ci/scripts/github/github_tvmbot.py
@@ -540,6 +540,7 @@ def rerun_jenkins_ci(self) -> None:
             "tvm-i386",
             "tvm-lint",
             "tvm-minimal",
+            "tvm-minimal-cross-isa",
             "tvm-riscv",
             "tvm-wasm",
         ]

From 2488257c08b85b062bfaa8fb979bf043618d96ce Mon Sep 17 00:00:00 2001
From: Wubin <wuby_491@126.com>
Date: Sat, 21 Jan 2023 08:41:07 +0800
Subject: [PATCH 207/286] =?UTF-8?q?[Frontend][PaddlePaddle]=20Add=20topk?=
 =?UTF-8?q?=20op=20and=20Fix=20bug,=20when=20the=20output=20is=20a=20dimen?=
 =?UTF-8?q?sion,=20it=20=E2=80=A6=20(#13701)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are some operation in PaddlePaddle that output a tensor which shape is [1], meanwhile, the slice op have decrease_axis attr, but it should not be squeezed.
---
 docker/install/ubuntu_install_paddle.sh       |  2 +-
 docker/python/ci-constraints.txt              |  2 +-
 python/tvm/relay/frontend/paddlepaddle.py     | 35 +++++++++++++++-
 .../frontend/paddlepaddle/test_forward.py     | 42 +++++++++++++++++++
 4 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/docker/install/ubuntu_install_paddle.sh b/docker/install/ubuntu_install_paddle.sh
index c7f9d43a3bd4..386d0fa6e797 100755
--- a/docker/install/ubuntu_install_paddle.sh
+++ b/docker/install/ubuntu_install_paddle.sh
@@ -20,4 +20,4 @@ set -e
 set -u
 set -o pipefail
 
-pip install paddlepaddle==2.1.3
+pip install paddlepaddle==2.4.1
diff --git a/docker/python/ci-constraints.txt b/docker/python/ci-constraints.txt
index 6e586b14ae3d..003c13170411 100644
--- a/docker/python/ci-constraints.txt
+++ b/docker/python/ci-constraints.txt
@@ -23,7 +23,7 @@ oneflow = "==0.7.0"
 onnx = "==1.10.2"
 onnxruntime = "==1.9.0"
 numpy = "==1.19.3"
-paddlepaddle = "==2.1.3"
+paddlepaddle = "==2.4.1"
 pillow = "==9.1.0"
 pylint = "==2.4.4"
 scipy = "==1.7.3"
diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
index ffbcf12de543..4927a362522e 100644
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ b/python/tvm/relay/frontend/paddlepaddle.py
@@ -1884,7 +1884,8 @@ def convert_slice(g, op, block):
         strides = _op.const([1] * dims, dtype="int64")
 
     out = _op.strided_slice(data, begin=starts, end=ends, strides=strides)
-    if decrease_axis:
+    out_shape = infer_shape(out)
+    if decrease_axis and len(out_shape) > 1:
         out = _op.squeeze(out, axis=decrease_axis)
     g.add_node(op.output("Out")[0], out)
 
@@ -1998,6 +1999,37 @@ def convert_swish(g, op, block):
     g.add_node(op.output("Out")[0], out)
 
 
+def convert_topk(g, op, block):
+    """Operator converter for topk."""
+
+    data = g.get_node(op.input("X")[0])
+    if op.input("K"):
+        k = g.get_node(op.input("K")[0])
+    else:
+        k = op.attr("k")
+
+    largest = op.attr("largest")
+    is_ascend = not largest
+    axis = op.attr("axis")
+
+    value_names = op.output("Out")
+    indice_names = op.output("Indices")
+
+    out = None
+    indice = None
+    if value_names and indice_names:
+        out, indice = _op.topk(data=data, k=k, axis=axis, ret_type="both", is_ascend=is_ascend)
+    elif value_names:
+        out = _op.topk(data=data, k=k, axis=axis, ret_type="values", is_ascend=is_ascend)
+    elif indice_names:
+        indice = _op.topk(data=data, k=k, axis=axis, ret_type="indices", is_ascend=is_ascend)
+
+    if out is not None:
+        g.add_node(value_names[0], out)
+    if indice is not None:
+        g.add_node(indice_names[0], indice)
+
+
 def convert_transpose(g, op, block):
     """Operator converter for transpose."""
 
@@ -2148,6 +2180,7 @@ def convert_unsqueeze(g, op, block):
     "swish": convert_swish,
     "tan": convert_unary_op,
     "tanh": convert_unary_op,
+    "top_k_v2": convert_topk,
     "transpose2": convert_transpose,
     "unsqueeze2": convert_unsqueeze,
 }
diff --git a/tests/python/frontend/paddlepaddle/test_forward.py b/tests/python/frontend/paddlepaddle/test_forward.py
index c0e54657a950..de6ea1dcf1da 100644
--- a/tests/python/frontend/paddlepaddle/test_forward.py
+++ b/tests/python/frontend/paddlepaddle/test_forward.py
@@ -1351,6 +1351,11 @@ def slice4(inputs):
         x1 = paddle.to_tensor([3]) + paddle.to_tensor([1])
         return inputs[:, x0:, 1:x1, :]
 
+    @paddle.jit.to_static
+    def slice5(inputs):
+        b, c, h, w = inputs  # add decrease_axis
+        return h
+
     input_shape = [1, 3, 10, 10]
     input_data = paddle.rand(input_shape, dtype="float32")
     verify_model(
@@ -1362,6 +1367,7 @@ def slice4(inputs):
     verify_model(slice2, input_data=input_data)
     verify_model(slice3, input_data=paddle.randn((4, 4)))
     verify_model(slice4, input_data=input_data)
+    # verify_model(slice5, input_data=paddle.randn((4,)))
 
 
 @tvm.testing.uses_gpu
@@ -1681,5 +1687,41 @@ def forward(self, inputs, prev_h):
         )
 
 
+@tvm.testing.uses_gpu
+def test_forward_topk():
+    @paddle.jit.to_static
+    def topk1(inputs):
+        return paddle.topk(inputs, k=1)
+
+    @paddle.jit.to_static
+    def topk2(inputs):
+        k = paddle.to_tensor([1], dtype=paddle.int32)
+        return paddle.topk(inputs, k=k)
+
+    @paddle.jit.to_static
+    def topk3(inputs):
+        return paddle.topk(inputs, k=1, largest=False)
+
+    @paddle.jit.to_static
+    def topk4(inputs):
+        return paddle.topk(inputs, k=2, sorted=True)
+
+    @paddle.jit.to_static
+    def topk5(inputs):
+        return paddle.topk(inputs, k=2, sorted=False)
+
+    @paddle.jit.to_static
+    def topk6(inputs):
+        return paddle.topk(inputs, k=1, axis=0)
+
+    input_data = paddle.to_tensor([[1, 4, 5, 7], [3, 6, 2, 5]], dtype=paddle.int32)
+    verify_model(topk1, input_data=input_data)
+    # verify_model(topk2, input_data=input_data)
+    verify_model(topk3, input_data=input_data)
+    verify_model(topk4, input_data=input_data)
+    verify_model(topk5, input_data=input_data)
+    verify_model(topk6, input_data=input_data)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 498eb94962ef24391b694c6de01f8951be460582 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Fri, 20 Jan 2023 21:25:55 -0800
Subject: [PATCH 208/286] [TVMScript] Implicit root block syntax sugar for
 TVMScript printer (#13819)

This PR implements the syntax sugar of implicit root block for new TVMScript printer. This syntax sugar will skip the `T.block("root")`, when the root block realize is simple and we shall reconstruct that root block via `tvm::tir::ScriptComplete` when roundtripping. For example, it will change
```python
@T.prim_func
def root_block_explicitly():
  with T.block("root"):
    a = T.alloc_buffer([128, 128])
    for i, j in T.grid(128, 128):
      with T.block():
        T.evaluate(0)
```
into
```python
@T.prim_func
def main():
  a = T.alloc_buffer((128, 128))
  for i, j in T.grid(128, 128):
    with T.block(""):
      T.evaluate(0)
```
---
 src/script/printer/tir/function.cc            | 35 ++++++++++++-
 .../unittest/test_tvmscript_printer_tir.py    | 52 ++++++++++++++-----
 2 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/src/script/printer/tir/function.cc b/src/script/printer/tir/function.cc
index 40957fcffaca..ea7d56e1656d 100644
--- a/src/script/printer/tir/function.cc
+++ b/src/script/printer/tir/function.cc
@@ -131,7 +131,40 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         }
       }
       // Step 4. Handle `func->body`
-      AsDocBody(func->body, p->Attr("body"), frame->get(), d);
+      Optional<tir::Block> implicit_root_block = [&]() -> Optional<tir::Block> {
+        const tir::BlockRealizeNode* root_block_realize = func->body.as<tir::BlockRealizeNode>();
+        if (root_block_realize && !root_block_realize->iter_values.size() &&
+            tir::is_one(root_block_realize->predicate)) {
+          tir::Block root_block = root_block_realize->block;
+          if (!root_block->annotations.size() && !root_block->match_buffers.size() &&
+              !root_block->reads.size() && !root_block->writes.size() &&
+              !root_block->init.defined()) {
+            const tir::BlockRealizeNode* block_realize =
+                root_block->body.as<tir::BlockRealizeNode>();
+            if (root_block->alloc_buffers.size() ||
+                (block_realize && block_realize->block->iter_vars.size()) ||
+                (!block_realize && tir::ContainsNode<tir::BlockRealizeNode>(root_block->body))) {
+              return root_block;
+            }
+          }
+        }
+        return NullOpt;
+      }();
+      if (implicit_root_block) {
+        tir::Block root_block = implicit_root_block.value();
+        ObjectPath root_block_p = p->Attr("body")->Attr("body");
+        // Handle root block `alloc_buffer`
+        for (int i = 0, n = root_block->alloc_buffers.size(); i < n; ++i) {
+          tir::Buffer buffer = root_block->alloc_buffers[i];
+          ObjectPath buffer_p = root_block_p->Attr("alloc_buffers")->ArrayIndex(i);
+          IdDoc lhs = DefineBuffer(buffer, *frame, d);
+          ExprDoc rhs = BufferDecl(buffer, "alloc_buffer", {}, buffer_p, *frame, d);
+          (*frame)->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
+        }
+        AsDocBody(root_block->body, root_block_p->Attr("body"), frame->get(), d);
+      } else {
+        AsDocBody(func->body, p->Attr("body"), frame->get(), d);
+      }
       Optional<ExprDoc> ret_type = NullOpt;
       if (func->ret_type.defined()) {
         const auto* as_tuple = func->ret_type.as<TupleTypeNode>();
diff --git a/tests/python/unittest/test_tvmscript_printer_tir.py b/tests/python/unittest/test_tvmscript_printer_tir.py
index 5d86a8860852..d57d10467077 100644
--- a/tests/python/unittest/test_tvmscript_printer_tir.py
+++ b/tests/python/unittest/test_tvmscript_printer_tir.py
@@ -717,21 +717,49 @@ def block_with_remap_explicitly():
 
     expected_output = """@T.prim_func
 def main():
-    with T.block("root"):
-        T.reads()
-        T.writes()
-        for i0, i1, i2, i3, i4, i5 in T.grid(128, 128, 128, 128, 128, 128):
-            with T.block("update"):
-                v0 = T.axis.spatial(128, i0 + 1)
-                v1, v2 = T.axis.remap("SR", [i1, i2])
-                v3 = T.axis.spatial(128, i3 - 1)
-                v4, v5 = T.axis.remap("RS", [i4, i5])
-                T.reads()
-                T.writes()
-                T.evaluate(0)"""
+    for i0, i1, i2, i3, i4, i5 in T.grid(128, 128, 128, 128, 128, 128):
+        with T.block("update"):
+            v0 = T.axis.spatial(128, i0 + 1)
+            v1, v2 = T.axis.remap("SR", [i1, i2])
+            v3 = T.axis.spatial(128, i3 - 1)
+            v4, v5 = T.axis.remap("RS", [i4, i5])
+            T.reads()
+            T.writes()
+            T.evaluate(0)"""
     _assert_print(block_with_remap_explicitly, expected_output)
     _assert_print(block_with_remap_implicitly, expected_output)
 
 
+def test_root_block():
+    from tvm.script import tir as T
+
+    @T.prim_func
+    def root_block_implicitly():
+        a = T.alloc_buffer([128, 128])
+        for i, j in T.grid(128, 128):
+            with T.block():
+                T.evaluate(0)
+
+    @T.prim_func
+    def root_block_explicitly():
+        with T.block("root"):
+            a = T.alloc_buffer([128, 128])
+            for i, j in T.grid(128, 128):
+                with T.block():
+                    T.evaluate(0)
+
+    expected_output = """@T.prim_func
+def main():
+    a = T.alloc_buffer((128, 128))
+    for i, j in T.grid(128, 128):
+        with T.block(""):
+            T.reads()
+            T.writes()
+            T.evaluate(0)
+    """
+    _assert_print(root_block_implicitly, expected_output)
+    _assert_print(root_block_explicitly, expected_output)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 55c9c9733753d2a76fb19102b708d9be271c02c3 Mon Sep 17 00:00:00 2001
From: Sunghyun Park <49998730+sunggg@users.noreply.github.com>
Date: Sun, 22 Jan 2023 11:22:24 -0800
Subject: [PATCH 209/286] [Bugfix][TIR] Fix version conflict with typing for
 different Python versions (3.8.0-3.10.0) (#13820)

* hotfix

* fix lint
---
 python/tvm/tir/schedule/_type_checker.py | 33 ++++++++++++------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/python/tvm/tir/schedule/_type_checker.py b/python/tvm/tir/schedule/_type_checker.py
index 12ce1ebc1f92..cb2d6446b3ef 100644
--- a/python/tvm/tir/schedule/_type_checker.py
+++ b/python/tvm/tir/schedule/_type_checker.py
@@ -27,6 +27,19 @@ def _is_none_type(type_: Any) -> bool:
     return type_ is None or type_ is type(None)
 
 
+def _get_subtypes(type_: Any) -> Any:
+    # TODO(@tvm-team): This is hot fix to support subtle difference between python versions
+    #                  Would be nice to find a better way if possible
+    if hasattr(typing, "_SpecialGenericAlias"):
+        if hasattr(typing, "get_args"):
+            subtypes = typing.get_args(type_)  # type: ignore
+        else:
+            subtypes = type_.__args__
+    else:
+        subtypes = type_.__args__
+    return subtypes
+
+
 if hasattr(typing, "_GenericAlias"):
     # For python versions 3.7 onward, check the __origin__ attribute.
 
@@ -64,10 +77,7 @@ def dict_(type_: Any) -> Any:
         @staticmethod
         def tuple_(type_: Any) -> Optional[List[type]]:
             if _Subtype._origin(type_) is tuple:
-                if hasattr(typing, "get_args"):
-                    subtypes = typing.get_args(type_)  # type: ignore
-                else:
-                    subtypes = type_.__args__
+                subtypes = _get_subtypes(type_)
                 return subtypes
             return None
 
@@ -76,10 +86,7 @@ def optional(  # pylint: disable=missing-function-docstring
             type_: Any,
         ) -> Optional[List[type]]:
             if _Subtype._origin(type_) is Union:
-                if hasattr(typing, "get_args"):
-                    subtypes = typing.get_args(type_)  # type: ignore
-                else:
-                    subtypes = type_.__args__
+                subtypes = _get_subtypes(type_)
                 if len(subtypes) == 2 and _is_none_type(subtypes[1]):
                     return [subtypes[0]]
             return None
@@ -87,10 +94,7 @@ def optional(  # pylint: disable=missing-function-docstring
         @staticmethod
         def union(type_: Any) -> Optional[List[type]]:  # pylint: disable=missing-function-docstring
             if _Subtype._origin(type_) is Union:
-                if hasattr(typing, "get_args"):
-                    subtypes = typing.get_args(type_)  # type: ignore
-                else:
-                    subtypes = type_.__args__
+                subtypes = _get_subtypes(type_)
                 if len(subtypes) != 2 or not _is_none_type(subtypes[1]):
                     return list(subtypes)
             return None
@@ -98,10 +102,7 @@ def union(type_: Any) -> Optional[List[type]]:  # pylint: disable=missing-functi
         @staticmethod
         def callable(type_: Any) -> Optional[List[type]]:
             if _Subtype._origin(type_) is collections.abc.Callable:
-                if hasattr(typing, "get_args") and not type_._special:
-                    subtypes = typing.get_args(type_)  # type: ignore
-                else:
-                    subtypes = type_.__args__
+                subtypes = _get_subtypes(type_)
                 return subtypes
             return None
 

From c73f98266b6591b1238135c08062f75a0f6a9ef6 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 23 Jan 2023 10:12:27 -0800
Subject: [PATCH 210/286] [microTVM][CRT]Separate CRT template project from
 standalone CRT build (#13812)

This PR separates the CRT template project for microTVM from CRT standalone library build. The CRT template project would move to build/microtvm_template_projects.

It also adds uploading microtvm_template_projects to S3 in few images that were missing this previously.
---
 CMakeLists.txt                                |  2 +
 ci/jenkins/generated/arm_jenkinsfile.groovy   |  4 +-
 ci/jenkins/generated/cpu_jenkinsfile.groovy   |  4 +-
 ci/jenkins/generated/i386_jenkinsfile.groovy  |  4 +-
 .../generated/minimal_jenkinsfile.groovy      |  4 +-
 .../templates/arm_jenkinsfile.groovy.j2       |  2 +-
 .../templates/cpu_jenkinsfile.groovy.j2       |  2 +-
 .../templates/i386_jenkinsfile.groovy.j2      |  2 +-
 .../templates/minimal_jenkinsfile.groovy.j2   |  2 +-
 ci/scripts/jenkins/s3.py                      |  3 +
 cmake/modules/CRT.cmake                       | 74 +++++++++++++++++++
 cmake/modules/StandaloneCrt.cmake             |  4 -
 python/tvm/micro/build.py                     |  3 -
 python/tvm/testing/aot.py                     |  4 +-
 src/runtime/crt/host/microtvm_api_server.py   |  5 +-
 tests/python/unittest/test_crt.py             |  2 +-
 tests/python/unittest/test_link_params.py     |  2 +-
 17 files changed, 98 insertions(+), 25 deletions(-)
 create mode 100644 cmake/modules/CRT.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f51233d244e3..060ac65592a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -481,6 +481,7 @@ include(cmake/utils/CCache.cmake)
 # Module rules
 include(cmake/modules/VTA.cmake)
 include(cmake/modules/StandaloneCrt.cmake)
+include(cmake/modules/CRT.cmake)
 include(cmake/modules/Zephyr.cmake)
 include(cmake/modules/Arduino.cmake)
 include(cmake/modules/CUDA.cmake)
@@ -577,6 +578,7 @@ include(cmake/modules/contrib/PAPI.cmake)
 if(USE_MICRO)
   # NOTE: cmake doesn't track dependencies at the file level across subdirectories. For the
   # Unix Makefiles generator, need to add these explicit target-level dependency)
+  add_dependencies(tvm_runtime crt)
   add_dependencies(tvm_runtime zephyr)
   add_dependencies(tvm_runtime arduino)
   if(USE_GEMMINI)
diff --git a/ci/jenkins/generated/arm_jenkinsfile.groovy b/ci/jenkins/generated/arm_jenkinsfile.groovy
index 0fc71b430ca0..2c64e9ab2499 100644
--- a/ci/jenkins/generated/arm_jenkinsfile.groovy
+++ b/ci/jenkins/generated/arm_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-09T15:39:24.387114
+// Generated at 2023-01-20T20:00:09.294689
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -556,7 +556,7 @@ def build() {
         make_standalone_crt(ci_arm, 'build')
         make_cpp_tests(ci_arm, 'build')
         sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/arm --items build/libtvm.so build/libvta_fsim.so build/libtvm_runtime.so build/config.cmake build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/crttest build/standalone_crt build/build.ninja",
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/arm --items build/libtvm.so build/libvta_fsim.so build/libtvm_runtime.so build/config.cmake build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/crttest build/standalone_crt build/build.ninja build/microtvm_template_projects",
             label: 'Upload artifacts to S3',
           )
           }
diff --git a/ci/jenkins/generated/cpu_jenkinsfile.groovy b/ci/jenkins/generated/cpu_jenkinsfile.groovy
index f9ede00399a2..c9e02ba28761 100644
--- a/ci/jenkins/generated/cpu_jenkinsfile.groovy
+++ b/ci/jenkins/generated/cpu_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-09T15:39:24.540570
+// Generated at 2023-01-20T22:52:37.278903
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -556,7 +556,7 @@ def build() {
         make_standalone_crt(ci_cpu, 'build')
         make_cpp_tests(ci_cpu, 'build')
         sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu --items build/libvta_tsim.so build/libtvm.so build/libvta_fsim.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/crttest build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/standalone_crt build/build.ninja",
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu --items build/libvta_tsim.so build/libtvm.so build/libvta_fsim.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/crttest build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/standalone_crt build/build.ninja build/microtvm_template_projects",
             label: 'Upload artifacts to S3',
           )
 
diff --git a/ci/jenkins/generated/i386_jenkinsfile.groovy b/ci/jenkins/generated/i386_jenkinsfile.groovy
index ae66fbe3e48c..0cadeac33dac 100644
--- a/ci/jenkins/generated/i386_jenkinsfile.groovy
+++ b/ci/jenkins/generated/i386_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-09T15:39:24.421467
+// Generated at 2023-01-20T20:00:09.177735
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -556,7 +556,7 @@ def build() {
         make_standalone_crt(ci_i386, 'build')
         make_cpp_tests(ci_i386, 'build')
         sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/i386 --items build/libvta_tsim.so build/libtvm.so build/libvta_fsim.so build/libtvm_runtime.so build/config.cmake build/standalone_crt build/build.ninja build/crttest build/cpptest build/build.ninja build/CMakeFiles/rules.ninja",
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/i386 --items build/libvta_tsim.so build/libtvm.so build/libvta_fsim.so build/libtvm_runtime.so build/config.cmake build/standalone_crt build/build.ninja build/crttest build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/microtvm_template_projects",
             label: 'Upload artifacts to S3',
           )
           }
diff --git a/ci/jenkins/generated/minimal_jenkinsfile.groovy b/ci/jenkins/generated/minimal_jenkinsfile.groovy
index 6c4abb0bd5af..f054f60a3aa2 100644
--- a/ci/jenkins/generated/minimal_jenkinsfile.groovy
+++ b/ci/jenkins/generated/minimal_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-12-09T15:39:24.492813
+// Generated at 2023-01-20T22:52:37.006864
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -556,7 +556,7 @@ def build() {
         make_standalone_crt(ci_minimal, 'build')
         make_cpp_tests(ci_minimal, 'build')
         sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu-minimal --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/crttest build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/standalone_crt build/build.ninja",
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu-minimal --items build/libtvm.so build/libtvm_runtime.so build/config.cmake build/libtvm_allvisible.so build/crttest build/cpptest build/build.ninja build/CMakeFiles/rules.ninja build/standalone_crt build/build.ninja build/microtvm_template_projects",
             label: 'Upload artifacts to S3',
           )
           }
diff --git a/ci/jenkins/templates/arm_jenkinsfile.groovy.j2 b/ci/jenkins/templates/arm_jenkinsfile.groovy.j2
index 6cffd5cbbe66..b455fa04125d 100644
--- a/ci/jenkins/templates/arm_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/arm_jenkinsfile.groovy.j2
@@ -31,7 +31,7 @@
   cmake_build(ci_arm, 'build', '-j4')
   make_standalone_crt(ci_arm, 'build')
   make_cpp_tests(ci_arm, 'build')
-  {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib + cpptest + crttest + standalone_crt) }}
+  {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib + cpptest + crttest + standalone_crt + microtvm_template_projects) }}
 {% endcall %}
 
 {% set test_method_names = [] %}
diff --git a/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2 b/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
index fa2be6584ff0..a833a89d64da 100644
--- a/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
@@ -31,7 +31,7 @@
   cmake_build(ci_cpu, 'build', '-j2')
   make_standalone_crt(ci_cpu, 'build')
   make_cpp_tests(ci_cpu, 'build')
-  {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim + tvm_allvisible + crttest + cpptest + standalone_crt) }}
+  {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim + tvm_allvisible + crttest + cpptest + standalone_crt + microtvm_template_projects) }}
   ci_setup(ci_cpu)
   // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
   // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch
diff --git a/ci/jenkins/templates/i386_jenkinsfile.groovy.j2 b/ci/jenkins/templates/i386_jenkinsfile.groovy.j2
index 1825e0cbd6bd..da1cdeaefeee 100644
--- a/ci/jenkins/templates/i386_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/i386_jenkinsfile.groovy.j2
@@ -31,7 +31,7 @@
   cmake_build(ci_i386, 'build', '-j2')
   make_standalone_crt(ci_i386, 'build')
   make_cpp_tests(ci_i386, 'build')
-  {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim + standalone_crt + crttest + cpptest) }}
+  {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim + standalone_crt + crttest + cpptest + microtvm_template_projects) }}
 {% endcall %}
 
 
diff --git a/ci/jenkins/templates/minimal_jenkinsfile.groovy.j2 b/ci/jenkins/templates/minimal_jenkinsfile.groovy.j2
index 87db883745cc..6420599061de 100644
--- a/ci/jenkins/templates/minimal_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/minimal_jenkinsfile.groovy.j2
@@ -31,7 +31,7 @@
   cmake_build(ci_minimal, 'build', '-j2')
   make_standalone_crt(ci_minimal, 'build')
   make_cpp_tests(ci_minimal, 'build')
-  {{ m.upload_artifacts(tag='cpu-minimal', filenames=tvm_lib + tvm_allvisible + crttest + cpptest + standalone_crt) }}
+  {{ m.upload_artifacts(tag='cpu-minimal', filenames=tvm_lib + tvm_allvisible + crttest + cpptest + standalone_crt + microtvm_template_projects) }}
 {% endcall %}
 
 
diff --git a/ci/scripts/jenkins/s3.py b/ci/scripts/jenkins/s3.py
index 63fbaac5fafc..8886adef7236 100755
--- a/ci/scripts/jenkins/s3.py
+++ b/ci/scripts/jenkins/s3.py
@@ -142,4 +142,7 @@ def s3(source: str, destination: str, recursive: bool) -> List[str]:
                 show_md5(file)
         elif action == Action.UPLOAD:
             show_md5(item)
+            if Path(item).is_dir():
+                if len(list(Path(item).glob("**/*"))) == 0:
+                    raise RuntimeError(f"Cannot upload empty folder with name: {item}")
             s3(item, s3_path + "/" + item, recursive=Path(item).is_dir())
diff --git a/cmake/modules/CRT.cmake b/cmake/modules/CRT.cmake
new file mode 100644
index 000000000000..518a613dc102
--- /dev/null
+++ b/cmake/modules/CRT.cmake
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more contributor
+# license agreements.  See the NOTICE file distributed with this work for
+# additional information regarding copyright ownership.  The ASF licenses this
+# file to you under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+if(USE_MICRO)
+  message(STATUS "Add CRT template project for microTVM")
+
+  function(microtvm_add_crt)
+    list(
+      APPEND
+      CRT_TEMPLATE_FILE_COPY_JOBS
+      "src/runtime/crt/host microtvm_api_server.py -> crt"
+      "src/runtime/crt/host Makefile.template -> crt"
+      "src/runtime/crt crt_config-template.h -> crt"
+      "src/runtime/crt/host main.cc -> crt/src"
+    )
+
+    foreach(job_spec IN LISTS CRT_TEMPLATE_FILE_COPY_JOBS)
+      string(REPLACE " " ";" job_spec "${job_spec}")
+      list(LENGTH job_spec job_spec_length)
+      math(EXPR job_spec_length_mod "${job_spec_length} % 3")
+      if(NOT "${job_spec_length_mod}" EQUAL 1)
+        message(
+          FATAL_ERROR
+            "CRT copy job spec list length is ${job_spec_length}; parsed job spec is ${job_spec}"
+        )
+      endif()
+      math(EXPR job_spec_stop "${job_spec_length} - 3")
+
+      list(GET job_spec 0 job_src_base)
+      set(job_src_base "${CMAKE_CURRENT_SOURCE_DIR}/${job_src_base}")
+      foreach(copy_pattern_index RANGE 1 "${job_spec_stop}" 3)
+        list(GET job_spec ${copy_pattern_index} copy_pattern)
+        math(EXPR copy_dest_index "${copy_pattern_index} + 2")
+        list(GET job_spec ${copy_dest_index} copy_dest)
+
+        file(
+          GLOB_RECURSE copy_files
+          RELATIVE "${job_src_base}"
+          "${job_src_base}/${copy_pattern}")
+        list(LENGTH copy_files copy_files_length)
+        if("${copy_files_length}" EQUAL 0)
+          message(
+            FATAL_ERROR
+              "CRT copy job matched 0 files: ${job_src_base}/${copy_pattern} -> ${copy_dest}"
+          )
+        endif()
+        foreach(copy_src IN LISTS copy_files)
+          get_filename_component(
+            dest_path "${MICROTVM_TEMPLATE_PROJECTS}/${copy_dest}/${copy_src}"
+            ABSOLUTE)
+          tvm_micro_add_copy_file(crt_template_deps
+                                  ${job_src_base}/${copy_src} ${dest_path})
+        endforeach()
+      endforeach()
+    endforeach()
+
+    add_custom_target(crt DEPENDS ${crt_template_deps})
+  endfunction()
+
+  microtvm_add_crt()
+
+endif(USE_MICRO)
diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake
index 2ca37f53d9f5..1d4e213ec71a 100644
--- a/cmake/modules/StandaloneCrt.cmake
+++ b/cmake/modules/StandaloneCrt.cmake
@@ -94,16 +94,12 @@ else()
         "src/runtime/crt/common *.c -> src/runtime/crt/common"
         "src/runtime/crt/graph_executor *.c -> src/runtime/crt/graph_executor"
         "src/runtime/crt/graph_executor_module *.c -> src/runtime/crt/graph_executor_module"
-        "src/runtime/crt/host *.cc -> template/host"
-        "src/runtime/crt/host *.py -> template/host"
-        "src/runtime/crt/host Makefile.template -> template/host"
         "src/runtime/crt/memory *.c -> src/runtime/crt/memory"
         "src/runtime/crt/microtvm_rpc_common *.cc -> src/runtime/crt/microtvm_rpc_common"
         "src/runtime/crt/microtvm_rpc_server *.cc -> src/runtime/crt/microtvm_rpc_server"
         "src/runtime/minrpc *.h -> src/runtime/minrpc"
         "src/support generic_arena.h -> src/support"
         "src/support ssize.h -> src/support"
-        "src/runtime/crt crt_config-template.h -> template"
         )
 
   set(STANDALONE_CRT_BASE ${CMAKE_CURRENT_BINARY_DIR}/standalone_crt)
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index b05d0d60d47a..ac35142a8937 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -95,9 +95,6 @@ def get_microtvm_template_projects(platform: str) -> str:
     if platform not in MicroTVMTemplateProject.list():
         raise ValueError(f"platform {platform} is not supported.")
 
-    if platform == MicroTVMTemplateProject.CRT.value:
-        return os.path.join(get_standalone_crt_dir(), "template", "host")
-
     microtvm_template_projects = None
     for path in libinfo.find_lib_path():
         template_path = os.path.join(os.path.dirname(path), "microtvm_template_projects")
diff --git a/python/tvm/testing/aot.py b/python/tvm/testing/aot.py
index 30d3c78ae43b..5ddbdcabacc9 100644
--- a/python/tvm/testing/aot.py
+++ b/python/tvm/testing/aot.py
@@ -723,9 +723,9 @@ def run_and_check_body(base_path):
 
         include_path = os.path.join(base_path, "include")
         os.mkdir(include_path)
-        crt_root = tvm.micro.get_standalone_crt_dir()
+        crt_root = tvm.micro.get_microtvm_template_projects("crt")
         shutil.copy2(
-            os.path.join(crt_root, "template", "crt_config-template.h"),
+            os.path.join(crt_root, "crt_config-template.h"),
             os.path.join(include_path, "crt_config.h"),
         )
 
diff --git a/src/runtime/crt/host/microtvm_api_server.py b/src/runtime/crt/host/microtvm_api_server.py
index b84abdf45985..e5b82f96b0ff 100644
--- a/src/runtime/crt/host/microtvm_api_server.py
+++ b/src/runtime/crt/host/microtvm_api_server.py
@@ -144,7 +144,7 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         crt_config_dir = project_dir / "crt_config"
         crt_config_dir.mkdir()
         shutil.copy2(
-            os.path.join(os.path.dirname(__file__), "..", "crt_config-template.h"),
+            os.path.join(os.path.dirname(__file__), "crt_config-template.h"),
             os.path.join(crt_config_dir, "crt_config.h"),
         )
 
@@ -152,7 +152,8 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         src_dir = os.path.join(project_dir, "src")
         os.mkdir(src_dir)
         shutil.copy2(
-            os.path.join(os.path.dirname(__file__), "main.cc"), os.path.join(src_dir, "main.cc")
+            os.path.join(os.path.dirname(__file__), "src", "main.cc"),
+            os.path.join(src_dir, "main.cc"),
         )
 
     def build(self, options):
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index b11f7a5fac5e..83fa91af06c9 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -47,7 +47,7 @@ def _make_sess_from_op(temp_dir, op_name, sched, arg_bufs):
 
 
 def _make_session(temp_dir, mod):
-    template_project_dir = os.path.join(tvm.micro.get_standalone_crt_dir(), "template", "host")
+    template_project_dir = pathlib.Path(tvm.micro.get_microtvm_template_projects("crt"))
     project = tvm.micro.generate_project(
         template_project_dir, mod, temp_dir / "project", {"verbose": 1}
     )
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index e5b8cd77445f..70caa99c9bca 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -345,7 +345,7 @@ def test_crt_link_params(linkable_dtype):
         assert len(factory.get_params().keys()) == 0  # NOTE: params became tir.constants
 
         temp_dir = tvm.contrib.utils.tempdir()
-        template_project_dir = os.path.join(tvm.micro.get_standalone_crt_dir(), "template", "host")
+        template_project_dir = tvm.micro.get_microtvm_template_projects("crt")
         project = tvm.micro.generate_project(
             template_project_dir, factory, temp_dir / "project", {"verbose": 1}
         )

From e1d447c8ce1249ebc5c66824a4f464f3e0b8113b Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Tue, 24 Jan 2023 03:39:58 +0800
Subject: [PATCH 211/286] [MetaSchedule] add fp16-16-32 TensorCores rule to
 default settings (#13822)

---
 .../schedule_rule/schedule_rule.cc              | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index e25f0b12210d..938d39377f1c 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -172,6 +172,22 @@ Array<ScheduleRule> ScheduleRule::DefaultCUDA() {
 
 Array<ScheduleRule> ScheduleRule::DefaultCUDATensorCore() {
   Array<Map<String, String>> intrin_groups = {
+      // Tensor Cores f32 += f16 * f16
+      {
+          {"init", "wmma_fill_16x16x16_f32"},
+          {"load_a", "wmma_load_16x16x16_f16_a"},
+          {"load_b", "wmma_load_16x16x16_f16_b"},
+          {"compute", "wmma_sync_16x16x16_f16f16f32"},
+          {"store", "wmma_store_16x16x16_f32_shared"},
+      },
+      {
+          {"init", "wmma_fill_16x16x16_f32"},
+          {"load_a", "wmma_load_16x16x16_f16_a"},
+          {"load_b", "wmma_load_16x16x16_f16_b_trans"},
+          {"compute", "wmma_sync_16x16x16_f16f16f32_trans"},
+          {"store", "wmma_store_16x16x16_f32_shared"},
+      },
+      // Tensor Cores f16 += f16 * f16
       {
           {"init", "wmma_fill_16x16x16_f16"},
           {"load_a", "wmma_load_16x16x16_f16_a"},
@@ -186,6 +202,7 @@ Array<ScheduleRule> ScheduleRule::DefaultCUDATensorCore() {
           {"compute", "wmma_sync_16x16x16_f16f16f16_trans"},
           {"store", "wmma_store_16x16x16_f16_shared"},
       },
+      // Tensor Cores s32 += s8 * s8
       {
           {"init", "wmma_fill_16x16x16_s32"},
           {"load_a", "wmma_load_16x16x16_s8_a"},

From 1ff77aefca02e357c0a6a8d8fb9817eda15eb8df Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao@apache.org>
Date: Mon, 23 Jan 2023 13:04:36 -0800
Subject: [PATCH 212/286] [TVMScript] Add ObjectPath to LiteralDoc (#13821)

This PR adds ObjectPath to LiteralDoc to allow integer/float/string/...
literals to have their own object path. This is a final preparation
towards structural error rendering when SEqual fails.
---
 include/tvm/script/printer/doc.h          | 34 ++++++++----
 include/tvm/script/printer/ir_docsifier.h |  9 ++--
 python/tvm/script/printer/doc.py          | 32 +++++++++---
 src/script/printer/doc.cc                 | 26 ++++------
 src/script/printer/ir/ir.cc               | 18 +++----
 src/script/printer/ir/misc.cc             |  2 +-
 src/script/printer/legacy_repr.cc         |  1 -
 src/script/printer/tir/block.cc           | 18 ++++---
 src/script/printer/tir/buffer.cc          | 15 +++---
 src/script/printer/tir/expr.cc            | 44 ++++++++--------
 src/script/printer/tir/for_loop.cc        | 21 ++++----
 src/script/printer/tir/ir.cc              | 28 +++++-----
 src/script/printer/tir/stmt.cc            | 63 +++++++++++++----------
 src/tir/analysis/control_flow_graph.cc    |  1 -
 14 files changed, 178 insertions(+), 134 deletions(-)

diff --git a/include/tvm/script/printer/doc.h b/include/tvm/script/printer/doc.h
index 094d3fdf51df..01f0fc1f4a91 100644
--- a/include/tvm/script/printer/doc.h
+++ b/include/tvm/script/printer/doc.h
@@ -23,6 +23,8 @@
 #include <tvm/node/node.h>
 #include <tvm/runtime/data_type.h>
 
+#include <string>
+
 namespace tvm {
 namespace script {
 namespace printer {
@@ -243,40 +245,54 @@ class LiteralDocNode : public ExprDocNode {
  */
 class LiteralDoc : public ExprDoc {
  protected:
-  explicit LiteralDoc(ObjectRef value);
-  LiteralDoc(ObjectRef value, ObjectPath object_path);
+  explicit LiteralDoc(ObjectRef value, const Optional<ObjectPath>& object_path);
 
  public:
   /*!
    * \brief Create a LiteralDoc to represent None/null/empty value.
+   * \param p The object path
    */
-  static LiteralDoc None() { return LiteralDoc(ObjectRef(nullptr)); }
+  static LiteralDoc None(const Optional<ObjectPath>& p) {
+    return LiteralDoc(ObjectRef(nullptr), p);
+  }
   /*!
    * \brief Create a LiteralDoc to represent integer.
    * \param v The integer value.
+   * \param p The object path
    */
-  static LiteralDoc Int(int64_t v) { return LiteralDoc(IntImm(DataType::Int(64), v)); }
+  static LiteralDoc Int(int64_t v, const Optional<ObjectPath>& p) {
+    return LiteralDoc(IntImm(DataType::Int(64), v), p);
+  }
   /*!
    * \brief Create a LiteralDoc to represent boolean.
    * \param v The boolean value.
+   * \param p The object path
    */
-  static LiteralDoc Boolean(bool v) { return LiteralDoc(IntImm(DataType::Bool(), v)); }
+  static LiteralDoc Boolean(bool v, const Optional<ObjectPath>& p) {
+    return LiteralDoc(IntImm(DataType::Bool(), v), p);
+  }
   /*!
    * \brief Create a LiteralDoc to represent float.
    * \param v The float value.
+   * \param p The object path
    */
-  static LiteralDoc Float(double v) { return LiteralDoc(FloatImm(DataType::Float(64), v)); }
+  static LiteralDoc Float(double v, const Optional<ObjectPath>& p) {
+    return LiteralDoc(FloatImm(DataType::Float(64), v), p);
+  }
   /*!
    * \brief Create a LiteralDoc to represent string.
    * \param v The string value.
+   * \param p The object path
    */
-  static LiteralDoc Str(const String& v) { return LiteralDoc(v); }
+  static LiteralDoc Str(const String& v, const Optional<ObjectPath>& p) { return LiteralDoc(v, p); }
   /*!
    * \brief Create a LiteralDoc to represent string.
    * \param v The string value.
+   * \param p The object path
    */
-  static LiteralDoc DataType(const DLDataType& v) {
-    return LiteralDoc::Str(runtime::DLDataType2String(v));
+  static LiteralDoc DataType(const runtime::DataType& v, const Optional<ObjectPath>& p) {
+    std::string dtype = v.is_void() ? "void" : runtime::DLDataType2String(v);
+    return LiteralDoc::Str(dtype, p);
   }
 
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(LiteralDoc, ExprDoc, LiteralDocNode);
diff --git a/include/tvm/script/printer/ir_docsifier.h b/include/tvm/script/printer/ir_docsifier.h
index e426946b56fe..e0419b469505 100644
--- a/include/tvm/script/printer/ir_docsifier.h
+++ b/include/tvm/script/printer/ir_docsifier.h
@@ -259,11 +259,12 @@ inline void FrameNode::ExitWithScope() {
 
 template <class TDoc>
 inline TDoc IRDocsifierNode::AsDoc(const ObjectRef& obj, const ObjectPath& path) const {
-  if (!obj.defined()) {
-    return Downcast<TDoc>(LiteralDoc::None());
+  if (obj.defined()) {
+    Doc d = IRDocsifier::vtable()(dispatch_tokens.back(), obj, path, GetRef<IRDocsifier>(this));
+    d->source_paths.push_back(path);
+    return Downcast<TDoc>(d);
   }
-  return Downcast<TDoc>(
-      IRDocsifier::vtable()(dispatch_tokens.back(), obj, path, GetRef<IRDocsifier>(this)));
+  return Downcast<TDoc>(LiteralDoc::None(path));
 }
 
 inline void FrameNode::AddDispatchToken(const IRDocsifier& d, const String& token) {
diff --git a/python/tvm/script/printer/doc.py b/python/tvm/script/printer/doc.py
index a93957d3e18f..5a4a4cd67a72 100644
--- a/python/tvm/script/printer/doc.py
+++ b/python/tvm/script/printer/doc.py
@@ -155,17 +155,37 @@ class LiteralDoc(ExprDoc):
 
     value: Union[str, IntImm, FloatImm, None]
 
-    def __init__(self, value: Union[str, float, bool, int, None]):
+    def __init__(
+        self,
+        value: Union[str, float, bool, int, None],
+        path: Optional[ObjectPath] = None,
+    ):
         if value is None:
-            self.__init_handle_by_constructor__(_ffi_api.LiteralDocNone)  # type: ignore # pylint: disable=no-member
+            self.__init_handle_by_constructor__(_ffi_api.LiteralDocNone, path)  # type: ignore # pylint: disable=no-member
         elif isinstance(value, str):
-            self.__init_handle_by_constructor__(_ffi_api.LiteralDocStr, value)  # type: ignore # pylint: disable=no-member
+            self.__init_handle_by_constructor__(
+                _ffi_api.LiteralDocStr,  # type: ignore # pylint: disable=no-member
+                value,
+                path,
+            )
         elif isinstance(value, float):
-            self.__init_handle_by_constructor__(_ffi_api.LiteralDocFloat, value)  # type: ignore # pylint: disable=no-member
+            self.__init_handle_by_constructor__(
+                _ffi_api.LiteralDocFloat,  # type: ignore # pylint: disable=no-member
+                value,
+                path,
+            )
         elif isinstance(value, bool):
-            self.__init_handle_by_constructor__(_ffi_api.LiteralDocBoolean, value)  # type: ignore # pylint: disable=no-member
+            self.__init_handle_by_constructor__(
+                _ffi_api.LiteralDocBoolean,  # type: ignore # pylint: disable=no-member
+                value,
+                path,
+            )
         elif isinstance(value, int):
-            self.__init_handle_by_constructor__(_ffi_api.LiteralDocInt, value)  # type: ignore # pylint: disable=no-member
+            self.__init_handle_by_constructor__(
+                _ffi_api.LiteralDocInt,  # type: ignore # pylint: disable=no-member
+                value,
+                path,
+            )
         else:
             raise TypeError(f"Unsupported type {type(value)} for LiteralDoc")
 
diff --git a/src/script/printer/doc.cc b/src/script/printer/doc.cc
index f41b40c92cc9..89f6b7c8b1cf 100644
--- a/src/script/printer/doc.cc
+++ b/src/script/printer/doc.cc
@@ -48,16 +48,12 @@ StmtBlockDoc::StmtBlockDoc(Array<StmtDoc> stmts) {
   this->data_ = std::move(n);
 }
 
-LiteralDoc::LiteralDoc(ObjectRef value) {
+LiteralDoc::LiteralDoc(ObjectRef value, const Optional<ObjectPath>& object_path) {
   ObjectPtr<LiteralDocNode> n = make_object<LiteralDocNode>();
   n->value = value;
-  this->data_ = std::move(n);
-}
-
-LiteralDoc::LiteralDoc(ObjectRef value, ObjectPath object_path) {
-  ObjectPtr<LiteralDocNode> n = make_object<LiteralDocNode>();
-  n->value = value;
-  n->source_paths.push_back(object_path);
+  if (object_path.defined()) {
+    n->source_paths.push_back(object_path.value());
+  }
   this->data_ = std::move(n);
 }
 
@@ -250,15 +246,11 @@ TVM_REGISTER_GLOBAL("script.printer.StmtBlockDoc").set_body_typed([](Array<StmtD
 });
 
 TVM_REGISTER_NODE_TYPE(LiteralDocNode);
-TVM_REGISTER_GLOBAL("script.printer.LiteralDocNone").set_body_typed<LiteralDoc()>(LiteralDoc::None);
-TVM_REGISTER_GLOBAL("script.printer.LiteralDocInt")
-    .set_body_typed<LiteralDoc(int64_t)>(LiteralDoc::Int);
-TVM_REGISTER_GLOBAL("script.printer.LiteralDocBoolean")
-    .set_body_typed<LiteralDoc(bool)>(LiteralDoc::Boolean);
-TVM_REGISTER_GLOBAL("script.printer.LiteralDocFloat")
-    .set_body_typed<LiteralDoc(double)>(LiteralDoc::Float);
-TVM_REGISTER_GLOBAL("script.printer.LiteralDocStr")
-    .set_body_typed<LiteralDoc(const String&)>(LiteralDoc::Str);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocNone").set_body_typed(LiteralDoc::None);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocInt").set_body_typed(LiteralDoc::Int);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocBoolean").set_body_typed(LiteralDoc::Boolean);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocFloat").set_body_typed(LiteralDoc::Float);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocStr").set_body_typed(LiteralDoc::Str);
 
 TVM_REGISTER_NODE_TYPE(IdDocNode);
 TVM_REGISTER_GLOBAL("script.printer.IdDoc").set_body_typed([](String name) { return IdDoc(name); });
diff --git a/src/script/printer/ir/ir.cc b/src/script/printer/ir/ir.cc
index 5cd459be6696..e438919f4b1b 100644
--- a/src/script/printer/ir/ir.cc
+++ b/src/script/printer/ir/ir.cc
@@ -63,26 +63,26 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<GlobalVar>("", [](GlobalVar gv, ObjectPath p, IRDocsifier d) -> Doc {
-      return IR("GlobalVar")->Call({LiteralDoc::Str(gv->name_hint)});
+      return IR("GlobalVar")->Call({LiteralDoc::Str(gv->name_hint, p->Attr("name_hint"))});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<Op>("", [](Op op, ObjectPath p, IRDocsifier d) -> Doc {
-      return IR("Op")->Call({LiteralDoc::Str(op->name)});
+      return IR("Op")->Call({LiteralDoc::Str(op->name, p->Attr("name"))});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<TypeVar>("", [](TypeVar type_var, ObjectPath p, IRDocsifier d) -> Doc {
-      return IR("TypeVar")->Call({LiteralDoc::Str(type_var->name_hint),  //
-                                  LiteralDoc::Str(TypeKind2String(type_var->kind))});
+    .set_dispatch<TypeVar>("", [](TypeVar var, ObjectPath p, IRDocsifier d) -> Doc {
+      return IR("TypeVar")->Call({LiteralDoc::Str(var->name_hint, p->Attr("name_hint")),  //
+                                  LiteralDoc::Str(TypeKind2String(var->kind), p->Attr("kind"))});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<GlobalTypeVar>(  //
-        "", [](GlobalTypeVar type_var, ObjectPath p, IRDocsifier d) -> Doc {
+        "", [](GlobalTypeVar var, ObjectPath p, IRDocsifier d) -> Doc {
           return IR("GlobalTypeVar")
-              ->Call({LiteralDoc::Str(type_var->name_hint),  //
-                      LiteralDoc::Str(TypeKind2String(type_var->kind))});
+              ->Call({LiteralDoc::Str(var->name_hint, p->Attr("name_hint")),
+                      LiteralDoc::Str(TypeKind2String(var->kind), p->Attr("kind"))});
         });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
@@ -94,7 +94,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<TensorType>("", [](TensorType type, ObjectPath p, IRDocsifier d) -> Doc {
       return IR("TensorType")
           ->Call({d->AsDoc<ExprDoc>(type->shape, p->Attr("shape")),
-                  LiteralDoc::DataType(type->dtype)});
+                  LiteralDoc::DataType(type->dtype, p->Attr("dtype"))});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
diff --git a/src/script/printer/ir/misc.cc b/src/script/printer/ir/misc.cc
index bd2792167194..cb78dc3ff5c3 100644
--- a/src/script/printer/ir/misc.cc
+++ b/src/script/printer/ir/misc.cc
@@ -24,7 +24,7 @@ namespace printer {
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<String>("", [](String s, ObjectPath p, IRDocsifier d) -> Doc {
-      return LiteralDoc::Str(s);
+      return LiteralDoc::Str(s, p);
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
diff --git a/src/script/printer/legacy_repr.cc b/src/script/printer/legacy_repr.cc
index f264dfee8d50..2909e059f3e3 100644
--- a/src/script/printer/legacy_repr.cc
+++ b/src/script/printer/legacy_repr.cc
@@ -588,7 +588,6 @@ TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
 
 TVM_STATIC_IR_FUNCTOR(ReprLegacyPrinter, vtable)
     .set_dispatch<PrimFuncNode>([](const ObjectRef& ref, ReprLegacyPrinter* p) {
-      // TODO(tvm-team) redirect to Text printer once we have a good text format.
       auto* node = static_cast<const PrimFuncNode*>(ref.get());
       (*p) << "PrimFunc(" << node->params << ") ";
       if (node->attrs.defined()) {
diff --git a/src/script/printer/tir/block.cc b/src/script/printer/tir/block.cc
index 069ec7f3ea41..f78e7037c3e0 100644
--- a/src/script/printer/tir/block.cc
+++ b/src/script/printer/tir/block.cc
@@ -118,16 +118,20 @@ Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
       lhs.reserve(m);
       loop_var_doc.reserve(m);
       std::string binding_type = "";
+      Array<ObjectPath> binding_paths;
       for (int i : remap_vars_indices) {
         tir::IterVar iter_var = block->iter_vars[i];
-        ObjectPath iter_var_p = block_p->Attr("iter_var")->ArrayIndex(i);
+        ObjectPath iter_var_p = block_p->Attr("iter_vars")->ArrayIndex(i);
         lhs.push_back(DefineVar(iter_var->var, *frame, d));
         loop_var_doc.push_back(d->AsDoc<ExprDoc>(realize->iter_values[i],
                                                  realize_p->Attr("iter_values")->ArrayIndex(i)));
+        binding_paths.push_back(iter_var_p->Attr("iter_type"));
         binding_type += iter_var->iter_type == tir::IterVarType::kDataPar ? "S" : "R";
       }
       ExprDoc rhs = TIR("axis")->Attr("remap");
-      rhs = rhs->Call({LiteralDoc::Str(binding_type), ListDoc(loop_var_doc)});
+      ExprDoc binding_str = LiteralDoc::Str(binding_type, NullOpt);
+      binding_str->source_paths = std::move(binding_paths);
+      rhs = rhs->Call({binding_str, ListDoc(loop_var_doc)});
       (*frame)->stmts.push_back(AssignDoc(TupleDoc(lhs), rhs, NullOpt));
       remap_vars_indices.clear();
     }
@@ -198,11 +202,13 @@ Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
   Array<ExprDoc> kwargs_values;
   if (!realize) {
     kwargs_keys.push_back("no_realize");
-    kwargs_values.push_back(LiteralDoc::Boolean(true));
+    kwargs_values.push_back(LiteralDoc::Boolean(true, NullOpt));
   }
-  return ScopeDoc(
-      NullOpt, TIR("block")->Call({LiteralDoc::Str(block->name_hint)}, kwargs_keys, kwargs_values),
-      (*frame)->stmts);
+  return ScopeDoc(NullOpt,
+                  TIR("block")  //
+                      ->Call({LiteralDoc::Str(block->name_hint, block_p->Attr("name_hint"))},
+                             kwargs_keys, kwargs_values),
+                  (*frame)->stmts);
 }
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
diff --git a/src/script/printer/tir/buffer.cc b/src/script/printer/tir/buffer.cc
index 126a6e58273f..b947039b58de 100644
--- a/src/script/printer/tir/buffer.cc
+++ b/src/script/printer/tir/buffer.cc
@@ -56,7 +56,7 @@ Map<String, ExprDoc> BufferAttrs(const tir::Buffer& buffer, const ObjectPath& p,
   array_out_line_var_def(buffer->shape, p->Attr("shape"), "shape");
   // Step 2. Handle `buffer.dtype`
   if (buffer->dtype != Default::BufferDType()) {
-    kwargs.Set("dtype", LiteralDoc::DataType(buffer->dtype));
+    kwargs.Set("dtype", LiteralDoc::DataType(buffer->dtype, p->Attr("dtype")));
   }
   // Step 3. Handle `buffer.data`
   implicit_var_def(buffer->data, p->Attr("data"), "data");
@@ -78,20 +78,22 @@ Map<String, ExprDoc> BufferAttrs(const tir::Buffer& buffer, const ObjectPath& p,
   {
     String scope = buffer.scope();
     if (scope != "global") {
-      kwargs.Set("scope", LiteralDoc::Str(scope));
+      kwargs.Set(
+          "scope",
+          LiteralDoc::Str(scope, p->Attr("data")->Attr("type_annotation")->Attr("storage_scope")));
     }
   }
   // Step 7. Handle `buffer.data_alignment`
   if (buffer->data_alignment != runtime::kAllocAlignment) {
-    kwargs.Set("align", LiteralDoc::Int(buffer->data_alignment));
+    kwargs.Set("align", LiteralDoc::Int(buffer->data_alignment, p->Attr("data_alignment")));
   }
   // Step 8. Handle `buffer.offset_factor`
   if (needs_print_factor || buffer->offset_factor != 1) {
-    kwargs.Set("offset_factor", LiteralDoc::Int(buffer->offset_factor));
+    kwargs.Set("offset_factor", LiteralDoc::Int(buffer->offset_factor, p->Attr("offset_factor")));
   }
   // Step 9. Handle `buffer.buffer_type`
   if (buffer->buffer_type != tir::BufferType::kDefault) {
-    kwargs.Set("type", LiteralDoc::Str("auto"));
+    kwargs.Set("type", LiteralDoc::Str("auto", p->Attr("buffer_type")));
   }
   // Step 10. Handle `buffer.axis_separator`
   if (!buffer->axis_separators.empty()) {
@@ -130,7 +132,8 @@ ExprDoc BufferAttn(const tir::Buffer& buffer, const ObjectPath& p, const Frame&
                    const IRDocsifier& d) {
   Map<String, ExprDoc> attrs = BufferAttrs(buffer, p, frame, d);
   ExprDoc shape = attrs.Get("shape").value();
-  ExprDoc dtype = attrs.Get("dtype").value_or(LiteralDoc::DataType(buffer->dtype));
+  ExprDoc dtype =
+      attrs.Get("dtype").value_or(LiteralDoc::DataType(buffer->dtype, p->Attr("dtype")));
   return TIR("Buffer")->Call({shape, dtype}, {}, {});
 }
 
diff --git a/src/script/printer/tir/expr.cc b/src/script/printer/tir/expr.cc
index 1f2ba97700cb..6e0cfd420262 100644
--- a/src/script/printer/tir/expr.cc
+++ b/src/script/printer/tir/expr.cc
@@ -24,17 +24,17 @@ namespace tvm {
 namespace script {
 namespace printer {
 
-Doc PrintVar(const tir::Var& var, const ObjectPath& p, const IRDocsifier& d) {
+Doc PrintVar(const tir::Var& var, const ObjectPath& var_p, const IRDocsifier& d) {
   if (!d->IsVarDefined(var)) {
     if (Optional<Frame> opt_f = FindLowestVarDef(var, d)) {
       ExprDoc lhs = DefineVar(var, opt_f.value(), d);
       Type type = var->type_annotation;
       if (const auto* ptr_type = type.as<PointerTypeNode>()) {
         ICHECK(ptr_type->element_type->IsInstance<PrimTypeNode>());
-        ExprDoc rhs = d->AsDoc<ExprDoc>(type, p->Attr("type_annotation"));
+        ExprDoc rhs = d->AsDoc<ExprDoc>(type, var_p->Attr("type_annotation"));
         opt_f.value()->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
       } else {
-        ExprDoc rhs = TIR("var")->Call({LiteralDoc::DataType(var->dtype)});
+        ExprDoc rhs = TIR("var")->Call({LiteralDoc::DataType(var->dtype, var_p->Attr("dtype"))});
         opt_f.value()->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
       }
     }
@@ -56,13 +56,13 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)  //
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<tir::IterVar>("", [](tir::IterVar var, ObjectPath p, IRDocsifier d) -> Doc {
+    .set_dispatch<tir::IterVar>("", [](tir::IterVar var, ObjectPath var_p, IRDocsifier d) -> Doc {
       return TIR("iter_var")
           ->Call({
-              d->AsDoc<ExprDoc>(var->var, p->Attr("var")),
-              d->AsDoc<ExprDoc>(var->dom, p->Attr("dom")),
-              LiteralDoc::Str(IterVarType2String(var->iter_type)),
-              LiteralDoc::Str(var->thread_tag),
+              d->AsDoc<ExprDoc>(var->var, var_p->Attr("var")),
+              d->AsDoc<ExprDoc>(var->dom, var_p->Attr("dom")),
+              LiteralDoc::Str(IterVarType2String(var->iter_type), var_p->Attr("iter_type")),
+              LiteralDoc::Str(var->thread_tag, var_p->Attr("thread_tag")),
           });
     });
 
@@ -82,7 +82,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Cast>("", [](tir::Cast cast, ObjectPath p, IRDocsifier d) -> Doc {
-      ExprDoc dtype = LiteralDoc::DataType(cast->dtype);
+      ExprDoc dtype = LiteralDoc::DataType(cast->dtype, p->Attr("dtype"));
       ExprDoc value = d->AsDoc<ExprDoc>(cast->value, p->Attr("value"));
       return TIR("Cast")->Call({dtype, value});
     });
@@ -97,20 +97,20 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<tir::Ramp>("", [](tir::Ramp ramp, ObjectPath p, IRDocsifier d) -> Doc {
+    .set_dispatch<tir::Ramp>("", [](tir::Ramp ramp, ObjectPath ramp_p, IRDocsifier d) -> Doc {
       return TIR("Ramp")->Call({
-          d->AsDoc<ExprDoc>(ramp->base, p->Attr("base")),
-          d->AsDoc<ExprDoc>(ramp->stride, p->Attr("stride")),
-          LiteralDoc::Int(ramp->lanes),
+          d->AsDoc<ExprDoc>(ramp->base, ramp_p->Attr("base")),
+          d->AsDoc<ExprDoc>(ramp->stride, ramp_p->Attr("stride")),
+          LiteralDoc::Int(ramp->lanes, ramp_p->Attr("lanes")),
       });
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<tir::Broadcast>("", [](tir::Broadcast bc, ObjectPath p, IRDocsifier d) -> Doc {
+    .set_dispatch<tir::Broadcast>("", [](tir::Broadcast bc, ObjectPath bc_p, IRDocsifier d) -> Doc {
       return TIR("Broadcast")
           ->Call({
-              d->AsDoc<ExprDoc>(bc->value, p->Attr("value")),
-              LiteralDoc::Int(bc->lanes),
+              d->AsDoc<ExprDoc>(bc->value, bc_p->Attr("value")),
+              LiteralDoc::Int(bc->lanes, bc_p->Attr("lanes")),
           });
     });
 
@@ -165,7 +165,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<tir::Call>("", [](tir::Call call, ObjectPath p, IRDocsifier d) -> Doc {
+    .set_dispatch<tir::Call>("", [](tir::Call call, ObjectPath call_p, IRDocsifier d) -> Doc {
       static const OpAttrMap<tir::TScriptPrinterName>& op_names =
           Op::GetAttrMap<tir::TScriptPrinterName>("TScriptPrinterName");
       static const std::unordered_set<const Object*> dtype_first_arg = {
@@ -196,7 +196,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         }
         prefix = TIR(name);
       } else if (const auto* gv = call->op.as<GlobalVarNode>()) {
-        prefix = LiteralDoc::Str(gv->name_hint);
+        prefix = LiteralDoc::Str(gv->name_hint, call_p->Attr("op"));
       } else {
         LOG(FATAL) << "call: " << call;
       }
@@ -204,13 +204,13 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       int n_args = call->args.size();
       args.reserve(n_args + 1);
       if (dtype_first_arg.count(call->op.get())) {
-        args.push_back(LiteralDoc::DataType(call->dtype));
+        args.push_back(LiteralDoc::DataType(call->dtype, call_p->Attr("dtype")));
       }
       for (int i = 0; i < n_args; ++i) {
-        args.push_back(d->AsDoc<ExprDoc>(call->args[i], p->Attr("args")->ArrayIndex(i)));
+        args.push_back(d->AsDoc<ExprDoc>(call->args[i], call_p->Attr("args")->ArrayIndex(i)));
       }
       if (dtype_last_arg.count(call->op.get())) {
-        args.push_back(LiteralDoc::DataType(call->dtype));
+        args.push_back(LiteralDoc::DataType(call->dtype, call_p->Attr("dtype")));
       }
       return prefix->Call(args);
     });
@@ -227,7 +227,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       ExprDoc init = d->AsDoc<ExprDoc>(r->init, p->Attr("init"));
       ExprDoc axis = d->AsDoc<ExprDoc>(r->axis, p->Attr("axis"));
       ExprDoc condition = d->AsDoc<ExprDoc>(r->condition, p->Attr("condition"));
-      ExprDoc value_index = LiteralDoc::Int(r->value_index);
+      ExprDoc value_index = LiteralDoc::Int(r->value_index, p->Attr("value_index"));
       return TIR("reduce")->Call({combiner}, {"source", "init", "axis", "condition", "value_index"},
                                  {source, init, axis, condition, value_index});
       LOG(FATAL) << "ValueError: Reduce should never exist in TIR: " << r;
diff --git a/src/script/printer/tir/for_loop.cc b/src/script/printer/tir/for_loop.cc
index c8e2580f9c6f..2a81c37061c6 100644
--- a/src/script/printer/tir/for_loop.cc
+++ b/src/script/printer/tir/for_loop.cc
@@ -23,7 +23,7 @@ namespace script {
 namespace printer {
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<tir::For>("", [](tir::For loop, ObjectPath p, IRDocsifier d) -> Doc {
+    .set_dispatch<tir::For>("", [](tir::For loop, ObjectPath loop_p, IRDocsifier d) -> Doc {
       // Step 1. Check syntactic sugar: `T.grid`
       std::vector<const tir::ForNode*> grid;
       std::unordered_set<const tir::VarNode*> grid_loop_vars;
@@ -55,10 +55,10 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         for (int i = 0; i < n; ++i) {
           const tir::ForNode* loop = grid[i];
           lhs.push_back(DefineVar(loop->loop_var, *f, d));
-          rhs.push_back(d->AsDoc<ExprDoc>(loop->extent, p->Attr("extent")));
-          p = p->Attr("body");
+          rhs.push_back(d->AsDoc<ExprDoc>(loop->extent, loop_p->Attr("extent")));
+          loop_p = loop_p->Attr("body");
         }
-        AsDocBody(grid.back()->body, p, (*f).get(), d);
+        AsDocBody(grid.back()->body, loop_p, (*f).get(), d);
         return ForDoc(TupleDoc(lhs), TIR("grid")->Call(rhs), (*f)->stmts);
       }
       // Step 3. If not `T.grid`, print loop kind accordingly
@@ -68,13 +68,13 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       Optional<ExprDoc> annotations = NullOpt;
       Optional<ExprDoc> thread = NullOpt;
       if (tir::is_zero(loop->min)) {
-        max = d->AsDoc<ExprDoc>(loop->extent, p->Attr("extent"));
+        max = d->AsDoc<ExprDoc>(loop->extent, loop_p->Attr("extent"));
       } else {
-        min = d->AsDoc<ExprDoc>(loop->min, p->Attr("min"));
-        max = d->AsDoc<ExprDoc>(loop->min + loop->extent, p->Attr("extent"));
+        min = d->AsDoc<ExprDoc>(loop->min, loop_p->Attr("min"));
+        max = d->AsDoc<ExprDoc>(loop->min + loop->extent, loop_p->Attr("extent"));
       }
       if (!loop->annotations.empty()) {
-        annotations = d->AsDoc<ExprDoc>(loop->annotations, p->Attr("annotations"));
+        annotations = d->AsDoc<ExprDoc>(loop->annotations, loop_p->Attr("annotations"));
       }
       ExprDoc prefix{nullptr};
       if (loop->kind == tir::ForKind::kSerial) {
@@ -91,7 +91,8 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         prefix = TIR("vectorized");
       } else if (loop->kind == tir::ForKind::kThreadBinding) {
         prefix = TIR("thread_binding");
-        thread = LiteralDoc::Str(loop->thread_binding.value()->thread_tag);
+        thread = LiteralDoc::Str(loop->thread_binding.value()->thread_tag,
+                                 loop_p->Attr("thread_binding"));
       } else {
         LOG(FATAL) << "ValueError: Unknown ForKind: " << tir::ForKind2String(loop->kind);
       }
@@ -113,7 +114,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         kwargs_values.push_back(annotations.value());
       }
       ExprDoc rhs = prefix->Call(args, kwargs_keys, kwargs_values);
-      AsDocBody(loop->body, p, (*f).get(), d);
+      AsDocBody(loop->body, loop_p, (*f).get(), d);
       return ForDoc(lhs, rhs, (*f)->stmts);
     });
 
diff --git a/src/script/printer/tir/ir.cc b/src/script/printer/tir/ir.cc
index ad00c42119f6..1214f822610c 100644
--- a/src/script/printer/tir/ir.cc
+++ b/src/script/printer/tir/ir.cc
@@ -27,26 +27,26 @@ namespace printer {
 TVM_REGISTER_NODE_TYPE(TIRFrameNode);
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<IntImm>("", [](IntImm imm, ObjectPath p, IRDocsifier d) -> Doc {
+    .set_dispatch<IntImm>("", [](IntImm imm, ObjectPath imm_p, IRDocsifier d) -> Doc {
       DataType dtype = imm->dtype;
       if (dtype == Default::IntDType()) {
-        return LiteralDoc::Int(imm->value);
+        return LiteralDoc::Int(imm->value, imm_p->Attr("value"));
       } else if (dtype == DataType::Bool()) {
-        return LiteralDoc::Boolean(imm->value);
+        return LiteralDoc::Boolean(imm->value, imm_p->Attr("value"));
       } else {
         return TIR(runtime::DLDataType2String(dtype))  //
-            ->Call({LiteralDoc::Int(imm->value)});
+            ->Call({LiteralDoc::Int(imm->value, imm_p->Attr("value"))});
       }
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<FloatImm>("", [](FloatImm imm, ObjectPath p, IRDocsifier d) -> Doc {
+    .set_dispatch<FloatImm>("", [](FloatImm imm, ObjectPath imm_p, IRDocsifier d) -> Doc {
       DataType dtype = imm->dtype;
       if (dtype == Default::FloatDType()) {
-        return LiteralDoc::Float(imm->value);
+        return LiteralDoc::Float(imm->value, imm_p->Attr("value"));
       } else {
         return TIR(runtime::DLDataType2String(dtype))  //
-            ->Call({LiteralDoc::Float(imm->value)});
+            ->Call({LiteralDoc::Float(imm->value, imm_p->Attr("value"))});
       }
     });
 
@@ -65,26 +65,26 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
-    .set_dispatch<PointerType>("", [](PointerType ty, ObjectPath p, IRDocsifier d) -> Doc {
+    .set_dispatch<PointerType>("", [](PointerType ty, ObjectPath ty_p, IRDocsifier d) -> Doc {
       ExprDoc element_type{nullptr};
       if (const auto* prim_type = ty->element_type.as<PrimTypeNode>()) {
-        std::string dtype =
-            prim_type->dtype.is_void() ? "void" : runtime::DLDataType2String(prim_type->dtype);
-        element_type = LiteralDoc::Str(dtype);
+        element_type = LiteralDoc::DataType(prim_type->dtype,  //
+                                            ty_p->Attr("element_type")->Attr("dtype"));
       } else {
-        element_type = d->AsDoc<ExprDoc>(ty->element_type, p->Attr("element_type"));
+        element_type = d->AsDoc<ExprDoc>(ty->element_type, ty_p->Attr("element_type"));
       }
       if (ty->storage_scope == "") {
         return TIR("Ptr")->Call({element_type});
       } else {
-        return TIR("Ptr")->Call({element_type, LiteralDoc::Str(ty->storage_scope)});
+        return TIR("Ptr")->Call(
+            {element_type, LiteralDoc::Str(ty->storage_scope, ty_p->Attr("storage_scope"))});
       }
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<TupleType>("", [](TupleType ty, ObjectPath p, IRDocsifier d) -> Doc {
       if (ty->fields.empty()) {
-        return LiteralDoc::None();
+        return LiteralDoc::None(p);
       }
       return TIR("Tuple")->Call(d->AsDoc<ListDoc>(ty->fields, p->Attr("fields"))->elements);
     });
diff --git a/src/script/printer/tir/stmt.cc b/src/script/printer/tir/stmt.cc
index 57b4c695a4ee..7c8d44c10e72 100644
--- a/src/script/printer/tir/stmt.cc
+++ b/src/script/printer/tir/stmt.cc
@@ -173,31 +173,35 @@ bool IsAllocateDeclBufferPattern(const tir::AllocateNode* allocate) {
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Allocate>(  //
-        "", [](tir::Allocate stmt, ObjectPath p, IRDocsifier d) -> Doc {
+        "", [](tir::Allocate stmt, ObjectPath stmt_p, IRDocsifier d) -> Doc {
           bool concise = AllowConciseScoping(d);
           OccurrenceCounter counter(stmt->buffer_var.get());
           counter(stmt->body);
           if (counter.count == 1 && IsAllocateDeclBufferPattern(stmt.get())) {
-            return d->AsDoc(stmt->body, p->Attr("body"));
+            return d->AsDoc(stmt->body, stmt_p->Attr("body"));
           }
-          String storage_scope = tir::GetPtrStorageScope(stmt->buffer_var);
           Array<ExprDoc> args;
           Array<String> kwargs_keys;
           Array<ExprDoc> kwargs_values;
-          args.push_back(d->AsDoc<ExprDoc>(stmt->extents, p->Attr("extents")));
-          args.push_back(LiteralDoc::DataType(stmt->dtype));
-          args.push_back(LiteralDoc::Str(storage_scope));
+          args.push_back(d->AsDoc<ExprDoc>(stmt->extents, stmt_p->Attr("extents")));
+          args.push_back(LiteralDoc::DataType(stmt->dtype, stmt_p->Attr("dtype")));
+          args.push_back(LiteralDoc::Str(tir::GetPtrStorageScope(stmt->buffer_var),
+                                         stmt_p
+                                             ->Attr("buffer_var")  //
+                                             ->Attr("type_annotation")
+                                             ->Attr("storage_scope")));
           if (!tir::is_one(stmt->condition)) {
-            args.push_back(d->AsDoc<ExprDoc>(stmt->condition, p->Attr("condition")));
+            args.push_back(d->AsDoc<ExprDoc>(stmt->condition, stmt_p->Attr("condition")));
           }
           if (!stmt->annotations.empty()) {
             kwargs_keys.push_back("annotations");
-            kwargs_values.push_back(d->AsDoc<ExprDoc>(stmt->annotations, p->Attr("annotations")));
+            kwargs_values.push_back(
+                d->AsDoc<ExprDoc>(stmt->annotations, stmt_p->Attr("annotations")));
           }
           ExprDoc lhs = DefineVar(stmt->buffer_var, d->frames.back(), d);
           With<TIRFrame> f(d, stmt);
           ExprDoc rhs = TIR("allocate")->Call(args, kwargs_keys, kwargs_values);
-          AsDocBody(stmt->body, p->Attr("body"), f->get(), d);
+          AsDocBody(stmt->body, stmt_p->Attr("body"), f->get(), d);
           return DoConciseScoping(lhs, rhs, &(*f)->stmts, concise);
         });
 
@@ -215,9 +219,9 @@ ExprDoc PrintNDArray(::tvm::runtime::NDArray arr) {
   runtime::DataType dtype = arr.DataType();
   for (int i = 0; i < tot_dim; i++) {
     if (dtype.is_float()) {
-      result.push_back(LiteralDoc::Float(data_ptr[i]));
+      result.push_back(LiteralDoc::Float(data_ptr[i], NullOpt));
     } else {
-      result.push_back(LiteralDoc::Int(data_ptr[i]));
+      result.push_back(LiteralDoc::Int(data_ptr[i], NullOpt));
     }
     if (i == NUM_PRINT) {
       break;
@@ -228,7 +232,7 @@ ExprDoc PrintNDArray(::tvm::runtime::NDArray arr) {
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::AllocateConst>(
-        "", [](tir::AllocateConst stmt, ObjectPath p, IRDocsifier d) -> Doc {
+        "", [](tir::AllocateConst stmt, ObjectPath stmt_p, IRDocsifier d) -> Doc {
           bool concise = AllowConciseScoping(d);
           String storage_scope = tir::GetPtrStorageScope(stmt->buffer_var);
           Array<ExprDoc> args;
@@ -273,12 +277,12 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
             LOG(FATAL) << "DataType not supported";
           }
           args.push_back(data_doc);
-          args.push_back(LiteralDoc::DataType(stmt->dtype));
-          args.push_back(d->AsDoc<ExprDoc>(stmt->extents, p->Attr("extents")));
+          args.push_back(LiteralDoc::DataType(stmt->dtype, stmt_p->Attr("dtype")));
+          args.push_back(d->AsDoc<ExprDoc>(stmt->extents, stmt_p->Attr("extents")));
           ExprDoc rhs = TIR("allocate_const")->Call(args, kwargs_keys, kwargs_values);
           With<TIRFrame> f(d, stmt);
           ExprDoc lhs = DefineVar(stmt->buffer_var, *f, d);
-          AsDocBody(stmt->body, p->Attr("body"), f->get(), d);
+          AsDocBody(stmt->body, stmt_p->Attr("body"), f->get(), d);
           return DoConciseScoping(lhs, rhs, &(*f)->stmts, concise);
         });
 
@@ -323,18 +327,18 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::AttrStmt>(  //
-        "", [](tir::AttrStmt stmt, ObjectPath p, IRDocsifier d) -> Doc {
+        "", [](tir::AttrStmt stmt, ObjectPath stmt_p, IRDocsifier d) -> Doc {
           bool concise = AllowConciseScoping(d);
           Optional<ExprDoc> rhs = NullOpt;
           tir::Stmt body = stmt->body;
-          ObjectPath body_p = p->Attr("body");
+          ObjectPath body_p = stmt_p->Attr("body");
           if (stmt->attr_key == "realize_scope") {
             if (const auto* realize = stmt->body.as<tir::BufferRealizeNode>()) {
               if (realize->buffer.same_as(stmt->node)) {
-                rhs =
-                    DocsifyBufferRealize(realize,
-                                         /*value=*/d->AsDoc<ExprDoc>(stmt->value, p->Attr("value")),
-                                         /*p=*/p->Attr("body"), d);
+                rhs = DocsifyBufferRealize(
+                    realize,
+                    /*value=*/d->AsDoc<ExprDoc>(stmt->value, stmt_p->Attr("value")),
+                    /*p=*/stmt_p->Attr("body"), d);
                 body = realize->body;
                 body_p = body_p->Attr("body");
               }
@@ -344,25 +348,28 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
             if (const auto* iter_var = stmt->node.as<tir::IterVarNode>()) {
               if (!d->IsVarDefined(iter_var->var)) {
                 // `DefineVar` is not used here because a more specific name is desirable
+                ObjectPath iter_var_p = stmt_p->Attr("node");
                 Frame f = FindLowestVarDef(iter_var->var, d).value();
                 DefineVar(iter_var->var, f, d);
                 f->stmts.push_back(
-                    AssignDoc(d->AsDoc<ExprDoc>(iter_var->var, p->Attr("node")->Attr("var")),
-                              TIR("env_thread")->Call({LiteralDoc::Str(iter_var->thread_tag)}),  //
+                    AssignDoc(d->AsDoc<ExprDoc>(iter_var->var, iter_var_p->Attr("var")),
+                              TIR("env_thread")
+                                  ->Call({LiteralDoc::Str(iter_var->thread_tag,
+                                                          iter_var_p->Attr("thread_tag"))}),  //
                               NullOpt));
               }
               rhs = TIR("launch_thread")
                         ->Call({
-                            d->AsDoc<ExprDoc>(iter_var->var, p->Attr("node")),
-                            d->AsDoc<ExprDoc>(stmt->value, p->Attr("value")),
+                            d->AsDoc<ExprDoc>(iter_var->var, stmt_p->Attr("node")),
+                            d->AsDoc<ExprDoc>(stmt->value, stmt_p->Attr("value")),
                         });
             }
           }
           if (!rhs.defined()) {
             rhs = TIR("attr")->Call({
-                d->AsDoc<ExprDoc>(stmt->node, p->Attr("node")),
-                LiteralDoc::Str(stmt->attr_key),
-                d->AsDoc<ExprDoc>(stmt->value, p->Attr("value")),
+                d->AsDoc<ExprDoc>(stmt->node, stmt_p->Attr("node")),
+                LiteralDoc::Str(stmt->attr_key, stmt_p->Attr("attr_key")),
+                d->AsDoc<ExprDoc>(stmt->value, stmt_p->Attr("value")),
             });
           }
           With<TIRFrame> f(d, stmt);
diff --git a/src/tir/analysis/control_flow_graph.cc b/src/tir/analysis/control_flow_graph.cc
index de9da80140e4..86ce4e21351f 100644
--- a/src/tir/analysis/control_flow_graph.cc
+++ b/src/tir/analysis/control_flow_graph.cc
@@ -25,7 +25,6 @@
 #include "control_flow_graph.h"
 
 #include <tvm/runtime/registry.h>
-#include <tvm/script/printer/printer.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>

From 1d3139b0973087e953314dc3aed345f82b124470 Mon Sep 17 00:00:00 2001
From: Alexey <avoronov.icemist@gmail.com>
Date: Tue, 24 Jan 2023 04:02:38 +0300
Subject: [PATCH 213/286] [Metaschedule] get_top_k should not return not built
 records (#13824)

* [Metaschedule] get_top_k should not return not built records

* [Metaschedule][NFC] GetTopK extra polishing
---
 src/meta_schedule/database/json_database.cc   |  9 +++-
 src/meta_schedule/database/memory_database.cc | 44 ++++++++-----------
 .../unittest/test_meta_schedule_database.py   | 16 +++++--
 3 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index b0fba5adb5c2..0e51e262df0f 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -127,7 +127,14 @@ class JSONDatabaseNode : public DatabaseNode {
     Array<TuningRecord> results;
     results.reserve(top_k);
     for (const TuningRecord& record : this->tuning_records_) {
-      if (!record->run_secs.defined() || record->run_secs.value().empty()) {
+      auto run_secs = record->run_secs;
+      if (!run_secs.defined() || run_secs.value().empty() ||
+          std::all_of(run_secs.value().begin(), run_secs.value().end(),
+                      // kMaxMeanTime(1e10) is used as a stub for undefined measurement times.
+                      [](tvm::FloatImm v) {
+                        return v.defined() &&
+                               v->value == SortTuningRecordByMeanRunSecs::kMaxMeanTime;
+                      })) {
         continue;
       }
       if (record->workload.same_as(workload) ||
diff --git a/src/meta_schedule/database/memory_database.cc b/src/meta_schedule/database/memory_database.cc
index 19178a35f456..8cbde46f83b7 100644
--- a/src/meta_schedule/database/memory_database.cc
+++ b/src/meta_schedule/database/memory_database.cc
@@ -65,42 +65,34 @@ class MemoryDatabaseNode : public DatabaseNode {
     if (top_k == 0) {
       return {};
     }
-    std::vector<std::pair<double, TuningRecord>> results;
+    std::vector<TuningRecord> results;
     results.reserve(records.size());
     for (const TuningRecord& record : records) {
-      if (!record->run_secs.defined()) {
-        continue;
-      }
-      Array<FloatImm> run_secs = record->run_secs.value();
-      if (run_secs.empty()) {
+      auto run_secs = record->run_secs;
+      if (!run_secs.defined() || run_secs.value().empty() ||
+          std::all_of(run_secs.value().begin(), run_secs.value().end(),
+                      // kMaxMeanTime(1e10) is used as a stub for undefined measurement times.
+                      [](tvm::FloatImm v) {
+                        return v.defined() &&
+                               v->value == SortTuningRecordByMeanRunSecs::kMaxMeanTime;
+                      })) {
         continue;
       }
       if (record->workload.same_as(workload) ||
           WorkloadEqual(GetModuleEquality())(record->workload, workload)) {
-        double sum = 0.0;
-        for (const FloatImm& i : run_secs) {
-          sum += i->value;
-        }
-        results.emplace_back(sum / run_secs.size(), record);
+        results.emplace_back(record);
       }
     }
-    std::sort(results.begin(), results.end());
-    auto begin = results.begin();
-    auto end = results.end();
+    std::stable_sort(results.begin(), results.end(), SortTuningRecordByMeanRunSecs());
     if (results.size() > static_cast<size_t>(top_k)) {
-      end = begin + top_k;
-    }
-    Array<TuningRecord> ret;
-    ret.reserve(end - begin);
-    while (begin != end) {
-      ret.push_back(begin->second);
-      ++begin;
-    }
-    if (ret.size() < static_cast<size_t>(top_k)) {
-      LOG(WARNING) << "The size of the GetTopK result is smaller than requested. There are not "
-                      "enough valid records in the database for this workload.";
+      return {results.begin(), results.end() + top_k};
+    } else {
+      if (results.size() < static_cast<size_t>(top_k)) {
+        LOG(WARNING) << "The size of the GetTopK result is smaller than requested. There are not "
+                        "enough valid records in the database for this workload.";
+      }
+      return results;
     }
-    return ret;
   }
 
   Array<TuningRecord> GetAllTuningRecords() final { return records; }
diff --git a/tests/python/unittest/test_meta_schedule_database.py b/tests/python/unittest/test_meta_schedule_database.py
index 4ec10b556c3b..d4681d40111b 100644
--- a/tests/python/unittest/test_meta_schedule_database.py
+++ b/tests/python/unittest/test_meta_schedule_database.py
@@ -554,10 +554,14 @@ def call_get_top_k(run_secs_list, database, k):
 
 @pytest.mark.parametrize(
     "k,expected",
-    [(0, []), (3, [[0.0, 2.0], [2.0], [1.5, 4.5]]), (5, [[0.0, 2.0], [2.0], [1.5, 4.5]])],
+    [
+        (0, []),
+        (4, [[0.0, 2.0], [2.0], [1.5, 4.5], [3.0, 1e10]]),
+        (5, [[0.0, 2.0], [2.0], [1.5, 4.5], [3.0, 1e10]]),
+    ],
 )
 def test_memory_database_get_top_k(k, expected):
-    run_secs_list = [[1.5, 4.5], [], [0.0, 2.0], None, [2.0]]
+    run_secs_list = [[1.5, 4.5], [], [0.0, 2.0], None, [2.0], [3.0, 1e10], [1e10]]
     database = ms.database.MemoryDatabase()
     result = call_get_top_k(run_secs_list, database, k)
     assert result == expected
@@ -565,10 +569,14 @@ def test_memory_database_get_top_k(k, expected):
 
 @pytest.mark.parametrize(
     "k,expected",
-    [(0, []), (3, [[0.0, 2.0], [2.0], [1.5, 4.5]]), (5, [[0.0, 2.0], [2.0], [1.5, 4.5]])],
+    [
+        (0, []),
+        (4, [[0.0, 2.0], [2.0], [1.5, 4.5], [3.0, 1e10]]),
+        (5, [[0.0, 2.0], [2.0], [1.5, 4.5], [3.0, 1e10]]),
+    ],
 )
 def test_json_database_get_top_k(k, expected):
-    run_secs_list = [[1.5, 4.5], [], [0.0, 2.0], None, [2.0]]
+    run_secs_list = [[1.5, 4.5], [], [0.0, 2.0], None, [2.0], [3.0, 1e10], [1e10]]
     with tempfile.TemporaryDirectory() as tmpdir:
         database = _create_tmp_database(tmpdir)
         result = call_get_top_k(run_secs_list, database, k)

From cf48327c32b0e0271f3019becff963c30676bc85 Mon Sep 17 00:00:00 2001
From: Alexey <avoronov.icemist@gmail.com>
Date: Tue, 24 Jan 2023 07:02:15 +0300
Subject: [PATCH 214/286] [Hexagon][Metaschedule] Add timeout_sec arg to
 get_hexagon_local_builder (#13828)

---
 python/tvm/contrib/hexagon/meta_schedule.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/tvm/contrib/hexagon/meta_schedule.py b/python/tvm/contrib/hexagon/meta_schedule.py
index 6e1541e498a9..92298c011d4a 100644
--- a/python/tvm/contrib/hexagon/meta_schedule.py
+++ b/python/tvm/contrib/hexagon/meta_schedule.py
@@ -129,7 +129,9 @@ def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat, artifact_path
 
 
 def get_hexagon_local_builder(
-    pass_context: tvm.transform.PassContext = None, max_workers: Optional[int] = None
+    pass_context: tvm.transform.PassContext = None,
+    max_workers: Optional[int] = None,
+    timeout_sec: float = 30.0,
 ):
     """Return Hexagon-compatible Builder for meta schedule."""
 
@@ -146,10 +148,13 @@ def default_build_with_context(
 
     if pass_context is not None:
         return LocalBuilder(
-            f_build=default_build_with_context, f_export=export_func, max_workers=max_workers
+            f_build=default_build_with_context,
+            f_export=export_func,
+            max_workers=max_workers,
+            timeout_sec=timeout_sec,
         )
     else:
-        return LocalBuilder(f_export=export_func, max_workers=max_workers)
+        return LocalBuilder(f_export=export_func, max_workers=max_workers, timeout_sec=timeout_sec)
 
 
 def get_hexagon_rpc_runner(

From f7b5c105a0d5956089369294e989c07214192ade Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Mon, 23 Jan 2023 23:13:35 -0800
Subject: [PATCH 215/286] [TVMScript] More concise `T.allocate` syntax printing
 (#13830)

This PR is the follow up of #13813. We simplify the printing output of `T.allocate` with `T.decl_buffer`. For example, we have a code snippet as

```python
buffer_data = T.allocate(...)
buffer = T.decl_buffer(..., data=buffer_data)
T.evaluate(buffer_data)
```

Originally, we skip the `T.allocate` only if the var `buffer_data` defined by `T.allocate` is used only once by the following `T.decl_buffer`. This was due to the limitation of the old printer design.

But in the new printer, we may automatically replace the `buffer_data` with `buffer.data` if skipping the definition of `buffer_data`. We are able to link all `buffer_data` usages together. So the new output result will be like

```python
buffer = T.decl_buffer(...)
T.evaluate(buffer.data)
```
---
 src/script/printer/tir/stmt.cc                      | 4 +---
 tests/python/unittest/test_tvmscript_printer_tir.py | 7 +++----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/script/printer/tir/stmt.cc b/src/script/printer/tir/stmt.cc
index 7c8d44c10e72..acdfd7da472b 100644
--- a/src/script/printer/tir/stmt.cc
+++ b/src/script/printer/tir/stmt.cc
@@ -175,9 +175,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Allocate>(  //
         "", [](tir::Allocate stmt, ObjectPath stmt_p, IRDocsifier d) -> Doc {
           bool concise = AllowConciseScoping(d);
-          OccurrenceCounter counter(stmt->buffer_var.get());
-          counter(stmt->body);
-          if (counter.count == 1 && IsAllocateDeclBufferPattern(stmt.get())) {
+          if (IsAllocateDeclBufferPattern(stmt.get())) {
             return d->AsDoc(stmt->body, stmt_p->Attr("body"));
           }
           Array<ExprDoc> args;
diff --git a/tests/python/unittest/test_tvmscript_printer_tir.py b/tests/python/unittest/test_tvmscript_printer_tir.py
index d57d10467077..c73ae291930c 100644
--- a/tests/python/unittest/test_tvmscript_printer_tir.py
+++ b/tests/python/unittest/test_tvmscript_printer_tir.py
@@ -343,7 +343,7 @@ def test_allocate_with_decl_buffer_sugar():
     )
 
 
-def test_allocate_with_decl_buffer_no_sugar_multi_usage():
+def test_allocate_with_decl_buffer_sugar_multi_usage():
     with IRBuilder() as ib:
         with T.allocate([128, 128], "float32") as buffer_data:
             with T.decl_buffer([128, 128], "float32", data=buffer_data) as buffer:
@@ -352,9 +352,8 @@ def test_allocate_with_decl_buffer_no_sugar_multi_usage():
     _assert_print(
         obj,
         """
-with T.allocate([128, 128], "float32", "global") as v:
-    buffer = T.decl_buffer((128, 128), data=v)
-    T.evaluate(v)
+with T.decl_buffer((128, 128)) as buffer:
+    T.evaluate(buffer.data)
 """,
     )
 

From 2d633577f9acaa381d1a64df8a6769e131338505 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Tue, 24 Jan 2023 18:41:54 +0530
Subject: [PATCH 216/286] [TOOL][NATIVE] Android native application for deploy
 and run (#13791)

* [TOOL][NATIVE] Android native appliction for deploy and run

This application helps as a reference for verifying and integration of
TVM compiled models on Android targets natively independent of RPC setup.

tvmc will be used to for compiling tuning and to run it before deployment.

This PR also covers
 * Enabling clml for tvmc compilation tool.
 * Graph runtime api "get_output_info" to return output tensor specification
   similar to "get_input_into"
 * This tool adds and enabled 3rdparty dependency "cnpy" to deal with npz files.

* Update apps/cpp_rtvm/README.md

Co-authored-by: Egor Churaev <egor.churaev@gmail.com>

* Update apps/cpp_rtvm/README.md

Co-authored-by: Egor Churaev <egor.churaev@gmail.com>

* * review comments.

* * proof reading

* Update apps/cpp_rtvm/README.md

Co-authored-by: Egor Churaev <egor.churaev@gmail.com>

* * review

Co-authored-by: Egor Churaev <egor.churaev@gmail.com>
---
 .gitmodules                                  |   3 +
 3rdparty/cnpy                                |   1 +
 CMakeLists.txt                               |   4 +
 LICENSE                                      |   1 +
 apps/cpp_rtvm/CMakeLists.txt                 |  98 +++++
 apps/cpp_rtvm/README.md                      | 354 +++++++++++++++++++
 apps/cpp_rtvm/main.cc                        | 264 ++++++++++++++
 apps/cpp_rtvm/scripts/download_models.py     |  36 ++
 apps/cpp_rtvm/tvm_runner.cc                  | 320 +++++++++++++++++
 apps/cpp_rtvm/tvm_runner.h                   |  93 +++++
 cmake/config.cmake                           |   3 +
 cmake/modules/LibInfo.cmake                  |   1 +
 python/tvm/driver/tvmc/composite_target.py   |   5 +
 python/tvm/relay/op/contrib/clml.py          |   2 +-
 src/auto_scheduler/search_task.cc            |   8 +
 src/runtime/graph_executor/graph_executor.cc |  44 ++-
 src/runtime/graph_executor/graph_executor.h  |   6 +
 src/support/libinfo.cc                       |   5 +
 tests/scripts/task_build_adreno_bins.sh      |   3 +-
 19 files changed, 1248 insertions(+), 3 deletions(-)
 create mode 160000 3rdparty/cnpy
 create mode 100644 apps/cpp_rtvm/CMakeLists.txt
 create mode 100644 apps/cpp_rtvm/README.md
 create mode 100644 apps/cpp_rtvm/main.cc
 create mode 100644 apps/cpp_rtvm/scripts/download_models.py
 create mode 100644 apps/cpp_rtvm/tvm_runner.cc
 create mode 100644 apps/cpp_rtvm/tvm_runner.h

diff --git a/.gitmodules b/.gitmodules
index 64c1a30050bc..d3366b5662c4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -22,3 +22,6 @@
 [submodule "3rdparty/gemmini"]
 	path = 3rdparty/gemmini
 	url = https://github.com/ucb-bar/gemmini
+[submodule "3rdparty/cnpy"]
+	path = 3rdparty/cnpy
+	url = https://github.com/rogersce/cnpy.git
diff --git a/3rdparty/cnpy b/3rdparty/cnpy
new file mode 160000
index 000000000000..4e8810b1a863
--- /dev/null
+++ b/3rdparty/cnpy
@@ -0,0 +1 @@
+Subproject commit 4e8810b1a8637695171ed346ce68f6984e585ef4
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 060ac65592a5..46be2d52fd90 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -597,6 +597,10 @@ if(USE_CPP_RPC)
   add_subdirectory("apps/cpp_rpc")
 endif()
 
+if(USE_CPP_RTVM)
+  add_subdirectory("apps/cpp_rtvm")
+endif()
+
 if(USE_IOS_RPC)
   add_subdirectory("apps/ios_rpc")
 endif()
diff --git a/LICENSE b/LICENSE
index fbc11be2deb5..2705be863f50 100644
--- a/LICENSE
+++ b/LICENSE
@@ -234,6 +234,7 @@ MIT License
 3rdparty/libcrc
 3rdparty/cma
 3rdparty/compiler-rt/builtin_fp16.h
+3rdparty/cnpy
 
 The Unlicense
 -------------
diff --git a/apps/cpp_rtvm/CMakeLists.txt b/apps/cpp_rtvm/CMakeLists.txt
new file mode 100644
index 000000000000..bfd26ee3fe9f
--- /dev/null
+++ b/apps/cpp_rtvm/CMakeLists.txt
@@ -0,0 +1,98 @@
+cmake_policy(SET CMP0069 NEW) # suppress cmake warning about IPO
+
+set(RTVM_SOURCES
+  main.cc
+  tvm_runner.cc
+  ../../3rdparty/cnpy/cnpy.cpp
+)
+set(TVM_RUNNER_SOURCES
+  tvm_runner.cc
+  ../../3rdparty/cnpy/cnpy.cpp
+)
+
+set(RTVM_LINKER_LIBS "")
+
+if(WIN32)
+  list(APPEND RTVM_SOURCES win32_process.cc)
+  list(APPEND TVM_RUNNER_SOURCES win32_process.cc)
+endif()
+
+# Set output to same directory as the other TVM libs
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+add_executable(rtvm ${RTVM_SOURCES})
+add_library(tvm_runner_objs OBJECT ${TVM_RUNNER_SOURCES})
+add_library(tvm_runner SHARED $<TARGET_OBJECTS:tvm_runner_objs>)
+
+include(CheckIPOSupported)
+check_ipo_supported(RESULT result OUTPUT output)
+if(result)
+  set_property(TARGET rtvm PROPERTY INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE)
+endif()
+
+if(WIN32)
+  target_compile_definitions(rtvm PUBLIC -DNOMINMAX)
+endif()
+
+if (OS)
+   if (OS STREQUAL "Linux")
+      set_property(TARGET rtvm PROPERTY LINK_FLAGS -lpthread)
+      set_property(TARGET tvm_runner PROPERTY LINK_FLAGS -lpthread)
+   endif()
+endif()
+
+if(USE_OPENCL)
+   if (ANDROID_ABI)
+     if(DEFINED ENV{ANDROID_NDK_MAJOR})
+       if($ENV{ANDROID_NDK_MAJOR} VERSION_LESS "23")
+         set_property(TARGET rtvm PROPERTY LINK_FLAGS -fuse-ld=gold)
+         set_property(TARGET tvm_runner PROPERTY LINK_FLAGS -fuse-ld=gold)
+       endif()
+     endif()
+   endif()
+endif()
+
+target_include_directories(
+  rtvm
+  PUBLIC "../../include"
+  PUBLIC "../../3rdparty/cnpy"
+  PUBLIC DLPACK_PATH
+  PUBLIC DMLC_PATH
+)
+
+if (BUILD_FOR_ANDROID AND USE_HEXAGON)
+  get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
+    DSPRPC_LIB DSPRPC_LIB_DIRS
+  )
+  if(DSPRPC_LIB_DIRS)
+    link_directories(${DSPRPC_LIB_DIRS})
+  else()
+    message(WARNING "Could not locate some Hexagon SDK components")
+  endif()
+  list(APPEND RTVM_LINKER_LIBS cdsprpc log)
+endif()
+
+if(USE_ETHOSN)
+  if (ETHOSN_RUNTIME_LIBRARY)
+    list(APPEND RTVM_LINKER_LIBS ${ETHOSN_RUNTIME_LIBRARY})
+  else()
+    message(WARNING "Could not locate Arm(R) Ethos(TM)-N runtime library components")
+  endif()
+endif()
+
+if(BUILD_STATIC_RUNTIME)
+  list(APPEND RTVM_LINKER_LIBS -Wl,--whole-archive tvm_runtime -Wl,--no-whole-archive z)
+else()
+  list(APPEND RTVM_LINKER_LIBS tvm_runtime z)
+endif()
+
+target_link_libraries(rtvm ${RTVM_LINKER_LIBS})
+
+# Build tvm_runner as a exportable lib
+target_include_directories(
+  tvm_runner_objs
+  PUBLIC "../../include"
+  PUBLIC "../../3rdparty/cnpy"
+  PUBLIC DLPACK_PATH
+  PUBLIC DMLC_PATH
+)
+target_link_libraries(tvm_runner ${RTVM_LINKER_LIBS})
diff --git a/apps/cpp_rtvm/README.md b/apps/cpp_rtvm/README.md
new file mode 100644
index 000000000000..e6961532823d
--- /dev/null
+++ b/apps/cpp_rtvm/README.md
@@ -0,0 +1,354 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+
+# Native Inference application for CPP Native
+
+Native inference tool ```rtvm``` helps in deploying TVM compiled models from a standalone cpp environment.
+Overall process starts from getting a model from a framework all the way up to running on target device using `rtvm` tool.
+
+### Models
+
+Models can be downloaded from well known frameworks like Tensorflow, PyTorch, TFLite, Onnx ..etc.
+scripts/download_models.py has a reference to prepare sample network ```resnet50``` from keras framework.
+
+```bash
+python3  scripts/download_models.py
+```
+
+### Auto Tuning
+Auto tuning process tunes various operatrors the given model for respective target. Auto tuning for remote devices use ```tvm_rpc``` and we need to setup the rpc environment before we invoke tuning.
+Please refer below section [RPC setup](#rpc-setup) for the same.
+
+Auto tunng is necessary to obtain best performaning kernels. We can skip this step if we have tuning log already or the tuning cache is available from tophub (implicite by TVM compilation process).
+Below message indicate that there exists some kernels not optimized for the selected target. In this case we can proceed with tuning to best performance.
+```One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.```
+
+with below environment from [RPC setup](#rpc-setup)
+``` bash
+tvm tracker running on ```TVM_TRACKER_HOST```
+tracker port being ```TVM_TRACKER_PORT```
+rpc device access key being ```TVM_RPC_KEY```
+the model to be tuned being ```./model_data/keras-resnet50/resnet50.h5```
+```
+
+the below command we can generate the tuning cache to file ```./model_data/keras-resnet50/keras-resnet50.log```
+
+```bash
+python3 -m tvm.driver.tvmc tune --target="opencl" --target-host="llvm -mtriple=aarch64-linux-gnu" \
+./model_data/keras-resnet50/resnet50.h5 -o ./model_data/keras-resnet50/keras-resnet50.log \
+--early-stopping 0 --repeat 30 --rpc-key ${TVM_RPC_KEY} --rpc-tracker ${TVM_TRACKER_HOST}:${TVM_TRACKER_PORT} --trials 1024 \
+--tuning-records ./model_data/keras-resnet50/keras-resnet50-records.log --tuner xgb
+```
+
+where
+```bash
+--target="opencl" refers to opencl device on Android device
+--target-host="llvm -mtriple=aarch64-linux-gnu" refers to target_host being an ARM64 CPU
+Options --early-stopping, --repeat, --trials, --tuner are Auto TVM specific options.
+```
+Please refer to AutoTVM documentation for more details [here](https://tvm.apache.org/docs/how_to/tune_with_autotvm/index.html?highlight=autotvm).
+
+### Compile the model
+
+Compilation step generates TVM compiler output artifacts which need to be taken to target device for deployment.
+These artifacts is a compressed archive with kernel shared lib, json with graph description and params binary.
+
+Below command will generate the same
+
+
+```bash
+python3 -m tvm.driver.tvmc compile --cross-compiler ${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang \
+--target="opencl, llvm" --target-llvm-mtriple aarch64-linux-gnu -o keras-resnet50.tar ./model_data/keras-resnet50/resnet50.h5
+```
+
+where
+```
+--cross-compiler : Indicates the cross compiler path for kernel library generation
+--target="opencl, llvm" indicates target and host devices
+```
+
+### Test Run via RPC
+
+At this stage we can verify the generated compiler output for execution correctness over the RPC setup interface.
+Below command can run the compiled output on remote target device.
+
+with
+
+``` bash
+tvm tracker running on ```TVM_TRACKER_HOST```
+tracker port being ```TVM_TRACKER_PORT```
+rpc device access key being ```TVM_RPC_KEY```
+compilation out being keras-resnet50.tar
+```
+
+```bash
+python3 -m tvm.driver.tvmc run --device="cl" keras-resnet50.tar --rpc-key ${TVM_RPC_KEY} --rpc-tracker ${TVM_TRACKER_HOST}:${TVM_TRACKER_PORT} --print-time
+```
+
+This inputs random inputs and validates the execution correctness of the compiled model.
+
+```tvmc``` tool has various options to input custom data, profile the model and benchmark the execution.
+
+
+### Deployment Run
+
+Now we will verify the deployment run of the compiled model using ```rtvm``` tool on target device without any RPC or host based execution.
+
+We need to extract the tar achive on target device. We can copy the extracted contents of ```keras-resnet50.tar``` under Android temp folder at ```/data/local/tmp/keras-resnet50/```
+
+Also copy the cross compiled tool ```rtvm``` and ```libtvm_runtime.so``` to ```data/local/tmp/```
+
+```rtvm``` usage can be quired as below
+```bash
+Android:/data/local/tmp $ LD_LIBRARY_PATH=./ ./rtvm
+Command line usage
+--model        - The folder containing tvm artifacts(mod.so, mod.param, mod.json)
+--device       - The target device to use {llvm, opencl, cpu, cuda, metal, rocm, vpi, oneapi}
+--input        - Numpy file for the model input (optional and we use random of not given)
+--output       - Numpy file name to dump the model output as numpy
+--dump-meta    - Dump model meta information
+
+  Example
+  ./rtvm --model=keras-resnet50 --device="opencl" --dump-meta
+  ./rtvm --model=keras-resnet50 --device="opencl" --input input.npz --output=output.npz
+```
+
+```rtvm``` can run the model using no inputs (just a dry run without any valid inputs) and also with specific input supplied as a numpy npz format file.
+
+We can create npz dump for all inputs by saving the dict object as shown below.
+
+With ```keras-resnet50``` having one  input ```input_1``` with shape ```[1, 224, 224, 3]``` and dtype ```float32```
+
+```
+# Random initilization
+input1 = np.random.uniform(low=-1, high=1, size=(1, 224, 224, 3)).astype("float32")
+dataset = {"input_1": input1}
+np.savez("input.npz", **dataset)
+```
+
+Copy ```input.npz``` also to the target device as ```/data/local/tmp/input.npz```
+
+
+Now, on Android shell we can do a dry run as well as with specific input as shown below.
+```bash
+# Query meta data information
+Android:/data/local/tmp/ $ LD_LIBRARY_PATH=./ ./rtvm --model=keras-resnet50 --device=opencl --dump-meta
+. . . . . .
+Meta Information:keras-resnet50
+    Number of Inputs:183
+    Number of Outputs:1
+    Input MetaInfo:
+        Input:input_1
+            DType:float32
+            Shape:[1, 224, 224, 3]
+    Output MetaInfo:
+        Output:tvmgen_default_fused_nn_softmax
+            DType:float32
+            Shape:[1, 1000]
+. . . . . .
+
+# Dry run with out any inputs
+Android:/data/local/tmp/ $ LD_LIBRARY_PATH=./ ./rtvm --model=keras-resnet50 --device=opencl
+Model         = keras-resnet50
+Device        = opencl
+Input         =
+Output        =
+Dump Metadata = False
+TVMRunner Constructor:keras-resnet50 Devices:opencl
+TVMRunner Load:keras-resnet50
+TVMRunner::GetMetaInfo
+Executing dry run ...
+Set Random Input for :input_1
+TVMRunner::GetInputMemSize:input_1
+Random Input Size:602112  bytes
+TVMRunner::SetInput (Raw)
+TVMRunner::Run
+Get Output for :tvmgen_default_fused_nn_softmax
+TVMRunner::GetOutputMemSize:tvmgen_default_fused_nn_softmax
+TVMRunner::GetOutput (Raw)
+Output Size:4000  bytes
+
+
+# Run with input and dump output as npz file
+Android:/data/local/tmp/ $ LD_LIBRARY_PATH=./ ./rtvm --model=keras-resnet50 --device=opencl --input=input.npz --output=output.npz
+Model         = keras-resnet50
+Device        = opencl
+Input         = input.npz
+Output        = output.npz
+Dump Metadata = False
+TVMRunner Constructor:keras-resnet50 Devices:opencl
+TVMRunner Load:keras-resnet50
+TVMRunner::GetMetaInfo
+Executing with Input:input.npz Output:output.npz
+TVMRunner::SetInput (Numpy):input.npz
+Set Numpy Input for :input_1
+TVMRunner::Run
+TVMRunner::GetOutput (Numpy):output.npz
+Get Output for :tvmgen_default_fused_nn_softmax
+Output Size:4000  bytes
+```
+
+output.npz contains the modle outputs. Below is a quick look of its contents.
+```bash
+tvm-host:~$ unzip -l output.npz
+Archive:  output.npz
+  Length      Date    Time    Name
+---------  ---------- -----   ----
+     4080  1980-00-00 00:00   tvmgen_default_fused_nn_softmax.npy
+---------                     -------
+     4080                     1 file
+
+```
+
+Building ```cpp_rtvm``` produces ```libtvm_runner.so```, a simplified interface that rtvm use internally for loading and executing tvm compiled models from C/C++ environments.
+```tvm_runner.h``` describes the interface definition here. Alternatively pro users can use TVM's [c_native_api](https://github.com/apache/tvm/blob/main/include/tvm/runtime/c_runtime_api.h) interface for more access to TVM features.
+
+
+# RPC Setup
+
+For Android devices we require cross compilation of tvm_rpc (also libtvm_runtime.so which is a dependency) for remote device.
+RPC setup involves running tracker on host device and running tvm_rpc on target device.
+
+### Tracker
+
+Below command runs the tracker on host over port ```9100```
+
+```bash
+python3 -m tvm.exec.rpc_tracker --host 127.0.0.1 --port 9100"
+```
+### RPC on Target
+
+With ```abcd1234ef``` being adb device id and tvm_rpc (and libtvm_runtime.so) is pushed to target device at ```/data/local/tmp/tvm_rpc/```
+
+```bash
+export ANDROID_SERIAL=abcd1234ef
+# Below settings will reroute networking tcm connections on devices to host device via adb interface
+adb reverse tcp:9100 tcp:9100
+adb forward tcp:5000 tcp:5000
+# Run the tvm_rpc on device
+env adb shell "cd /data/local/tmp/tvm_rpc; killall -9 tvm_rpc; \
+LD_LIBRARY_PATH=/data/local/tmp/tvm_rpc/ ./tvm_rpc server --host=0.0.0.0 --port=5000 --port-end=5010 --tracker=127.0.0.1:9100 --key=android
+```
+
+Now we have the rpc setup with ```TVM_TRACKER_HOST=127.0.0.1```, ```TVM_TRACKER_PORT=9100``` and ```TVM_RPC_KEY=android```.
+
+We can also check connected and available devices on tracker as shown below.
+
+```bash
+python3 -m tvm.exec.query_rpc_tracker --port ${TVM_TRACKER_PORT}
+Tracker address 127.0.0.1:9100
+
+Server List
+------------------------------
+server-address           key
+------------------------------
+       127.0.0.1:5000    server:android
+------------------------------
+
+Queue Status
+-------------------------------
+key       total  free  pending
+-------------------------------
+android   1      1     0
+-------------------------------
+```
+
+
+# Target Specific Configuration
+
+Below sections describe device/target specific settings to be used with ```tvmc``` tool.
+
+### Adreno GPU
+
+Adreno GPU has a docker definition that helps to ease the development environment.
+
+We can build the docker image by using below command from TVM repo.
+
+```bash
+./docker/build.sh ci_adreno
+docker tag tvm.ci_adreno ci_adreno
+```
+
+Below command builds host and target rpc components for Adreno and drops into an interactive shell.
+
+```bash
+./tests/scripts/ci.py adreno -i
+```
+
+Also, one can build with Adreno OpenCLML SDK support
+
+```bash
+export ADRENO_OPENCL=<Path to OpenCLML SDK>
+./tests/scripts/ci.py adreno -i
+```
+
+Above command produces
+```build-adreno``` which is host build
+```build-adreno-target``` which contains cross compiled tvm_rpc and libtvm_runtime.so
+
+
+Below options to be used for Adreno GPU while working with tvmc
+
+* Tuning
+
+  ```
+  --target="opencl -device=adreno"
+  --target-host="llvm -mtriple=aarch64-linux-gnu"
+  ```
+
+* Compilation
+
+  ```
+  --cross-compiler ${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang
+  --target="opencl, llvm"
+  --target-opencl-device adreno
+  --target-llvm-mtriple aarch64-linux-gnu
+  ```
+
+  While enabling CLML just need to specify below target option for compilation.
+  ```--target="opencl, clml, llvm"```
+
+
+* Running
+
+  ```--device="cl"```
+
+
+For example with a model from keras ```./model_data/keras-resnet50/resnet50.h5```
+
+
+```bash
+# Tuning
+python3 -m tvm.driver.tvmc tune --desired-layout NCHW --target="opencl -device=adreno" --target-host="llvm -mtriple=aarch64-linux-gnu" \
+./model_data/keras-resnet50/resnet50.h5 -o ./model_data/keras-resnet50/keras-resnet50.log --early-stopping 0 --repeat 30 \
+--rpc-key ${TVM_RPC_KEY} --rpc-tracker {TVM_TRACKER_HOST}:{TVM_TRACKER_PORT} --trials 1024 --tuning-records ./model_data/keras-resnet50/keras-resnet50-records.log --tuner xgb
+
+# Tuning produces tuning log ./model_data/keras-resnet50/keras-resnet50.log
+
+
+# Compilation
+python3 -m tvm.driver.tvmc compile --cross-compiler ${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang \
+--desired-layout NCHW --target="opencl, llvm" --target-opencl-device adreno --target-llvm-mtriple aarch64-linux-gnu \
+./model_data/keras-resnet50/resnet50.h5 -o keras-resnet50.tar
+
+# Compilation produces target artifacts keras-resnet50.tar
+
+# Run on adreno device via RPC
+python3 -m tvm.driver.tvmc run --device="cl" keras-resnet50.tar --rpc-key ${TVM_RPC_KEY} --rpc-tracker {TVM_TRACKER_HOST}:{TVM_TRACKER_PORT} --print-time
+
+```
diff --git a/apps/cpp_rtvm/main.cc b/apps/cpp_rtvm/main.cc
new file mode 100644
index 000000000000..31019ee0c9cf
--- /dev/null
+++ b/apps/cpp_rtvm/main.cc
@@ -0,0 +1,264 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file main.cc
+ * \brief TVM runtime utility for TVM.
+ */
+#include <csignal>
+#include <cstdio>
+#include <cstdlib>
+#if defined(__linux__) || defined(__ANDROID__)
+#include <unistd.h>
+#endif
+#include <dmlc/logging.h>
+
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "../../src/support/socket.h"
+#include "../../src/support/utils.h"
+#include "tvm_runner.h"
+
+#if defined(_WIN32)
+#include "win32_process.h"
+#endif
+
+using namespace std;
+using namespace tvm::runtime;
+using namespace tvm::support;
+
+static const string kUsage =
+    "Command line usage\n"
+    "--model        - The folder containing tvm artifacts(mod.so, mod.param, mod.json) \n"
+    "--device       - The target device to use {llvm, opencl, cpu, cuda, metal, rocm, vpi, "
+    "oneapi}\n"
+    "--input        - Numpy file for the model input (optional and we use random of not given)\n"
+    "--output       - Numpy file name to dump the model output as numpy\n"
+    "--dump-meta    - Dump model meta information\n"
+    "\n"
+    "  Example\n"
+    "  ./rtvm --model=keras-resnet50 --device=\"opencl\" --dump-meta\n"
+    "  ./rtvm --model=keras-resnet50 --device=\"opencl\" --input input.npz --output=output.npz\n"
+    "\n";
+
+/*!
+ * \brief Tool Arguments.
+ * \arg model The tvm artifact to load & run
+ * \arg device The target device to use {llvm, cl, ...etc.}
+ * \arg input Numpy file for the model input
+ * \arg output Numpy file name to dump the model output as numpy
+ */
+struct ToolArgs {
+  string model;
+  string device;
+  string input;
+  string output;
+  bool dump_meta = false;
+};
+
+/*!
+ * \brief PrintArgs print the contents of ToolArgs
+ * \param args ToolArgs structure
+ */
+void PrintArgs(const ToolArgs& args) {
+  LOG(INFO) << "Model         = " << args.model;
+  LOG(INFO) << "Device        = " << args.device;
+  LOG(INFO) << "Input         = " << args.input;
+  LOG(INFO) << "Output        = " << args.output;
+  LOG(INFO) << "Dump Metadata = " << ((args.dump_meta) ? ("True") : ("False"));
+}
+
+#if defined(__linux__) || defined(__ANDROID__)
+/*!
+ * \brief CtrlCHandler, exits if Ctrl+C is pressed
+ * \param s signal
+ */
+void CtrlCHandler(int s) {
+  LOG(INFO) << "\nUser pressed Ctrl+C, Exiting";
+  exit(1);
+}
+
+/*!
+ * \brief HandleCtrlC Register for handling Ctrl+C event.
+ */
+void HandleCtrlC() {
+  // Ctrl+C handler
+  struct sigaction sigIntHandler;
+  sigIntHandler.sa_handler = CtrlCHandler;
+  sigemptyset(&sigIntHandler.sa_mask);
+  sigIntHandler.sa_flags = 0;
+  sigaction(SIGINT, &sigIntHandler, nullptr);
+}
+#endif
+/*!
+ * \brief GetCmdOption Parse and find the command option.
+ * \param argc arg counter
+ * \param argv arg values
+ * \param option command line option to search for.
+ * \param key whether the option itself is key
+ * \return value corresponding to option.
+ */
+string GetCmdOption(int argc, char* argv[], string option, bool key = false) {
+  string cmd;
+  for (int i = 1; i < argc; ++i) {
+    string arg = argv[i];
+    if (arg.find(option) == 0) {
+      if (key) {
+        cmd = argv[i];
+        return cmd;
+      }
+      // We assume "=" is the end of option.
+      ICHECK_EQ(*option.rbegin(), '=');
+      cmd = arg.substr(arg.find('=') + 1);
+      return cmd;
+    }
+  }
+  return cmd;
+}
+
+/*!
+ * \brief ParseCmdArgs parses the command line arguments.
+ * \param argc arg counter
+ * \param argv arg values
+ * \param args the output structure which holds the parsed values
+ */
+void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& args) {
+  const string model = GetCmdOption(argc, argv, "--model=");
+  if (!model.empty()) {
+    args.model = model;
+  } else {
+    LOG(INFO) << kUsage;
+    exit(0);
+  }
+
+  const string device = GetCmdOption(argc, argv, "--device=");
+  if (!device.empty()) {
+    args.device = device;
+  } else {
+    LOG(INFO) << kUsage;
+    exit(0);
+  }
+
+  const string input = GetCmdOption(argc, argv, "--input=");
+  if (!input.empty()) {
+    args.input = input;
+  }
+
+  const string output = GetCmdOption(argc, argv, "--output=");
+  if (!output.empty()) {
+    args.output = output;
+  }
+
+  const string pmeta = GetCmdOption(argc, argv, "--dump-meta", true);
+  if (!pmeta.empty()) {
+    args.dump_meta = true;
+  }
+}
+
+/*!
+ * \brief Loads and Executes the model on given Target.
+ * \param args tool arguments
+ * \return result of operation.
+ */
+int ExecuteModel(ToolArgs& args) {
+#if defined(__linux__) || defined(__ANDROID__)
+  // Ctrl+C handler
+  HandleCtrlC();
+#endif
+
+  // Initialize TVM Runner
+  TVMRunner runner = TVMRunner(args.model, args.device);
+
+  // Load the model
+  runner.Load();
+
+  // Query Model meta Information
+  TVMMetaInfo mInfo = runner.GetMetaInfo();
+
+  // Print Meta Information
+  if (args.dump_meta) runner.PrintMetaInfo();
+
+  if (args.input.empty() || args.output.empty()) {
+    LOG(INFO) << "Executing dry run ... ";
+    // Set random input for all inputs
+    for (auto& elem : mInfo.input_info) {
+      LOG(INFO) << "Set Random Input for :" << elem.first;
+      auto shape = elem.second.first;
+      size_t ssize = runner.GetInputMemSize(elem.first);
+      char* data = (char*)malloc(ssize);
+      LOG(INFO) << "Random Input Size:" << ssize << "  bytes";
+      runner.SetInput(elem.first, data);
+      free(data);
+    }
+
+    // Run the model
+    runner.Run();
+
+    // Get Output and dump few values
+    for (auto& elem : mInfo.output_info) {
+      LOG(INFO) << "Get Output for :" << elem.first;
+      auto shape = elem.second.first;
+      size_t ssize = runner.GetOutputMemSize(elem.first);
+      char* data = (char*)malloc(ssize);
+      runner.GetOutput(elem.first, data);
+      LOG(INFO) << "Output Size:" << ssize << "  bytes";
+      free(data);
+    }
+  } else {
+    LOG(INFO) << "Executing with Input:" << args.input << " Output:" << args.output;
+
+    // Set Input from Numpy Input
+    runner.SetInput(args.input);
+
+    // Run the model
+    runner.Run();
+
+    // Get Output as Numpy dump
+    runner.GetOutput(args.output);
+  }
+
+  return 0;
+}
+
+/*!
+ * \brief main The main function.
+ * \param argc arg counter
+ * \param argv arg values
+ * \return result of operation.
+ */
+int main(int argc, char* argv[]) {
+  if (argc <= 1) {
+    LOG(INFO) << kUsage;
+    return 0;
+  }
+
+  ToolArgs args;
+  ParseCmdArgs(argc, argv, args);
+  PrintArgs(args);
+
+  if (ExecuteModel(args)) {
+    PrintArgs(args);
+    LOG(INFO) << kUsage;
+    return -1;
+  }
+  return 0;
+}
diff --git a/apps/cpp_rtvm/scripts/download_models.py b/apps/cpp_rtvm/scripts/download_models.py
new file mode 100644
index 000000000000..ef330cf765d8
--- /dev/null
+++ b/apps/cpp_rtvm/scripts/download_models.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+tmp_dir = "./model_data/"
+dload_models = []
+
+# Keras : Resnet50
+try:
+    from tensorflow.keras.applications.resnet50 import ResNet50
+
+    model_file_name = "{}/{}".format(tmp_dir + "keras-resnet50", "resnet50.h5")
+    model = ResNet50(include_top=True, weights="imagenet", input_shape=(224, 224, 3), classes=1000)
+    model.save(model_file_name)
+    dload_models.append(model_file_name)
+except ImportError:
+    LOG.warning("Keras is not installed, skipping Keras models")
+
+
+print("Models:", dload_models)
diff --git a/apps/cpp_rtvm/tvm_runner.cc b/apps/cpp_rtvm/tvm_runner.cc
new file mode 100644
index 000000000000..74498e8170c4
--- /dev/null
+++ b/apps/cpp_rtvm/tvm_runner.cc
@@ -0,0 +1,320 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm_runner.cc
+ * \brief TVM model runner implementation.
+ */
+
+#include "tvm_runner.h"
+
+#include <cnpy.h>
+
+#include <fstream>
+#include <streambuf>
+#include <string>
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Get the TVM device id corresponding to device string.
+ * \param device the target device in string format.
+ * \return dl_device corresponding to the device string.
+ */
+int GetTVMDevice(std::string device) {
+  if (!device.compare("cpu")) {
+    return static_cast<int>(kDLCPU);
+  } else if (!device.compare("llvm")) {
+    return static_cast<int>(kDLCPU);
+  } else if (!device.compare("cuda")) {
+    return static_cast<int>(kDLCUDA);
+  } else if (!device.compare("opencl")) {
+    return static_cast<int>(kDLOpenCL);
+  } else if (!device.compare("vulkan")) {
+    return static_cast<int>(kDLVulkan);
+  } else if (!device.compare("metal")) {
+    return static_cast<int>(kDLMetal);
+  } else if (!device.compare("vpi")) {
+    return static_cast<int>(kDLVPI);
+  } else if (!device.compare("rocm")) {
+    return static_cast<int>(kDLROCM);
+  } else if (!device.compare("oneapi")) {
+    return static_cast<int>(kDLOneAPI);
+  } else {
+    LOG(FATAL) << "TVMRunner : Unsupported device :" << device;
+  }
+}
+
+/*!
+ * \brief Constructor for TVMRunner.
+ * \param path where the tfm compiler artifacts present.
+ * \param device the target device where we need to load the compiled model.
+ */
+TVMRunner::TVMRunner(std::string path, std::string device) : r_model_path(path), r_device(device) {
+  LOG(INFO) << "TVMRunner Constructor:" << r_model_path << " Devices:" << r_device;
+}
+
+/*!
+ * \brief Load Setup TVM graph runtime for given model.
+ * \param 0 on success else error code.
+ */
+int TVMRunner::Load(void) {
+  LOG(INFO) << "TVMRunner Load:" << r_model_path;
+  // Load the lib file
+  r_mod_handle = Module::LoadFromFile((r_model_path + "/mod.so").c_str(), "so");
+
+  // Read model json file
+  std::ifstream json_reader((r_model_path + "/mod.json").c_str());
+  CHECK(!json_reader.fail()) << "Failed to open json file:" << (r_model_path + "/mod.json").c_str();
+  std::string json_str((std::istreambuf_iterator<char>(json_reader)),
+                       std::istreambuf_iterator<char>());
+  json_reader.close();
+
+  // Get ref to graph exeutor
+  auto f_handle = tvm::runtime::Registry::Get("tvm.graph_executor.create");
+
+  // Greate graph runtime
+  r_graph_handle = (*f_handle)(json_str, r_mod_handle, GetTVMDevice(r_device), 0);
+
+  // Read params binary file
+  std::ifstream params_reader((r_model_path + "/mod.params").c_str(), std::ios::binary);
+  CHECK(!params_reader.fail()) << "Failed to open json file:"
+                               << (r_model_path + "/mod.params").c_str();
+  const std::string params_str((std::istreambuf_iterator<char>(params_reader)),
+                               std::istreambuf_iterator<char>());
+  params_reader.close();
+  TVMByteArray params_arr;
+  params_arr.data = params_str.c_str();
+  params_arr.size = params_str.length();
+
+  // Load parameters
+  r_graph_handle.GetFunction("load_params")(params_arr);
+
+  return 0;
+}
+
+/*!
+ * \brief Calculated the memory size for the NDArray.
+ * \param NDArray object.
+ * \return size of the memory.
+ */
+inline size_t GetMemSize(NDArray& narr) {
+  size_t size = 1;
+  for (tvm_index_t i = 0; i < narr->ndim; ++i) {
+    size *= static_cast<size_t>(narr->shape[i]);
+  }
+  size *= (narr->dtype.bits * narr->dtype.lanes + 7) / 8;
+  return size;
+}
+
+/*!
+ * \brief Get the input alloc mem size.
+ * \param input_id The input id to query the mem size.
+ * \return The memory size.
+ */
+size_t TVMRunner::GetInputMemSize(std::string input_id) {
+  LOG(INFO) << "TVMRunner::GetInputMemSize:" << input_id;
+
+  NDArray in_arr = r_graph_handle.GetFunction("get_input")(input_id);
+  auto ssize = GetMemSize(in_arr);
+
+  return ssize;
+}
+
+/*!
+ * \brief Get the output alloc mem size.
+ * \param output_id The output id to query the mem size.
+ * \return The memory size.
+ */
+size_t TVMRunner::GetOutputMemSize(std::string output_id) {
+  LOG(INFO) << "TVMRunner::GetOutputMemSize:" << output_id;
+
+  NDArray out_arr = r_graph_handle.GetFunction("get_output")(output_id);
+  auto ssize = GetMemSize(out_arr);
+
+  return ssize;
+}
+
+/*!
+ * \brief Set the model inputs from npz file.
+ * \param inputfile the npz file from where we read input tensor data.
+ * \param 0 on success else error code.
+ */
+int TVMRunner::SetInput(std::string inputfile) {
+  LOG(INFO) << "TVMRunner::SetInput (Numpy):" << inputfile;
+  cnpy::npz_t npz_input = cnpy::npz_load(inputfile);
+
+  for (auto& elem : mInfo.input_info) {
+    LOG(INFO) << "Set Numpy Input for :" << elem.first;
+    NDArray in_arr = r_graph_handle.GetFunction("get_input")(elem.first);
+    auto ssize = GetMemSize(in_arr);
+
+    if (npz_input.find(elem.first) != npz_input.end()) {
+      in_arr.CopyFromBytes(npz_input[elem.first].data<char>(), ssize);
+    } else {
+      LOG(WARNING) << "Couldn't find input " << elem.first << " in npy input file";
+    }
+  }
+
+  return 0;
+}
+
+/*!
+ * \brief Set the model input from the given binary buffer.
+ * \param input_id input node name.
+ * \param raw_input binary input buffer to copy over input NDArray.
+ * \param 0 on success else error code.
+ */
+int TVMRunner::SetInput(std::string input_id, char* raw_input) {
+  LOG(INFO) << "TVMRunner::SetInput (Raw)";
+  NDArray in_arr = r_graph_handle.GetFunction("get_input")(input_id);
+  auto ssize = GetMemSize(in_arr);
+  in_arr.CopyFromBytes(raw_input, ssize);
+  return 0;
+}
+
+/*!
+ * \brief Get the model outputs and dump them to npz file.
+ * \param outputfile the npz file to where we dump the output data.
+ * \param 0 on success else error code.
+ */
+int TVMRunner::GetOutput(std::string outputfile) {
+  LOG(INFO) << "TVMRunner::GetOutput (Numpy):" << outputfile;
+
+  for (auto& elem : mInfo.output_info) {
+    LOG(INFO) << "Get Output for :" << elem.first;
+    NDArray out_arr = r_graph_handle.GetFunction("get_output")(elem.first);
+    auto ssize = GetMemSize(out_arr);
+    LOG(INFO) << "Output Size:" << ssize << "  bytes";
+
+    void* data = (void*)malloc(ssize * (out_arr->dtype.bits * out_arr->dtype.lanes + 7) / 8);
+    out_arr.CopyToBytes(data, ssize);
+    std::vector<size_t> shape;
+
+    for (int j = 0; j < out_arr->ndim; ++j) shape.push_back(out_arr->shape[j]);
+    if (!elem.second.second.compare("float32")) {
+      cnpy::npz_save<float>(outputfile, elem.first, (float*)data, shape, "a");
+    } else if (!elem.second.second.compare("int8")) {
+      cnpy::npz_save<int8_t>(outputfile, elem.first, (int8_t*)data, shape, "a");
+    } else {
+      LOG(WARNING) << "DType:" << elem.second.second << " is not supported for npy_save";
+    }
+    free(data);
+  }
+
+  return 0;
+}
+
+/*!
+ * \brief Get output of the model as a binary buffer.
+ * \param output_id output node name to read the data.
+ * \param raw_output the buffer to copy the data to.
+ * \param 0 on success else error code.
+ */
+int TVMRunner::GetOutput(std::string output_id, char* raw_output) {
+  LOG(INFO) << "TVMRunner::GetOutput (Raw)";
+  NDArray out_arr = r_graph_handle.GetFunction("get_output")(output_id);
+  auto ssize = GetMemSize(out_arr);
+  out_arr.CopyToBytes(raw_output, ssize);
+  return 0;
+}
+
+/*!
+ * \brief Call one cycle of execution for the model.
+ * \param 0 on success else error code.
+ */
+int TVMRunner::Run(void) {
+  LOG(INFO) << "TVMRunner::Run";
+
+  r_graph_handle.GetFunction("run")();
+  return 0;
+}
+
+/*!
+ * \brief Query various metadata from the grsph runtime.
+ * \param 0 on success else error code.
+ */
+TVMMetaInfo TVMRunner::GetMetaInfo(void) {
+  LOG(INFO) << "TVMRunner::GetMetaInfo";
+
+  mInfo.n_inputs = r_graph_handle.GetFunction("get_num_inputs")();
+  mInfo.n_outputs = r_graph_handle.GetFunction("get_num_outputs")();
+
+  Map<String, ObjectRef> tvm_input_info = r_graph_handle.GetFunction("get_input_info")();
+  auto shape_info = GetRef<Map<String, ObjectRef>>(tvm_input_info["shape"].as<MapNode>());
+  auto dtype_info = GetRef<Map<String, ObjectRef>>(tvm_input_info["dtype"].as<MapNode>());
+  for (const auto& kv : shape_info) {
+    auto stuple = GetRef<ShapeTuple>(kv.second.as<ShapeTupleObj>());
+    std::vector<int> vshape;
+    vshape.assign(stuple.begin(), stuple.end());
+    auto dtype = GetRef<String>(dtype_info[kv.first].as<StringObj>());
+    std::pair<std::vector<int>, std::string> value = std::make_pair(vshape, dtype);
+    mInfo.input_info.insert({kv.first, value});
+  }
+
+  tvm_input_info = r_graph_handle.GetFunction("get_output_info")();
+  shape_info = GetRef<Map<String, ObjectRef>>(tvm_input_info["shape"].as<MapNode>());
+  dtype_info = GetRef<Map<String, ObjectRef>>(tvm_input_info["dtype"].as<MapNode>());
+  for (const auto& kv : shape_info) {
+    auto stuple = GetRef<ShapeTuple>(kv.second.as<ShapeTupleObj>());
+    std::vector<int> vshape;
+    vshape.assign(stuple.begin(), stuple.end());
+    auto dtype = GetRef<String>(dtype_info[kv.first].as<StringObj>());
+    std::pair<std::vector<int>, std::string> value = std::make_pair(vshape, dtype);
+    mInfo.output_info.insert({kv.first, value});
+  }
+
+  return mInfo;
+}
+
+/*!
+ * \brief Print the meta information.
+ * \param 0 on success else error code.
+ */
+void TVMRunner::PrintMetaInfo(void) {
+  LOG(INFO) << "Meta Information:" << r_model_path;
+  LOG(INFO) << "    Number of Inputs:" << mInfo.n_inputs;
+  LOG(INFO) << "    Number of Outputs:" << mInfo.n_outputs;
+  LOG(INFO) << "    Input MetaInfo:";
+  for (auto& elem : mInfo.input_info) {
+    std::ostringstream stream;
+    stream << "[";
+    copy(elem.second.first.begin(), elem.second.first.end() - 1,
+         std::ostream_iterator<int>(stream, ", "));
+    stream << elem.second.first.back() << "]";
+    LOG(INFO) << "        Input:" << elem.first;
+    LOG(INFO) << "            DType:" << elem.second.second;
+    LOG(INFO) << "            Shape:" << stream.str();
+  }
+  LOG(INFO) << "    Output MetaInfo:";
+  for (auto& elem : mInfo.output_info) {
+    std::ostringstream stream;
+    stream << "[";
+    copy(elem.second.first.begin(), elem.second.first.end() - 1,
+         std::ostream_iterator<int>(stream, ", "));
+    stream << elem.second.first.back() << "]";
+    LOG(INFO) << "        Output:" << elem.first;
+    LOG(INFO) << "            DType:" << elem.second.second;
+    LOG(INFO) << "            Shape:" << stream.str();
+  }
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/apps/cpp_rtvm/tvm_runner.h b/apps/cpp_rtvm/tvm_runner.h
new file mode 100644
index 000000000000..37ba53606ee8
--- /dev/null
+++ b/apps/cpp_rtvm/tvm_runner.h
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm_runner.h
+ * \brief TVM model runner.
+ */
+#ifndef TVM_APPS_CPP_RTVM_RUNNER_H_
+#define TVM_APPS_CPP_RTVM_RUNNER_H_
+
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <string>
+
+#include "tvm/runtime/c_runtime_api.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief various meta information related to the compiled TVM model.
+ */
+typedef struct {
+ public:
+  int n_inputs;
+  int n_outputs;
+  std::map<std::string, std::pair<std::vector<int>, std::string>> input_info;
+  std::map<std::string, std::pair<std::vector<int>, std::string>> output_info;
+} TVMMetaInfo;
+
+/*!
+ * \brief encapsulates TVM graph runtime functionality with simplified API interface.
+ */
+class TVMRunner {
+ public:
+  /*! \brief Constructor */
+  TVMRunner(std::string path, std::string device);
+
+  /*! \brief Initiates graph runtime and with the compiled model */
+  int Load(void);
+  /*! \brief Executes one inference cycle */
+  int Run(void);
+  /*! \brief To set the inputs from given npz file */
+  int SetInput(std::string);
+  /*! \brief To set the input from binary data */
+  int SetInput(std::string, char*);
+  /*! \brief Save the model output into given npz file */
+  int GetOutput(std::string);
+  /*! \brief Get the model output in binary format */
+  int GetOutput(std::string, char*);
+  /*! \brief To get the input mem size */
+  size_t GetInputMemSize(std::string);
+  /*! \brief To get the output mem size */
+  size_t GetOutputMemSize(std::string);
+  /*! \brief Populates various meta information from graph runtime */
+  TVMMetaInfo GetMetaInfo(void);
+  /*! \brief Print function to show all meta information */
+  void PrintMetaInfo(void);
+
+ private:
+  /*! \brief Module handle for the shared object */
+  Module r_mod_handle;
+  /*! \brief Graph runtime module handle */
+  Module r_graph_handle;
+  /*! \brief The local model path from where we load the model */
+  std::string r_model_path;
+  /*! \brief The target device */
+  std::string r_device;
+  /*! \brief Holds meta information queried from graph runtime */
+  TVMMetaInfo mInfo;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_APPS_CPP_RTVM_RUNNER_H_
diff --git a/cmake/config.cmake b/cmake/config.cmake
index fba7fafe9316..19e8cbf5853f 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -110,6 +110,9 @@ set(USE_RPC ON)
 # Whether to build the C++ RPC server binary
 set(USE_CPP_RPC OFF)
 
+# Whether to build the C++ native runtime tool binary
+set(USE_CPP_RTVM OFF)
+
 # Whether to build the iOS RPC server application
 set(USE_IOS_RPC OFF)
 
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index f73a294bd969..116c6e4fc72e 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -60,6 +60,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_CMSISNN="${USE_CMSISNN}"
     TVM_INFO_USE_COREML="${USE_COREML}"
     TVM_INFO_USE_CPP_RPC="${USE_CPP_RPC}"
+    TVM_INFO_USE_CPP_RTVM="${USE_CPP_RTVM}"
     TVM_INFO_USE_CUBLAS="${USE_CUBLAS}"
     TVM_INFO_USE_CUDA="${USE_CUDA}"
     TVM_INFO_USE_CUDNN="${USE_CUDNN}"
diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py
index a7087ea9239f..b5d04cdba7fa 100644
--- a/python/tvm/driver/tvmc/composite_target.py
+++ b/python/tvm/driver/tvmc/composite_target.py
@@ -28,6 +28,7 @@
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tvm.relay.op.contrib.bnns import partition_for_bnns
 from tvm.relay.op.contrib.vitis_ai import partition_for_vitis_ai
+from tvm.relay.op.contrib.clml import partition_for_clml
 
 
 from tvm.driver.tvmc import TVMCException
@@ -71,6 +72,10 @@
         "config_key": "relay.ext.vitis_ai.options",
         "pass_pipeline": partition_for_vitis_ai,
     },
+    "clml": {
+        "config_key": None,
+        "pass_pipeline": partition_for_clml,
+    },
 }
 
 
diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
index e6e535edc068..1ec9b298abe5 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -79,7 +79,7 @@ def transform_function(
         return RemoveDropout().visit(func)
 
 
-def partition_for_clml(mod, params=None):
+def partition_for_clml(mod, params=None, **opts):
     """Partition the graph greedily offloading supported
     operators to CLML Library.
 
diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index 5c8c678e8ce4..ca59c1f3b077 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -102,6 +102,14 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
       int max_vthread_extent = 1;
       return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block,
                             max_threads_per_block, max_vthread_extent, warp_size);
+    } else if (target->GetAttr<String>("device", "") == "adreno") {
+      int max_shared_memory_per_block = 32768;
+      int max_local_memory_per_block = 32768;
+      int max_threads_per_block = 256;
+      int warp_size = 1;
+      int max_vthread_extent = 1;
+      return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block,
+                            max_threads_per_block, max_vthread_extent, warp_size);
     } else {
       // add other opencl target
       auto dev = Device{static_cast<DLDeviceType>(device_type), 0};
diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index d805abfc658a..3c3d931df5d9 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -138,6 +138,28 @@ std::tuple<GraphExecutor::ShapeInfo, GraphExecutor::DtypeInfo> GraphExecutor::Ge
   return std::make_tuple(shape_dict, dtype_dict);
 }
 
+/*!
+ * \brief Get the output info of Graph by parsing the output nodes.
+ * \return The shape and dtype tuple.
+ */
+std::tuple<GraphExecutor::ShapeInfo, GraphExecutor::DtypeInfo> GraphExecutor::GetOutputInfo()
+    const {
+  GraphExecutor::ShapeInfo shape_dict;
+  GraphExecutor::DtypeInfo dtype_dict;
+  for (auto out : outputs_) {
+    uint32_t nid = out.node_id;
+    CHECK_LE(nid, nodes_.size());
+    std::string name = nodes_[nid].name;
+    CHECK_LE(nid, attrs_.shape.size());
+    auto shape = attrs_.shape[nid];
+    shape_dict.Set(name, ShapeTuple(shape));
+    CHECK_LE(nid, attrs_.dltype.size());
+    auto dtype = attrs_.dltype[nid];
+    dtype_dict.Set(name, String(dtype));
+  }
+  return std::make_tuple(shape_dict, dtype_dict);
+}
+
 /*!
  * \brief Get the output index given the name of output.
  * \param name The name of the output.
@@ -606,7 +628,19 @@ PackedFunc GraphExecutor::GetFunction(const std::string& name,
       if (args.num_args == 2) {
         this->CopyOutputTo(args[0], args[1]);
       } else {
-        *rv = this->GetOutput(args[0]);
+        int out_idx = -1;
+        if (String::CanConvertFrom(args[0])) {
+          for (size_t i = 0; i < outputs_.size(); i++) {
+            std::string& name = nodes_[outputs_[i].node_id].name;
+            if (args[0].operator String() == name) {
+              out_idx = i;
+            }
+          }
+          CHECK(out_idx != -1) << "Invalid output node:" << args[0].operator String();
+        } else {
+          out_idx = args[0];
+        }
+        *rv = this->GetOutput(out_idx);
       }
     });
   } else if (name == "get_input") {
@@ -682,6 +716,14 @@ PackedFunc GraphExecutor::GetFunction(const std::string& name,
       input_info.Set("dtype", dtype_info);
       *rv = input_info;
     });
+  } else if (name == "get_output_info") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      auto [shape_info, dtype_info] = this->GetOutputInfo();
+      Map<String, ObjectRef> input_info;
+      input_info.Set("shape", shape_info);
+      input_info.Set("dtype", dtype_info);
+      *rv = input_info;
+    });
   } else {
     return PackedFunc();
   }
diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h
index bbe94636b3a1..9fce154870cd 100644
--- a/src/runtime/graph_executor/graph_executor.h
+++ b/src/runtime/graph_executor/graph_executor.h
@@ -117,6 +117,12 @@ class TVM_DLL GraphExecutor : public ModuleNode {
    */
   std::tuple<ShapeInfo, DtypeInfo> GetInputInfo() const;
 
+  /*!
+   * \brief Get the output info of Graph by parsing the output nodes.
+   * \return The shape and dtype tuple.
+   */
+  std::tuple<ShapeInfo, DtypeInfo> GetOutputInfo() const;
+
   /*!
    * \brief Get the output index given the name of output.
    * \param name The name of the output.
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 2c5a2e7a5a39..009c701d2a99 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -203,6 +203,10 @@
 #define TVM_INFO_USE_CPP_RPC "NOT-FOUND"
 #endif
 
+#ifndef TVM_INFO_USE_CPP_RTVM
+#define TVM_INFO_USE_CPP_RTVM "NOT-FOUND"
+#endif
+
 #ifndef TVM_INFO_USE_TFLITE
 #define TVM_INFO_USE_TFLITE "NOT-FOUND"
 #endif
@@ -273,6 +277,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_CMSISNN", TVM_INFO_USE_CMSISNN},
       {"USE_COREML", TVM_INFO_USE_COREML},
       {"USE_CPP_RPC", TVM_INFO_USE_CPP_RPC},
+      {"USE_CPP_RTVM", TVM_INFO_USE_CPP_RTVM},
       {"USE_CUBLAS", TVM_INFO_USE_CUBLAS},
       {"USE_CUDA", TVM_INFO_USE_CUDA},
       {"USE_CUDNN", TVM_INFO_USE_CUDNN},
diff --git a/tests/scripts/task_build_adreno_bins.sh b/tests/scripts/task_build_adreno_bins.sh
index 187ca7f815df..f65794106ee3 100755
--- a/tests/scripts/task_build_adreno_bins.sh
+++ b/tests/scripts/task_build_adreno_bins.sh
@@ -37,6 +37,7 @@ echo set\(USE_OPENCL ON\) >> config.cmake
 fi
 echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_CPP_RPC ON\) >> config.cmake
+echo set\(USE_CPP_RTVM ON\) >> config.cmake
 echo set\(USE_GRAPH_EXECUTOR ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
 echo set\(USE_KALLOC_ALIGNMENT 32\) >> config.cmake
@@ -56,4 +57,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.
       -DCMAKE_C_COMPILER="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang" \
       -DMACHINE_NAME="aarch64-linux-gnu" ..
 
-make -j$(nproc) tvm_rpc
+make -j$(nproc) tvm_rpc rtvm

From 76a16c4704da9bdce9709b29a35b0b885f24590c Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao@apache.org>
Date: Tue, 24 Jan 2023 06:54:29 -0800
Subject: [PATCH 217/286] [TVMScript] Introduce `PrinterConfig` (#13831)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR introduces `PrinterConfig`, a systematic way to configure
TVMScript printer without having to set global flags.

This PR enables more customization of printer behavior. More
specifically, now any TVM’s object in python, as long as it
inherits from `Scriptable`, it automatically gains two methods:
- `.script(tir_prefix=...)`
- `.show(...)`
---
 include/tvm/ir/expr.h                         |  11 +-
 include/tvm/ir/module.h                       |  11 +-
 include/tvm/node/repr_printer.h               |   1 +
 include/tvm/node/script_printer.h             | 105 +++++++++
 include/tvm/script/printer/doc.h              |   8 +
 include/tvm/script/printer/ir_docsifier.h     |   4 +-
 include/tvm/script/printer/printer.h          |  76 ------
 include/tvm/tir/function.h                    |  11 +-
 include/tvm/tir/stmt.h                        |  11 +-
 python/tvm/ir/expr.py                         |   4 +-
 python/tvm/ir/module.py                       |  80 +------
 python/tvm/ir/type.py                         |   3 +-
 python/tvm/runtime/__init__.py                |   1 +
 python/tvm/runtime/script_printer.py          | 218 ++++++++++++++++++
 python/tvm/script/printer/__init__.py         |   1 -
 python/tvm/script/printer/default.py          |  83 -------
 python/tvm/script/printer/doc_printer.py      |  15 +-
 python/tvm/tir/buffer.py                      |  10 +-
 python/tvm/tir/expr.py                        |  83 +------
 python/tvm/tir/function.py                    |  79 +------
 python/tvm/tir/stmt.py                        |  83 +------
 src/node/script_printer.cc                    |  79 +++++++
 .../printer/doc_printer/base_doc_printer.cc   |  19 +-
 .../printer/doc_printer/base_doc_printer.h    |  26 +--
 .../printer/doc_printer/python_doc_printer.cc |  17 +-
 src/script/printer/ir/ir.cc                   |  31 ++-
 src/script/printer/ir/script_method.cc        |  34 ---
 src/script/printer/ir/utils.h                 |  19 +-
 src/script/printer/ir_docsifier.cc            |   3 +-
 src/script/printer/printer.cc                 |  47 ----
 src/script/printer/tir/block.cc               |  19 +-
 src/script/printer/tir/buffer.cc              |   8 +-
 src/script/printer/tir/expr.cc                |  49 ++--
 src/script/printer/tir/for_loop.cc            |  12 +-
 src/script/printer/tir/function.cc            |  10 +-
 src/script/printer/tir/ir.cc                  |  27 +--
 src/script/printer/tir/script_method.cc       |  59 -----
 src/script/printer/tir/stmt.cc                |  20 +-
 src/script/printer/tir/utils.h                |  17 +-
 src/script/printer/utils.h                    |  37 ++-
 .../unittest/test_tvmscript_printer_tir.py    |  20 +-
 41 files changed, 602 insertions(+), 849 deletions(-)
 create mode 100644 include/tvm/node/script_printer.h
 delete mode 100644 include/tvm/script/printer/printer.h
 create mode 100644 python/tvm/runtime/script_printer.py
 delete mode 100644 python/tvm/script/printer/default.py
 create mode 100644 src/node/script_printer.cc
 delete mode 100644 src/script/printer/ir/script_method.cc
 delete mode 100644 src/script/printer/printer.cc
 delete mode 100644 src/script/printer/tir/script_method.cc

diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index bfbaa7cddd4f..78c09e81b16f 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -100,16 +100,7 @@ class PrimExprNode : public BaseExprNode {
    */
   DataType dtype;
 
-  /*!
-   * \brief Returns the TVMScript format
-   * \param indent_spaces Number of spaces used for indentation
-   * \param print_line_numbers Whether to print line numbers
-   * \param num_context_lines Number of context lines to print around the underlined text
-   * \param path_to_underline Object path to be underlined
-   */
-  TVM_DLL std::string Script(int indent_spaces = 4, bool print_line_numbers = false,
-                             int num_context_lines = -1,
-                             Optional<ObjectPath> path_to_underline = NullOpt) const;
+  TVM_OBJECT_ENABLE_SCRIPT_PRINTER();
 
   static constexpr const char* _type_key = "PrimExpr";
   static constexpr const uint32_t _type_child_slots = 38;
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index 4cd357d4180b..0a5bac182fd9 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -328,16 +328,7 @@ class IRModuleNode : public Object {
    */
   TVM_DLL std::unordered_set<String> Imports() const;
 
-  /*!
-   * \brief Returns the TVMScript format
-   * \param indent_spaces Number of spaces used for indentation
-   * \param print_line_numbers Whether to print line numbers
-   * \param num_context_lines Number of context lines to print around the underlined text
-   * \param path_to_underline Object path to be underlined
-   */
-  TVM_DLL std::string Script(int indent_spaces = 4, bool print_line_numbers = false,
-                             int num_context_lines = -1,
-                             Optional<ObjectPath> path_to_underline = NullOpt) const;
+  TVM_OBJECT_ENABLE_SCRIPT_PRINTER();
 
   static constexpr const char* _type_key = "IRModule";
   static constexpr const bool _type_has_method_sequal_reduce = true;
diff --git a/include/tvm/node/repr_printer.h b/include/tvm/node/repr_printer.h
index e3f59fcc14a1..2a2d0bf3fb05 100644
--- a/include/tvm/node/repr_printer.h
+++ b/include/tvm/node/repr_printer.h
@@ -24,6 +24,7 @@
 #define TVM_NODE_REPR_PRINTER_H_
 
 #include <tvm/node/functor.h>
+#include <tvm/node/script_printer.h>
 
 #include <iostream>
 #include <string>
diff --git a/include/tvm/node/script_printer.h b/include/tvm/node/script_printer.h
new file mode 100644
index 000000000000..af50aae71a43
--- /dev/null
+++ b/include/tvm/node/script_printer.h
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \file tvm/node/repr_printer.h
+ * \brief Printer class to print repr string of each AST/IR nodes.
+ */
+#ifndef TVM_NODE_SCRIPT_PRINTER_H_
+#define TVM_NODE_SCRIPT_PRINTER_H_
+
+#include <tvm/node/functor.h>
+#include <tvm/node/object_path.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/data_type.h>
+
+#include <iostream>
+#include <string>
+
+namespace tvm {
+
+class PrinterConfigNode : public Object {
+ public:
+  /*! \brief The prefix of IR nodes */
+  std::string ir_prefix = "I";
+  /*! \brief The prefix of TIR nodes */
+  std::string tir_prefix = "T";
+  /*! \brief The prefix of Relax nodes */
+  std::string relax_prefix = "R";
+  /*! \brief Default data type of TIR buffer */
+  DataType buffer_dtype = DataType::Float(32);
+  /*! \brief Default data type of integer literals */
+  DataType int_dtype = DataType::Int(32);
+  /*!
+   * \brief Default data type of float literals. Right now we always print out the explicit type
+   * of floating point values, so setting it to Void means we do not print without the
+   * T.float32/T.float64 wrapper.
+   */
+  DataType float_dtype = DataType::Void();
+  /*! \brief Whether or not to verbose print expressions. */
+  bool verbose_expr = false;
+  /* \brief Number of spaces used for indentation*/
+  int indent_spaces = 4;
+  /* \brief Whether to print line numbers */
+  bool print_line_numbers = false;
+  /* \brief Number of context lines to print around the underlined text */
+  int num_context_lines = -1;
+  /* \brief Object path to be underlined */
+  Optional<ObjectPath> path_to_underline = NullOpt;
+
+  void VisitAttrs(AttrVisitor* v) {
+    v->Visit("ir_prefix", &ir_prefix);
+    v->Visit("buffer_dtype", &buffer_dtype);
+    v->Visit("int_dtype", &int_dtype);
+    v->Visit("float_dtype", &float_dtype);
+    v->Visit("verbose_expr", &verbose_expr);
+    v->Visit("indent_spaces", &indent_spaces);
+    v->Visit("print_line_numbers", &print_line_numbers);
+    v->Visit("num_context_lines", &num_context_lines);
+    v->Visit("path_to_underline", &path_to_underline);
+  }
+
+  static constexpr const char* _type_key = "node.PrinterConfig";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PrinterConfigNode, Object);
+};
+
+class PrinterConfig : public ObjectRef {
+ public:
+  explicit PrinterConfig(Map<String, ObjectRef> config_dict = Map<String, ObjectRef>());
+
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(PrinterConfig, runtime::ObjectRef,
+                                                    PrinterConfigNode);
+};
+
+/*! \brief Legacy behavior of ReprPrinter. */
+class TVMScriptPrinter {
+ public:
+  /* Convert the object to TVMScript format */
+  static std::string Script(const ObjectRef& node, const Optional<PrinterConfig>& cfg);
+  // Allow registration to be printer.
+  using FType = NodeFunctor<std::string(const ObjectRef&, const PrinterConfig&)>;
+  TVM_DLL static FType& vtable();
+};
+
+#define TVM_OBJECT_ENABLE_SCRIPT_PRINTER()                                                      \
+  std::string Script(const Optional<PrinterConfig>& config = NullOpt) const {                   \
+    return TVMScriptPrinter::Script(GetRef<ObjectRef>(this), config.value_or(PrinterConfig())); \
+  }
+
+}  // namespace tvm
+#endif  // TVM_NODE_SCRIPT_PRINTER_H_
diff --git a/include/tvm/script/printer/doc.h b/include/tvm/script/printer/doc.h
index 01f0fc1f4a91..6504e2c2843d 100644
--- a/include/tvm/script/printer/doc.h
+++ b/include/tvm/script/printer/doc.h
@@ -29,8 +29,16 @@ namespace tvm {
 namespace script {
 namespace printer {
 
+// Forward declaration
 class Doc;
 
+/*!
+ * \brief Convert Doc into Python script.
+ * \param doc Doc to be converted
+ * \param cfg The configuration of the printer
+ */
+String DocToPythonScript(Doc doc, const PrinterConfig& cfg);
+
 /*!
  * \brief The base class of all Doc.
  *
diff --git a/include/tvm/script/printer/ir_docsifier.h b/include/tvm/script/printer/ir_docsifier.h
index e0419b469505..67fa96ef8082 100644
--- a/include/tvm/script/printer/ir_docsifier.h
+++ b/include/tvm/script/printer/ir_docsifier.h
@@ -126,6 +126,8 @@ class IRDocsifierNode : public Object {
     /*! \brief The name of the variable */
     Optional<String> name;
   };
+  /*! \brief The configuration of the printer */
+  PrinterConfig cfg{nullptr};
   /*!
    * \brief The stack of frames.
    * \sa FrameNode
@@ -232,7 +234,7 @@ class IRDocsifier : public ObjectRef {
  public:
   using FType = IRDocsifierFunctor<printer::Doc, ObjectPath, IRDocsifier>;
   /*! \brief Create a IRDocsifier. */
-  IRDocsifier();
+  explicit IRDocsifier(const PrinterConfig& cfg);
   /*! \brief The registration table for IRDocsifier. */
   TVM_DLL static FType& vtable();
 
diff --git a/include/tvm/script/printer/printer.h b/include/tvm/script/printer/printer.h
deleted file mode 100644
index b373a2be73fb..000000000000
--- a/include/tvm/script/printer/printer.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef TVM_SCRIPT_PRINTER_PRINTER_H_
-#define TVM_SCRIPT_PRINTER_PRINTER_H_
-
-#include <tvm/node/node.h>
-#include <tvm/script/printer/ir_docsifier.h>
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace tvm {
-namespace script {
-namespace printer {
-
-/*! \brief Default values in the TVMScript printer */
-struct Default {
-  /*! \brief The prefix of IR nodes */
-  std::unordered_map<std::string, std::string> ir_prefix = {{"ir", "I"}, {"tir", "T"}};
-  /*! \brief Default data type of TIR buffer */
-  DataType buffer_dtype = DataType::Float(32);
-  /*! \brief Default data type of integer literals */
-  DataType int_dtype = DataType::Int(32);
-  /*!
-   * \brief Default data type of float literals. Right now we always print out the explicit type
-   * of floating point values, so setting it to Void means we do not print without the
-   * T.float32/T.float64 wrapper.
-   */
-  DataType float_dtype = DataType::Void();
-  /*! \brief Whether or not to verbose print expressions. */
-  bool verbose_expr = false;
-  /*! \brief Returns a singleton of the configuration */
-  static Default* Instance();
-  static std::string& Prefix(const std::string& ir) { return Instance()->ir_prefix.at(ir); }
-  static DataType& BufferDType() { return Instance()->buffer_dtype; }
-  static DataType& IntDType() { return Instance()->int_dtype; }
-  static DataType& FloatDType() { return Instance()->float_dtype; }
-  static bool& VerboseExpr() { return Instance()->verbose_expr; }
-};
-
-/*!
- * \brief Convert Doc into Python script.
- * \param doc Doc to be converted
- * \param indent_spaces Number of spaces used for indentation
- * \param print_line_numbers Whether to print line numbers
- * \param num_context_lines Number of context lines to print around the underlined text
- * \param path_to_underline Object path to be underlined
- */
-String DocToPythonScript(Doc doc,                          //
-                         int indent_spaces = 4,            //
-                         bool print_line_numbers = false,  //
-                         int num_context_lines = -1,       //
-                         Optional<ObjectPath> path_to_underline = NullOpt);
-
-}  // namespace printer
-}  // namespace script
-}  // namespace tvm
-
-#endif  // TVM_SCRIPT_PRINTER_PRINTER_H_
diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index 17e7de930260..e135c261990b 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -132,16 +132,7 @@ class PrimFuncNode : public BaseFuncNode {
    */
   TVM_DLL FuncType func_type_annotation() const;
 
-  /*!
-   * \brief Returns the TVMScript format
-   * \param indent_spaces Number of spaces used for indentation
-   * \param print_line_numbers Whether to print line numbers
-   * \param num_context_lines Number of context lines to print around the underlined text
-   * \param path_to_underline Object path to be underlined
-   */
-  std::string Script(int indent_spaces = 4, bool print_line_numbers = false,
-                     int num_context_lines = -1,
-                     Optional<ObjectPath> path_to_underline = NullOpt) const;
+  TVM_OBJECT_ENABLE_SCRIPT_PRINTER();
 
   static constexpr const char* _type_key = "tir.PrimFunc";
   TVM_DECLARE_FINAL_OBJECT_INFO(PrimFuncNode, BaseFuncNode);
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index e0b7bcc868b3..7a7ad2acedd7 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -46,16 +46,7 @@ class StmtNode : public Object {
   StmtNode() = default;
   explicit StmtNode(Span span) : span(span) {}
 
-  /*!
-   * \brief Returns the TVMScript format
-   * \param indent_spaces Number of spaces used for indentation
-   * \param print_line_numbers Whether to print line numbers
-   * \param num_context_lines Number of context lines to print around the underlined text
-   * \param path_to_underline Object path to be underlined
-   */
-  std::string Script(int indent_spaces = 4, bool print_line_numbers = false,
-                     int num_context_lines = -1,
-                     Optional<ObjectPath> path_to_underline = NullOpt) const;
+  TVM_OBJECT_ENABLE_SCRIPT_PRINTER();
 
   static constexpr const char* _type_key = "tir.Stmt";
   static constexpr const bool _type_has_method_sequal_reduce = true;
diff --git a/python/tvm/ir/expr.py b/python/tvm/ir/expr.py
index 52af8407b7a0..3c3fefb6d6c6 100644
--- a/python/tvm/ir/expr.py
+++ b/python/tvm/ir/expr.py
@@ -17,7 +17,7 @@
 """Common expressions data structures in the IR."""
 import tvm._ffi
 
-from ..runtime import const, convert
+from ..runtime import Scriptable, const, convert
 from . import _ffi_api
 from .base import Node
 
@@ -121,7 +121,7 @@ def astext(self, show_meta_data=True, annotate=None):
 
 
 @tvm._ffi.register_object
-class Range(Node):
+class Range(Node, Scriptable):
     """Represent a range in TVM.
 
     You do not need to create a Range explicitly.
diff --git a/python/tvm/ir/module.py b/python/tvm/ir/module.py
index 51410049ec74..3daffb2640c5 100644
--- a/python/tvm/ir/module.py
+++ b/python/tvm/ir/module.py
@@ -15,10 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """IRModule that holds the functions and type definitions."""
-from typing import Optional
-
 import tvm._ffi
 from tvm._ffi.base import string_types
+from tvm.runtime import Scriptable
 
 from . import _ffi_api
 from . import expr as _expr
@@ -27,7 +26,7 @@
 
 
 @tvm._ffi.register_object("IRModule")
-class IRModule(Node):
+class IRModule(Node, Scriptable):
     """IRModule that holds functions and type definitions.
 
     IRModule is the basic unit for all IR transformations across the stack.
@@ -314,78 +313,3 @@ def astext(self, show_meta_data=True, annotate=None):
         from tvm.relay import astext  # pylint: disable=import-outside-toplevel
 
         return astext(self, show_meta_data, annotate)
-
-    def script(
-        self,
-        *,
-        indent_spaces: int = 4,
-        print_line_numbers: bool = False,
-        num_context_lines: Optional[int] = None,
-        path_to_underline=None,
-    ) -> str:
-        """Print IRModule into TVMScript
-
-        Parameters
-        ----------
-        indent_spaces : int
-            The number of indent spaces to use in the output
-        print_line_numbers: bool
-            Whether to print line numbers
-        num_context_lines : Optional[int]
-            Number of context lines to print around the underlined text
-        path_to_underline : Optional[ObjectPath]
-            Object path to be underlined
-
-        Returns
-        -------
-        script : str
-            The TVM Script of the IRModule
-        """
-        if num_context_lines is None:
-            num_context_lines = -1
-        return _ffi_api.Module_Script(  # type: ignore  # pylint: disable=no-member
-            self, indent_spaces, print_line_numbers, num_context_lines, path_to_underline
-        )
-
-    def show(
-        self,
-        *,
-        style: Optional[str] = None,
-        black_format: bool = True,
-        indent_spaces: int = 4,
-        print_line_numbers: bool = False,
-        num_context_lines: Optional[int] = None,
-        path_to_underline=None,
-    ) -> None:
-        """A sugar for print highlighted TVM script.
-
-        Parameters
-        ----------
-        style : str, optional
-            Pygmentize printing style, auto-detected if None.  See
-            `tvm.script.highlight.cprint` for more details.
-        black_format: bool
-            If true (default), use the formatter Black to format the TVMScript
-        indent_spaces : int
-            The number of indent spaces to use in the output
-        print_line_numbers: bool
-            Whether to print line numbers
-        num_context_lines : Optional[int]
-            Number of context lines to print around the underlined text
-        path_to_underline : Optional[ObjectPath]
-            Object path to be underlined
-        """
-        from tvm.script.highlight import (  # pylint: disable=import-outside-toplevel
-            cprint,
-        )
-
-        cprint(
-            self.script(
-                indent_spaces=indent_spaces,
-                print_line_numbers=print_line_numbers,
-                num_context_lines=num_context_lines,
-                path_to_underline=path_to_underline,
-            ),
-            style=style,
-            black_format=black_format,
-        )
diff --git a/python/tvm/ir/type.py b/python/tvm/ir/type.py
index ea06aeda2030..c83cef3f6cea 100644
--- a/python/tvm/ir/type.py
+++ b/python/tvm/ir/type.py
@@ -19,12 +19,13 @@
 
 import tvm
 import tvm._ffi
+from tvm.runtime import Scriptable
 
 from . import _ffi_api
 from .base import Node
 
 
-class Type(Node):
+class Type(Node, Scriptable):
     """The base class of all types."""
 
     def __eq__(self, other):
diff --git a/python/tvm/runtime/__init__.py b/python/tvm/runtime/__init__.py
index 502de7372154..71f71e6c8427 100644
--- a/python/tvm/runtime/__init__.py
+++ b/python/tvm/runtime/__init__.py
@@ -20,6 +20,7 @@
 from .packed_func import PackedFunc
 from .object import Object
 from .object_path import ObjectPath, ObjectPathPair
+from .script_printer import Scriptable
 from .object_generic import ObjectGeneric, ObjectTypes
 from .ndarray import NDArray, DataType, DataTypeCode, Device
 from .module import Module, num_threads
diff --git a/python/tvm/runtime/script_printer.py b/python/tvm/runtime/script_printer.py
new file mode 100644
index 000000000000..23144c47f1ee
--- /dev/null
+++ b/python/tvm/runtime/script_printer.py
@@ -0,0 +1,218 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Configuration of TVMScript printer"""
+from typing import Optional
+
+from tvm._ffi import register_object
+from tvm.runtime import Object
+
+from . import _ffi_node_api
+from .object_path import ObjectPath
+
+
+@register_object("node.PrinterConfig")
+class PrinterConfig(Object):
+    """Configuration of TVMScript printer"""
+
+    ir_prefix: str
+    tir_prefix: str
+    relax_prefix: str
+    buffer_dtype: str
+    int_dtype: str
+    float_dtype: str
+    verbose_expr: bool
+    indent_spaces: int
+    print_line_numbers: bool
+    num_context_lines: int
+    path_to_underline: Optional[ObjectPath]
+
+    def __init__(
+        self,
+        *,
+        ir_prefix: str = "I",
+        tir_prefix: str = "T",
+        relax_prefix: str = "R",
+        buffer_dtype: str = "float32",
+        int_dtype: str = "int32",
+        float_dtype: str = "void",
+        verbose_expr: bool = False,
+        indent_spaces: int = 4,
+        print_line_numbers: bool = False,
+        num_context_lines: Optional[int] = None,
+        path_to_underline: Optional[ObjectPath] = None,
+    ) -> None:
+        if num_context_lines is None:
+            num_context_lines = -1
+        self.__init_handle_by_constructor__(
+            _ffi_node_api.PrinterConfig,  # type: ignore # pylint: disable=no-member
+            {
+                "ir_prefix": ir_prefix,
+                "tir_prefix": tir_prefix,
+                "relax_prefix": relax_prefix,
+                "buffer_dtype": buffer_dtype,
+                "int_dtype": int_dtype,
+                "float_dtype": float_dtype,
+                "verbose_expr": verbose_expr,
+                "indent_spaces": indent_spaces,
+                "print_line_numbers": print_line_numbers,
+                "num_context_lines": num_context_lines,
+                "path_to_underline": path_to_underline,
+            },
+        )
+
+
+def _script(obj: Object, config: PrinterConfig) -> str:
+    return _ffi_node_api.TVMScriptPrinterScript(obj, config)  # type: ignore # pylint: disable=no-member
+
+
+class Scriptable:
+    """A base class that enables the script() and show() method."""
+
+    def script(
+        self,
+        *,
+        ir_prefix: str = "I",
+        tir_prefix: str = "T",
+        relax_prefix: str = "R",
+        buffer_dtype: str = "float32",
+        int_dtype: str = "int32",
+        float_dtype: str = "void",
+        verbose_expr: bool = False,
+        indent_spaces: int = 4,
+        print_line_numbers: bool = False,
+        num_context_lines: int = -1,
+        path_to_underline: Optional[ObjectPath] = None,
+    ) -> str:
+        """Print TVM IR into TVMScript text format
+
+        Parameters
+        ----------
+        ir_prefix : str = "I"
+            The prefix of AST nodes from tvm.ir
+        tir_prefix : str = "T"
+            The prefix of AST nodes from tvm.tir
+        relax_prefix : str = "R"
+            The prefix of AST nodes from tvm.relax
+        buffer_dtype : str = "float32"
+            The default data type of buffer
+        int_dtype : str = "int32"
+            The default data type of integer
+        float_dtype : str = "void"
+            The default data type of float
+        verbose_expr : bool = False
+            Whether to print the detailed definition of each variable in the expression
+        indent_spaces : int = 4
+            The number of spaces for indentation
+        print_line_numbers : bool = False
+            Whether to print line numbers
+        num_context_lines : int = -1
+            The number of lines of context to print before and after the line to underline.
+        path_to_underline : Optional[ObjectPath] = None
+            Object path to be underlined
+
+        Returns
+        -------
+        script : str
+            The TVM Script of the given TVM IR
+        """
+        return _script(
+            self,
+            PrinterConfig(
+                ir_prefix=ir_prefix,
+                tir_prefix=tir_prefix,
+                relax_prefix=relax_prefix,
+                buffer_dtype=buffer_dtype,
+                int_dtype=int_dtype,
+                float_dtype=float_dtype,
+                verbose_expr=verbose_expr,
+                indent_spaces=indent_spaces,
+                print_line_numbers=print_line_numbers,
+                num_context_lines=num_context_lines,
+                path_to_underline=path_to_underline,
+            ),
+        )
+
+    def show(
+        self,
+        style: Optional[str] = None,
+        black_format: bool = True,
+        *,
+        ir_prefix: str = "I",
+        tir_prefix: str = "T",
+        relax_prefix: str = "R",
+        buffer_dtype: str = "float32",
+        int_dtype: str = "int32",
+        float_dtype: str = "void",
+        verbose_expr: bool = False,
+        indent_spaces: int = 4,
+        print_line_numbers: bool = False,
+        num_context_lines: int = -1,
+        path_to_underline: Optional[ObjectPath] = None,
+    ) -> None:
+        """A sugar for print highlighted TVM script.
+
+        Parameters
+        ----------
+        style : str, optional
+            Pygmentize printing style, auto-detected if None.  See
+            `tvm.script.highlight.cprint` for more details.
+        black_format: bool
+            If true (default), use the formatter Black to format the TVMScript
+        ir_prefix : str = "I"
+            The prefix of AST nodes from tvm.ir
+        tir_prefix : str = "T"
+            The prefix of AST nodes from tvm.tir
+        relax_prefix : str = "R"
+            The prefix of AST nodes from tvm.relax
+        buffer_dtype : str = "float32"
+            The default data type of buffer
+        int_dtype : str = "int32"
+            The default data type of integer
+        float_dtype : str = "void"
+            The default data type of float
+        verbose_expr : bool = False
+            Whether to print the detailed definition of each variable in the expression
+        indent_spaces : int = 4
+            The number of spaces for indentation
+        print_line_numbers : bool = False
+            Whether to print line numbers
+        num_context_lines : int = -1
+            The number of lines of context to print before and after the line to underline.
+        path_to_underline : Optional[ObjectPath] = None
+            Object path to be underlined
+        """
+        from tvm.script.highlight import (  # pylint: disable=import-outside-toplevel
+            cprint,
+        )
+
+        cprint(
+            self.script(
+                ir_prefix=ir_prefix,
+                tir_prefix=tir_prefix,
+                relax_prefix=relax_prefix,
+                buffer_dtype=buffer_dtype,
+                int_dtype=int_dtype,
+                float_dtype=float_dtype,
+                verbose_expr=verbose_expr,
+                indent_spaces=indent_spaces,
+                print_line_numbers=print_line_numbers,
+                num_context_lines=num_context_lines,
+                path_to_underline=path_to_underline,
+            ),
+            style=style,
+            black_format=black_format,
+        )
diff --git a/python/tvm/script/printer/__init__.py b/python/tvm/script/printer/__init__.py
index 01d89dacbf52..8d2f73bb2b8d 100644
--- a/python/tvm/script/printer/__init__.py
+++ b/python/tvm/script/printer/__init__.py
@@ -19,4 +19,3 @@
 This package provides a set of APIs to print supported TVM IR into TVMScript
 in a roundtrippable way.
 """
-from . import default
diff --git a/python/tvm/script/printer/default.py b/python/tvm/script/printer/default.py
deleted file mode 100644
index 33ca693ebf32..000000000000
--- a/python/tvm/script/printer/default.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""The printer configuration"""
-from typing_extensions import Literal
-
-from . import _ffi_api
-
-
-def ir_prefix(  # pylint: disable=invalid-name
-    ir: Literal["ir", "tir"],
-    prefix: str,
-) -> None:
-    """Set the prefix for the IR. If not set, the prefix for "tvm.ir" is "I", and for "tir" is "T.
-
-    Parameters
-    ----------
-    ir : str
-        The IR type, either "ir" or "tir".
-
-    prefix : str
-        The prefix to use.
-    """
-    _ffi_api.DefaultIRPrefix(ir, prefix)  # type: ignore  # pylint: disable=no-member
-
-
-def buffer_dtype(dtype: str) -> None:
-    """Set the default dtype for buffer. If not set, it is "float32".
-
-    Parameters
-    ----------
-    dtype : str
-        The default dtype for buffer.
-    """
-    _ffi_api.DefaultBufferDtype(dtype)  # type: ignore  # pylint: disable=no-member
-
-
-def int_dtype(dtype: str) -> None:
-    """Set the default dtype for integers. If not set, it is "int32".
-
-    Parameters
-    ----------
-    dtype : str
-        The default dtype for buffer.
-    """
-    _ffi_api.DefaultBufferDtype(dtype)  # type: ignore  # pylint: disable=no-member
-
-
-def float_dtype(dtype: str) -> None:
-    """Set the default dtype for buffer. If not set, there is no default,
-    which means every floating point numbers will be wrapped with its precise dtype.
-
-    Parameters
-    ----------
-    dtype : str
-        The default dtype for buffer.
-    """
-    _ffi_api.DefaultFloatDtype(dtype)  # type: ignore  # pylint: disable=no-member
-
-
-def verbose_expr(verbose: bool) -> None:
-    """Whether or not to verbose print expressions. If not, the definition of every variable in an
-    expression will be printed as separate statements. Otherwise, the result will be a one-liner.
-
-    Parameters
-    ----------
-    dtype : str
-        The default dtype for buffer.
-    """
-    _ffi_api.VerboseExpr(verbose)  # type: ignore  # pylint: disable=no-member
diff --git a/python/tvm/script/printer/doc_printer.py b/python/tvm/script/printer/doc_printer.py
index 1791f46b00a2..137b71a77d9f 100644
--- a/python/tvm/script/printer/doc_printer.py
+++ b/python/tvm/script/printer/doc_printer.py
@@ -17,7 +17,10 @@
 """Functions to print doc into text format"""
 
 from typing import Optional
-from tvm.runtime.object_path import ObjectPath
+
+from tvm.runtime import ObjectPath
+from tvm.runtime.script_printer import PrinterConfig
+
 from . import _ffi_api
 from .doc import Doc
 
@@ -49,8 +52,10 @@ def to_python_script(
     script : str
         The text representation of Doc in Python syntax
     """
-    if num_context_lines is None:
-        num_context_lines = -1
-    return _ffi_api.DocToPythonScript(  # type: ignore
-        doc, indent_spaces, print_line_numbers, num_context_lines, path_to_underline
+    cfg = PrinterConfig(
+        indent_spaces=indent_spaces,
+        print_line_numbers=print_line_numbers,
+        num_context_lines=num_context_lines,
+        path_to_underline=path_to_underline,
     )
+    return _ffi_api.DocToPythonScript(doc, cfg)  # type: ignore # pylint: disable=no-member
diff --git a/python/tvm/tir/buffer.py b/python/tvm/tir/buffer.py
index c2c158c77f78..11db28e20a1c 100644
--- a/python/tvm/tir/buffer.py
+++ b/python/tvm/tir/buffer.py
@@ -20,13 +20,13 @@
 import tvm._ffi
 from tvm._ffi.base import string_types
 from tvm.ir import PointerType, PrimExpr, PrimType, Range
-from tvm.runtime import Object, convert
+from tvm.runtime import Object, Scriptable, convert
 
 from . import _ffi_api
 
 
 @tvm._ffi.register_object("tir.Buffer")
-class Buffer(Object):
+class Buffer(Object, Scriptable):
     """Symbolic data buffer in TVM.
 
     Buffer provide a way to represent data layout
@@ -179,7 +179,11 @@ def offset_of(self, indices):
 
     def __getitem__(self, indices):
         from ..arith import Analyzer  # pylint: disable=import-outside-toplevel
-        from .expr import BufferLoad, Ramp, const  # pylint: disable=import-outside-toplevel
+        from .expr import (  # pylint: disable=import-outside-toplevel
+            BufferLoad,
+            Ramp,
+            const,
+        )
         from .stmt import BufferRegion  # pylint: disable=import-outside-toplevel
 
         if not isinstance(indices, (tuple, list)):
diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py
index dab7a175185d..cb4a892ac289 100644
--- a/python/tvm/tir/expr.py
+++ b/python/tvm/tir/expr.py
@@ -34,7 +34,7 @@
 from tvm import ir
 from tvm.ir import Op, PrimExpr
 from tvm.ir.base import Span
-from tvm.runtime import DataType, DataTypeCode, Object, ObjectGeneric, const
+from tvm.runtime import DataType, DataTypeCode, Object, ObjectGeneric, Scriptable, const
 
 from . import _ffi_api
 from . import generic as _generic
@@ -318,88 +318,13 @@ def asobject(self):
         return IntImm("int32", self.value, self.span)  # type: ignore
 
 
-class PrimExprWithOp(ExprOp, PrimExpr):
+class PrimExprWithOp(ExprOp, PrimExpr, Scriptable):
     """Helper base class to inherit from PrimExpr."""
 
     # In Python3, We have to explicitly tell interpreter to retain __hash__ if we overide __eq__
     # https://docs.python.org/3.1/reference/datamodel.html#object.__hash__
     __hash__ = PrimExpr.__hash__
 
-    def script(
-        self,
-        *,
-        indent_spaces: int = 4,
-        print_line_numbers: bool = False,
-        num_context_lines: Optional[int] = None,
-        path_to_underline=None,
-    ) -> str:
-        """Print IRModule into TVMScript
-
-        Parameters
-        ----------
-        indent_spaces : int
-            The number of indent spaces to use in the output
-        print_line_numbers: bool
-            Whether to print line numbers
-        num_context_lines : Optional[int]
-            Number of context lines to print around the underlined text
-        path_to_underline : Optional[ObjectPath]
-            Object path to be underlined
-
-        Returns
-        -------
-        script : str
-            The TVM Script of the IRModule
-        """
-        if num_context_lines is None:
-            num_context_lines = -1
-        return _ffi_api.PrimExprScript(  # type: ignore  # pylint: disable=no-member
-            self, indent_spaces, print_line_numbers, num_context_lines, path_to_underline
-        )
-
-    def show(
-        self,
-        *,
-        style: Optional[str] = None,
-        black_format: bool = True,
-        indent_spaces: int = 4,
-        print_line_numbers: bool = False,
-        num_context_lines: Optional[int] = None,
-        path_to_underline=None,
-    ) -> None:
-        """A sugar for print highlighted TVM script.
-
-        Parameters
-        ----------
-        style : str, optional
-            Pygmentize printing style, auto-detected if None.  See
-            `tvm.script.highlight.cprint` for more details.
-        black_format: bool
-            If true (default), use the formatter Black to format the TVMScript
-        indent_spaces : int
-            The number of indent spaces to use in the output
-        print_line_numbers: bool
-            Whether to print line numbers
-        num_context_lines : Optional[int]
-            Number of context lines to print around the underlined text
-        path_to_underline : Optional[ObjectPath]
-            Object path to be underlined
-        """
-        from tvm.script.highlight import (  # pylint: disable=import-outside-toplevel
-            cprint,
-        )
-
-        cprint(
-            self.script(
-                indent_spaces=indent_spaces,
-                print_line_numbers=print_line_numbers,
-                num_context_lines=num_context_lines,
-                path_to_underline=path_to_underline,
-            ),
-            style=style,
-            black_format=black_format,
-        )
-
 
 class ConstExpr(PrimExprWithOp):
     pass
@@ -460,7 +385,7 @@ def __init__(self, name, dtype, span=None):
 
 
 @tvm._ffi.register_object("tir.IterVar")
-class IterVar(Object, ExprOp):
+class IterVar(Object, ExprOp, Scriptable):
     """Represent iteration variable.
 
     IterVar represents axis iterations in the computation.
@@ -521,7 +446,7 @@ def __init__(self, dom, var, iter_type, thread_tag="", span=None):
 
 
 @tvm._ffi.register_object("tir.CommReducer")
-class CommReducer(Object):
+class CommReducer(Object, Scriptable):
     """Commutative reduce operator
 
     Parameters
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index fb5a37c5dc17..f854e56ad11a 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -24,7 +24,7 @@
 import tvm._ffi
 import tvm.runtime
 from tvm.ir import BaseFunc, Range
-from tvm.runtime import Object
+from tvm.runtime import Object, Scriptable
 
 from ..runtime.ndarray import NDArray
 from . import _ffi_api
@@ -33,7 +33,7 @@
 
 
 @tvm._ffi.register_object("tir.PrimFunc")
-class PrimFunc(BaseFunc):
+class PrimFunc(BaseFunc, Scriptable):
     """A function declaration expression.
 
     Parameters
@@ -170,81 +170,6 @@ def mem_copy_16_16(a: T.handle, b: T.handle) -> None:
         """
         return _ffi_api.Specialize(self, param_map)  # type: ignore
 
-    def script(
-        self,
-        *,
-        indent_spaces: int = 4,
-        print_line_numbers: bool = False,
-        num_context_lines: Optional[int] = None,
-        path_to_underline=None,
-    ) -> str:
-        """Print IRModule into TVMScript
-
-        Parameters
-        ----------
-        indent_spaces : int
-            The number of indent spaces to use in the output
-        print_line_numbers: bool
-            Whether to print line numbers
-        num_context_lines : Optional[int]
-            Number of context lines to print around the underlined text
-        path_to_underline : Optional[ObjectPath]
-            Object path to be underlined
-
-        Returns
-        -------
-        script : str
-            The TVM Script of the IRModule
-        """
-        if num_context_lines is None:
-            num_context_lines = -1
-        return _ffi_api.PrimFuncScript(  # type: ignore  # pylint: disable=no-member
-            self, indent_spaces, print_line_numbers, num_context_lines, path_to_underline
-        )
-
-    def show(
-        self,
-        *,
-        style: Optional[str] = None,
-        black_format: bool = True,
-        indent_spaces: int = 4,
-        print_line_numbers: bool = False,
-        num_context_lines: Optional[int] = None,
-        path_to_underline=None,
-    ) -> None:
-        """A sugar for print highlighted TVM script.
-
-        Parameters
-        ----------
-        style : str, optional
-            Pygmentize printing style, auto-detected if None.  See
-            `tvm.script.highlight.cprint` for more details.
-        black_format: bool
-            If true (default), use the formatter Black to format the TVMScript
-        indent_spaces : int
-            The number of indent spaces to use in the output
-        print_line_numbers: bool
-            Whether to print line numbers
-        num_context_lines : Optional[int]
-            Number of context lines to print around the underlined text
-        path_to_underline : Optional[ObjectPath]
-            Object path to be underlined
-        """
-        from tvm.script.highlight import (  # pylint: disable=import-outside-toplevel
-            cprint,
-        )
-
-        cprint(
-            self.script(
-                indent_spaces=indent_spaces,
-                print_line_numbers=print_line_numbers,
-                num_context_lines=num_context_lines,
-                path_to_underline=path_to_underline,
-            ),
-            style=style,
-            black_format=black_format,
-        )
-
 
 @tvm._ffi.register_object("tir.TensorIntrin")
 class TensorIntrin(Object):
diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py
index 096c13653a94..d6cd06a1d915 100644
--- a/python/tvm/tir/stmt.py
+++ b/python/tvm/tir/stmt.py
@@ -31,91 +31,16 @@
 
 import tvm._ffi
 from tvm.ir import PrimExpr, Range, Span
-from tvm.runtime import Object, const
+from tvm.runtime import Object, Scriptable, const
 
 from . import _ffi_api
 from .buffer import Buffer
 from .expr import IterVar
 
 
-class Stmt(Object):
+class Stmt(Object, Scriptable):
     """Base class of all the statements."""
 
-    def script(
-        self,
-        *,
-        indent_spaces: int = 4,
-        print_line_numbers: bool = False,
-        num_context_lines: Optional[int] = None,
-        path_to_underline=None,
-    ) -> str:
-        """Print IRModule into TVMScript
-
-        Parameters
-        ----------
-        indent_spaces : int
-            The number of indent spaces to use in the output
-        print_line_numbers: bool
-            Whether to print line numbers
-        num_context_lines : Optional[int]
-            Number of context lines to print around the underlined text
-        path_to_underline : Optional[ObjectPath]
-            Object path to be underlined
-
-        Returns
-        -------
-        script : str
-            The TVM Script of the IRModule
-        """
-        if num_context_lines is None:
-            num_context_lines = -1
-        return _ffi_api.StmtScript(  # type: ignore  # pylint: disable=no-member
-            self, indent_spaces, print_line_numbers, num_context_lines, path_to_underline
-        )
-
-    def show(
-        self,
-        *,
-        style: Optional[str] = None,
-        black_format: bool = True,
-        indent_spaces: int = 4,
-        print_line_numbers: bool = False,
-        num_context_lines: Optional[int] = None,
-        path_to_underline=None,
-    ) -> None:
-        """A sugar for print highlighted TVM script.
-
-        Parameters
-        ----------
-        style : str, optional
-            Pygmentize printing style, auto-detected if None.  See
-            `tvm.script.highlight.cprint` for more details.
-        black_format: bool
-            If true (default), use the formatter Black to format the TVMScript
-        indent_spaces : int
-            The number of indent spaces to use in the output
-        print_line_numbers: bool
-            Whether to print line numbers
-        num_context_lines : Optional[int]
-            Number of context lines to print around the underlined text
-        path_to_underline : Optional[ObjectPath]
-            Object path to be underlined
-        """
-        from tvm.script.highlight import (  # pylint: disable=import-outside-toplevel
-            cprint,
-        )
-
-        cprint(
-            self.script(
-                indent_spaces=indent_spaces,
-                print_line_numbers=print_line_numbers,
-                num_context_lines=num_context_lines,
-                path_to_underline=path_to_underline,
-            ),
-            style=style,
-            black_format=black_format,
-        )
-
 
 @tvm._ffi.register_object("tir.LetStmt")
 class LetStmt(Stmt):
@@ -623,7 +548,7 @@ def __init__(self, buffer, bounds, span=None):
 
 
 @tvm._ffi.register_object("tir.BufferRegion")
-class BufferRegion(Object):
+class BufferRegion(Object, Scriptable):
     """BufferRegion node.
 
     Parameters
@@ -643,7 +568,7 @@ def __init__(self, buffer: Buffer, region: List[Range]):
 
 
 @tvm._ffi.register_object("tir.MatchBufferRegion")
-class MatchBufferRegion(Object):
+class MatchBufferRegion(Object, Scriptable):
     """MatchBufferRegion node.
 
     Parameters
diff --git a/src/node/script_printer.cc b/src/node/script_printer.cc
new file mode 100644
index 000000000000..605d5208462f
--- /dev/null
+++ b/src/node/script_printer.cc
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/ir/expr.h>
+#include <tvm/node/repr_printer.h>
+#include <tvm/node/script_printer.h>
+#include <tvm/runtime/registry.h>
+
+namespace tvm {
+
+TVMScriptPrinter::FType& TVMScriptPrinter::vtable() {
+  static FType inst;
+  return inst;
+}
+
+std::string TVMScriptPrinter::Script(const ObjectRef& node, const Optional<PrinterConfig>& cfg) {
+  return TVMScriptPrinter::vtable()(node, cfg.value_or(PrinterConfig()));
+}
+
+PrinterConfig::PrinterConfig(Map<String, ObjectRef> config_dict) {
+  runtime::ObjectPtr<PrinterConfigNode> n = make_object<PrinterConfigNode>();
+  if (auto v = config_dict.Get("ir_prefix")) {
+    n->ir_prefix = Downcast<String>(v);
+  }
+  if (auto v = config_dict.Get("tir_prefix")) {
+    n->tir_prefix = Downcast<String>(v);
+  }
+  if (auto v = config_dict.Get("relax_prefix")) {
+    n->relax_prefix = Downcast<String>(v);
+  }
+  if (auto v = config_dict.Get("buffer_dtype")) {
+    n->buffer_dtype = DataType(runtime::String2DLDataType(Downcast<String>(v)));
+  }
+  if (auto v = config_dict.Get("int_dtype")) {
+    n->int_dtype = DataType(runtime::String2DLDataType(Downcast<String>(v)));
+  }
+  if (auto v = config_dict.Get("float_dtype")) {
+    n->float_dtype = DataType(runtime::String2DLDataType(Downcast<String>(v)));
+  }
+  if (auto v = config_dict.Get("verbose_expr")) {
+    n->verbose_expr = Downcast<IntImm>(v)->value;
+  }
+  if (auto v = config_dict.Get("indent_spaces")) {
+    n->indent_spaces = Downcast<IntImm>(v)->value;
+  }
+  if (auto v = config_dict.Get("print_line_numbers")) {
+    n->print_line_numbers = Downcast<IntImm>(v)->value;
+  }
+  if (auto v = config_dict.Get("num_context_lines")) {
+    n->num_context_lines = Downcast<IntImm>(v)->value;
+  }
+  if (auto v = config_dict.Get("path_to_underline")) {
+    n->path_to_underline = Downcast<ObjectPath>(v);
+  }
+  this->data_ = std::move(n);
+}
+
+TVM_REGISTER_NODE_TYPE(PrinterConfigNode);
+TVM_REGISTER_GLOBAL("node.PrinterConfig").set_body_typed([](Map<String, ObjectRef> config_dict) {
+  return PrinterConfig(config_dict);
+});
+TVM_REGISTER_GLOBAL("node.TVMScriptPrinterScript").set_body_typed(TVMScriptPrinter::Script);
+
+}  // namespace tvm
diff --git a/src/script/printer/doc_printer/base_doc_printer.cc b/src/script/printer/doc_printer/base_doc_printer.cc
index 38b8ef897740..a3a5c06ede0d 100644
--- a/src/script/printer/doc_printer/base_doc_printer.cc
+++ b/src/script/printer/doc_printer/base_doc_printer.cc
@@ -77,13 +77,13 @@ ByteSpan PopNextUnderline(UnderlineIter* next_underline, UnderlineIter end_under
 
 void PrintChunk(const std::pair<size_t, size_t>& lines_range,
                 const std::pair<UnderlineIter, UnderlineIter>& underlines, const std::string& text,
-                const std::vector<size_t>& line_starts, const DocPrinterOptions& options,
+                const std::vector<size_t>& line_starts, const PrinterConfig& options,
                 size_t line_number_width, std::string* out) {
   UnderlineIter next_underline = underlines.first;
   ByteSpan current_underline = PopNextUnderline(&next_underline, underlines.second);
 
   for (size_t line_idx = lines_range.first; line_idx < lines_range.second; ++line_idx) {
-    if (options.print_line_numbers) {
+    if (options->print_line_numbers) {
       std::string line_num_str = std::to_string(line_idx + 1);
       line_num_str.push_back(' ');
       for (size_t i = line_num_str.size(); i < line_number_width; ++i) {
@@ -148,12 +148,12 @@ void PrintCut(size_t num_lines_skipped, std::string* out) {
 
 std::pair<size_t, size_t> GetLinesForUnderline(const ByteSpan& underline,
                                                const std::vector<size_t>& line_starts,
-                                               size_t num_lines, const DocPrinterOptions& options) {
+                                               size_t num_lines, const PrinterConfig& options) {
   size_t first_line_of_underline = GetLineIndex(underline.first, line_starts);
-  size_t first_line_of_chunk = MoveBack(first_line_of_underline, options.num_context_lines);
+  size_t first_line_of_chunk = MoveBack(first_line_of_underline, options->num_context_lines);
   size_t end_line_of_underline = GetLineIndex(underline.second - 1, line_starts) + 1;
   size_t end_line_of_chunk =
-      MoveForward(end_line_of_underline, options.num_context_lines, num_lines);
+      MoveForward(end_line_of_underline, options->num_context_lines, num_lines);
 
   return {first_line_of_chunk, end_line_of_chunk};
 }
@@ -181,8 +181,8 @@ size_t GetNumLines(const std::string& text, const std::vector<size_t>& line_star
   }
 }
 
-size_t GetLineNumberWidth(size_t num_lines, const DocPrinterOptions& options) {
-  if (options.print_line_numbers) {
+size_t GetLineNumberWidth(size_t num_lines, const PrinterConfig& options) {
+  if (options->print_line_numbers) {
     return std::to_string(num_lines).size() + 1;
   } else {
     return 0;
@@ -190,8 +190,7 @@ size_t GetLineNumberWidth(size_t num_lines, const DocPrinterOptions& options) {
 }
 
 std::string DecorateText(const std::string& text, const std::vector<size_t>& line_starts,
-                         const DocPrinterOptions& options,
-                         const std::vector<ByteSpan>& underlines) {
+                         const PrinterConfig& options, const std::vector<ByteSpan>& underlines) {
   size_t num_lines = GetNumLines(text, line_starts);
   size_t line_number_width = GetLineNumberWidth(num_lines, options);
 
@@ -237,7 +236,7 @@ std::string DecorateText(const std::string& text, const std::vector<size_t>& lin
 
 }  // anonymous namespace
 
-DocPrinter::DocPrinter(const DocPrinterOptions& options) : options_(options) {
+DocPrinter::DocPrinter(const PrinterConfig& options) : options_(options) {
   line_starts_.push_back(0);
 }
 
diff --git a/src/script/printer/doc_printer/base_doc_printer.h b/src/script/printer/doc_printer/base_doc_printer.h
index db1d733d96ad..7851ce061b0d 100644
--- a/src/script/printer/doc_printer/base_doc_printer.h
+++ b/src/script/printer/doc_printer/base_doc_printer.h
@@ -35,23 +35,6 @@ namespace printer {
 /*! \brief Range of byte offsets in a string */
 using ByteSpan = std::pair<size_t, size_t>;
 
-/*! \brief Options to customize DocPrinter's output */
-struct DocPrinterOptions {
-  /*! \brief Number of spaces for one level of indentation */
-  int indent_spaces = 4;
-
-  /*! \brief Whether to print the line numbers */
-  bool print_line_numbers = false;
-
-  /*!
-   * \brief Number of context lines to print around the underlined text.
-   *
-   * If set to a non-default value `n`, only print `n` context lines before and after
-   * the underlined pieces of text.
-   */
-  size_t num_context_lines = std::numeric_limits<size_t>::max();
-};
-
 /*!
  * \brief DocPrinter is responsible for printing Doc tree into text format
  * \details This is the base class for translating Doc into string.
@@ -67,7 +50,8 @@ class DocPrinter {
    *
    * \param options the option for printer
    */
-  explicit DocPrinter(const DocPrinterOptions& options);
+  explicit DocPrinter(const PrinterConfig& options);
+
   virtual ~DocPrinter() = default;
 
   /*!
@@ -224,13 +208,13 @@ class DocPrinter {
    * \brief Increase the indent level of any content to be
    *        printed after this call
    */
-  void IncreaseIndent() { indent_ += options_.indent_spaces; }
+  void IncreaseIndent() { indent_ += options_->indent_spaces; }
 
   /*!
    * \brief Decrease the indent level of any content to be
    *        printed after this call
    */
-  void DecreaseIndent() { indent_ -= options_.indent_spaces; }
+  void DecreaseIndent() { indent_ -= options_->indent_spaces; }
 
   /*!
    * \brief Add a new line into the output stream
@@ -258,7 +242,7 @@ class DocPrinter {
   void MarkSpan(const ByteSpan& span, const ObjectPath& path);
 
   /*! \brief Options to customize certain aspects of the output */
-  DocPrinterOptions options_;
+  PrinterConfig options_;
 
   /*! \brief the current level of indent */
   int indent_ = 0;
diff --git a/src/script/printer/doc_printer/python_doc_printer.cc b/src/script/printer/doc_printer/python_doc_printer.cc
index 8634236df5c3..ce6b8e7f423c 100644
--- a/src/script/printer/doc_printer/python_doc_printer.cc
+++ b/src/script/printer/doc_printer/python_doc_printer.cc
@@ -142,7 +142,7 @@ ExprPrecedence GetExprPrecedence(const ExprDoc& doc) {
 
 class PythonDocPrinter : public DocPrinter {
  public:
-  explicit PythonDocPrinter(const DocPrinterOptions& options) : DocPrinter(options) {}
+  explicit PythonDocPrinter(const PrinterConfig& options) : DocPrinter(options) {}
 
  protected:
   using DocPrinter::PrintDoc;
@@ -642,17 +642,12 @@ void PythonDocPrinter::PrintTypedDoc(const ClassDoc& doc) {
   NewLineWithoutIndent();
 }
 
-String DocToPythonScript(Doc doc, int indent_spaces, bool print_line_numbers, int num_context_lines,
-                         Optional<ObjectPath> path_to_underline) {
-  DocPrinterOptions options;
-  options.indent_spaces = indent_spaces;
-  options.print_line_numbers = print_line_numbers;
-  if (num_context_lines >= 0) {
-    options.num_context_lines = num_context_lines;
+String DocToPythonScript(Doc doc, const PrinterConfig& cfg) {
+  if (cfg->num_context_lines < 0) {
+    cfg->num_context_lines = std::numeric_limits<int32_t>::max();
   }
-
-  PythonDocPrinter printer(options);
-  printer.Append(doc, path_to_underline);
+  PythonDocPrinter printer(cfg);
+  printer.Append(doc, cfg->path_to_underline);
   std::string result = printer.GetString();
   int last_space = result.size();
   while (last_space > 0 && std::isspace(result[last_space - 1])) {
diff --git a/src/script/printer/ir/ir.cc b/src/script/printer/ir/ir.cc
index e438919f4b1b..4a246e169276 100644
--- a/src/script/printer/ir/ir.cc
+++ b/src/script/printer/ir/ir.cc
@@ -52,7 +52,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           BaseFunc func = kv.second;
           (*f)->stmts.push_back(d->AsDoc<FunctionDoc>(func, p->Attr("functions")->MapValue(gv)));
         }
-        return ClassDoc(IdDoc("Module"), {IR("ir_module")}, (*f)->stmts);
+        return ClassDoc(IdDoc("Module"), {IR(d, "ir_module")}, (*f)->stmts);
       }
     });
 
@@ -63,43 +63,44 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<GlobalVar>("", [](GlobalVar gv, ObjectPath p, IRDocsifier d) -> Doc {
-      return IR("GlobalVar")->Call({LiteralDoc::Str(gv->name_hint, p->Attr("name_hint"))});
+      return IR(d, "GlobalVar")->Call({LiteralDoc::Str(gv->name_hint, p->Attr("name_hint"))});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<Op>("", [](Op op, ObjectPath p, IRDocsifier d) -> Doc {
-      return IR("Op")->Call({LiteralDoc::Str(op->name, p->Attr("name"))});
+      return IR(d, "Op")->Call({LiteralDoc::Str(op->name, p->Attr("name"))});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<TypeVar>("", [](TypeVar var, ObjectPath p, IRDocsifier d) -> Doc {
-      return IR("TypeVar")->Call({LiteralDoc::Str(var->name_hint, p->Attr("name_hint")),  //
-                                  LiteralDoc::Str(TypeKind2String(var->kind), p->Attr("kind"))});
+      return IR(d, "TypeVar")
+          ->Call({LiteralDoc::Str(var->name_hint, p->Attr("name_hint")),  //
+                  LiteralDoc::Str(TypeKind2String(var->kind), p->Attr("kind"))});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<GlobalTypeVar>(  //
         "", [](GlobalTypeVar var, ObjectPath p, IRDocsifier d) -> Doc {
-          return IR("GlobalTypeVar")
+          return IR(d, "GlobalTypeVar")
               ->Call({LiteralDoc::Str(var->name_hint, p->Attr("name_hint")),
                       LiteralDoc::Str(TypeKind2String(var->kind), p->Attr("kind"))});
         });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<RelayRefType>("", [](RelayRefType ref, ObjectPath p, IRDocsifier d) -> Doc {
-      return IR("RelayRef")->Call({d->AsDoc<ExprDoc>(ref->value, p->Attr("value"))});
+      return IR(d, "RelayRef")->Call({d->AsDoc<ExprDoc>(ref->value, p->Attr("value"))});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<TensorType>("", [](TensorType type, ObjectPath p, IRDocsifier d) -> Doc {
-      return IR("TensorType")
+      return IR(d, "TensorType")
           ->Call({d->AsDoc<ExprDoc>(type->shape, p->Attr("shape")),
                   LiteralDoc::DataType(type->dtype, p->Attr("dtype"))});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<FuncType>("", [](FuncType func_type, ObjectPath p, IRDocsifier d) -> Doc {
-      return IR("FuncType")
+      return IR(d, "FuncType")
           ->Call({
               d->AsDoc<ExprDoc>(func_type->type_params, p->Attr("type_params")),
               d->AsDoc<ExprDoc>(func_type->arg_types, p->Attr("arg_types")),
@@ -109,19 +110,17 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<IncompleteType>("", [](IncompleteType ty, ObjectPath p, IRDocsifier d) -> Doc {
-      return IR("IncompleteType")->Call({});
+      return IR(d, "IncompleteType")->Call({});
     });
 
-void ReprPrintIRModule(const ObjectRef& mod, ReprPrinter* p) {
+std::string ReprPrintIRModule(const ObjectRef& mod, const PrinterConfig& cfg) {
   if (const auto* f = runtime::Registry::Get("relay.ir.PrintRelayModule")) {
     if (Optional<String> s = (*f)(mod)) {
-      p->stream << s.value();
-      return;
+      return s.value();
     }
   }
-  std::string res =
-      DocToPythonScript(IRDocsifier()->AsDoc(Downcast<IRModule>(mod), ObjectPath::Root()));
-  p->stream << res;
+  Doc doc = IRDocsifier(cfg)->AsDoc(mod, ObjectPath::Root());
+  return DocToPythonScript(doc, cfg);
 }
 
 TVM_SCRIPT_REPR(TypeVarNode, ReprPrintIR);
diff --git a/src/script/printer/ir/script_method.cc b/src/script/printer/ir/script_method.cc
deleted file mode 100644
index 01d3ede7ea6c..000000000000
--- a/src/script/printer/ir/script_method.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include <tvm/tir/builtin.h>
-
-#include "./utils.h"
-
-namespace tvm {
-
-std::string IRModuleNode::Script(int indent_spaces, bool print_line_numbers, int num_context_lines,
-                                 Optional<ObjectPath> path_to_underline) const {
-  using namespace tvm::script::printer;
-  return DocToPythonScript(IRDocsifier()->AsDoc(GetRef<ObjectRef>(this), ObjectPath::Root()),
-                           indent_spaces, print_line_numbers, num_context_lines, path_to_underline);
-}
-
-TVM_REGISTER_GLOBAL("ir.Module_Script").set_body_method<IRModule>(&IRModuleNode::Script);
-
-}  // namespace tvm
diff --git a/src/script/printer/ir/utils.h b/src/script/printer/ir/utils.h
index 820fe13df3c6..d20756e6081a 100644
--- a/src/script/printer/ir/utils.h
+++ b/src/script/printer/ir/utils.h
@@ -23,9 +23,9 @@
 #include <tvm/ir/function.h>
 #include <tvm/ir/op.h>
 #include <tvm/script/printer/ir_docsifier.h>
-#include <tvm/script/printer/printer.h>
 #include <tvm/support/with.h>
 
+#include <string>
 #include <utility>
 
 #include "../utils.h"
@@ -35,7 +35,9 @@ namespace script {
 namespace printer {
 
 /*! \brief Creates the IR common prefix, which is by default `I` */
-inline ExprDoc IR(const String& attr) { return IdDoc(Default::Prefix("ir"))->Attr(attr); }
+inline ExprDoc IR(const IRDocsifier& d, const String& attr) {
+  return IdDoc(d->cfg->ir_prefix)->Attr(attr);
+}
 
 class IRFrameNode : public FrameNode {
  public:
@@ -57,15 +59,14 @@ class IRFrame : public Frame {
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(IRFrame, Frame, IRFrameNode);
 };
 
-inline void ReprPrintIR(const ObjectRef& obj, ReprPrinter* p) {
-  IRDocsifier d;
+/*! \brief Redirected method for the ReprPrinter */
+inline std::string ReprPrintIR(const ObjectRef& obj, const PrinterConfig& cfg) {
+  IRDocsifier d(cfg);
   With<IRFrame> f(d);
   (*f)->AddDispatchToken(d, "ir");
-  try {
-    p->stream << DocToPythonScript(Docsify(obj, d, *f));
-  } catch (const Error& e) {
-    HandleUnsupportedFallback(e, obj, p);
-  }
+  std::ostringstream oss;
+  oss << Docsify(obj, d, *f, cfg);
+  return oss.str();
 }
 
 }  // namespace printer
diff --git a/src/script/printer/ir_docsifier.cc b/src/script/printer/ir_docsifier.cc
index 4c52ce890c9d..5a0d2bd6bbe0 100644
--- a/src/script/printer/ir_docsifier.cc
+++ b/src/script/printer/ir_docsifier.cc
@@ -144,8 +144,9 @@ void IRDocsifierNode::SetCommonPrefix(const ObjectRef& root,
   this->common_prefix = std::move(visitor.common_prefix);
 }
 
-IRDocsifier::IRDocsifier() {
+IRDocsifier::IRDocsifier(const PrinterConfig& cfg) {
   auto n = make_object<IRDocsifierNode>();
+  n->cfg = cfg;
   n->dispatch_tokens.push_back("");
   data_ = std::move(n);
 }
diff --git a/src/script/printer/printer.cc b/src/script/printer/printer.cc
deleted file mode 100644
index 878b380a3717..000000000000
--- a/src/script/printer/printer.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/script/printer/printer.h>
-
-namespace tvm {
-namespace script {
-namespace printer {
-
-Default* Default::Instance() {
-  static Default inst;
-  return &inst;
-}
-
-TVM_REGISTER_GLOBAL("script.printer.DefaultIRPrefix")
-    .set_body_typed([](std::string ir, std::string prefix) { Default::Prefix(ir) = prefix; });
-TVM_REGISTER_GLOBAL("script.printer.DefaultBufferDType")
-    .set_body_typed([](runtime::DataType dtype) { Default::BufferDType() = dtype; });
-TVM_REGISTER_GLOBAL("script.printer.DefaultIntDType").set_body_typed([](runtime::DataType dtype) {
-  Default::IntDType() = dtype;
-});
-TVM_REGISTER_GLOBAL("script.printer.DefaultFloatDType").set_body_typed([](runtime::DataType dtype) {
-  Default::FloatDType() = dtype;
-});
-TVM_REGISTER_GLOBAL("script.printer.VerboseExpr").set_body_typed([](bool verbose_expr) {
-  Default::VerboseExpr() = verbose_expr;
-});
-
-}  // namespace printer
-}  // namespace script
-}  // namespace tvm
diff --git a/src/script/printer/tir/block.cc b/src/script/printer/tir/block.cc
index f78e7037c3e0..a5b8d6609622 100644
--- a/src/script/printer/tir/block.cc
+++ b/src/script/printer/tir/block.cc
@@ -68,7 +68,7 @@ Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
   auto print_single_iter_var = [&](int i) {
     tir::IterVar iter_var = block->iter_vars[i];
     ObjectPath iter_var_p = block_p->Attr("iter_var")->ArrayIndex(i);
-    ExprDoc rhs = TIR("axis");
+    ExprDoc rhs = TIR(d, "axis");
     if (iter_var->iter_type == tir::IterVarType::kDataPar) {
       rhs = rhs->Attr("spatial");
     } else if (iter_var->iter_type == tir::IterVarType::kCommReduce) {
@@ -128,7 +128,7 @@ Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
         binding_paths.push_back(iter_var_p->Attr("iter_type"));
         binding_type += iter_var->iter_type == tir::IterVarType::kDataPar ? "S" : "R";
       }
-      ExprDoc rhs = TIR("axis")->Attr("remap");
+      ExprDoc rhs = TIR(d, "axis")->Attr("remap");
       ExprDoc binding_str = LiteralDoc::Str(binding_type, NullOpt);
       binding_str->source_paths = std::move(binding_paths);
       rhs = rhs->Call({binding_str, ListDoc(loop_var_doc)});
@@ -151,8 +151,9 @@ Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
   if (realize) {
     ICHECK(realize->predicate.defined() && realize->predicate->dtype.is_bool());
     if (!tir::is_one(realize->predicate)) {
-      (*frame)->stmts.push_back(ExprStmtDoc(TIR("where")->Call(
-          {d->AsDoc<ExprDoc>(realize->predicate, realize_p->Attr("predicate"))})));
+      (*frame)->stmts.push_back(ExprStmtDoc(
+          TIR(d, "where")
+              ->Call({d->AsDoc<ExprDoc>(realize->predicate, realize_p->Attr("predicate"))})));
     }
   }
   // Step 3. Handle block read/write regions
@@ -161,17 +162,17 @@ Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
     for (int i = 0, n = block->reads.size(); i < n; ++i) {
       reads.push_back(d->AsDoc<ExprDoc>(block->reads[i], block_p->Attr("reads")->ArrayIndex(i)));
     }
-    (*frame)->stmts.push_back(ExprStmtDoc(TIR("reads")->Call(reads)));
+    (*frame)->stmts.push_back(ExprStmtDoc(TIR(d, "reads")->Call(reads)));
     Array<ExprDoc> writes;
     for (int i = 0, n = block->writes.size(); i < n; ++i) {
       writes.push_back(d->AsDoc<ExprDoc>(block->writes[i], block_p->Attr("writes")->ArrayIndex(i)));
     }
-    (*frame)->stmts.push_back(ExprStmtDoc(TIR("writes")->Call(writes)));
+    (*frame)->stmts.push_back(ExprStmtDoc(TIR(d, "writes")->Call(writes)));
   }
   // Step 4. Handle block attributes
   if (!block->annotations.empty()) {
     (*frame)->stmts.push_back(ExprStmtDoc(
-        TIR("block_attr")
+        TIR(d, "block_attr")
             ->Call({d->AsDoc<ExprDoc>(block->annotations, block_p->Attr("annotations"))})));
   }
   // Step 5. Handle `alloc_buffer`
@@ -194,7 +195,7 @@ Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
     tir::Stmt init = block->init.value();
     With<TIRFrame> init_frame(d, init);
     AsDocBody(init, block_p->Attr("init"), init_frame->get(), d);
-    (*frame)->stmts.push_back(ScopeDoc(NullOpt, TIR("init")->Call({}), (*init_frame)->stmts));
+    (*frame)->stmts.push_back(ScopeDoc(NullOpt, TIR(d, "init")->Call({}), (*init_frame)->stmts));
   }
   // Step 8. Handle block body
   AsDocBody(block->body, block_p->Attr("body"), frame->get(), d);
@@ -205,7 +206,7 @@ Doc PrintBlock(IRDocsifier d, tir::Block block, ObjectPath block_p,  //
     kwargs_values.push_back(LiteralDoc::Boolean(true, NullOpt));
   }
   return ScopeDoc(NullOpt,
-                  TIR("block")  //
+                  TIR(d, "block")  //
                       ->Call({LiteralDoc::Str(block->name_hint, block_p->Attr("name_hint"))},
                              kwargs_keys, kwargs_values),
                   (*frame)->stmts);
diff --git a/src/script/printer/tir/buffer.cc b/src/script/printer/tir/buffer.cc
index b947039b58de..b4429dc9afc9 100644
--- a/src/script/printer/tir/buffer.cc
+++ b/src/script/printer/tir/buffer.cc
@@ -55,7 +55,7 @@ Map<String, ExprDoc> BufferAttrs(const tir::Buffer& buffer, const ObjectPath& p,
   // Step 1. Handle `buffer.shape`
   array_out_line_var_def(buffer->shape, p->Attr("shape"), "shape");
   // Step 2. Handle `buffer.dtype`
-  if (buffer->dtype != Default::BufferDType()) {
+  if (buffer->dtype != d->cfg->buffer_dtype) {
     kwargs.Set("dtype", LiteralDoc::DataType(buffer->dtype, p->Attr("dtype")));
   }
   // Step 3. Handle `buffer.data`
@@ -123,7 +123,7 @@ ExprDoc BufferCall(const ExprDoc& prefix, const Map<String, ExprDoc>& attrs, Arr
 
 ExprDoc BufferDecl(const tir::Buffer& buffer, const String& method, const Array<ExprDoc>& args,
                    const ObjectPath& p, const Frame& frame, const IRDocsifier& d) {
-  return BufferCall(/*prefix=*/TIR(method),
+  return BufferCall(/*prefix=*/TIR(d, method),
                     /*attrs=*/BufferAttrs(buffer, p, frame, d),
                     /*args=*/args);
 }
@@ -134,7 +134,7 @@ ExprDoc BufferAttn(const tir::Buffer& buffer, const ObjectPath& p, const Frame&
   ExprDoc shape = attrs.Get("shape").value();
   ExprDoc dtype =
       attrs.Get("dtype").value_or(LiteralDoc::DataType(buffer->dtype, p->Attr("dtype")));
-  return TIR("Buffer")->Call({shape, dtype}, {}, {});
+  return TIR(d, "Buffer")->Call({shape, dtype}, {}, {});
 }
 
 Array<Doc> BufferIndices(const Array<PrimExpr>& indices, const ObjectPath& p,
@@ -251,7 +251,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         "", [](tir::ProducerRealize stmt, ObjectPath p, IRDocsifier d) -> Doc {
           ExprDoc prefix = IdDoc(stmt->producer->GetNameHint());
           prefix = prefix[BufferSlices(stmt->bounds, p->Attr("bounds"), d)];
-          prefix = TIR("ProducerRealize")
+          prefix = TIR(d, "ProducerRealize")
                        ->Call({prefix, d->AsDoc<ExprDoc>(stmt->condition, p->Attr("condition"))});
           With<TIRFrame> f(d, stmt);
           AsDocBody(stmt->body, p->Attr("body"), f->get(), d);
diff --git a/src/script/printer/tir/expr.cc b/src/script/printer/tir/expr.cc
index 6e0cfd420262..ab91764b6a0b 100644
--- a/src/script/printer/tir/expr.cc
+++ b/src/script/printer/tir/expr.cc
@@ -34,7 +34,7 @@ Doc PrintVar(const tir::Var& var, const ObjectPath& var_p, const IRDocsifier& d)
         ExprDoc rhs = d->AsDoc<ExprDoc>(type, var_p->Attr("type_annotation"));
         opt_f.value()->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
       } else {
-        ExprDoc rhs = TIR("var")->Call({LiteralDoc::DataType(var->dtype, var_p->Attr("dtype"))});
+        ExprDoc rhs = TIR(d, "var")->Call({LiteralDoc::DataType(var->dtype, var_p->Attr("dtype"))});
         opt_f.value()->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
       }
     }
@@ -57,7 +57,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)  //
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::IterVar>("", [](tir::IterVar var, ObjectPath var_p, IRDocsifier d) -> Doc {
-      return TIR("iter_var")
+      return TIR(d, "iter_var")
           ->Call({
               d->AsDoc<ExprDoc>(var->var, var_p->Attr("var")),
               d->AsDoc<ExprDoc>(var->dom, var_p->Attr("dom")),
@@ -70,7 +70,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Not>("", [](tir::Not node, ObjectPath p, IRDocsifier d) -> Doc {
       ExprDoc a = d->AsDoc<ExprDoc>(node->a, p->Attr("a"));
       if (a->IsInstance<LiteralDocNode>()) {
-        return TIR("Not")->Call({a});
+        return TIR(d, "Not")->Call({a});
       }
       return OperationDoc(OperationDocNode::Kind::kNot, {a});
     });
@@ -84,21 +84,22 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Cast>("", [](tir::Cast cast, ObjectPath p, IRDocsifier d) -> Doc {
       ExprDoc dtype = LiteralDoc::DataType(cast->dtype, p->Attr("dtype"));
       ExprDoc value = d->AsDoc<ExprDoc>(cast->value, p->Attr("value"));
-      return TIR("Cast")->Call({dtype, value});
+      return TIR(d, "Cast")->Call({dtype, value});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Select>("", [](tir::Select select, ObjectPath p, IRDocsifier d) -> Doc {
-      return TIR("Select")->Call({
-          d->AsDoc<ExprDoc>(select->condition, p->Attr("condition")),
-          d->AsDoc<ExprDoc>(select->true_value, p->Attr("true_value")),
-          d->AsDoc<ExprDoc>(select->false_value, p->Attr("false_value")),
-      });
+      return TIR(d, "Select")
+          ->Call({
+              d->AsDoc<ExprDoc>(select->condition, p->Attr("condition")),
+              d->AsDoc<ExprDoc>(select->true_value, p->Attr("true_value")),
+              d->AsDoc<ExprDoc>(select->false_value, p->Attr("false_value")),
+          });
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Ramp>("", [](tir::Ramp ramp, ObjectPath ramp_p, IRDocsifier d) -> Doc {
-      return TIR("Ramp")->Call({
+      return TIR(d, "Ramp")->Call({
           d->AsDoc<ExprDoc>(ramp->base, ramp_p->Attr("base")),
           d->AsDoc<ExprDoc>(ramp->stride, ramp_p->Attr("stride")),
           LiteralDoc::Int(ramp->lanes, ramp_p->Attr("lanes")),
@@ -107,7 +108,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Broadcast>("", [](tir::Broadcast bc, ObjectPath bc_p, IRDocsifier d) -> Doc {
-      return TIR("Broadcast")
+      return TIR(d, "Broadcast")
           ->Call({
               d->AsDoc<ExprDoc>(bc->value, bc_p->Attr("value")),
               LiteralDoc::Int(bc->lanes, bc_p->Attr("lanes")),
@@ -117,10 +118,11 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Shuffle>(  //
         "", [](tir::Shuffle shuffle, ObjectPath p, IRDocsifier d) -> Doc {
-          return TIR("Shuffle")->Call({
-              d->AsDoc<ExprDoc>(shuffle->vectors, p->Attr("vectors")),
-              d->AsDoc<ExprDoc>(shuffle->indices, p->Attr("indices")),
-          });
+          return TIR(d, "Shuffle")
+              ->Call({
+                  d->AsDoc<ExprDoc>(shuffle->vectors, p->Attr("vectors")),
+                  d->AsDoc<ExprDoc>(shuffle->indices, p->Attr("indices")),
+              });
         });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
@@ -152,12 +154,12 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
             }
           }
           ExprDoc id = d->AsDoc<ExprDoc>(r->identity_element, p->Attr("identity_element"));
-          return TIR("comm_reducer")->Call({lambda, id});
+          return TIR(d, "comm_reducer")->Call({lambda, id});
         });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Let>("", [](tir::Let let, ObjectPath p, IRDocsifier d) -> Doc {
-      return TIR("let")->Call({
+      return TIR(d, "let")->Call({
           d->AsDoc<ExprDoc>(let->var, p->Attr("var")),
           d->AsDoc<ExprDoc>(let->value, p->Attr("value")),
           d->AsDoc<ExprDoc>(let->body, p->Attr("body")),
@@ -194,7 +196,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         if (op_names.count(GetRef<Op>(op)) == 0) {
           LOG(WARNING) << "No TScriptPrinterName attribute for " << op->name;
         }
-        prefix = TIR(name);
+        prefix = TIR(d, name);
       } else if (const auto* gv = call->op.as<GlobalVarNode>()) {
         prefix = LiteralDoc::Str(gv->name_hint, call_p->Attr("op"));
       } else {
@@ -217,7 +219,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Any>("", [](tir::Any any, ObjectPath p, IRDocsifier d) -> Doc {
-      return TIR("Any")->Call({});
+      return TIR(d, "Any")->Call({});
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
@@ -228,8 +230,9 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       ExprDoc axis = d->AsDoc<ExprDoc>(r->axis, p->Attr("axis"));
       ExprDoc condition = d->AsDoc<ExprDoc>(r->condition, p->Attr("condition"));
       ExprDoc value_index = LiteralDoc::Int(r->value_index, p->Attr("value_index"));
-      return TIR("reduce")->Call({combiner}, {"source", "init", "axis", "condition", "value_index"},
-                                 {source, init, axis, condition, value_index});
+      return TIR(d, "reduce")
+          ->Call({combiner}, {"source", "init", "axis", "condition", "value_index"},
+                 {source, init, axis, condition, value_index});
       LOG(FATAL) << "ValueError: Reduce should never exist in TIR: " << r;
     });
 
@@ -244,7 +247,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
                                    [](tir::NodeType node, ObjectPath p, IRDocsifier d) -> Doc { \
                                      ExprDoc a = d->AsDoc<ExprDoc>(node->a, p->Attr("a"));      \
                                      ExprDoc b = d->AsDoc<ExprDoc>(node->b, p->Attr("b"));      \
-                                     return TIR(OpString)->Call({a, b});                        \
+                                     return TIR(d, OpString)->Call({a, b});                     \
                                    });
 
 #define TVM_SCRIPT_PRINTER_DEF_BINARY_WITH_SUGAR(NodeType, OpString, OpKind)          \
@@ -254,7 +257,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
             ExprDoc a = d->AsDoc<ExprDoc>(node->a, p->Attr("a"));                     \
             ExprDoc b = d->AsDoc<ExprDoc>(node->b, p->Attr("b"));                     \
             if (a->IsInstance<LiteralDocNode>() && b->IsInstance<LiteralDocNode>()) { \
-              return TIR(OpString)->Call({a, b});                                     \
+              return TIR(d, OpString)->Call({a, b});                                  \
             }                                                                         \
             return OperationDoc(OperationDocNode::Kind::OpKind, {a, b});              \
           });
diff --git a/src/script/printer/tir/for_loop.cc b/src/script/printer/tir/for_loop.cc
index 2a81c37061c6..7d21de27a1a2 100644
--- a/src/script/printer/tir/for_loop.cc
+++ b/src/script/printer/tir/for_loop.cc
@@ -59,7 +59,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           loop_p = loop_p->Attr("body");
         }
         AsDocBody(grid.back()->body, loop_p, (*f).get(), d);
-        return ForDoc(TupleDoc(lhs), TIR("grid")->Call(rhs), (*f)->stmts);
+        return ForDoc(TupleDoc(lhs), TIR(d, "grid")->Call(rhs), (*f)->stmts);
       }
       // Step 3. If not `T.grid`, print loop kind accordingly
       ExprDoc lhs = DefineVar(loop->loop_var, *f, d);
@@ -81,16 +81,16 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         if (loop->annotations.empty()) {
           prefix = IdDoc("range");
         } else {
-          prefix = TIR("serial");
+          prefix = TIR(d, "serial");
         }
       } else if (loop->kind == tir::ForKind::kParallel) {
-        prefix = TIR("parallel");
+        prefix = TIR(d, "parallel");
       } else if (loop->kind == tir::ForKind::kUnrolled) {
-        prefix = TIR("unroll");
+        prefix = TIR(d, "unroll");
       } else if (loop->kind == tir::ForKind::kVectorized) {
-        prefix = TIR("vectorized");
+        prefix = TIR(d, "vectorized");
       } else if (loop->kind == tir::ForKind::kThreadBinding) {
-        prefix = TIR("thread_binding");
+        prefix = TIR(d, "thread_binding");
         thread = LiteralDoc::Str(loop->thread_binding.value()->thread_tag,
                                  loop_p->Attr("thread_binding"));
       } else {
diff --git a/src/script/printer/tir/function.cc b/src/script/printer/tir/function.cc
index ea7d56e1656d..fbcc2fca3b4b 100644
--- a/src/script/printer/tir/function.cc
+++ b/src/script/printer/tir/function.cc
@@ -111,7 +111,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       // Step 2. Handle `func->attrs`
       if (func->attrs.defined() && !func->attrs->dict.empty()) {
         (*frame)->stmts.push_back(
-            ExprStmtDoc(TIR("func_attr")  //
+            ExprStmtDoc(TIR(d, "func_attr")  //
                             ->Call({d->AsDoc<ExprDoc>(func->attrs, p->Attr("attrs"))})));
       }
       // Step 3. Handle `func->buffer_map`
@@ -175,14 +175,14 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       return FunctionDoc(
           /*name=*/IdDoc(FindFunctionName(d, func)),
           /*args=*/args,
-          /*decorators=*/{TIR("prim_func")},
+          /*decorators=*/{TIR(d, "prim_func")},
           /*return_type=*/ret_type,
           /*body=*/(*frame)->stmts);
     });
 
-void ReprPrintPrimFunc(const ObjectRef& obj, ReprPrinter* p) {
-  std::string res = DocToPythonScript(IRDocsifier()->AsDoc(obj, ObjectPath::Root()));
-  p->stream << res;
+std::string ReprPrintPrimFunc(const ObjectRef& obj, const PrinterConfig& cfg) {
+  Doc doc = IRDocsifier(cfg)->AsDoc(obj, ObjectPath::Root());
+  return DocToPythonScript(doc, cfg);
 }
 
 TVM_SCRIPT_REPR(tir::PrimFuncNode, ReprPrintPrimFunc);
diff --git a/src/script/printer/tir/ir.cc b/src/script/printer/tir/ir.cc
index 1214f822610c..76d3680fec81 100644
--- a/src/script/printer/tir/ir.cc
+++ b/src/script/printer/tir/ir.cc
@@ -29,12 +29,12 @@ TVM_REGISTER_NODE_TYPE(TIRFrameNode);
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<IntImm>("", [](IntImm imm, ObjectPath imm_p, IRDocsifier d) -> Doc {
       DataType dtype = imm->dtype;
-      if (dtype == Default::IntDType()) {
+      if (dtype == d->cfg->int_dtype) {
         return LiteralDoc::Int(imm->value, imm_p->Attr("value"));
       } else if (dtype == DataType::Bool()) {
         return LiteralDoc::Boolean(imm->value, imm_p->Attr("value"));
       } else {
-        return TIR(runtime::DLDataType2String(dtype))  //
+        return TIR(d, runtime::DLDataType2String(dtype))  //
             ->Call({LiteralDoc::Int(imm->value, imm_p->Attr("value"))});
       }
     });
@@ -42,26 +42,27 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<FloatImm>("", [](FloatImm imm, ObjectPath imm_p, IRDocsifier d) -> Doc {
       DataType dtype = imm->dtype;
-      if (dtype == Default::FloatDType()) {
+      if (dtype == d->cfg->float_dtype) {
         return LiteralDoc::Float(imm->value, imm_p->Attr("value"));
       } else {
-        return TIR(runtime::DLDataType2String(dtype))  //
+        return TIR(d, runtime::DLDataType2String(dtype))  //
             ->Call({LiteralDoc::Float(imm->value, imm_p->Attr("value"))});
       }
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<Range>("", [](Range range, ObjectPath p, IRDocsifier d) -> Doc {
-      return TIR("Range")->Call({
-          d->AsDoc<ExprDoc>(range->min, p->Attr("min")),
-          d->AsDoc<ExprDoc>(range->extent, p->Attr("extent")),
-      });
+      return TIR(d, "Range")
+          ->Call({
+              d->AsDoc<ExprDoc>(range->min, p->Attr("min")),
+              d->AsDoc<ExprDoc>(range->extent, p->Attr("extent")),
+          });
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<PrimType>("", [](PrimType ty, ObjectPath p, IRDocsifier d) -> Doc {
       std::string dtype = ty->dtype.is_void() ? "void" : runtime::DLDataType2String(ty->dtype);
-      return TIR(dtype);
+      return TIR(d, dtype);
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
@@ -74,9 +75,9 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         element_type = d->AsDoc<ExprDoc>(ty->element_type, ty_p->Attr("element_type"));
       }
       if (ty->storage_scope == "") {
-        return TIR("Ptr")->Call({element_type});
+        return TIR(d, "Ptr")->Call({element_type});
       } else {
-        return TIR("Ptr")->Call(
+        return TIR(d, "Ptr")->Call(
             {element_type, LiteralDoc::Str(ty->storage_scope, ty_p->Attr("storage_scope"))});
       }
     });
@@ -86,13 +87,13 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       if (ty->fields.empty()) {
         return LiteralDoc::None(p);
       }
-      return TIR("Tuple")->Call(d->AsDoc<ListDoc>(ty->fields, p->Attr("fields"))->elements);
+      return TIR(d, "Tuple")->Call(d->AsDoc<ListDoc>(ty->fields, p->Attr("fields"))->elements);
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<Target>("", [](Target target, ObjectPath p, IRDocsifier d) -> Doc {
       Map<String, ObjectRef> config = target->Export();
-      return TIR("target")->Call({d->AsDoc<ExprDoc>(config, p)});
+      return TIR(d, "target")->Call({d->AsDoc<ExprDoc>(config, p)});
     });
 
 TVM_SCRIPT_REPR(IntImmNode, ReprPrintTIR);
diff --git a/src/script/printer/tir/script_method.cc b/src/script/printer/tir/script_method.cc
deleted file mode 100644
index 5cda9a9626db..000000000000
--- a/src/script/printer/tir/script_method.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include <tvm/tir/builtin.h>
-
-#include "./utils.h"
-
-namespace tvm {
-
-std::string PrimExprNode::Script(int indent_spaces, bool print_line_numbers, int num_context_lines,
-                                 Optional<ObjectPath> path_to_underline) const {
-  using namespace tvm::script::printer;
-  IRDocsifier d;
-  ObjectRef obj = GetRef<ObjectRef>(this);
-  With<TIRFrame> f(MakeDispatchFrame(d, obj, ObjectRef(nullptr)));
-  return DocToPythonScript(Docsify(obj, d, *f), indent_spaces, print_line_numbers,
-                           num_context_lines, path_to_underline);
-}
-
-namespace tir {
-
-std::string StmtNode::Script(int indent_spaces, bool print_line_numbers, int num_context_lines,
-                             Optional<ObjectPath> path_to_underline) const {
-  using namespace tvm::script::printer;
-  IRDocsifier d;
-  ObjectRef obj = GetRef<ObjectRef>(this);
-  With<TIRFrame> f(MakeDispatchFrame(d, obj, ObjectRef(nullptr)));
-  return DocToPythonScript(Docsify(obj, d, *f), indent_spaces, print_line_numbers,
-                           num_context_lines, path_to_underline);
-}
-
-std::string PrimFuncNode::Script(int indent_spaces, bool print_line_numbers, int num_context_lines,
-                                 Optional<ObjectPath> path_to_underline) const {
-  using namespace tvm::script::printer;
-  return DocToPythonScript(IRDocsifier()->AsDoc(GetRef<ObjectRef>(this), ObjectPath::Root()),
-                           indent_spaces, print_line_numbers, num_context_lines, path_to_underline);
-}
-
-TVM_REGISTER_GLOBAL("tir.PrimFuncScript").set_body_method<PrimFunc>(&PrimFuncNode::Script);
-TVM_REGISTER_GLOBAL("tir.StmtScript").set_body_method<Stmt>(&StmtNode::Script);
-TVM_REGISTER_GLOBAL("tir.PrimExprScript").set_body_method<PrimExpr>(&PrimExprNode::Script);
-
-}  // namespace tir
-}  // namespace tvm
diff --git a/src/script/printer/tir/stmt.cc b/src/script/printer/tir/stmt.cc
index acdfd7da472b..2820f9ba6384 100644
--- a/src/script/printer/tir/stmt.cc
+++ b/src/script/printer/tir/stmt.cc
@@ -51,7 +51,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       if (eval->value->IsInstance<tir::CallNode>()) {
         return ExprStmtDoc(value);
       }
-      return ExprStmtDoc(TIR("evaluate")->Call({value}));
+      return ExprStmtDoc(TIR(d, "evaluate")->Call({value}));
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
@@ -75,7 +75,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         stmts->insert(stmts->begin(), AssignDoc(lhs, rhs, type_doc));
         return StmtBlockDoc(*stmts);
       } else {
-        rhs = TIR("let")->Call({lhs, rhs});
+        rhs = TIR(d, "let")->Call({lhs, rhs});
         return ScopeDoc(NullOpt, rhs, *stmts);
       }
     });
@@ -93,7 +93,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
             stmts->insert(stmts->begin(), AssertDoc(cond, msg));
             return StmtBlockDoc(*stmts);
           }
-          return ScopeDoc(NullOpt, TIR("Assert")->Call({cond, msg}), (*f)->stmts);
+          return ScopeDoc(NullOpt, TIR(d, "Assert")->Call({cond, msg}), (*f)->stmts);
         });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
@@ -145,7 +145,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tir::Prefetch>(  //
         "", [](tir::Prefetch stmt, ObjectPath p, IRDocsifier d) -> Doc {
-          return ExprStmtDoc(TIR("prefetch")
+          return ExprStmtDoc(TIR(d, "prefetch")
                                  ->Call({
                                      d->AsDoc<ExprDoc>(stmt->buffer, p->Attr("buffer")),
                                      d->AsDoc<ExprDoc>(stmt->bounds, p->Attr("bounds")),
@@ -198,7 +198,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           }
           ExprDoc lhs = DefineVar(stmt->buffer_var, d->frames.back(), d);
           With<TIRFrame> f(d, stmt);
-          ExprDoc rhs = TIR("allocate")->Call(args, kwargs_keys, kwargs_values);
+          ExprDoc rhs = TIR(d, "allocate")->Call(args, kwargs_keys, kwargs_values);
           AsDocBody(stmt->body, stmt_p->Attr("body"), f->get(), d);
           return DoConciseScoping(lhs, rhs, &(*f)->stmts, concise);
         });
@@ -277,7 +277,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           args.push_back(data_doc);
           args.push_back(LiteralDoc::DataType(stmt->dtype, stmt_p->Attr("dtype")));
           args.push_back(d->AsDoc<ExprDoc>(stmt->extents, stmt_p->Attr("extents")));
-          ExprDoc rhs = TIR("allocate_const")->Call(args, kwargs_keys, kwargs_values);
+          ExprDoc rhs = TIR(d, "allocate_const")->Call(args, kwargs_keys, kwargs_values);
           With<TIRFrame> f(d, stmt);
           ExprDoc lhs = DefineVar(stmt->buffer_var, *f, d);
           AsDocBody(stmt->body, stmt_p->Attr("body"), f->get(), d);
@@ -310,7 +310,7 @@ ExprDoc DocsifyBufferRealize(const tir::BufferRealizeNode* stmt, Optional<ExprDo
     kwargs_keys.push_back("condition");
     kwargs_values.push_back(d->AsDoc<ExprDoc>(stmt->condition, p->Attr("condition")));
   }
-  return TIR("realize")->Call(args, kwargs_keys, kwargs_values);
+  return TIR(d, "realize")->Call(args, kwargs_keys, kwargs_values);
 }
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
@@ -351,12 +351,12 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
                 DefineVar(iter_var->var, f, d);
                 f->stmts.push_back(
                     AssignDoc(d->AsDoc<ExprDoc>(iter_var->var, iter_var_p->Attr("var")),
-                              TIR("env_thread")
+                              TIR(d, "env_thread")
                                   ->Call({LiteralDoc::Str(iter_var->thread_tag,
                                                           iter_var_p->Attr("thread_tag"))}),  //
                               NullOpt));
               }
-              rhs = TIR("launch_thread")
+              rhs = TIR(d, "launch_thread")
                         ->Call({
                             d->AsDoc<ExprDoc>(iter_var->var, stmt_p->Attr("node")),
                             d->AsDoc<ExprDoc>(stmt->value, stmt_p->Attr("value")),
@@ -364,7 +364,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
             }
           }
           if (!rhs.defined()) {
-            rhs = TIR("attr")->Call({
+            rhs = TIR(d, "attr")->Call({
                 d->AsDoc<ExprDoc>(stmt->node, stmt_p->Attr("node")),
                 LiteralDoc::Str(stmt->attr_key, stmt_p->Attr("attr_key")),
                 d->AsDoc<ExprDoc>(stmt->value, stmt_p->Attr("value")),
diff --git a/src/script/printer/tir/utils.h b/src/script/printer/tir/utils.h
index e1ffe135229e..88094ee816ca 100644
--- a/src/script/printer/tir/utils.h
+++ b/src/script/printer/tir/utils.h
@@ -20,7 +20,6 @@
 #define TVM_SCRIPT_PRINTER_TIR_UTILS_H_
 
 #include <tvm/script/printer/ir_docsifier.h>
-#include <tvm/script/printer/printer.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/buffer.h>
 #include <tvm/tir/expr.h>
@@ -74,7 +73,9 @@ class TIRFrame : public Frame {
 };
 
 /*! \brief Creates the TIR common prefix, which is by default `T` */
-inline ExprDoc TIR(const String& attr) { return IdDoc(Default::Prefix("tir"))->Attr(attr); }
+inline ExprDoc TIR(const IRDocsifier& d, const String& attr) {
+  return IdDoc(d->cfg->tir_prefix)->Attr(attr);
+}
 
 /*!
  * \brief Defines a variable in the IRDocsifier at the given frame,
@@ -187,14 +188,12 @@ inline TIRFrame MakeDispatchFrame(const IRDocsifier& d, const ObjectRef& root,
 }
 
 /*! \brief Redirected method for the ReprPrinter */
-inline void ReprPrintTIR(const ObjectRef& obj, ReprPrinter* p) {
-  IRDocsifier d;
+inline std::string ReprPrintTIR(const ObjectRef& obj, const PrinterConfig& cfg) {
+  IRDocsifier d(cfg);
   With<TIRFrame> f(MakeDispatchFrame(d, obj, ObjectRef(nullptr)));
-  try {
-    p->stream << DocToPythonScript(Docsify(obj, d, *f));
-  } catch (const tvm::Error& e) {
-    HandleUnsupportedFallback(e, obj, p);
-  }
+  std::ostringstream oss;
+  oss << Docsify(obj, d, *f, cfg);
+  return oss.str();
 }
 
 /*!
diff --git a/src/script/printer/utils.h b/src/script/printer/utils.h
index 9f9a7d8299c4..5161f1f9a268 100644
--- a/src/script/printer/utils.h
+++ b/src/script/printer/utils.h
@@ -20,13 +20,6 @@
 #define TVM_SCRIPT_PRINTER_UTILS_H_
 
 #include <tvm/script/printer/ir_docsifier.h>
-#include <tvm/script/printer/printer.h>
-#include <tvm/tir/analysis.h>
-#include <tvm/tir/buffer.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/function.h>
-#include <tvm/tir/op.h>
-#include <tvm/tir/stmt.h>
 
 #include <string>
 #include <unordered_map>
@@ -37,13 +30,26 @@ namespace tvm {
 namespace script {
 namespace printer {
 
-#define TVM_SCRIPT_REPR(ObjectType, Method) \
-  TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<ObjectType>(Method);
+#define TVM_SCRIPT_REPR(ObjectType, Method)                   \
+  TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)                  \
+      .set_dispatch<ObjectType>(RedirectedReprPrinterMethod); \
+  TVM_STATIC_IR_FUNCTOR(TVMScriptPrinter, vtable).set_dispatch<ObjectType>(Method);
 
-inline StmtBlockDoc Docsify(const ObjectRef& obj, const IRDocsifier& d, const Frame& f) {
+inline void RedirectedReprPrinterMethod(const ObjectRef& obj, ReprPrinter* p) {
+  try {
+    p->stream << TVMScriptPrinter::Script(obj, NullOpt);
+  } catch (const tvm::Error& e) {
+    LOG(WARNING) << "TVMScript printer falls back to the legacy ReprPrinter with the error:\n"
+                 << e.what();
+    p->stream << AsLegacyRepr(obj);
+  }
+}
+
+inline std::string Docsify(const ObjectRef& obj, const IRDocsifier& d, const Frame& f,
+                           const PrinterConfig& cfg) {
   Doc doc = d->AsDoc(obj, ObjectPath::Root());
   if (const auto* expr_doc = doc.as<ExprDocNode>()) {
-    if (!Default::VerboseExpr()) {
+    if (!cfg->verbose_expr) {
       f->stmts.clear();
     }
     f->stmts.push_back(ExprStmtDoc(GetRef<ExprDoc>(expr_doc)));
@@ -56,14 +62,7 @@ inline StmtBlockDoc Docsify(const ObjectRef& obj, const IRDocsifier& d, const Fr
   } else {
     LOG(FATAL) << "TypeError: Unexpected doc type: " << doc->GetTypeKey();
   }
-  return StmtBlockDoc(f->stmts);
-}
-
-inline void HandleUnsupportedFallback(const tvm::Error& error, const ObjectRef& obj,
-                                      ReprPrinter* p) {
-  LOG(WARNING) << "TVMScript printer falls back to the legacy ReprPrinter with the error:\n"
-               << error.what();
-  p->stream << AsLegacyRepr(obj);
+  return DocToPythonScript(StmtBlockDoc(f->stmts), cfg);
 }
 
 }  // namespace printer
diff --git a/tests/python/unittest/test_tvmscript_printer_tir.py b/tests/python/unittest/test_tvmscript_printer_tir.py
index c73ae291930c..71da86bff763 100644
--- a/tests/python/unittest/test_tvmscript_printer_tir.py
+++ b/tests/python/unittest/test_tvmscript_printer_tir.py
@@ -15,31 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
-from contextlib import contextmanager
-
+import tvm.testing
 from tvm import ir, tir
 from tvm.ir import Range
 from tvm.script.ir_builder import IRBuilder
 from tvm.script.ir_builder import tir as T
-from tvm.script.printer import default
-import tvm.testing
-
-
-@contextmanager
-def verbose_expr():
-    try:
-        default.verbose_expr(True)
-        yield
-    finally:
-        default.verbose_expr(False)
 
 
 def _assert_print(obj, expected):
-    with verbose_expr():
-        if isinstance(obj, (tir.PrimFunc, tir.PrimExpr, tir.Stmt)):
-            assert obj.script().strip() == expected.strip()
-        assert str(obj).strip() == expected.strip()
-        assert repr(obj).strip() == expected.strip()
+    assert obj.script(verbose_expr=True).strip() == expected.strip()
 
 
 def test_prim_func():

From 66ef2a3cc400c3dee8b4bc7a777f5a0c89e74b86 Mon Sep 17 00:00:00 2001
From: joshherr-quic <95375797+joshherr-quic@users.noreply.github.com>
Date: Tue, 24 Jan 2023 13:52:32 -0600
Subject: [PATCH 218/286] [Hexagon]Float and quantized dense operators with
 schedules (#12873)

This PR implements dense operators for float types and quantized types. The quantized implementation uses floating point numbers for its intermediate compute type, fixed point will be investigated in the future.

float16 accuracy is questionable. Needs further investigation in an actual model (not just a unittest).
---
 python/tvm/topi/hexagon/qnn/__init__.py       |  17 +-
 python/tvm/topi/hexagon/qnn/qdense.py         | 193 ++++++++++++
 python/tvm/topi/hexagon/slice_ops/__init__.py |   1 +
 python/tvm/topi/hexagon/slice_ops/dense.py    | 144 +++++++++
 python/tvm/topi/hexagon/utils.py              |  26 +-
 .../contrib/test_hexagon/infrastructure.py    |   6 +
 .../topi/slice_op/test_dense_slice.py         | 282 ++++++++++++++++++
 7 files changed, 654 insertions(+), 15 deletions(-)
 create mode 100644 python/tvm/topi/hexagon/qnn/qdense.py
 create mode 100644 python/tvm/topi/hexagon/slice_ops/dense.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/slice_op/test_dense_slice.py

diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py
index b8cdc7a26d96..022a552c9d54 100644
--- a/python/tvm/topi/hexagon/qnn/__init__.py
+++ b/python/tvm/topi/hexagon/qnn/__init__.py
@@ -17,16 +17,13 @@
 
 """ Computes and schedules for Hexagon quantized ops """
 
+from .adaptive_avg_pool1d import *
 from .avg_pool2d import qnn_avg_pool2d_compute, qnn_avg_pool2d_schedule
-from .qadd_qsub_qmul import *
-from .dequantize import (
-    dequantize_compute,
-    dequantize_schedule,
-)
-
-from .quantize import quantize_compute, tir_quantize_schedule
+from .conv2d_alter_op import *
+from .dequantize import dequantize_compute, dequantize_schedule
+from .global_avg_pool2d import *
 from .nn import *
+from .qadd_qsub_qmul import *
+from .qdense import *
 from .qdepthwise_conv2d_slice import qdepthwise_conv2d_compute, qdepthwise_conv2d_schedule
-from .adaptive_avg_pool1d import *
-from .global_avg_pool2d import *
-from .conv2d_alter_op import *
+from .quantize import quantize_compute, tir_quantize_schedule
diff --git a/python/tvm/topi/hexagon/qnn/qdense.py b/python/tvm/topi/hexagon/qnn/qdense.py
new file mode 100644
index 000000000000..53f9077e56ba
--- /dev/null
+++ b/python/tvm/topi/hexagon/qnn/qdense.py
@@ -0,0 +1,193 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Schedule for dense operator"""
+
+from tvm import te, tir
+from tvm.topi import tag
+from ..utils import get_layout_transform_fn
+
+
+def qdense_compute(
+    tensor_a,
+    tensor_b,
+    zero_a,
+    scale_a,
+    zero_b,
+    scale_b,
+    zero_out=None,
+    scale_out=None,
+    bias=None,
+    q_dtype=None,
+):
+    """Hexagon's implementation of a sliced dense operator in Topi.
+    Uses matmul.
+
+    Parameters
+    ----------
+    tensor_a : tvm.te.Tensor
+        data 2-D with shape [batch, in_dim]
+
+    tensor_b : tvm.te.Tensor
+        weight 2-D with shape [in_dim, out_dim]
+
+    zero_a : integer
+        quantization zero point for tensor a.
+
+    scale_a : float
+        quantization scale for tensor a.
+
+    zero_b : integer
+        quantization zero point for tensor b.
+
+    scale_b : float
+        quantization scale for tensor b.
+
+    zero_out : Optional[integer]
+        quantization zero point for output.
+
+    scale_out : Optional[float]
+        quantization scale for output.
+
+    bias : Optional[tvm.te.Tensor]
+        1-D with shape [out_dim]
+
+    q_dtype : Optional[str]
+        The output type.
+
+    Returns
+    -------
+    mat : tvm.te.Tensor
+        2-D with shape [batch, out_dim]
+
+    """
+    if bias is not None:
+        assert len(bias.shape) == 1
+    if q_dtype is None:
+        q_dtype = tensor_a.dtype
+
+    batch, in_dim = tensor_a.shape
+    out_dim, red_dim = tensor_b.shape
+
+    # cmp should be done by values
+    assert int(in_dim) == int(red_dim)
+
+    k = te.reduce_axis((0, in_dim), name="k")
+    compute_lambda = lambda n, m: te.sum(
+        scale_a
+        * (tensor_a[n, k].astype("float32") - zero_a)
+        * scale_b
+        * (tensor_b[k, m].astype("float32") - zero_b),
+        axis=k,
+    )
+    compute_name = "qmatmul_sliced"
+
+    out = te.compute(
+        (batch, out_dim),
+        compute_lambda,
+        name=compute_name,
+        attrs={"layout_free_placeholders": [tensor_b]},
+    )
+
+    if bias is not None:
+        out = te.compute(
+            (batch, out_dim),
+            lambda i, j: out[i, j] + bias[j],
+            tag=tag.BROADCAST,
+            name="bias",
+        )
+
+    # Requantization of dense
+    if scale_out is not None:
+        out = te.compute(
+            (batch, out_dim),
+            lambda *i: (out[i] / scale_out + zero_out).astype(q_dtype),
+            name="requantize",
+        )
+
+    return out
+
+
+def qdense_schedule(outs, ins, output_layout: str, input_layout: str):
+    """Schedule for dense op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of dense in the format
+        of an array of tensors.
+
+    ins: Array of Tensor
+        Input tensors into graph.
+
+    output_layout: str
+        Descriptor string for physical layout
+
+    input_layout: str
+        Descriptor string for physical layout
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    if not isinstance(ins, list):
+        ins = [ins]
+    if not isinstance(outs, list):
+        outs = [outs]
+
+    func = te.create_prim_func([*ins, *outs])
+    s = tir.Schedule(func)
+
+    matmul = s.get_block("qmatmul_sliced")
+    try:
+        requantize = s.get_block("requantize")
+    except tir.schedule.schedule.ScheduleError:
+        requantize = None
+    try:
+        bias = s.get_block("bias")
+    except tir.schedule.schedule.ScheduleError:
+        bias = None
+
+    input_transform_fn = get_layout_transform_fn(input_layout)
+    output_transform_fn = get_layout_transform_fn(output_layout)
+
+    # Transform input and output buffer
+    s.transform_layout(matmul, ("read", 0), input_transform_fn)
+    if requantize is not None:
+        s.transform_layout(requantize, ("write", 0), output_transform_fn)
+    elif bias is not None:
+        s.transform_layout(bias, ("write", 0), output_transform_fn)
+    else:
+        s.transform_layout(matmul, ("write", 0), output_transform_fn)
+
+    # Vectorize
+    _, matmul_c, _ = s.get_loops(matmul)
+    _, matmul_c_inner = s.split(matmul_c, [None, 128])
+    s.vectorize(matmul_c_inner)
+
+    # Compute everything inline
+    if bias is not None and requantize is not None:
+        _, bias_c = s.get_loops(bias)
+        s.compute_at(matmul, bias_c)
+        _, out_c = s.get_loops(requantize)
+        s.compute_at(bias, out_c)
+    elif bias is not None and requantize is None:
+        _, out_c = s.get_loops(bias)
+        s.compute_at(matmul, out_c)
+
+    return s
diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
index 6b17b64489a9..46ae0c53200f 100644
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -37,3 +37,4 @@
 from .dwconv2d import *
 from .depth_to_space import d2s_compute, d2s_schedule
 from .global_avg_pool2d import *
+from .dense import *
diff --git a/python/tvm/topi/hexagon/slice_ops/dense.py b/python/tvm/topi/hexagon/slice_ops/dense.py
new file mode 100644
index 000000000000..a298ff4bc98e
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/dense.py
@@ -0,0 +1,144 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Schedule for dense operator"""
+
+from tvm import te, tir
+from tvm.topi import tag
+from ..utils import get_layout_transform_fn
+
+
+def dense_compute(tensor_a, tensor_b, bias=None, out_dtype=None):
+    """Hexagon's implementation of a sliced dense operator in Topi.
+    Uses matmul.
+
+    Parameters
+    ----------
+    tensor_a : tvm.te.Tensor
+        data 2-D with shape [batch, in_dim]
+
+    tensor_b : tvm.te.Tensor
+        weight 2-D with shape [in_dim, out_dim]
+
+    bias : Optional[tvm.te.Tensor]
+        1-D with shape [out_dim]
+
+    out_dtype : Optional[str]
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    output : tvm.te.Tensor
+        2-D with shape [batch, out_dim]
+
+    """
+    if bias is not None:
+        assert len(bias.shape) == 1
+    if out_dtype is None:
+        out_dtype = tensor_a.dtype
+
+    batch, in_dim = tensor_a.shape
+    out_dim, red_dim = tensor_b.shape
+
+    # cmp should be done by values
+    assert int(in_dim) == int(red_dim)
+
+    k = te.reduce_axis((0, in_dim), name="k")
+    compute_lambda = lambda n, m: te.sum(
+        tensor_a[n, k].astype(out_dtype) * tensor_b[k, m].astype(out_dtype), axis=k
+    )
+    compute_name = "matmul_sliced"
+    compute_tag = "matmul"
+
+    mat = te.compute(
+        (batch, out_dim),
+        compute_lambda,
+        name=compute_name,
+        tag=compute_tag,
+        attrs={"layout_free_placeholders": [tensor_b]},
+    )
+
+    if bias is not None:
+        mat = te.compute(
+            (batch, out_dim),
+            lambda i, j: mat[i, j] + bias[j],
+            tag=tag.BROADCAST,
+            name="bias",
+        )
+
+    return mat
+
+
+def dense_schedule(outs, ins, output_layout: str, input_layout: str):
+    """Schedule for dense op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of dense in the format
+        of an array of tensors.
+
+    ins: Array of Tensor
+        Input tensors into graph.
+
+    output_layout: str
+        Descriptor string for physical layout
+
+    input_layout: str
+        Descriptor string for physical layout
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    if not isinstance(ins, list):
+        ins = [ins]
+    if not isinstance(outs, list):
+        outs = [outs]
+
+    func = te.create_prim_func([*ins, *outs])
+    s = tir.Schedule(func)
+
+    matmul = s.get_block("matmul_sliced")
+    try:
+        bias = s.get_block("bias")
+    except tir.schedule.schedule.ScheduleError:
+        bias = None
+
+    input_transform_fn = get_layout_transform_fn(input_layout)
+    output_transform_fn = get_layout_transform_fn(output_layout)
+
+    # No bias
+    if bias is None:
+        s.transform_layout(matmul, ("read", 0), input_transform_fn)
+        # s.transform_layout(matmul, ("read", 1), input_transform_fn)
+        s.transform_layout(matmul, ("write", 0), output_transform_fn)
+    else:
+        s.transform_layout(matmul, ("read", 0), input_transform_fn)
+        s.transform_layout(bias, ("write", 0), output_transform_fn)
+
+    _, matmul_c, _ = s.get_loops(matmul)
+    _, matmul_c_inner = s.split(matmul_c, [None, 64])
+    s.vectorize(matmul_c_inner)
+
+    if bias is not None:
+        _, bias_c = s.get_loops(bias)
+        _, bias_c_inner = s.split(bias_c, [None, 64])
+        s.vectorize(bias_c_inner)
+
+    return s
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index 78ed21e8a13b..86aa87adf319 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -75,6 +75,21 @@ def nc_1024c_2d(n, c):
     return [n, c // 1024, te.AXIS_SEPARATOR, c % 1024]
 
 
+def nc_2048c_1d(n, c):
+    """Return index map for nc_2024c 1d layout"""
+    return [n, c // 2048, c % 2048]
+
+
+def nc_2048c_2d(n, c):
+    """Return index map for nc_2024c 2d layout"""
+    return [n, c // 2048, te.AXIS_SEPARATOR, c % 2048]
+
+
+def nc_1024c_1d(n, c):
+    """Return index map for nc_1024c 1d layout"""
+    return [n, c // 1024, c % 1024]
+
+
 def nhwc_4h2w32c2w_2d(n, h, w, c):
     """Return index map for nhwc_4h2w32c2w 2d layout"""
     return [n, h // 4, w // 4, c // 32, te.AXIS_SEPARATOR, h % 4, (w % 4) // 2, c % 32, w % 2]
@@ -100,11 +115,6 @@ def nc_2048_2d(n, c):
     return [n, c // 2048, te.AXIS_SEPARATOR, c % 2048]
 
 
-def nc_2048c_2d(n, c):
-    """Return index map for nc_2048 2d layout"""
-    return [n, c // 2048, te.AXIS_SEPARATOR, c % 2048]
-
-
 def nhwc_8h8w32c_2d(n, h, w, c):
     """Return index map for nhwc_8h8w32c 2d layout"""
     return [n, h // 8, w // 8, c // 32, te.AXIS_SEPARATOR, h % 8, w % 8, c % 32]
@@ -170,8 +180,14 @@ def get_layout_transform_fn(layout):
         return nc_512c_1d
     if layout == "nhwc-4h2w32c2w-2d":
         return nhwc_4h2w32c2w_2d
+    if layout == "nc-2048c-1d":
+        return nc_2048c_1d
+    if layout == "nc-2048c-2d":
+        return nc_2048c_2d
     if layout == "nc-1024c-2d":
         return nc_1024c_2d
+    if layout == "nc-1024c-1d":
+        return nc_1024c_1d
     if layout == "iohw-16i32o2i-1d":
         return iohw_16i32o2i_1d
     if layout == "nhwc-2048c-2d":
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index e81c24694ef9..735b3f2b94b5 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -253,8 +253,14 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str):
 
     if current_layout == "nc":
         n, c = arr_np.shape
+        if new_layout in ["nc-2048c-1d"]:
+            return arr_np.reshape([n, c // 2048, 2048])
+        if new_layout in ["nc-2048c-2d"]:
+            return arr_np.reshape([n, c // 2048, 2048])
         if new_layout in ["nc-1024c-2d"]:
             return arr_np.reshape([n, c // 1024, 1024])
+        if new_layout in ["nc-1024c-1d"]:
+            return arr_np.reshape([n, c // 1024, 1024])
         if new_layout in ["nc-512c-2d"]:
             return arr_np.reshape([n, c // 512, 512])
         if new_layout in ["nc-2048c-2d"]:
diff --git a/tests/python/contrib/test_hexagon/topi/slice_op/test_dense_slice.py b/tests/python/contrib/test_hexagon/topi/slice_op/test_dense_slice.py
new file mode 100644
index 000000000000..e616c384fb40
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/slice_op/test_dense_slice.py
@@ -0,0 +1,282 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import numpy as np
+
+from tvm import te, topi
+
+import tvm.testing
+from tvm.topi import testing
+from tvm.contrib.hexagon.build import HexagonLauncher
+from tvm.contrib.hexagon.session import Session
+import tvm.topi.hexagon.qnn as qnn
+import tvm.topi.hexagon.slice_ops as sl
+from ...infrastructure import transform_numpy, quantize_np
+from tvm.contrib.hexagon import allocate_hexagon_array
+
+
+@tvm.testing.fixture
+def input_np(input_shape, dtype):
+    if "int" in dtype:
+        data = np.random.random(input_shape).astype("float32")
+    elif "float" in dtype:
+        data = np.random.random(input_shape).astype(dtype)
+    return data
+
+
+@tvm.testing.fixture
+def weight_np(weight_shape, dtype):
+    if "int" in dtype:
+        weight = np.random.random(weight_shape).astype("float32")
+    elif "float" in dtype:
+        weight = np.random.random(weight_shape).astype(dtype)
+    return weight
+
+
+@tvm.testing.fixture
+def input_quant(input_np, dtype):
+    if "float" in dtype:
+        return None
+    quant, scale, zp = quantize_np(input_np, dtype)
+    return {"zero": zp, "scale": scale, "data": quant}
+
+
+@tvm.testing.fixture
+def weight_quant(weight_np, dtype):
+    if "float" in dtype:
+        return None
+    quant, scale, zp = quantize_np(weight_np, "int8")
+    return {"zero": zp, "scale": scale, "data": quant}
+
+
+@tvm.testing.fixture
+def bias_np(bias_shape, bias, dtype):
+    if bias:
+        if "int" in dtype:
+            data = np.random.randint(-128, 127, size=bias_shape).astype("int32")
+        elif "float" in dtype:
+            data = np.random.random(bias_shape).astype(dtype)
+        return data
+    else:
+        return None
+
+
+@tvm.testing.fixture
+def quant_arr(input_quant, weight_quant):
+    if input_quant is None:
+        return None
+    arr = np.empty((6,), dtype="float32")
+    arr[0] = input_quant["zero"]
+    arr[1] = input_quant["scale"]
+    arr[2] = weight_quant["zero"]
+    arr[3] = weight_quant["scale"]
+    return arr
+
+
+@tvm.testing.fixture
+def transformed_expected_output_np(expected_output_np, layout):
+    return transform_numpy(expected_output_np, "nc", layout)
+
+
+@tvm.testing.fixture
+def transformed_input_np(input_np, layout):
+    return transform_numpy(input_np, "nc", layout)
+
+
+@tvm.testing.fixture
+def transformed_input_quant(input_quant, layout):
+    if input_quant is None:
+        return None
+    input_quant["data"] = transform_numpy(input_quant["data"], "nc", layout)
+    return input_quant
+
+
+class TestDenseSlice:
+    (input_shape, output_shape, layout, bias, dtype,) = tvm.testing.parameters(
+        (  # Float 16
+            [1, 1024],
+            [1, 1024],
+            "nc-1024c-2d",
+            False,
+            "float16",
+        ),
+        (
+            [1, 2048],
+            [1, 2048],
+            "nc-1024c-2d",
+            True,
+            "float16",
+        ),
+        (  # Uint 8
+            [1, 2048],
+            [1, 2048],
+            "nc-2048c-2d",
+            False,
+            "uint8",
+        ),
+        (
+            [1, 4096],
+            [1, 4096],
+            "nc-2048c-2d",
+            True,
+            "uint8",
+        ),
+    )
+
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np, weight_np, bias_np, bias):
+        ref_np = tvm.topi.testing.dense(
+            np.reshape(input_np, (input_np.shape[0], input_np.shape[-1])),
+            weight_np.T,  # Function expects [in_dim, out_dim]
+            bias_np,
+            use_bias=bias,
+            out_dtype="float32" if "int" in str(input_np.dtype) else input_np.dtype,
+        )
+        return ref_np
+
+    @tvm.testing.fixture
+    def weight_shape(self, input_shape, output_shape):
+        return (output_shape[-1], input_shape[-1])
+
+    @tvm.testing.fixture
+    def bias_shape(self, output_shape):
+        return (output_shape[-1],)
+
+    @tvm.testing.requires_hexagon
+    def test_dense_slice(
+        self,
+        dtype,
+        bias_np,
+        layout,
+        output_shape,
+        input_shape,
+        input_np,
+        input_quant,
+        transformed_input_np,
+        transformed_input_quant,
+        weight_np,
+        # transformed_weight_np,
+        weight_quant,
+        # transformed_weight_quant,
+        transformed_expected_output_np,
+        expected_output_np,
+        quant_arr,
+        hexagon_session: Session,
+    ):
+
+        target_hexagon = tvm.target.hexagon("v69")
+        A = te.placeholder(input_shape, name="A", dtype=dtype)
+        W = te.placeholder(
+            (output_shape[-1], input_shape[-1]),
+            name="W",
+            dtype="int8" if dtype == "uint8" else dtype,
+        )
+        args = [A, W]
+        tensors = [A, W]
+
+        # If quantized, append the quantization params
+        if "int" in dtype:
+            args.append(quant_arr[0].astype("int32"))
+            args.append(quant_arr[1])
+            args.append(quant_arr[2].astype("int32"))
+            args.append(quant_arr[3])
+
+        if bias_np is not None:
+            B = te.placeholder((output_shape[-1],), name="B", dtype=str(bias_np.dtype))
+            args.append(B)
+            tensors.append(B)
+        else:
+            B = None
+
+        # Different compute and schedule for quant and float
+        if "float" in dtype:
+            M = sl.dense_compute(*args)
+            tir_schedule = sl.dense_schedule([M], tensors, layout, layout)
+        elif "int" in dtype:
+            M = qnn.qdense_compute(*args, bias=B)
+            tir_schedule = qnn.qdense_schedule([M], tensors, layout, layout)
+        else:
+            print("Unsupported dtype {}".format(dtype))
+            exit(-1)
+
+        sch = tir_schedule.mod
+
+        input_axis_separator = [2]
+        output_axis_separator = [2]
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(
+                sch,
+                args,
+                target=tvm.target.Target(target_hexagon, host=target_hexagon),
+                name="dense",
+            )
+            func.save("dense.s" if bias_np is None else "dense_bias.s")
+
+        input_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np if "float" in dtype else transformed_input_quant["data"],
+            axis_separators=input_axis_separator,
+            mem_scope="global.vtcm",
+        )
+        weight_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            data=weight_np if "float" in dtype else weight_quant["data"],
+            axis_separators=None,
+            mem_scope="global",
+        )
+        output_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            transformed_expected_output_np.shape,
+            "float32" if "int" in dtype else dtype,
+            axis_separators=output_axis_separator,
+            mem_scope="global.vtcm",
+        )
+        arrs = [input_arr, weight_arr]
+
+        if bias_np is not None:
+            bias_arr = allocate_hexagon_array(
+                hexagon_session.device,
+                data=bias_np,
+                axis_separators=None,
+                mem_scope="global.vtcm",
+            )
+            arrs.append(bias_arr)
+
+        arrs.append(output_arr)
+
+        mod = hexagon_session.load_module(func)
+        mod(*arrs)
+
+        # Reshape for comparison
+        b, c = output_shape
+        if layout == "nc-1024c-2d":
+            output_np = output_arr.numpy().reshape([b, c // 1024, 1024])
+        elif layout == "nc-2048c-2d":
+            output_np = output_arr.numpy().reshape([b, c // 2048, 2048])
+        else:
+            raise RuntimeError(f"Unexpected layout '{layout}'")
+
+        if "int" in dtype:
+            np.testing.assert_allclose(output_np, transformed_expected_output_np, rtol=1e-2, atol=0)
+        elif "float" in dtype:
+            np.testing.assert_allclose(output_np, transformed_expected_output_np, rtol=1e-1, atol=0)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 1e35674b98b07a879db770f70a2c0e57970097df Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Tue, 24 Jan 2023 23:06:30 +0200
Subject: [PATCH 219/286] [RUNTIME] Fix determination of big/little cores
 domains (#13832)

In case of three or more domains we starts to treat non little
cores as big ones and offload work to them as well by default. This
fix current modern schemes like 1-3-4 scheme with 1 huge, 3 big cores and 4
little cores and causes tvm to use huge and big cores for inference
---
 src/runtime/threading_backend.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
index 9f7f2cd8d98a..b6e12a25cca8 100644
--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -306,7 +306,11 @@ class ThreadGroup::Impl {
       int64_t cur_freq = 0;
 #if defined(__linux__) || defined(__ANDROID__)
       std::ostringstream filepath;
-      filepath << "/sys/devices/system/cpu/cpu" << i << "/cpufreq/scaling_max_freq";
+      // according to https://www.kernel.org/doc/Documentation/cpu-freq/user-guide.txt
+      // it's better to use cpuinfo_max_freq instead of scaling_max_freq for our
+      // purposes since scaling values can be changed dynamically according "policy limits"
+      // while we are looking for persistent definition of cores
+      filepath << "/sys/devices/system/cpu/cpu" << i << "/cpufreq/cpuinfo_max_freq";
       std::ifstream ifs(filepath.str());
       if (!ifs.fail()) {
         if (!(ifs >> cur_freq)) {
@@ -335,7 +339,8 @@ class ThreadGroup::Impl {
       }
     }
     if (big_count_ + little_count_ != static_cast<int>(sorted_order_.size())) {
-      LOG(WARNING) << "more than two frequencies detected!";
+      big_count_ = static_cast<int>(sorted_order_.size()) - little_count_;
+      LOG(WARNING) << "more than two frequencies detected! Forced big_count_ to " << big_count_;
     }
   }
 

From ea2cdc889198039be1dca58bec048413d1bb8a93 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 24 Jan 2023 15:29:42 -0800
Subject: [PATCH 220/286] [docker][microTVM]Fix Zephyr 0.15.2 SDK installation
 and separate Zephyr python environment (#13829)

This PR fixes the SDK installation and separates SDK installation from ZephyrProject installation.
Also it separates the Zephyr python environment to 3.8 which is the required version and it is different than tvm virtual env.
---
 docker/Dockerfile.ci_cortexm                |  7 +++--
 docker/Dockerfile.ci_riscv                  |  8 ++++--
 docker/install/ubuntu_install_zephyr.sh     | 30 +++++++++++++++------
 docker/install/ubuntu_install_zephyr_sdk.sh | 13 ++++++---
 4 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm
index 50062d9dea35..a6ea27cf4181 100644
--- a/docker/Dockerfile.ci_cortexm
+++ b/docker/Dockerfile.ci_cortexm
@@ -76,12 +76,15 @@ COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
 ENV PATH /opt/sccache:$PATH
 
-# Zephyr SDK deps
+# Zephyr Project
 COPY install/ubuntu_install_zephyr.sh /install/ubuntu_install_zephyr.sh
 COPY install/ubuntu_init_zephyr_project.sh /install/ubuntu_init_zephyr_project.sh
-COPY install/ubuntu_install_zephyr_sdk.sh /install/ubuntu_install_zephyr_sdk.sh
 RUN bash /install/ubuntu_install_zephyr.sh
 ENV ZEPHYR_BASE=/opt/zephyrproject/zephyr
+
+#Zephyr SDK
+COPY install/ubuntu_install_zephyr_sdk.sh /install/ubuntu_install_zephyr_sdk.sh
+RUN bash /install/ubuntu_install_zephyr_sdk.sh /opt/zephyr-sdk
 ENV PATH /opt/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin:$PATH
 
 # NRF
diff --git a/docker/Dockerfile.ci_riscv b/docker/Dockerfile.ci_riscv
index a640e996c7be..c115df423fc3 100644
--- a/docker/Dockerfile.ci_riscv
+++ b/docker/Dockerfile.ci_riscv
@@ -80,13 +80,17 @@ COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
 ENV PATH /opt/sccache:$PATH
 
-# Zephyr SDK deps
+# Zephyr Project
 COPY install/ubuntu_install_zephyr.sh /install/ubuntu_install_zephyr.sh
 COPY install/ubuntu_init_zephyr_project.sh /install/ubuntu_init_zephyr_project.sh
-COPY install/ubuntu_install_zephyr_sdk.sh /install/ubuntu_install_zephyr_sdk.sh
 RUN bash /install/ubuntu_install_zephyr.sh
 ENV ZEPHYR_BASE=/opt/zephyrproject/zephyr
 
+#Zephyr SDK
+COPY install/ubuntu_install_zephyr_sdk.sh /install/ubuntu_install_zephyr_sdk.sh
+RUN bash /install/ubuntu_install_zephyr_sdk.sh /opt/zephyr-sdk
+ENV PATH /opt/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin:$PATH
+
 # Download RISC-V gcc toolchain (linux)
 COPY install/ubuntu_download_xuantie_gcc_linux.sh /install/ubuntu_download_xuantie_gcc_linux.sh
 RUN bash /install/ubuntu_download_xuantie_gcc_linux.sh /opt/riscv/riscv64-unknown-linux-gnu
diff --git a/docker/install/ubuntu_install_zephyr.sh b/docker/install/ubuntu_install_zephyr.sh
index 6eaa5e5aa16c..8c36171f8d5a 100755
--- a/docker/install/ubuntu_install_zephyr.sh
+++ b/docker/install/ubuntu_install_zephyr.sh
@@ -38,7 +38,27 @@ sudo apt-get update
 
 sudo apt-install-and-clear -y cmake
 
-pip3 install west
+# Find release version
+apt-get update
+apt-install-and-clear -y \
+    lsb-core
+
+release=$(lsb_release -sc)
+if [ "${release}" == "bionic" ]; then
+     python_cmd="python3"
+elif [ "${release}" == "focal" ]; then
+     python_cmd="python3.8"
+else
+    echo "Don't know which version of python to use for Zephyr."
+    exit 2
+fi
+
+# Current Zephyr version is compatible with python3.8.
+# We use a different python env for Zephyr to test the
+# real world scenario where TVM and Zephyr could be in different
+# python environments.
+# TODO: use virtual env for Zephyr.
+$python_cmd -m pip install west
 
 # Init ZephyrProject
 ZEPHYR_PROJECT_PATH=/opt/zephyrproject
@@ -58,10 +78,4 @@ chmod -R o+w ${ZEPHYR_PROJECT_PATH}
 mkdir zephyr/.cache
 chmod o+rwx zephyr/.cache
 
-pip3 install -r /opt/zephyrproject/zephyr/scripts/requirements.txt
-
-# the requirements above overwrite junintparser with an older version, but it is not
-# used so overwrite it again with the correct version
-pip3 install junitparser==2.4.2
-
-bash /install/ubuntu_install_zephyr_sdk.sh /opt/zephyr-sdk
+$python_cmd -m pip install -r /opt/zephyrproject/zephyr/scripts/requirements.txt
diff --git a/docker/install/ubuntu_install_zephyr_sdk.sh b/docker/install/ubuntu_install_zephyr_sdk.sh
index 228baf732120..2c1e926e68da 100755
--- a/docker/install/ubuntu_install_zephyr_sdk.sh
+++ b/docker/install/ubuntu_install_zephyr_sdk.sh
@@ -43,9 +43,14 @@ INSTALLATION_PATH=$1
 shift
 
 ZEPHYR_SDK_FILE_SHA=8e3572fbca9f9ba18a4436c00d680af34a85e239f7fe66c7988da85571a0d23d
+ZEPHYR_SDK_FILE_NAME=zephyr-sdk-0.15.2_linux-x86_64.tar.gz
 wget https://github.com/zephyrproject-rtos/sdk-ng/releases/download/v0.15.2/zephyr-sdk-0.15.2_linux-x86_64.tar.gz
-echo "$ZEPHYR_SDK_FILE_SHA zephyr-sdk-0.15.2_linux-x86_64.tar.gz" | sha256sum --check
+echo "$ZEPHYR_SDK_FILE_SHA ${ZEPHYR_SDK_FILE_NAME}" | sha256sum --check
 
-tar xvf zephyr-sdk-0.15.2_linux-x86_64.tar.gz
-mv zephyr-sdk-0.15.2 zephyr-sdk
-rm zephyr-sdk-0.15.2_linux-x86_64.tar.gz
+mkdir ${INSTALLATION_PATH}
+tar -xvf ${ZEPHYR_SDK_FILE_NAME} -C "${INSTALLATION_PATH}" --strip-components=1
+rm ${ZEPHYR_SDK_FILE_NAME}
+
+# Setup SDK
+cd ${INSTALLATION_PATH}
+./setup.sh -h

From 72189865bd211bd3f52907bd7fb3fa89bf9cc125 Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Wed, 25 Jan 2023 03:21:44 +0200
Subject: [PATCH 221/286] [Adreno] Optimize reduction schedule (#13781)

* [Adreno] Optimize reduction schedule

1. Extend reduction through rfactor for reduction of big numbers of
elements. In other case use previous strightforward approach.
2. Inject reduction postops into main reduction kernel

* Address PR comment
---
 python/tvm/topi/adreno/reduction.py           | 103 ++++++++++----
 python/tvm/topi/cuda/reduction.py             |   6 +-
 .../transforms/annotate_texture_storage.cc    |   2 +-
 .../opencl_texture/test_reduction_texture.py  | 126 ++++++++++++++++++
 4 files changed, 211 insertions(+), 26 deletions(-)

diff --git a/python/tvm/topi/adreno/reduction.py b/python/tvm/topi/adreno/reduction.py
index b95832c60f2a..4ff519fd7c1c 100644
--- a/python/tvm/topi/adreno/reduction.py
+++ b/python/tvm/topi/adreno/reduction.py
@@ -25,45 +25,102 @@
 
 
 def _schedule_reduce_adreno(op, sch, is_idx_reduce=False):
-    if is_idx_reduce:
-        real_output = op.output(0)
+    sch_output = sch.outputs[0].output(0)
+    use_rfactor = False
+    if not is_idx_reduce:
+        rdomain = 1
+        whole_rop_output = op.output(0)
+        for axis in sch[whole_rop_output].op.reduce_axis:
+            rdomain = rdomain * axis.dom.extent
+        if rdomain > 50:
+            use_rfactor = True
+            # shared goves better perf, but works only for rfactor flow
+            scope = "shared"
+        else:
+            # in case of direct scheduling, shared is failed to be compiled
+            scope = "local"
+        if op in sch.outputs:
+            whole_rop_output = sch.cache_write(sch_output, scope)
+        else:
+            # no change for whole_rop_output def, but need to set proper scope
+            sch[whole_rop_output].set_scope(scope)
+    else:
         temp_idx_input = op.input_tensors[0].op.output(0)
         temp_val_input = op.input_tensors[0].op.output(1)
-    else:
-        real_output = op.output(0)
-    shape = get_const_tuple(real_output.shape)
+        sch[temp_idx_input].set_scope("local")
+        sch[temp_val_input].set_scope("local")
+
+    shape = get_const_tuple(sch_output.shape)
     latest4 = shape[-1] == 4
     div4 = numpy.prod(shape) % 4 == 0
 
     # Fuse and split the axis
     if latest4:
-        fused_outer = sch[real_output].fuse(
-            *[sch[real_output].op.axis[i] for i in range(len(sch[real_output].op.axis) - 1)]
+        fused_outer = sch[sch_output].fuse(
+            *[sch[sch_output].op.axis[i] for i in range(len(sch[sch_output].op.axis) - 1)]
         )
     else:
-        fused_outer = sch[real_output].fuse(
-            *[sch[real_output].op.axis[i] for i in range(len(sch[real_output].op.axis))]
+        fused_outer = sch[sch_output].fuse(
+            *[sch[sch_output].op.axis[i] for i in range(len(sch[sch_output].op.axis))]
         )
 
     ftc = numpy.prod(shape)
     a = fused_outer
-    if latest4:
-        sch[real_output].vectorize(sch[real_output].op.axis[-1])
-    elif div4 and not is_idx_reduce:
-        a, b = sch[real_output].split(fused_outer, factor=4)
-        sch[real_output].vectorize(b)
-        ftc = ftc / 4
 
-    num_thread = get_div(ftc, 128)
+    if not is_idx_reduce:
+        if use_rfactor:
+            # below values were selected empirically assuming that we should have some work in each
+            # thread (currently from 25-49) and number of threads not exceeding some threshold that
+            # was selected as 256 from performance point of view after experiments on Adreno 660
+            max_threads = rdomain.value // 25 if rdomain > 25 else 1
+            max_threads = 256 if max_threads > 256 else max_threads
+            num_thread = get_div(rdomain, max_threads)
 
-    bx, outer_in = sch[real_output].split(a, factor=num_thread)
+            fused_reduce = sch[whole_rop_output].fuse(*sch[whole_rop_output].op.reduce_axis)
+            thread_y = te.thread_axis((0, num_thread), "threadIdx.y")
+            _, ki = sch[whole_rop_output].split(fused_reduce, factor=num_thread)
+            data_out_rf = sch.rfactor(whole_rop_output, ki)
+            sch[data_out_rf].compute_at(
+                sch[whole_rop_output], sch[whole_rop_output].op.reduce_axis[0]
+            )
+            sch[whole_rop_output].bind(sch[whole_rop_output].op.reduce_axis[0], thread_y)
 
-    sch[real_output].bind(bx, te.thread_axis("blockIdx.x"))
-    sch[real_output].bind(outer_in, te.thread_axis("threadIdx.y"))
-    if is_idx_reduce:
-        sch[temp_idx_input].compute_at(sch[real_output], outer_in)
-        sch[temp_val_input].compute_at(sch[real_output], outer_in)
+    if div4:
+        if latest4:
+            b = sch[sch_output].op.axis[-1]
+        else:
+            a, b = sch[sch_output].split(fused_outer, factor=4)
+        sch[sch_output].vectorize(b)
+        if not use_rfactor:
+            if is_idx_reduce:
+                sch[temp_idx_input].compute_at(sch[sch_output], b)
+                sch[temp_val_input].compute_at(sch[sch_output], b)
+            else:
+                sch[whole_rop_output].compute_at(sch[sch_output], b)
+
+    if not use_rfactor:
+        num_thread = get_div(ftc, 128)
+        bx, outer_in = sch[sch_output].split(a, factor=num_thread)
+        sch[sch_output].bind(bx, te.thread_axis("blockIdx.x"))
+        sch[sch_output].bind(outer_in, te.thread_axis("threadIdx.x"))
+
+        if not div4:
+            if is_idx_reduce:
+                sch[temp_idx_input].compute_at(sch[sch_output], outer_in)
+                sch[temp_val_input].compute_at(sch[sch_output], outer_in)
+            else:
+                sch[whole_rop_output].compute_at(sch[sch_output], outer_in)
+    else:
+        sch[sch_output].bind(a, te.thread_axis("blockIdx.x"))
+        if not div4 or use_rfactor:
+            if is_idx_reduce:
+                sch[temp_idx_input].compute_at(sch[sch_output], a)
+                sch[temp_val_input].compute_at(sch[sch_output], a)
+            else:
+                sch[whole_rop_output].compute_at(sch[sch_output], a)
 
 
 def schedule_reduce(outs):
-    return schedule_reduce_impl(outs, _schedule_reduce_adreno, schedule_injective_from_existing)
+    return schedule_reduce_impl(
+        outs, _schedule_reduce_adreno, schedule_injective_from_existing, True
+    )
diff --git a/python/tvm/topi/cuda/reduction.py b/python/tvm/topi/cuda/reduction.py
index 318d72b1e5d0..e4234a9cce3f 100644
--- a/python/tvm/topi/cuda/reduction.py
+++ b/python/tvm/topi/cuda/reduction.py
@@ -116,7 +116,9 @@ def is_scheduled(stage):
     return True
 
 
-def schedule_reduce_impl(outs, schedule_reduce_stage, schedule_injective_stage):
+def schedule_reduce_impl(
+    outs, schedule_reduce_stage, schedule_injective_stage, inline_postops=False
+):
     """Schedule for inject->reduce->bcast ops.
     Traverse over the stages in the schedule and schedule separate stages depending
     on the position of the stage. Injecteve post-ops of reduction will be scheduled using
@@ -160,7 +162,7 @@ def traverse_before_reduce(operator):
     def traverse_after_reduce(operator):
         """Internal traverse function"""
         if tag.is_broadcast(operator.tag):
-            if operator not in scheduled_ops:
+            if operator not in scheduled_ops and not inline_postops:
                 schedule_injective_stage(sch, operator.output(0))
             for tensor in operator.input_tensors:
                 if tensor.op not in scheduled_ops:
diff --git a/src/relay/transforms/annotate_texture_storage.cc b/src/relay/transforms/annotate_texture_storage.cc
index 39f065ea8c11..d3748449adaa 100644
--- a/src/relay/transforms/annotate_texture_storage.cc
+++ b/src/relay/transforms/annotate_texture_storage.cc
@@ -404,7 +404,7 @@ class StorageInfo : private transform::DeviceAwareExprVisitor {
     } else if (const OpNode* opnode = call->op.as<OpNode>()) {
       auto fpattern = Op::GetAttrMap<TOpPattern>("TOpPattern");
       auto pattern = fpattern[GetRef<Op>(opnode)];
-      if (pattern <= kInjective) {
+      if (pattern <= kCommReduce) {
         if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
           if (ttype->shape.size() == 5) {
             supports_texture_storage = true;
diff --git a/tests/python/relay/opencl_texture/test_reduction_texture.py b/tests/python/relay/opencl_texture/test_reduction_texture.py
index 9dc8a8992d27..cc2dbff173e0 100644
--- a/tests/python/relay/opencl_texture/test_reduction_texture.py
+++ b/tests/python/relay/opencl_texture/test_reduction_texture.py
@@ -51,5 +51,131 @@ def test_argmax(remote, target, dtype):
     build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
 
 
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_reduction_max(remote, target, dtype):
+    # NCHW
+    input_shape = (1, 3, 720, 1280)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    argmax = relay.op.max(A, axis=[1])
+    mod = relay.Function([A], argmax)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_mean_nd4(remote, target, dtype):
+    # NCHW
+    input_shape = (1, 3, 729, 729)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    mean = relay.mean(A, axis=1, keepdims=True)
+    mod = relay.Function([A], mean)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_argmax_nd4(remote, target, dtype):
+    # NCHW
+    input_shape = (1, 3, 729, 729)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    argmax = relay.op.argmax(A, axis=[1])
+    mod = relay.Function([A], argmax)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_reduction_max_nd4(remote, target, dtype):
+    # NCHW
+    input_shape = (1, 3, 729, 729)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    argmax = relay.op.max(A, axis=[1])
+    mod = relay.Function([A], argmax)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_mean_b4(remote, target, dtype):
+    # NCHW
+    input_shape = (1, 3, 720, 320, 4)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    mean = relay.mean(A, axis=1, keepdims=True)
+    mod = relay.Function([A], mean)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_argmax_b4(remote, target, dtype):
+    # NCHW
+    input_shape = (1, 3, 720, 320, 4)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    argmax = relay.op.argmax(A, axis=[1])
+    mod = relay.Function([A], argmax)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_reduction_max_b4(remote, target, dtype):
+    # NCHW
+    input_shape = (1, 3, 720, 320, 4)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    argmax = relay.op.max(A, axis=[1])
+    mod = relay.Function([A], argmax)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_mean_global_pooling(remote, target, dtype):
+    """
+    Use case of blocked NCHW4c global pooling with big spatial valies
+    """
+    input_shape = (1, 160, 160, 32)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    mean = relay.mean(A, axis=[1, 2], keepdims=True)
+    mod = relay.Function([A], mean)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_mean_global_pooling_block4(remote, target, dtype):
+    """
+    Use case of blocked NCHW4c global pooling with big spatial valies
+    """
+    input_shape = (1, 160, 160, 8, 4)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    mean = relay.mean(A, axis=[1, 2], keepdims=True)
+    mod = relay.Function([A], mean)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_max_global_pooling_block4(remote, target, dtype):
+    """
+    Use case of blocked NCHW4c global pooling with big spatial valies
+    """
+    input_shape = (1, 160, 160, 8, 4)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    mean = relay.max(A, axis=[1, 2], keepdims=True)
+    mod = relay.Function([A], mean)
+
+    build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": dtype}, target)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From a9831a2298eb3e6a07c9f871b1e9f20aa3300496 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao@apache.org>
Date: Tue, 24 Jan 2023 21:24:20 -0800
Subject: [PATCH 222/286] [TVMScript] Default to T.Buffer than T.buffer_decl
 (#13838)

TVMScript parser supports both `T.Buffer` and `T.buffer_decl` interchangeably, which share the same semantics in TIR AST. However, `T.buffer_decl` is usually confused with `T.decl_buffer`. To clarify the semantics, we decide to print `T.Buffer` instead.

Note that this PR is backward compatible with the previous behavior, i.e. the parser still parses TVMScript with `T.decl_buffer`, and the only difference is the print now produces `T.Buffer` instead by default.
---
 python/tvm/script/ir_builder/tir/__init__.py  |   1 +
 python/tvm/script/parser/tir/entry.py         |   4 +-
 src/script/printer/tir/buffer.cc              |   3 +-
 src/script/printer/tir/ir.cc                  |   8 +-
 src/script/printer/utils.h                    |   4 +
 .../test_copy_compute_reordering.py           | 146 ++++++------
 .../test_ethosu/test_encode_constants.py      | 130 +++++------
 .../test_ethosu/test_hoist_allocates.py       |  72 +++---
 .../test_ethosu/test_merge_constants.py       | 216 +++++++++---------
 .../test_ethosu/test_remove_concatenates.py   |  24 +-
 .../test_ethosu/test_replace_conv2d.py        | 130 +++++------
 .../contrib/test_ethosu/test_replace_copy.py  |  20 +-
 .../contrib/test_ethosu/test_scheduler.py     |  16 +-
 .../test_ethosu/test_tir_to_cs_translator.py  |  64 +++---
 .../relay/aot/test_pass_aot_lower_main.py     |   4 +-
 tests/python/unittest/test_lower_build.py     |  12 +-
 tests/python/unittest/test_tir_renew_defs.py  |   2 +-
 .../test_tir_schedule_cache_read_write.py     |   8 +-
 .../test_tir_transform_common_subexpr_elim.py |   8 +-
 .../test_tir_transform_extract_constants.py   |   6 +-
 .../test_tir_transform_flatten_buffer.py      |  52 ++---
 ...est_tir_transform_inject_rolling_buffer.py |   4 +-
 ...est_tir_transform_inject_virtual_thread.py |   8 +-
 .../test_tir_transform_loop_partition.py      |  32 +--
 ...tir_transform_renormalize_split_pattern.py |  18 +-
 .../test_tir_transform_storage_rewrite.py     |  20 +-
 .../test_tir_transform_thread_sync.py         |   2 +-
 ...orm_convert_pool_allocations_to_offsets.py |  48 ++--
 .../unittest/test_tvmscript_ir_builder_tir.py |  18 +-
 .../unittest/test_tvmscript_printer_tir.py    |  14 +-
 .../unittest/test_tvmscript_roundtrip.py      | 132 +++++------
 .../unittest/test_tvmscript_syntax_sugar.py   |  12 -
 32 files changed, 607 insertions(+), 631 deletions(-)

diff --git a/python/tvm/script/ir_builder/tir/__init__.py b/python/tvm/script/ir_builder/tir/__init__.py
index 0a71af4db7e6..563ac56f7b10 100644
--- a/python/tvm/script/ir_builder/tir/__init__.py
+++ b/python/tvm/script/ir_builder/tir/__init__.py
@@ -17,3 +17,4 @@
 """Package tvm.script.ir_builder.tir"""
 from .ir import *  # pylint: disable=wildcard-import,redefined-builtin
 from .ir import boolean as bool  # pylint: disable=redefined-builtin
+from .ir import buffer_decl as Buffer
diff --git a/python/tvm/script/parser/tir/entry.py b/python/tvm/script/parser/tir/entry.py
index a5c134a8594c..e7ec7cf886d4 100644
--- a/python/tvm/script/parser/tir/entry.py
+++ b/python/tvm/script/parser/tir/entry.py
@@ -55,7 +55,7 @@ class BufferProxy:
     def __call__(
         self,
         shape,
-        dtype=None,
+        dtype="float32",
         data=None,
         strides=None,
         elem_offset=None,
@@ -65,8 +65,6 @@ def __call__(
         buffer_type="",
         axis_separators=None,
     ) -> Buffer:
-        if dtype is None:
-            raise ValueError("Data type must be specified when constructing buffer")
         return buffer_decl(
             shape,
             dtype=dtype,
diff --git a/src/script/printer/tir/buffer.cc b/src/script/printer/tir/buffer.cc
index b4429dc9afc9..19f3dc7ef577 100644
--- a/src/script/printer/tir/buffer.cc
+++ b/src/script/printer/tir/buffer.cc
@@ -209,8 +209,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)  //
       if (!d->IsVarDefined(buffer)) {
         if (Optional<Frame> opt_f = FindLowestVarDef(buffer, d)) {
           ExprDoc lhs = DefineBuffer(buffer, opt_f.value(), d);
-          ExprDoc rhs = BufferDecl(buffer, "buffer_decl",  // TODO(@junrushao): name confusing
-                                   {}, p, opt_f.value(), d);
+          ExprDoc rhs = BufferDecl(buffer, "Buffer", {}, p, opt_f.value(), d);
           opt_f.value()->stmts.push_back(AssignDoc(lhs, rhs, NullOpt));
         }
       }
diff --git a/src/script/printer/tir/ir.cc b/src/script/printer/tir/ir.cc
index 76d3680fec81..ce10ff6816d7 100644
--- a/src/script/printer/tir/ir.cc
+++ b/src/script/printer/tir/ir.cc
@@ -34,8 +34,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       } else if (dtype == DataType::Bool()) {
         return LiteralDoc::Boolean(imm->value, imm_p->Attr("value"));
       } else {
-        return TIR(d, runtime::DLDataType2String(dtype))  //
-            ->Call({LiteralDoc::Int(imm->value, imm_p->Attr("value"))});
+        return TIR(d, DType2Str(dtype))->Call({LiteralDoc::Int(imm->value, imm_p->Attr("value"))});
       }
     });
 
@@ -45,7 +44,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       if (dtype == d->cfg->float_dtype) {
         return LiteralDoc::Float(imm->value, imm_p->Attr("value"));
       } else {
-        return TIR(d, runtime::DLDataType2String(dtype))  //
+        return TIR(d, DType2Str(dtype))
             ->Call({LiteralDoc::Float(imm->value, imm_p->Attr("value"))});
       }
     });
@@ -61,8 +60,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<PrimType>("", [](PrimType ty, ObjectPath p, IRDocsifier d) -> Doc {
-      std::string dtype = ty->dtype.is_void() ? "void" : runtime::DLDataType2String(ty->dtype);
-      return TIR(d, dtype);
+      return TIR(d, DType2Str(ty->dtype));
     });
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
diff --git a/src/script/printer/utils.h b/src/script/printer/utils.h
index 5161f1f9a268..cb20eb363ddd 100644
--- a/src/script/printer/utils.h
+++ b/src/script/printer/utils.h
@@ -65,6 +65,10 @@ inline std::string Docsify(const ObjectRef& obj, const IRDocsifier& d, const Fra
   return DocToPythonScript(StmtBlockDoc(f->stmts), cfg);
 }
 
+inline std::string DType2Str(const runtime::DataType& dtype) {
+  return dtype.is_void() ? "void" : runtime::DLDataType2String(dtype);
+}
+
 }  // namespace printer
 }  // namespace script
 }  // namespace tvm
diff --git a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
index 586b8b380e22..02b5f9f7f122 100644
--- a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
+++ b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
@@ -29,16 +29,16 @@ class AllOperatorsWithWeights:
     def main() -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer1 = T.buffer_decl([8192], "int8")
-        buffer2 = T.buffer_decl([128], "uint8")
-        buffer3 = T.buffer_decl([32], "uint8")
-        buffer4 = T.buffer_decl([112], "uint8")
-        buffer5 = T.buffer_decl([32], "uint8")
-        buffer6 = T.buffer_decl([112], "uint8")
-        buffer7 = T.buffer_decl([32], "uint8")
-        buffer8 = T.buffer_decl([112], "uint8")
-        buffer9 = T.buffer_decl([32], "uint8")
-        buffer10 = T.buffer_decl([2048], "int8")
+        buffer1 = T.Buffer([8192], "int8")
+        buffer2 = T.Buffer([128], "uint8")
+        buffer3 = T.Buffer([32], "uint8")
+        buffer4 = T.Buffer([112], "uint8")
+        buffer5 = T.Buffer([32], "uint8")
+        buffer6 = T.Buffer([112], "uint8")
+        buffer7 = T.Buffer([32], "uint8")
+        buffer8 = T.Buffer([112], "uint8")
+        buffer9 = T.Buffer([32], "uint8")
+        buffer10 = T.Buffer([2048], "int8")
         # body
         p1 = T.decl_buffer([128], "uint8")
         p2 = T.decl_buffer([112], "uint8")
@@ -77,16 +77,16 @@ class ReferenceModule:
         def main() -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer1 = T.buffer_decl([8192], "int8")
-            buffer2 = T.buffer_decl([128], "uint8")
-            buffer3 = T.buffer_decl([32], "uint8")
-            buffer4 = T.buffer_decl([112], "uint8")
-            buffer5 = T.buffer_decl([32], "uint8")
-            buffer6 = T.buffer_decl([112], "uint8")
-            buffer7 = T.buffer_decl([32], "uint8")
-            buffer8 = T.buffer_decl([112], "uint8")
-            buffer9 = T.buffer_decl([32], "uint8")
-            buffer10 = T.buffer_decl([2048], "int8")
+            buffer1 = T.Buffer([8192], "int8")
+            buffer2 = T.Buffer([128], "uint8")
+            buffer3 = T.Buffer([32], "uint8")
+            buffer4 = T.Buffer([112], "uint8")
+            buffer5 = T.Buffer([32], "uint8")
+            buffer6 = T.Buffer([112], "uint8")
+            buffer7 = T.Buffer([32], "uint8")
+            buffer8 = T.Buffer([112], "uint8")
+            buffer9 = T.Buffer([32], "uint8")
+            buffer10 = T.Buffer([2048], "int8")
             # body
             p1 = T.decl_buffer([128], "uint8")
             p2 = T.decl_buffer([112], "uint8")
@@ -123,16 +123,16 @@ class ReferenceModule:
         def main() -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer1 = T.buffer_decl([8192], "int8")
-            buffer2 = T.buffer_decl([128], "uint8")
-            buffer3 = T.buffer_decl([32], "uint8")
-            buffer4 = T.buffer_decl([112], "uint8")
-            buffer5 = T.buffer_decl([32], "uint8")
-            buffer6 = T.buffer_decl([112], "uint8")
-            buffer7 = T.buffer_decl([32], "uint8")
-            buffer8 = T.buffer_decl([112], "uint8")
-            buffer9 = T.buffer_decl([32], "uint8")
-            buffer10 = T.buffer_decl([2048], "int8")
+            buffer1 = T.Buffer([8192], "int8")
+            buffer2 = T.Buffer([128], "uint8")
+            buffer3 = T.Buffer([32], "uint8")
+            buffer4 = T.Buffer([112], "uint8")
+            buffer5 = T.Buffer([32], "uint8")
+            buffer6 = T.Buffer([112], "uint8")
+            buffer7 = T.Buffer([32], "uint8")
+            buffer8 = T.Buffer([112], "uint8")
+            buffer9 = T.Buffer([32], "uint8")
+            buffer10 = T.Buffer([2048], "int8")
             # body
             p1 = T.decl_buffer([128], "uint8")
             p2 = T.decl_buffer([112], "uint8")
@@ -167,8 +167,8 @@ class AllOperatorsWithoutWeights:
     @T.prim_func
     def main() -> None:
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer1 = T.buffer_decl([36], "int8")
-        buffer2 = T.buffer_decl([9], "int8")
+        buffer1 = T.Buffer([36], "int8")
+        buffer2 = T.Buffer([9], "int8")
         # body
         p1 = T.decl_buffer([96], "int8")
         T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 4, 3, 3, 0, 4, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 12, 3, 1, "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -189,11 +189,11 @@ class OperatorsWithAndWithoutWeights:
     @T.prim_func
     def main() -> None:
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer1 = T.buffer_decl([97156], "int8")
-        buffer2 = T.buffer_decl([80], "uint8")
-        buffer3 = T.buffer_decl([64], "uint8")
-        buffer4 = T.buffer_decl([96], "uint8")
-        buffer5 = T.buffer_decl([32], "uint8")
+        buffer1 = T.Buffer([97156], "int8")
+        buffer2 = T.Buffer([80], "uint8")
+        buffer3 = T.Buffer([64], "uint8")
+        buffer4 = T.Buffer([96], "uint8")
+        buffer5 = T.Buffer([32], "uint8")
         # body
         p1 = T.decl_buffer([390336], "int8")
         p2 = T.decl_buffer([80], "uint8")
@@ -224,11 +224,11 @@ class ReferenceModule:
         @T.prim_func
         def main() -> None:
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer1 = T.buffer_decl([97156], "int8")
-            buffer2 = T.buffer_decl([80], "uint8")
-            buffer3 = T.buffer_decl([64], "uint8")
-            buffer4 = T.buffer_decl([96], "uint8")
-            buffer5 = T.buffer_decl([32], "uint8")
+            buffer1 = T.Buffer([97156], "int8")
+            buffer2 = T.Buffer([80], "uint8")
+            buffer3 = T.Buffer([64], "uint8")
+            buffer4 = T.Buffer([96], "uint8")
+            buffer5 = T.Buffer([32], "uint8")
             # body
             p1 = T.decl_buffer([390336], "int8")
             p2 = T.decl_buffer([80], "uint8")
@@ -257,11 +257,11 @@ class ReferenceModule:
         @T.prim_func
         def main() -> None:
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer1 = T.buffer_decl([97156], "int8")
-            buffer2 = T.buffer_decl([80], "uint8")
-            buffer3 = T.buffer_decl([64], "uint8")
-            buffer4 = T.buffer_decl([96], "uint8")
-            buffer5 = T.buffer_decl([32], "uint8")
+            buffer1 = T.Buffer([97156], "int8")
+            buffer2 = T.Buffer([80], "uint8")
+            buffer3 = T.Buffer([64], "uint8")
+            buffer4 = T.Buffer([96], "uint8")
+            buffer5 = T.Buffer([32], "uint8")
             # body
             p1 = T.decl_buffer([390336], "int8")
             p2 = T.decl_buffer([80], "uint8")
@@ -289,14 +289,14 @@ class CopyToBufferWithLocalScope:
     @T.prim_func
     def main() -> None:
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer1 = T.buffer_decl([64], "uint8")
-        buffer2 = T.buffer_decl([48], "uint8")
-        buffer3 = T.buffer_decl([48], "uint8")
-        buffer4 = T.buffer_decl([256], "uint8")
-        buffer5 = T.buffer_decl([16], "uint8")
-        buffer6 = T.buffer_decl([48], "uint8")
-        buffer7 = T.buffer_decl([256], "uint8")
-        buffer8 = T.buffer_decl([64], "uint8")
+        buffer1 = T.Buffer([64], "uint8")
+        buffer2 = T.Buffer([48], "uint8")
+        buffer3 = T.Buffer([48], "uint8")
+        buffer4 = T.Buffer([256], "uint8")
+        buffer5 = T.Buffer([16], "uint8")
+        buffer6 = T.Buffer([48], "uint8")
+        buffer7 = T.Buffer([256], "uint8")
+        buffer8 = T.Buffer([64], "uint8")
         # body
         p1 = T.decl_buffer([48], "uint8")
         p2 = T.decl_buffer([48], "uint8")
@@ -330,14 +330,14 @@ class ReferenceModule:
         @T.prim_func
         def main() -> None:
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer1 = T.buffer_decl([64], "uint8")
-            buffer2 = T.buffer_decl([48], "uint8")
-            buffer3 = T.buffer_decl([48], "uint8")
-            buffer4 = T.buffer_decl([256], "uint8")
-            buffer5 = T.buffer_decl([16], "uint8")
-            buffer6 = T.buffer_decl([48], "uint8")
-            buffer7 = T.buffer_decl([256], "uint8")
-            buffer8 = T.buffer_decl([64], "uint8")
+            buffer1 = T.Buffer([64], "uint8")
+            buffer2 = T.Buffer([48], "uint8")
+            buffer3 = T.Buffer([48], "uint8")
+            buffer4 = T.Buffer([256], "uint8")
+            buffer5 = T.Buffer([16], "uint8")
+            buffer6 = T.Buffer([48], "uint8")
+            buffer7 = T.Buffer([256], "uint8")
+            buffer8 = T.Buffer([64], "uint8")
             # body
             p1 = T.decl_buffer([48], "uint8")
             p2 = T.decl_buffer([48], "uint8")
@@ -406,11 +406,11 @@ class ReferenceModule:
         @T.prim_func
         def main() -> None:
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer1 = T.buffer_decl([97156], "int8")
-            buffer2 = T.buffer_decl([80], "uint8")
-            buffer3 = T.buffer_decl([64], "uint8")
-            buffer4 = T.buffer_decl([96], "uint8")
-            buffer5 = T.buffer_decl([32], "uint8")
+            buffer1 = T.Buffer([97156], "int8")
+            buffer2 = T.Buffer([80], "uint8")
+            buffer3 = T.Buffer([64], "uint8")
+            buffer4 = T.Buffer([96], "uint8")
+            buffer5 = T.Buffer([32], "uint8")
             # body
             p1 = T.decl_buffer([390336], "int8")
             p2 = T.decl_buffer([80], "uint8")
@@ -439,11 +439,11 @@ class ReferenceModule:
         @T.prim_func
         def main() -> None:
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer1 = T.buffer_decl([97156], "int8")
-            buffer2 = T.buffer_decl([80], "uint8")
-            buffer3 = T.buffer_decl([64], "uint8")
-            buffer4 = T.buffer_decl([96], "uint8")
-            buffer5 = T.buffer_decl([32], "uint8")
+            buffer1 = T.Buffer([97156], "int8")
+            buffer2 = T.Buffer([80], "uint8")
+            buffer3 = T.Buffer([64], "uint8")
+            buffer4 = T.Buffer([96], "uint8")
+            buffer5 = T.Buffer([32], "uint8")
             # body
             p1 = T.decl_buffer([390336], "int8")
             p2 = T.decl_buffer([80], "uint8")
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index 0728840ee96b..871c7e29df20 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -39,19 +39,19 @@ class WeightStreamOnlyU55:
     def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write: T.Buffer[(1, 16, 16, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        placeholder = T.buffer_decl([8192], "int8", data=input_placeholder.data)
-        ethosu_write = T.buffer_decl([2048], "int8", data=input_ethosu_write.data)
-        buffer1 = T.buffer_decl([160], "uint8")
-        buffer3 = T.buffer_decl([144], "uint8")
-        buffer5 = T.buffer_decl([144], "uint8")
-        buffer7 = T.buffer_decl([144], "uint8")
-        buffer8 = T.buffer_decl([32], "uint8")
+        placeholder = T.Buffer([8192], "int8", data=input_placeholder.data)
+        ethosu_write = T.Buffer([2048], "int8", data=input_ethosu_write.data)
+        buffer1 = T.Buffer([160], "uint8")
+        buffer3 = T.Buffer([144], "uint8")
+        buffer5 = T.Buffer([144], "uint8")
+        buffer7 = T.Buffer([144], "uint8")
+        buffer8 = T.Buffer([32], "uint8")
         # body
         p1_data = T.allocate([160], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p1 = T.buffer_decl([160], "uint8", data=p1_data)
+        p1 = T.Buffer([160], "uint8", data=p1_data)
         p2_data = T.allocate([144], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.buffer_decl([144], "uint8", data=p2_data)
-        buffer9 = T.buffer_decl([144], "uint8", data=p1.data)
+        p2 = T.Buffer([144], "uint8", data=p2_data)
+        buffer9 = T.Buffer([144], "uint8", data=p1.data)
         T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 160, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 144, p2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, T.int8(-1), T.int8(-1), 12, p1[128], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -70,18 +70,18 @@ def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_writ
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        placeholder = T.buffer_decl([8192], dtype="int8", data=input_placeholder.data)
-        ethosu_write = T.buffer_decl([2048], dtype="int8", data=input_ethosu_write.data)
-        buffer_encoded_1 = T.buffer_decl([192], dtype="uint8")
-        buffer_encoded_2_1 = T.buffer_decl([192], dtype="uint8")
-        buffer_encoded_4_1 = T.buffer_decl([208], dtype="uint8")
-        buffer_encoded_6_1 = T.buffer_decl([192], dtype="uint8")
+        placeholder = T.Buffer([8192], dtype="int8", data=input_placeholder.data)
+        ethosu_write = T.Buffer([2048], dtype="int8", data=input_ethosu_write.data)
+        buffer_encoded_1 = T.Buffer([192], dtype="uint8")
+        buffer_encoded_2_1 = T.Buffer([192], dtype="uint8")
+        buffer_encoded_4_1 = T.Buffer([208], dtype="uint8")
+        buffer_encoded_6_1 = T.Buffer([192], dtype="uint8")
         # body
         p1_data = T.allocate([208], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p1 = T.buffer_decl([208], "uint8", data=p1_data)
+        p1 = T.Buffer([208], "uint8", data=p1_data)
         p2_data = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.buffer_decl([192], "uint8", data=p2_data)
-        p3 = T.buffer_decl([192], dtype="uint8", data=p1.data)
+        p2 = T.Buffer([192], "uint8", data=p2_data)
+        p3 = T.Buffer([192], dtype="uint8", data=p1.data)
         T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 192, p3[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 192, p2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 80, p3[80], 80, 12, p3[160], 16, p3[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -157,14 +157,14 @@ class RereadWeightsU55:
     def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write: T.Buffer[(1, 16, 16, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer1 = T.buffer_decl([384], "uint8")
-        placeholder = T.buffer_decl([8192], "int8", data=input_placeholder.data)
-        ethosu_write = T.buffer_decl([2048], "int8", data=input_ethosu_write.data)
+        buffer1 = T.Buffer([384], "uint8")
+        placeholder = T.Buffer([8192], "int8", data=input_placeholder.data)
+        ethosu_write = T.Buffer([2048], "int8", data=input_ethosu_write.data)
         # body
         p1_data = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p1 = T.buffer_decl([384], "uint8", data=p1_data)
+        p1 = T.Buffer([384], "uint8", data=p1_data)
         p2_data = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.buffer_decl([384], "uint8", data=p2_data)
+        p2 = T.Buffer([384], "uint8", data=p2_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 384, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 384, p2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 304, T.int8(-1), T.int8(-1), 12, p1[304], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -179,14 +179,14 @@ def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_writ
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        placeholder = T.buffer_decl([8192], dtype="int8", data=input_placeholder.data)
-        ethosu_write = T.buffer_decl([2048], dtype="int8", data=input_ethosu_write.data)
-        placeholder_encoded_1 = T.buffer_decl([464], "uint8")
+        placeholder = T.Buffer([8192], dtype="int8", data=input_placeholder.data)
+        ethosu_write = T.Buffer([2048], dtype="int8", data=input_ethosu_write.data)
+        placeholder_encoded_1 = T.Buffer([464], "uint8")
         # body
         p1_data = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p1 = T.buffer_decl([464], "uint8", data=p1_data)
+        p1 = T.Buffer([464], "uint8", data=p1_data)
         p2_data = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.buffer_decl([464], "uint8", data=p2_data)
+        p2 = T.Buffer([464], "uint8", data=p2_data)
         T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 464, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 464, p2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -259,15 +259,15 @@ class DirectReadOnlyU55:
     def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write: T.Buffer[(1, 16, 16, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([592], "uint8")
-        buffer_1 = T.buffer_decl([160], "uint8")
-        buffer_2 = T.buffer_decl([160], "uint8")
-        buffer_3 = T.buffer_decl([80], "uint8")
-        placeholder = T.buffer_decl([8192], "int8", data=input_placeholder.data)
-        ethosu_write = T.buffer_decl([2048], "int8", data=input_ethosu_write.data)
+        buffer = T.Buffer([592], "uint8")
+        buffer_1 = T.Buffer([160], "uint8")
+        buffer_2 = T.Buffer([160], "uint8")
+        buffer_3 = T.Buffer([80], "uint8")
+        placeholder = T.Buffer([8192], "int8", data=input_placeholder.data)
+        ethosu_write = T.Buffer([2048], "int8", data=input_ethosu_write.data)
         # body
         ethosu_write_1_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        ethosu_write_1 = T.buffer_decl([4096], "int8", data=ethosu_write_1_data)
+        ethosu_write_1 = T.Buffer([4096], "int8", data=ethosu_write_1_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 160, T.int8(-1), T.int8(-1), 12, buffer_3[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -280,15 +280,15 @@ def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_writ
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        placeholder_encoded = T.buffer_decl([608], dtype="uint8")
-        placeholder_encoded_1 = T.buffer_decl([160], dtype="uint8")
-        placeholder_encoded_2 = T.buffer_decl([208], dtype="uint8")
-        placeholder_encoded_3 = T.buffer_decl([96], dtype="uint8")
-        placeholder = T.buffer_decl([8192], dtype="int8", data=input_placeholder.data)
-        ethosu_write = T.buffer_decl([2048], dtype="int8", data=input_ethosu_write.data)
+        placeholder_encoded = T.Buffer([608], dtype="uint8")
+        placeholder_encoded_1 = T.Buffer([160], dtype="uint8")
+        placeholder_encoded_2 = T.Buffer([208], dtype="uint8")
+        placeholder_encoded_3 = T.Buffer([96], dtype="uint8")
+        placeholder = T.Buffer([8192], dtype="int8", data=input_placeholder.data)
+        ethosu_write = T.Buffer([2048], dtype="int8", data=input_ethosu_write.data)
         # body
         ethosu_write_2_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        ethosu_write_2 = T.buffer_decl([4096], "int8", data=ethosu_write_2_data)
+        ethosu_write_2 = T.Buffer([4096], "int8", data=ethosu_write_2_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded[0], 304, placeholder_encoded[304], 304, 12, placeholder_encoded_1[0], 80, placeholder_encoded_1[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded_2[0], 112, placeholder_encoded_2[112], 96, 12, placeholder_encoded_3[0], 48, placeholder_encoded_3[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -357,21 +357,21 @@ class MixedReadU55:
     def main(input_ifm: T.Buffer[(1,16,16,32), "int8"], input_ethosu_write: T.Buffer[(1,16,16,8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer1 = T.buffer_decl([112], "uint8")
-        buffer3 = T.buffer_decl([112], "uint8")
-        buffer5 = T.buffer_decl([112], "uint8")
-        buffer7 = T.buffer_decl([112], "uint8")
-        buffer9 = T.buffer_decl([592], "uint8")
-        buffer10 = T.buffer_decl([160], "uint8")
-        ifm = T.buffer_decl([8192], "int8", data=input_ifm.data)
-        ethosu_write = T.buffer_decl([2048], "int8", data=input_ethosu_write.data)
+        buffer1 = T.Buffer([112], "uint8")
+        buffer3 = T.Buffer([112], "uint8")
+        buffer5 = T.Buffer([112], "uint8")
+        buffer7 = T.Buffer([112], "uint8")
+        buffer9 = T.Buffer([592], "uint8")
+        buffer10 = T.Buffer([160], "uint8")
+        ifm = T.Buffer([8192], "int8", data=input_ifm.data)
+        ethosu_write = T.Buffer([2048], "int8", data=input_ethosu_write.data)
         # body
         p1_data = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p1 = T.buffer_decl([112], "uint8", data=p1_data)
+        p1 = T.Buffer([112], "uint8", data=p1_data)
         p3_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        p3 = T.buffer_decl([4096], "int8", data=p3_data)
+        p3 = T.Buffer([4096], "int8", data=p3_data)
         p2_data = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.buffer_decl([112], "uint8", data=p2_data)
+        p2 = T.Buffer([112], "uint8", data=p2_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 112, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 592, T.int8(-1), T.int8(-1), 12, buffer10[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 112, p2[0], dtype="handle"))
@@ -391,20 +391,20 @@ def main(input_ifm: T.Buffer[(1,16,16,32), "int8"], input_ethosu_write: T.Buffer
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        ifm = T.buffer_decl([8192], dtype="int8", data=input_ifm.data)
-        ethosu_write = T.buffer_decl([2048], dtype="int8", data=input_ethosu_write.data)
-        buffer1 = T.buffer_decl([128], dtype="uint8")
-        buffer2 = T.buffer_decl([128], dtype="uint8")
-        buffer3 = T.buffer_decl([128], dtype="uint8")
-        buffer4 = T.buffer_decl([608], dtype="uint8")
-        buffer5 = T.buffer_decl([160], dtype="uint8")
-        buffer6 = T.buffer_decl([128], dtype="uint8")
+        ifm = T.Buffer([8192], dtype="int8", data=input_ifm.data)
+        ethosu_write = T.Buffer([2048], dtype="int8", data=input_ethosu_write.data)
+        buffer1 = T.Buffer([128], dtype="uint8")
+        buffer2 = T.Buffer([128], dtype="uint8")
+        buffer3 = T.Buffer([128], dtype="uint8")
+        buffer4 = T.Buffer([608], dtype="uint8")
+        buffer5 = T.Buffer([160], dtype="uint8")
+        buffer6 = T.Buffer([128], dtype="uint8")
         p1_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p1 = T.buffer_decl([128], "uint8", data=p1_data)
+        p1 = T.Buffer([128], "uint8", data=p1_data)
         p2_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.buffer_decl([4096], "int8", data=p2_data)
+        p2 = T.Buffer([4096], "int8", data=p2_data)
         p3_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p3 = T.buffer_decl([128], "uint8", data=p3_data)
+        p3 = T.Buffer([128], "uint8", data=p3_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, ifm[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer4[0], 304, buffer4[304], 304, 12, buffer5[0], 80, buffer5[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p3[0], dtype="handle"))
diff --git a/tests/python/contrib/test_ethosu/test_hoist_allocates.py b/tests/python/contrib/test_ethosu/test_hoist_allocates.py
index 1508aa441c3b..ea1cae50e6eb 100644
--- a/tests/python/contrib/test_ethosu/test_hoist_allocates.py
+++ b/tests/python/contrib/test_ethosu/test_hoist_allocates.py
@@ -109,27 +109,27 @@ class Module:
         def main(input_placeholder: T.Buffer[(1, 27, 42, 3), "int8"], input_placeholder_encoded: T.Buffer[(3, 3, 2, 3), "uint8"], input_placeholder_encoded_1: T.Buffer[(3, 10), "uint8"], input_placeholder_encoded_2: T.Buffer[(3, 3, 2, 3), "uint8"], input_placeholder_encoded_3: T.Buffer[(3, 10), "uint8"], input_ethosu_write: T.Buffer[(1, 27, 42, 3), "int8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            placeholder = T.buffer_decl([3402], dtype="int8", data=input_placeholder.data)
-            placeholder_encoded = T.buffer_decl([128], dtype="int8", data=input_placeholder_encoded.data)
-            placeholder_encoded_1 = T.buffer_decl([32], dtype="uint8", data=input_placeholder_encoded_1.data)
-            placeholder_encoded_2 = T.buffer_decl([128], dtype="int8", data=input_placeholder_encoded_2.data)
-            placeholder_encoded_3 = T.buffer_decl([32], dtype="uint8", data=input_placeholder_encoded_3.data)
-            ethosu_write = T.buffer_decl([3402], dtype="int8", data=input_ethosu_write.data)
+            placeholder = T.Buffer([3402], dtype="int8", data=input_placeholder.data)
+            placeholder_encoded = T.Buffer([128], dtype="int8", data=input_placeholder_encoded.data)
+            placeholder_encoded_1 = T.Buffer([32], dtype="uint8", data=input_placeholder_encoded_1.data)
+            placeholder_encoded_2 = T.Buffer([128], dtype="int8", data=input_placeholder_encoded_2.data)
+            placeholder_encoded_3 = T.Buffer([32], dtype="uint8", data=input_placeholder_encoded_3.data)
+            ethosu_write = T.Buffer([3402], dtype="int8", data=input_ethosu_write.data)
             # body
             placeholder_global_data = T.allocate([128], "uint8", "global")
-            placeholder_global = T.buffer_decl([128], "uint8", data=placeholder_global_data)
+            placeholder_global = T.Buffer([128], "uint8", data=placeholder_global_data)
             T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 128, placeholder_global[0], dtype="handle"))
             placeholder_d_global_data = T.allocate([32], "uint8", "global")
-            placeholder_d_global = T.buffer_decl([32], "uint8", data=placeholder_d_global_data)
+            placeholder_d_global = T.Buffer([32], "uint8", data=placeholder_d_global_data)
             T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
             ethosu_write_2_data = T.allocate([18144], "int8", "global")
-            ethosu_write_2 = T.buffer_decl([18144], "int8", data=ethosu_write_2_data)
+            ethosu_write_2 = T.Buffer([18144], "int8", data=ethosu_write_2_data)
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 27, 42, 3, 27, 0, 42, placeholder[0], 0, 0, 0, T.float32(0.0039215646684169769), -128, "NHWC", 126, 3, 1, "int8", 27, 42, 3, 27, 0, 42, ethosu_write_2[0], 0, 0, 0, T.float32(0.031308155506849289), -128, "NHCWB16", 672, 16, 1, 2, 3, 1, 1, 1, 2, placeholder_global[0], 128, 0, placeholder_d_global[0], 32, 2, 0, 2, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
             placeholder_d_global_1_data = T.allocate([128], "uint8", "global")
-            placeholder_d_global_1 = T.buffer_decl([128], "uint8", data=placeholder_d_global_1_data)
+            placeholder_d_global_1 = T.Buffer([128], "uint8", data=placeholder_d_global_1_data)
             T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 128, placeholder_d_global_1[0], dtype="handle"))
             placeholder_d_global_2_data = T.allocate([32], "uint8", "global")
-            placeholder_d_global_2 = T.buffer_decl([32], "uint8", data=placeholder_d_global_2_data)
+            placeholder_d_global_2 = T.Buffer([32], "uint8", data=placeholder_d_global_2_data)
             T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_3[0], 32, placeholder_d_global_2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 27, 42, 3, 27, 0, 42, ethosu_write_2[0], 0, 0, 0, T.float32(0.031308155506849289), -128, "NHCWB16", 672, 16, 1, "int8", 27, 42, 3, 27, 0, 42, ethosu_write[0], 0, 0, 0, T.float32(0.23604340851306915), -128, "NHWC", 126, 3, 1, 2, 3, 1, 1, 1, 2, placeholder_d_global_1[0], 128, 0, placeholder_d_global_2[0], 32, 2, 0, 2, 1, "CLIP", -128, 127, "TFL", "NONE", dtype="handle"))
     # fmt: on
@@ -153,20 +153,20 @@ class Module:
         def main(input_placeholder: T.Buffer[(1, 2, 3, 4), "int8"], T_concat: T.Buffer[(24,), "int8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            placeholder = T.buffer_decl([24], dtype="int8", data=input_placeholder.data)
+            placeholder = T.Buffer([24], dtype="int8", data=input_placeholder.data)
             # body
             ethosu_write_data = T.allocate([12], "int8", "global")
-            ethosu_write = T.buffer_decl([12], "int8", data=ethosu_write_data)
+            ethosu_write = T.Buffer([12], "int8", data=ethosu_write_data)
             T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, placeholder[12], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
             ethosu_write_1_data = T.allocate([12], "int8", "global")
-            ethosu_write_1 = T.buffer_decl([12], "int8", data=ethosu_write_1_data)
+            ethosu_write_1 = T.Buffer([12], "int8", data=ethosu_write_1_data)
             T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write_1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
             T.evaluate(T.call_extern("ethosu_identity", "int8", 12, 1, 1, 12, 0, 1, ethosu_write_1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "int8", 12, 1, 1, 12, 0, 1, T_concat[12], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
             ethosu_write_2_data = T.allocate([12], "int8", "global")
-            ethosu_write_2 = T.buffer_decl([12], "int8", data=ethosu_write_2_data)
+            ethosu_write_2 = T.Buffer([12], "int8", data=ethosu_write_2_data)
             T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, placeholder[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
             ethosu_write_3_data = T.allocate([12], "int8", "global")
-            ethosu_write_3 = T.buffer_decl([12], "int8", data=ethosu_write_3_data)
+            ethosu_write_3 = T.Buffer([12], "int8", data=ethosu_write_3_data)
             T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
             T.evaluate(T.call_extern("ethosu_identity", "int8", 12, 1, 1, 12, 0, 1, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "int8", 12, 1, 1, 12, 0, 1, T_concat[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
     # fmt: on
@@ -190,35 +190,35 @@ class Module:
         def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write: T.Buffer[(1, 16, 16, 8), "int8"], buffer_encoded: T.Buffer[(128,), "uint8"], buffer_encoded_1: T.Buffer[(32,), "uint8"], buffer_encoded_2: T.Buffer[(112,), "uint8"], buffer_encoded_3: T.Buffer[(32,), "uint8"], buffer_encoded_4: T.Buffer[(112,), "uint8"], buffer_encoded_5: T.Buffer[(32,), "uint8"], buffer_encoded_6: T.Buffer[(112,), "uint8"], buffer_encoded_7: T.Buffer[(32,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            placeholder = T.buffer_decl([8192], dtype="int8", data=input_placeholder.data)
-            ethosu_write = T.buffer_decl([2048], dtype="int8", data=input_ethosu_write.data)
+            placeholder = T.Buffer([8192], dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.Buffer([2048], dtype="int8", data=input_ethosu_write.data)
             # body
             with T.allocate([128], "uint8", "global") as placeholder_global_data:
-                placeholder_global = T.buffer_decl([128], "uint8", data=placeholder_global_data)
+                placeholder_global = T.Buffer([128], "uint8", data=placeholder_global_data)
                 T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 128, placeholder_global[0], dtype="handle"))
                 placeholder_d_global_data = T.allocate([32], "uint8", "global")
-                placeholder_d_global = T.buffer_decl([32], "uint8", data=placeholder_d_global_data)
+                placeholder_d_global = T.Buffer([32], "uint8", data=placeholder_d_global_data)
                 T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
                 T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 128, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             with T.allocate([112], "uint8", "global") as placeholder_global_1_data:
-                placeholder_global_1 = T.buffer_decl([112], "uint8", data=placeholder_global_1_data)
+                placeholder_global_1 = T.Buffer([112], "uint8", data=placeholder_global_1_data)
                 T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 112, placeholder_global_1[0], dtype="handle"))
                 placeholder_d_global_1_data = T.allocate([32], "uint8", "global")
-                placeholder_d_global_1 = T.buffer_decl([32], "uint8", data=placeholder_d_global_1_data)
+                placeholder_d_global_1 = T.Buffer([32], "uint8", data=placeholder_d_global_1_data)
                 T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle"))
                 T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 112, 12, placeholder_d_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             with T.allocate([112], "uint8", "global") as placeholder_global_2_data:
-                placeholder_global_2 = T.buffer_decl([112], "uint8", data=placeholder_global_2_data)
+                placeholder_global_2 = T.Buffer([112], "uint8", data=placeholder_global_2_data)
                 T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 112, placeholder_global_2[0], dtype="handle"))
                 placeholder_d_global_2_data = T.allocate([32], "uint8", "global")
-                placeholder_d_global_2 = T.buffer_decl([32], "uint8", data=placeholder_d_global_2_data)
+                placeholder_d_global_2 = T.Buffer([32], "uint8", data=placeholder_d_global_2_data)
                 T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle"))
                 T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 112, 12, placeholder_d_global_2[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             placeholder_global_3_data = T.allocate([112], "uint8", "global")
-            placeholder_global_3 = T.buffer_decl([112], "uint8", data=placeholder_global_3_data)
+            placeholder_global_3 = T.Buffer([112], "uint8", data=placeholder_global_3_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 112, placeholder_global_3[0], dtype="handle"))
             placeholder_d_global_3_data = T.allocate([32], "uint8", "global")
-            placeholder_d_global_3 = T.buffer_decl([32], "uint8", data=placeholder_d_global_3_data)
+            placeholder_d_global_3 = T.Buffer([32], "uint8", data=placeholder_d_global_3_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 112, 12, placeholder_d_global_3[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     # fmt: on
@@ -240,23 +240,23 @@ class Module:
         def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write: T.Buffer[(1, 16, 16, 8), "int8"], buffer_encoded: T.Buffer[(128,), "uint8"], buffer_encoded_1: T.Buffer[(32,), "uint8"], buffer_encoded_2: T.Buffer[(112,), "uint8"], buffer_encoded_3: T.Buffer[(32,), "uint8"], buffer_encoded_4: T.Buffer[(112,), "uint8"], buffer_encoded_5: T.Buffer[(32,), "uint8"], buffer_encoded_6: T.Buffer[(112,), "uint8"], buffer_encoded_7: T.Buffer[(32,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            placeholder = T.buffer_decl([8192], dtype="int8", data=input_placeholder.data)
-            ethosu_write = T.buffer_decl([2048], dtype="int8", data=input_ethosu_write.data)
+            placeholder = T.Buffer([8192], dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.Buffer([2048], dtype="int8", data=input_ethosu_write.data)
             # body
             placeholder_global_data = T.allocate([128], "uint8", "global")
-            placeholder_global = T.buffer_decl([128], "uint8", data=placeholder_global_data)
+            placeholder_global = T.Buffer([128], "uint8", data=placeholder_global_data)
             placeholder_global_1_data = T.allocate([112], "uint8", "global")
-            placeholder_global_1 = T.buffer_decl([112], "uint8", data=placeholder_global_1_data)
+            placeholder_global_1 = T.Buffer([112], "uint8", data=placeholder_global_1_data)
             placeholder_global_2_data = T.allocate([112], "uint8", "global")
-            placeholder_global_2 = T.buffer_decl([112], "uint8", data=placeholder_global_2_data)
+            placeholder_global_2 = T.Buffer([112], "uint8", data=placeholder_global_2_data)
             placeholder_d_global_data = T.allocate([32], "uint8", "global")
-            placeholder_d_global = T.buffer_decl([32], "uint8", data=placeholder_d_global_data)
+            placeholder_d_global = T.Buffer([32], "uint8", data=placeholder_d_global_data)
             placeholder_d_global_1_data = T.allocate([32], "uint8", "global")
-            placeholder_d_global_1 = T.buffer_decl([32], "uint8", data=placeholder_d_global_1_data)
+            placeholder_d_global_1 = T.Buffer([32], "uint8", data=placeholder_d_global_1_data)
             placeholder_d_global_2_data = T.allocate([32], "uint8", "global")
-            placeholder_d_global_2 = T.buffer_decl([32], "uint8", data=placeholder_d_global_2_data)
+            placeholder_d_global_2 = T.Buffer([32], "uint8", data=placeholder_d_global_2_data)
             placeholder_global_3_data = T.allocate([112], "uint8", "global")
-            placeholder_global_3 = T.buffer_decl([112], "uint8", data=placeholder_global_3_data)
+            placeholder_global_3 = T.Buffer([112], "uint8", data=placeholder_global_3_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 128, placeholder_global[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 128, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -266,7 +266,7 @@ def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_writ
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 112, placeholder_global_2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle"))
             placeholder_d_global_3_data = T.allocate([32], "uint8", "global")
-            placeholder_d_global_3 = T.buffer_decl([32], "uint8", data=placeholder_d_global_3_data)
+            placeholder_d_global_3 = T.Buffer([32], "uint8", data=placeholder_d_global_3_data)
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 112, 12, placeholder_d_global_2[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 112, placeholder_global_3[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle"))
diff --git a/tests/python/contrib/test_ethosu/test_merge_constants.py b/tests/python/contrib/test_ethosu/test_merge_constants.py
index ed1927b849d6..7465e220787c 100644
--- a/tests/python/contrib/test_ethosu/test_merge_constants.py
+++ b/tests/python/contrib/test_ethosu/test_merge_constants.py
@@ -41,13 +41,13 @@ class InputModule:
         def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer1 = T.buffer_decl([8192], "int8")
-            buffer10 = T.buffer_decl([2048], "int8")
+            buffer1 = T.Buffer([8192], "int8")
+            buffer10 = T.Buffer([2048], "int8")
             # body
             p1_data = T.allocate([128], "uint8", "global")
-            p1 = T.buffer_decl([128], "uint8", data=p1_data)
+            p1 = T.Buffer([128], "uint8", data=p1_data)
             p4_data = T.allocate([32], "uint8", "global")
-            p4 = T.buffer_decl([32], "uint8", data=p4_data)
+            p4 = T.Buffer([32], "uint8", data=p4_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -59,11 +59,11 @@ class ReferenceModule:
         def main(buffer2: T.Buffer[(160,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer1 = T.buffer_decl([8192], "int8")
-            buffer10 = T.buffer_decl([2048], "int8")
+            buffer1 = T.Buffer([8192], "int8")
+            buffer10 = T.Buffer([2048], "int8")
             # body
             p4_data = T.allocate([160], "uint8", "global")
-            p4 = T.buffer_decl([160], "uint8", data=p4_data)
+            p4 = T.Buffer([160], "uint8", data=p4_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 128, 12, p4[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     # fmt: on
@@ -86,25 +86,25 @@ class InputModule:
         def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"], buffer4: T.Buffer[(112,), "uint8"], buffer5: T.Buffer[(32,), "uint8"], buffer6: T.Buffer[(112,), "uint8"], buffer7: T.Buffer[(32,), "uint8"], buffer8: T.Buffer[(112,), "uint8"], buffer9: T.Buffer[(32,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer1 = T.buffer_decl([8192], "int8")
-            buffer10 = T.buffer_decl([2048], "int8")
+            buffer1 = T.Buffer([8192], "int8")
+            buffer10 = T.Buffer([2048], "int8")
             # body
             p1_data = T.allocate([128], "uint8", "global")
-            p1 = T.buffer_decl([128], "uint8", data=p1_data)
+            p1 = T.Buffer([128], "uint8", data=p1_data)
             p2_data = T.allocate([112], "uint8", "global")
-            p2 = T.buffer_decl([112], "uint8", data=p2_data)
+            p2 = T.Buffer([112], "uint8", data=p2_data)
             p3_data = T.allocate([112], "uint8", "global")
-            p3 = T.buffer_decl([112], "uint8", data=p3_data)
+            p3 = T.Buffer([112], "uint8", data=p3_data)
             p4_data = T.allocate([32], "uint8", "global")
-            p4 = T.buffer_decl([32], "uint8", data=p4_data)
+            p4 = T.Buffer([32], "uint8", data=p4_data)
             p5_data = T.allocate([32], "uint8", "global")
-            p5 = T.buffer_decl([32], "uint8", data=p5_data)
+            p5 = T.Buffer([32], "uint8", data=p5_data)
             p6_data = T.allocate([32], "uint8", "global")
-            p6 = T.buffer_decl([32], "uint8", data=p6_data)
+            p6 = T.Buffer([32], "uint8", data=p6_data)
             p7_data = T.allocate([112], "uint8", "global")
-            p7 = T.buffer_decl([112], "uint8", data=p7_data)
+            p7 = T.Buffer([112], "uint8", data=p7_data)
             p8_data = T.allocate([3], "uint8", "global")
-            p8 = T.buffer_decl([3], "uint8", data=p8_data)
+            p8 = T.Buffer([3], "uint8", data=p8_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
@@ -125,17 +125,17 @@ class ReferenceModule:
         def main(buffer2: T.Buffer[(160,), "uint8"], buffer4: T.Buffer[(144,), "uint8"], buffer6: T.Buffer[(144,), "uint8"], buffer8: T.Buffer[(144,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer1 = T.buffer_decl([8192], "int8")
-            buffer10 = T.buffer_decl([2048], "int8")
+            buffer1 = T.Buffer([8192], "int8")
+            buffer10 = T.Buffer([2048], "int8")
             # body
             p4_data = T.allocate([160], "uint8", "global")
-            p4 = T.buffer_decl([160], "uint8", data=p4_data)
+            p4 = T.Buffer([160], "uint8", data=p4_data)
             p7_data = T.allocate([144], "uint8", "global")
-            p7 = T.buffer_decl([144], "uint8", data=p7_data)
+            p7 = T.Buffer([144], "uint8", data=p7_data)
             p10_data = T.allocate([144], "uint8", "global")
-            p10 = T.buffer_decl([144], "uint8", data=p10_data)
+            p10 = T.Buffer([144], "uint8", data=p10_data)
             p11_data = T.allocate([144], "uint8", "global")
-            p11 = T.buffer_decl([144], "uint8", data=p11_data)
+            p11 = T.Buffer([144], "uint8", data=p11_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 144, p7[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 128, 12, p4[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -175,14 +175,14 @@ class InputModule:
         @T.prim_func
         def main(buffer2: T.Buffer[(80,), "uint8"], buffer3: T.Buffer[(64,), "uint8"]) -> None:
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer0 = T.buffer_decl([390336], "int8")
-            buffer1 = T.buffer_decl([97156], "int8")
-            buffer6 = T.buffer_decl([390336], "int8")
+            buffer0 = T.Buffer([390336], "int8")
+            buffer1 = T.Buffer([97156], "int8")
+            buffer6 = T.Buffer([390336], "int8")
             # body
             p2_data = T.allocate([80], "uint8", "global")
-            p2 = T.buffer_decl([80], "uint8", data=p2_data)
+            p2 = T.Buffer([80], "uint8", data=p2_data)
             p3_data = T.allocate([64], "uint8", "global")
-            p3 = T.buffer_decl([64], "uint8", data=p3_data)
+            p3 = T.Buffer([64], "uint8", data=p3_data)
             T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
@@ -194,12 +194,12 @@ class ReferenceModule:
         @T.prim_func
         def main(buffer2: T.Buffer[(144,), "uint8"]) -> None:
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer0 = T.buffer_decl([390336], "int8")
-            buffer1 = T.buffer_decl([97156], "int8")
-            buffer6 = T.buffer_decl([390336], "int8")
+            buffer0 = T.Buffer([390336], "int8")
+            buffer1 = T.Buffer([97156], "int8")
+            buffer6 = T.Buffer([390336], "int8")
             # body
             p3_data = T.allocate([144], "uint8", "global")
-            p3 = T.buffer_decl([144], "uint8", data=p3_data)
+            p3 = T.Buffer([144], "uint8", data=p3_data)
             T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 144, p3[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, buffer6[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p3[0], 80, 0, p3[80], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -234,17 +234,17 @@ def main(buffer1: T.Buffer[(64,), "uint8"],
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # body
             p1_data = T.allocate([48], "uint8", "global")
-            p1 = T.buffer_decl([48], "uint8", data=p1_data)
+            p1 = T.Buffer([48], "uint8", data=p1_data)
             p2_data = T.allocate([48], "uint8", "global")
-            p2 = T.buffer_decl([48], "uint8", data=p2_data)
+            p2 = T.Buffer([48], "uint8", data=p2_data)
             p3_data = T.allocate([256], "int8", "local")
-            p3 = T.buffer_decl([256], "int8", data=p3_data, scope="local")
+            p3 = T.Buffer([256], "int8", data=p3_data, scope="local")
             p5_data = T.allocate([16], "uint8", "global")
-            p5 = T.buffer_decl([16], "uint8", data=p5_data)
+            p5 = T.Buffer([16], "uint8", data=p5_data)
             p6_data = T.allocate([48], "uint8", "global")
-            p6 = T.buffer_decl([48], "uint8", data=p6_data)
+            p6 = T.Buffer([48], "uint8", data=p6_data)
             p7_data = T.allocate([256], "int8", "local")
-            p7 = T.buffer_decl([256], "int8", data=p7_data, scope="local")
+            p7 = T.Buffer([256], "int8", data=p7_data, scope="local")
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) # Local
@@ -269,13 +269,13 @@ def main(buffer1: T.Buffer[(64,), "uint8"],
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # body
             p1_data = T.allocate([96], "uint8", "global")
-            p1 = T.buffer_decl([96], "uint8", data=p1_data)
+            p1 = T.Buffer([96], "uint8", data=p1_data)
             p2_data = T.allocate([64], "uint8", "global")
-            p2 = T.buffer_decl([64], "uint8", data=p2_data)
+            p2 = T.Buffer([64], "uint8", data=p2_data)
             p3_data = T.allocate([256], "int8", "local")
-            p3 = T.buffer_decl([256], "int8", data=p3_data, scope="local")
+            p3 = T.Buffer([256], "int8", data=p3_data, scope="local")
             p7_data = T.allocate([256], "int8", "local")
-            p7 = T.buffer_decl([256], "int8", data=p7_data, scope="local")
+            p7 = T.Buffer([256], "int8", data=p7_data, scope="local")
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) # Local
             T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 64, p2[0], dtype="handle"))
@@ -312,11 +312,11 @@ class InputModule:
         def main() -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            placeholder = T.buffer_decl([20], "int8")
-            ethosu_write = T.buffer_decl([16], "int8")
+            placeholder = T.Buffer([20], "int8")
+            ethosu_write = T.Buffer([16], "int8")
             # body
             ethosu_write_4_data = T.allocate([16], "int8", "global")
-            ethosu_write_4 = T.buffer_decl([16], "int8", data=ethosu_write_4_data)
+            ethosu_write_4 = T.Buffer([16], "int8", data=ethosu_write_4_data)
             T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 1, 4, 4, 1, 0, 4, placeholder[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "int8", 1, 4, 1, 1, 0, 4, placeholder[16], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 1, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "MAX", 0, "CLIP", -128, 127, "TFL", 1, 4, 4, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
 
@@ -326,11 +326,11 @@ class ReferenceModule:
         def main() -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            placeholder = T.buffer_decl([20], "int8")
-            ethosu_write = T.buffer_decl([16], "int8")
+            placeholder = T.Buffer([20], "int8")
+            ethosu_write = T.Buffer([16], "int8")
             # body
             ethosu_write_4_data = T.allocate([16], "int8", "global")
-            ethosu_write_4 = T.buffer_decl([16], "int8", data=ethosu_write_4_data)
+            ethosu_write_4 = T.Buffer([16], "int8", data=ethosu_write_4_data)
             T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 1, 4, 4, 1, 0, 4, placeholder[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "int8", 1, 4, 1, 1, 0, 4, placeholder[16], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 1, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "MAX", 0, "CLIP", -128, 127, "TFL", 1, 4, 4, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     # fmt: on
@@ -351,13 +351,13 @@ class InputModule:
         def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer1 = T.buffer_decl([8192], "int8")
-            buffer10 = T.buffer_decl([2048], "int8")
+            buffer1 = T.Buffer([8192], "int8")
+            buffer10 = T.Buffer([2048], "int8")
             # body
             p1_data = T.allocate([128], "uint8", "global")
-            p1 = T.buffer_decl([128], "uint8", data=p1_data)
+            p1 = T.Buffer([128], "uint8", data=p1_data)
             p4_data = T.allocate([32], "uint8", "global")
-            p4 = T.buffer_decl([32], "uint8", data=p4_data)
+            p4 = T.Buffer([32], "uint8", data=p4_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -372,11 +372,11 @@ class ReferenceModule:
         def main(buffer2: T.Buffer[(160,), "uint8"]) -> None:
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-            buffer1 = T.buffer_decl([8192], "int8")
-            buffer10 = T.buffer_decl([2048], "int8")
+            buffer1 = T.Buffer([8192], "int8")
+            buffer10 = T.Buffer([2048], "int8")
             # body
             p5_data = T.allocate([160], "uint8", "global")
-            p5 = T.buffer_decl([160], "uint8", data=p5_data)
+            p5 = T.Buffer([160], "uint8", data=p5_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p5[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5[0], 128, 12, p5[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p5[0], dtype="handle"))
@@ -403,13 +403,13 @@ def main(input_placeholder: T.Buffer[(1, 16, 16, 32), "int8"], buffer1: T.Buffer
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # buffer definition
-            placeholder = T.buffer_decl(8192, dtype="int8", data=input_placeholder.data)
-            ethosu_write = T.buffer_decl(2048, dtype="int8", data=input_ethosu_write.data)
+            placeholder = T.Buffer(8192, dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.Buffer(2048, dtype="int8", data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([368], "uint8", "global")
-            p1 = T.buffer_decl([368], "uint8", data=p1_data)
+            p1 = T.Buffer([368], "uint8", data=p1_data)
             p2_data = T.allocate([96], "uint8", "global")
-            p2 = T.buffer_decl([96], "uint8", data=p2_data)
+            p2 = T.Buffer([96], "uint8", data=p2_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 368, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p2[0], 48, p2[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -423,11 +423,11 @@ def main(input_placeholder: T.Buffer[(1,16,16,32), "int8"], buffer1: T.Buffer[(4
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # buffer definition
-            placeholder = T.buffer_decl(8192, dtype="int8", data=input_placeholder.data)
-            ethosu_write = T.buffer_decl(2048, dtype="int8", data=input_ethosu_write.data)
+            placeholder = T.Buffer(8192, dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.Buffer(2048, dtype="int8", data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([464], "uint8", "global")
-            p1 = T.buffer_decl([464], "uint8", data=p1_data)
+            p1 = T.Buffer([464], "uint8", data=p1_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 464, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -453,17 +453,17 @@ def main(input_placeholder: T.Buffer[(1,16,16,32), "int8"], buffer1: T.Buffer[(3
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # buffer definition
-            placeholder = T.buffer_decl(8192, dtype="int8", data=input_placeholder.data)
-            ethosu_write = T.buffer_decl(2048, dtype="int8", data=input_ethosu_write.data)
+            placeholder = T.Buffer(8192, dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.Buffer(2048, dtype="int8", data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([368], "uint8", "global")
-            p1 = T.buffer_decl([368], "uint8", data=p1_data)
+            p1 = T.Buffer([368], "uint8", data=p1_data)
             p2_data = T.allocate([96], "uint8", "global")
-            p2 = T.buffer_decl([96], "uint8", data=p2_data)
+            p2 = T.Buffer([96], "uint8", data=p2_data)
             p3_data = T.allocate([368], "uint8", "global")
-            p3 = T.buffer_decl([368], "uint8", data=p3_data)
+            p3 = T.Buffer([368], "uint8", data=p3_data)
             p4_data = T.allocate([96], "uint8", "global")
-            p4 = T.buffer_decl([96], "uint8", data=p4_data)
+            p4 = T.Buffer([96], "uint8", data=p4_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 368, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p2[0], 48, p2[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -480,13 +480,13 @@ def main(input_placeholder: T.Buffer[(1,16,16,32), "int8"], buffer1: T.Buffer[(4
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # buffer definition
-            placeholder = T.buffer_decl(8192, dtype="int8", data=input_placeholder.data)
-            ethosu_write = T.buffer_decl(2048, dtype="int8", data=input_ethosu_write.data)
+            placeholder = T.Buffer(8192, dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.Buffer(2048, dtype="int8", data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([464], "uint8", "global")
-            p1 = T.buffer_decl([464], "uint8", data=p1_data)
+            p1 = T.Buffer([464], "uint8", data=p1_data)
             p2_data = T.allocate([464], "uint8", "global")
-            p2 = T.buffer_decl([464], "uint8", data=p2_data)
+            p2 = T.Buffer([464], "uint8", data=p2_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 464, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 464, p2[0], dtype="handle"))
@@ -519,17 +519,17 @@ def main(input_placeholder: T.Buffer[(1,16,16,32), "int8"], buffer1: T.Buffer[(3
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # buffer definition
-            placeholder = T.buffer_decl(8192, dtype="int8", data=input_placeholder.data)
-            ethosu_write = T.buffer_decl(2048, dtype="int8", data=input_ethosu_write.data)
+            placeholder = T.Buffer(8192, dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.Buffer(2048, dtype="int8", data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([368], "uint8", "global")
-            p1 = T.buffer_decl([368], "uint8", data=p1_data)
+            p1 = T.Buffer([368], "uint8", data=p1_data)
             p2_data = T.allocate([96], "uint8", "global")
-            p2 = T.buffer_decl([96], "uint8", data=p2_data)
+            p2 = T.Buffer([96], "uint8", data=p2_data)
             p3_data = T.allocate([368], "uint8", "global")
-            p3 = T.buffer_decl([368], "uint8", data=p3_data)
+            p3 = T.Buffer([368], "uint8", data=p3_data)
             p4_data = T.allocate([96], "uint8", "global")
-            p4 = T.buffer_decl([96], "uint8", data=p4_data)
+            p4 = T.Buffer([96], "uint8", data=p4_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 368, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p2[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p2[0], 48, p2[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -546,13 +546,13 @@ def main(input_placeholder: T.Buffer[(1,16,16,32), "int8"], buffer1: T.Buffer[(4
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # buffer definition
-            placeholder = T.buffer_decl(8192, dtype="int8", data=input_placeholder.data)
-            ethosu_write = T.buffer_decl(2048, dtype="int8", data=input_ethosu_write.data)
+            placeholder = T.Buffer(8192, dtype="int8", data=input_placeholder.data)
+            ethosu_write = T.Buffer(2048, dtype="int8", data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([464], "uint8", "global")
-            p1 = T.buffer_decl([464], "uint8", data=p1_data)
+            p1 = T.Buffer([464], "uint8", data=p1_data)
             p2_data = T.allocate([464], "uint8", "global")
-            p2 = T.buffer_decl([464], "uint8", data=p2_data)
+            p2 = T.Buffer([464], "uint8", data=p2_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 464, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 464, p2[0], dtype="handle"))
@@ -585,17 +585,17 @@ def main(input_placeholder: T.Buffer[(1,16,16,32), "int8"], buffer1: T.Buffer[(3
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # buffer definition
-            placeholder = T.buffer_decl(8192, dtype='int8', data=input_placeholder.data)
-            ethosu_write = T.buffer_decl(4096, dtype='int8', data=input_ethosu_write.data)
+            placeholder = T.Buffer(8192, dtype='int8', data=input_placeholder.data)
+            ethosu_write = T.Buffer(4096, dtype='int8', data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([368], "uint8", "global")
-            p1 = T.buffer_decl([368], "uint8", data=p1_data)
+            p1 = T.Buffer([368], "uint8", data=p1_data)
             p2_data = T.allocate([368], "uint8", "global")
-            p2 = T.buffer_decl([368], "uint8", data=p2_data)
+            p2 = T.Buffer([368], "uint8", data=p2_data)
             p3_data = T.allocate([96], "uint8", "global")
-            p3 = T.buffer_decl([96], "uint8", data=p3_data)
+            p3 = T.Buffer([96], "uint8", data=p3_data)
             p4_data = T.allocate([96], "uint8", "global")
-            p4 = T.buffer_decl([96], "uint8", data=p4_data)
+            p4 = T.Buffer([96], "uint8", data=p4_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 368, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 96, p3[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p3[0], 48, p3[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -612,13 +612,13 @@ def main(input_placeholder: T.Buffer[(1,16,16,32), "int8"], buffer1: T.Buffer[(4
             # function attr dict
             T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
             # buffer definition
-            placeholder = T.buffer_decl(8192, dtype='int8', data=input_placeholder.data)
-            ethosu_write = T.buffer_decl(4096, dtype='int8', data=input_ethosu_write.data)
+            placeholder = T.Buffer(8192, dtype='int8', data=input_placeholder.data)
+            ethosu_write = T.Buffer(4096, dtype='int8', data=input_ethosu_write.data)
             # body
             p1_data = T.allocate([464], "uint8", "global")
-            p1 = T.buffer_decl([464], "uint8", data=p1_data)
+            p1 = T.Buffer([464], "uint8", data=p1_data)
             p2_data = T.allocate([464], "uint8", "global")
-            p2 = T.buffer_decl([464], "uint8", data=p2_data)
+            p2 = T.Buffer([464], "uint8", data=p2_data)
             T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 464, p1[0], dtype="handle"))
             T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
             T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 464, p2[0], dtype="handle"))
@@ -662,25 +662,25 @@ def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"],
             v4a = T.var("int32")
             v4b = T.var("int32")
             v4c = T.var("int32")
-            buffer1 = T.buffer_decl([8192], "int8")
-            buffer10 = T.buffer_decl([2048], "int8")
+            buffer1 = T.Buffer([8192], "int8")
+            buffer10 = T.Buffer([2048], "int8")
             # body
             p1_data = T.allocate([128], "uint8", "global")
-            p1 = T.buffer_decl([128], "uint8", data=p1_data)
+            p1 = T.Buffer([128], "uint8", data=p1_data)
             p2_data = T.allocate([112], "uint8", "global")
-            p2 = T.buffer_decl([112], "uint8", data=p2_data)
+            p2 = T.Buffer([112], "uint8", data=p2_data)
             p3_data = T.allocate([112], "uint8", "global")
-            p3 = T.buffer_decl([112], "uint8", data=p3_data)
+            p3 = T.Buffer([112], "uint8", data=p3_data)
             p4_data = T.allocate([32], "uint8", "global")
-            p4 = T.buffer_decl([32], "uint8", data=p4_data)
+            p4 = T.Buffer([32], "uint8", data=p4_data)
             p5_data = T.allocate([32], "uint8", "global")
-            p5 = T.buffer_decl([32], "uint8", data=p5_data)
+            p5 = T.Buffer([32], "uint8", data=p5_data)
             p6_data = T.allocate([32], "uint8", "global")
-            p6 = T.buffer_decl([32], "uint8", data=p6_data)
+            p6 = T.Buffer([32], "uint8", data=p6_data)
             p7_data = T.allocate([112], "uint8", "global")
-            p7 = T.buffer_decl([112], "uint8", data=p7_data)
+            p7 = T.Buffer([112], "uint8", data=p7_data)
             p8_data = T.allocate([3], "uint8", "global")
-            p8 = T.buffer_decl([3], "uint8", data=p8_data)
+            p8 = T.Buffer([3], "uint8", data=p8_data)
             with T.attr(T.iter_var(v1a, None, "DataPar", ""), "pragma_compute_cycles_hint", 100):
                 T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
             with T.attr(T.iter_var(v1b, None, "DataPar", ""), "pragma_compute_cycles_hint", 101):
@@ -721,17 +721,17 @@ def main(buffer2: T.Buffer[(160,), "uint8"], buffer4: T.Buffer[(144,), "uint8"],
             v3c = T.var("int32")
             v4a = T.var("int32")
             v4c = T.var("int32")
-            buffer1 = T.buffer_decl([8192], "int8")
-            buffer10 = T.buffer_decl([2048], "int8")
+            buffer1 = T.Buffer([8192], "int8")
+            buffer10 = T.Buffer([2048], "int8")
             # body
             p4_data = T.allocate([160], "uint8", "global")
-            p4 = T.buffer_decl([160], "uint8", data=p4_data)
+            p4 = T.Buffer([160], "uint8", data=p4_data)
             p7_data = T.allocate([144], "uint8", "global")
-            p7 = T.buffer_decl([144], "uint8", data=p7_data)
+            p7 = T.Buffer([144], "uint8", data=p7_data)
             p10_data = T.allocate([144], "uint8", "global")
-            p10 = T.buffer_decl([144], "uint8", data=p10_data)
+            p10 = T.Buffer([144], "uint8", data=p10_data)
             p11_data = T.allocate([144], "uint8", "global")
-            p11 = T.buffer_decl([144], "uint8", data=p11_data)
+            p11 = T.Buffer([144], "uint8", data=p11_data)
             with T.attr(T.iter_var(v1a, None, "DataPar", ""), "pragma_compute_cycles_hint", 201):
                 T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle"))
             with T.attr(T.iter_var(v2a, None, "DataPar", ""), "pragma_compute_cycles_hint", 205):
diff --git a/tests/python/contrib/test_ethosu/test_remove_concatenates.py b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
index b8ce7f0d60c9..64777aa0fb71 100644
--- a/tests/python/contrib/test_ethosu/test_remove_concatenates.py
+++ b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
@@ -35,21 +35,21 @@ def main(input_placeholder: T.Buffer[(1,8,12,16), "int8"], input_placeholder_1:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
 
-        placeholder = T.buffer_decl(1536, dtype="int8", data=input_placeholder.data)
-        placeholder_1 = T.buffer_decl(1280, dtype="int8", data=input_placeholder_1.data)
-        T_concat = T.buffer_decl(4096, dtype="int8", data=input_T_concat.data)
+        placeholder = T.Buffer(1536, dtype="int8", data=input_placeholder.data)
+        placeholder_1 = T.Buffer(1280, dtype="int8", data=input_placeholder_1.data)
+        T_concat = T.Buffer(4096, dtype="int8", data=input_T_concat.data)
 
-        buffer = T.buffer_decl([2992], "uint8")
-        buffer_1 = T.buffer_decl([160], "uint8")
-        buffer_2 = T.buffer_decl([2992], "uint8")
-        buffer_3 = T.buffer_decl([160], "uint8")
-        buffer_4 = T.buffer_decl([2992], "uint8")
-        buffer_5 = T.buffer_decl([160], "uint8")
-        buffer_6 = T.buffer_decl([2992], "uint8")
-        buffer_7 = T.buffer_decl([160], "uint8")
+        buffer = T.Buffer([2992], "uint8")
+        buffer_1 = T.Buffer([160], "uint8")
+        buffer_2 = T.Buffer([2992], "uint8")
+        buffer_3 = T.Buffer([160], "uint8")
+        buffer_4 = T.Buffer([2992], "uint8")
+        buffer_5 = T.Buffer([160], "uint8")
+        buffer_6 = T.Buffer([2992], "uint8")
+        buffer_7 = T.Buffer([160], "uint8")
         # body
         T_concat_1_data = T.allocate([2816], "int8", "global", annotations={"disable_lower_builtin":True})
-        T_concat_1 = T.buffer_decl([2816], "int8", data=T_concat_1_data)
+        T_concat_1 = T.Buffer([2816], "int8", data=T_concat_1_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, placeholder_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 160, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat[352], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_3[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 12, 16, 8, 0, 12, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 192, 16, 1, "int8", 8, 12, 16, 8, 0, 12, T_concat_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer_4[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_5[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index bdc0447bc718..ffa6d6effd79 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -370,15 +370,15 @@ class Conv2dDoubleCascade1:
     def main(input_placeholder_5: T.Buffer[(1, 8, 8, 3), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 8, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([304], "uint8")
-        buffer_1 = T.buffer_decl([80], "uint8")
-        buffer_2 = T.buffer_decl([320], "uint8")
-        buffer_3 = T.buffer_decl([160], "uint8")
-        placeholder_5 = T.buffer_decl([192], 'int8', data=input_placeholder_5.data)
-        ethosu_write_1 = T.buffer_decl([512], 'int8', data=input_ethosu_write_1.data)
+        buffer = T.Buffer([304], "uint8")
+        buffer_1 = T.Buffer([80], "uint8")
+        buffer_2 = T.Buffer([320], "uint8")
+        buffer_3 = T.Buffer([160], "uint8")
+        placeholder_5 = T.Buffer([192], 'int8', data=input_placeholder_5.data)
+        ethosu_write_1 = T.Buffer([512], 'int8', data=input_ethosu_write_1.data)
         # body
         ethosu_write_2_data = T.allocate([1024], "int8", "global", annotations={"disable_lower_builtin": True})
-        ethosu_write_2 = T.buffer_decl([1024], "int8", data=ethosu_write_2_data)
+        ethosu_write_2 = T.Buffer([1024], "int8", data=ethosu_write_2_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, buffer[0], 304, T.int8(-1), T.int8(-1), 12, buffer_1[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[12], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -392,15 +392,15 @@ class Conv2dDoubleCascade2:
     def main(input_placeholder_5: T.Buffer[(1, 8, 8, 3), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 8, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([80], "uint8")
-        buffer_1 = T.buffer_decl([320], "uint8")
-        buffer_2 = T.buffer_decl([1312], "uint8")
-        buffer_3 = T.buffer_decl([2608], "uint8")
-        placeholder_5 = T.buffer_decl([192], 'int8', data=input_placeholder_5.data)
-        ethosu_write_1 = T.buffer_decl([512], 'int8', data=input_ethosu_write_1.data)
+        buffer = T.Buffer([80], "uint8")
+        buffer_1 = T.Buffer([320], "uint8")
+        buffer_2 = T.Buffer([1312], "uint8")
+        buffer_3 = T.Buffer([2608], "uint8")
+        placeholder_5 = T.Buffer([192], 'int8', data=input_placeholder_5.data)
+        ethosu_write_1 = T.Buffer([512], 'int8', data=input_ethosu_write_1.data)
         # body
         ethosu_write_2_data = T.allocate([1536], "int8", "global", annotations={"disable_lower_builtin": True})
-        ethosu_write_2 = T.buffer_decl([1536], "int8", data=ethosu_write_2_data)
+        ethosu_write_2 = T.Buffer([1536], "int8", data=ethosu_write_2_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, T.int8(-1), T.int8(-1), 12, buffer[0], 80, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[48], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -414,16 +414,16 @@ class Conv2dDoubleCascade3:
     def main(input_placeholder_5: T.Buffer[(1, 16, 16, 3), "int8"], input_ethosu_write_1: T.Buffer[(1, 20, 4, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([1744], "uint8")
-        buffer_1 = T.buffer_decl([80], "uint8")
-        buffer_2 = T.buffer_decl([320], "uint8")
-        buffer_3 = T.buffer_decl([880], "uint8")
-        placeholder_5 = T.buffer_decl([768], 'int8', data=input_placeholder_5.data)
-        ethosu_write_1 = T.buffer_decl([640], 'int8', data=input_ethosu_write_1.data)
+        buffer = T.Buffer([1744], "uint8")
+        buffer_1 = T.Buffer([80], "uint8")
+        buffer_2 = T.Buffer([320], "uint8")
+        buffer_3 = T.Buffer([880], "uint8")
+        placeholder_5 = T.Buffer([768], 'int8', data=input_placeholder_5.data)
+        ethosu_write_1 = T.Buffer([640], 'int8', data=input_ethosu_write_1.data)
 
         # body
         ethosu_write_2_data = T.allocate([2560], "int8", "global", annotations={"disable_lower_builtin": True})
-        ethosu_write_2 = T.buffer_decl([2560], "int8", data=ethosu_write_2_data)
+        ethosu_write_2 = T.Buffer([2560], "int8", data=ethosu_write_2_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 3, 8, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, T.int8(-1), T.int8(-1), 12, buffer_1[0], 80, T.int8(-1), T.int8(-1), 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 12, 16, 3, 12, 0, 16, placeholder_5[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 10, 8, 32, 10, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -439,15 +439,15 @@ class Conv2dDoubleCascade4:
     def main(input_placeholder_5: T.Buffer[(1, 8, 1, 8, 16), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 2, 8, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([1456], "uint8")
-        buffer_1 = T.buffer_decl([352], "uint8")
-        buffer_2 = T.buffer_decl([272], "uint8")
-        buffer_3 = T.buffer_decl([11040], "uint8")
-        placeholder_5 = T.buffer_decl([1024], 'int8', data=input_placeholder_5.data)
-        ethosu_write_1 = T.buffer_decl([2048], 'int8', data=input_ethosu_write_1.data)
+        buffer = T.Buffer([1456], "uint8")
+        buffer_1 = T.Buffer([352], "uint8")
+        buffer_2 = T.Buffer([272], "uint8")
+        buffer_3 = T.Buffer([11040], "uint8")
+        placeholder_5 = T.Buffer([1024], 'int8', data=input_placeholder_5.data)
+        ethosu_write_1 = T.Buffer([2048], 'int8', data=input_ethosu_write_1.data)
         # body
         ethosu_write_2_data = T.allocate([2304], "int8", "global", annotations={"disable_lower_builtin": True})
-        ethosu_write_2 = T.buffer_decl((2304,), "int8", data=ethosu_write_2_data)
+        ethosu_write_2 = T.Buffer((2304,), "int8", data=ethosu_write_2_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, T.int8(-1), T.int8(-1), 12, buffer_2[0], 272, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[256], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -461,15 +461,15 @@ class Conv2dDoubleCascade5:
     def main(input_placeholder: T.Buffer[(1, 8, 8, 3), "int8"], input_ethosu_write: T.Buffer[(1, 32, 32, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([160], "uint8")
-        buffer_1 = T.buffer_decl([320], "uint8")
-        buffer_2 = T.buffer_decl([304], "uint8")
-        buffer_3 = T.buffer_decl([80], "uint8")
-        placeholder = T.buffer_decl([192], 'int8', data=input_placeholder.data)
-        ethosu_write = T.buffer_decl([8192], 'int8', data=input_ethosu_write.data)
+        buffer = T.Buffer([160], "uint8")
+        buffer_1 = T.Buffer([320], "uint8")
+        buffer_2 = T.Buffer([304], "uint8")
+        buffer_3 = T.Buffer([80], "uint8")
+        placeholder = T.Buffer([192], 'int8', data=input_placeholder.data)
+        ethosu_write = T.Buffer([8192], 'int8', data=input_ethosu_write.data)
         # body
         ethosu_write_1_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        ethosu_write_1 = T.buffer_decl([4096], "int8", data=ethosu_write_1_data)
+        ethosu_write_1 = T.Buffer([4096], "int8", data=ethosu_write_1_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 32, 8, 16, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 304, T.int8(-1), T.int8(-1), 12, buffer_3[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[96], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
@@ -483,15 +483,15 @@ class Conv2dDoubleCascade6:
     def main(input_placeholder: T.Buffer[(1, 8, 1, 8, 16), "int8"], input_ethosu_write: T.Buffer[(1, 32, 2, 32, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([1456], "uint8")
-        buffer_1 = T.buffer_decl([352], "uint8")
-        buffer_2 = T.buffer_decl([11040], "uint8")
-        buffer_3 = T.buffer_decl([272], "uint8")
-        placeholder = T.buffer_decl([1024], 'int8', data=input_placeholder.data)
-        ethosu_write = T.buffer_decl([32768], 'int8', data=input_ethosu_write.data)
+        buffer = T.Buffer([1456], "uint8")
+        buffer_1 = T.Buffer([352], "uint8")
+        buffer_2 = T.Buffer([11040], "uint8")
+        buffer_3 = T.Buffer([272], "uint8")
+        placeholder = T.Buffer([1024], 'int8', data=input_placeholder.data)
+        ethosu_write = T.Buffer([32768], 'int8', data=input_ethosu_write.data)
         # body
         ethosu_write_1_data = T.allocate([12288], "int8", "global", annotations={"disable_lower_builtin":True})
-        ethosu_write_1 = T.buffer_decl([12288], "int8", data=ethosu_write_1_data)
+        ethosu_write_1 = T.Buffer([12288], "int8", data=ethosu_write_1_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 768, 16, 256, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 768, 16, 256, "int8", 32, 32, 26, 32, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 1024, 16, 512, 3, 3, 1, 1, 1, 1, buffer_2[0], 11040, T.int8(-1), T.int8(-1), 12, buffer_3[0], 272, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -647,10 +647,10 @@ class Conv2dInlineCopy1:
     def main(input_placeholder_3: T.Buffer[(1, 10, 12, 8), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 8, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([848], "uint8")
-        buffer_1 = T.buffer_decl([160], "uint8")
-        placeholder_3 = T.buffer_decl([960], 'int8', data=input_placeholder_3.data)
-        ethosu_write_1 = T.buffer_decl([1024], 'int8', data=input_ethosu_write_1.data)
+        buffer = T.Buffer([848], "uint8")
+        buffer_1 = T.Buffer([160], "uint8")
+        placeholder_3 = T.Buffer([960], 'int8', data=input_placeholder_3.data)
+        ethosu_write_1 = T.Buffer([1024], 'int8', data=input_ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, placeholder_3[120], 0, 0, 0, T.float32(0.5), 10, "NHWC", 96, 8, 1, "int8", 8, 8, 16, 8, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 848, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -662,10 +662,10 @@ class Conv2dInlineCopy2:
     def main(input_placeholder_3: T.Buffer[(1, 7, 9, 5), "int8"], input_ethosu_write_1: T.Buffer[(1, 3, 5, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([160], "uint8")
-        buffer_1 = T.buffer_decl([656], "uint8")
-        placeholder_3 = T.buffer_decl([315], 'int8', data=input_placeholder_3.data)
-        ethosu_write_1 = T.buffer_decl([240], 'int8', data=input_ethosu_write_1.data)
+        buffer = T.Buffer([160], "uint8")
+        buffer_1 = T.Buffer([656], "uint8")
+        placeholder_3 = T.Buffer([315], 'int8', data=input_placeholder_3.data)
+        ethosu_write_1 = T.Buffer([240], 'int8', data=input_ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 3, 5, 3, 3, 0, 5, placeholder_3[146], 0, 0, 0, T.float32(0.5), 10, "NHWC", 45, 5, 1, "int8", 3, 5, 16, 3, 0, 5, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 80, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 656, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -706,10 +706,10 @@ class Conv2dInlineReshape1:
     def main(input_placeholder_3: T.Buffer[(4, 6, 8, 1), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 6, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([160], "uint8")
-        buffer_1 = T.buffer_decl([848], "uint8")
-        placeholder_3 = T.buffer_decl([192], 'int8', data=input_placeholder_3.data)
-        ethosu_write_1 = T.buffer_decl([768], 'int8', data=input_ethosu_write_1.data)
+        buffer = T.Buffer([160], "uint8")
+        buffer_1 = T.Buffer([848], "uint8")
+        placeholder_3 = T.Buffer([192], 'int8', data=input_placeholder_3.data)
+        ethosu_write_1 = T.Buffer([768], 'int8', data=input_ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -722,10 +722,10 @@ class Conv2dInlineReshape2:
     def main(input_placeholder_3: T.Buffer[(1, 24, 8), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 6, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([160], "uint8")
-        buffer_1 = T.buffer_decl([848], "uint8")
-        placeholder_3 = T.buffer_decl([192], 'int8', data=input_placeholder_3.data)
-        ethosu_write_1 = T.buffer_decl([768], 'int8', data=input_ethosu_write_1.data)
+        buffer = T.Buffer([160], "uint8")
+        buffer_1 = T.Buffer([848], "uint8")
+        placeholder_3 = T.Buffer([192], 'int8', data=input_placeholder_3.data)
+        ethosu_write_1 = T.Buffer([768], 'int8', data=input_ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -738,10 +738,10 @@ class Conv2dInlineReshape3:
     def main(input_placeholder_3: T.Buffer[(192, 1), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 6, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([160], "uint8")
-        buffer_1 = T.buffer_decl([848], "uint8")
-        placeholder_3 = T.buffer_decl([192], 'int8', data=input_placeholder_3.data)
-        ethosu_write_1 = T.buffer_decl([768], 'int8', data=input_ethosu_write_1.data)
+        buffer = T.Buffer([160], "uint8")
+        buffer_1 = T.Buffer([848], "uint8")
+        placeholder_3 = T.Buffer([192], 'int8', data=input_placeholder_3.data)
+        ethosu_write_1 = T.Buffer([768], 'int8', data=input_ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -754,9 +754,9 @@ class Conv2dInlineReshape4:
     def main(placeholder_3: T.Buffer[(192,), "int8"], input_ethosu_write_1: T.Buffer[(1, 8, 6, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([160], "uint8")
-        buffer_1 = T.buffer_decl([848], "uint8")
-        ethosu_write_1 = T.buffer_decl([768], 'int8', data=input_ethosu_write_1.data)
+        buffer = T.Buffer([160], "uint8")
+        buffer_1 = T.Buffer([848], "uint8")
+        ethosu_write_1 = T.Buffer([768], 'int8', data=input_ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
index e23954f4cb67..29e1f9814c81 100644
--- a/tests/python/contrib/test_ethosu/test_replace_copy.py
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -37,12 +37,12 @@ class ReferenceModule:
     def main(input_placeholder_3: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write_1: T.Buffer[(1, 16, 16, 8), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer_1 = T.buffer_decl([384], "uint8")
-        placeholder_3 = T.buffer_decl([8192], dtype="int8", data=input_placeholder_3.data)
-        ethosu_write_1 = T.buffer_decl([2048], dtype="int8", data=input_ethosu_write_1.data)
+        buffer_1 = T.Buffer([384], "uint8")
+        placeholder_3 = T.Buffer([8192], dtype="int8", data=input_placeholder_3.data)
+        ethosu_write_1 = T.Buffer([2048], dtype="int8", data=input_ethosu_write_1.data)
         # body
         placeholder_global_data = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_global = T.buffer_decl([384], "uint8", data=placeholder_global_data)
+        placeholder_global = T.Buffer([384], "uint8", data=placeholder_global_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 384, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_global[304], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -81,15 +81,15 @@ class WeightStream:
     def main(input_placeholder_5: T.Buffer[(1, 16, 16, 32), "int8"], input_ethosu_write_1: T.Buffer[(1, 16, 16, 16), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([528], "uint8")
-        buffer_2 = T.buffer_decl([336], "uint8")
-        placeholder_5 = T.buffer_decl([8192], dtype="int8", data=input_placeholder_5.data)
-        ethosu_write_1 = T.buffer_decl([4096], dtype="int8", data=input_ethosu_write_1.data)
+        buffer = T.Buffer([528], "uint8")
+        buffer_2 = T.Buffer([336], "uint8")
+        placeholder_5 = T.Buffer([8192], dtype="int8", data=input_placeholder_5.data)
+        ethosu_write_1 = T.Buffer([4096], dtype="int8", data=input_ethosu_write_1.data)
         # body
         placeholder_d_global_data = T.allocate([528], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_d_global = T.buffer_decl([528], "uint8", data=placeholder_d_global_data)
+        placeholder_d_global = T.Buffer([528], "uint8", data=placeholder_d_global_data)
         placeholder_d_global_1_data = T.allocate([336], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_d_global_1 = T.buffer_decl([336], "uint8", data=placeholder_d_global_1_data)
+        placeholder_d_global_1 = T.Buffer([336], "uint8", data=placeholder_d_global_1_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 528, placeholder_d_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 336, placeholder_d_global_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_d_global[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global[416], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
index 1e9b43b47ada..c6f6bc2c6c61 100644
--- a/tests/python/contrib/test_ethosu/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -182,18 +182,18 @@ class DiamondGraphTir:
     @T.prim_func
     def main(input_placeholder: T.Buffer[(1, 56, 56, 96), "int8"], input_ethosu_write: T.Buffer[(1, 56, 56, 24), "int8"]) -> None:
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        placeholder = T.buffer_decl([301056], dtype='int8', data=input_placeholder.data)
-        ethosu_write = T.buffer_decl([75264], dtype='int8', data=input_ethosu_write.data)
-        buffer1 = T.buffer_decl([2848], "uint8")
-        buffer3 = T.buffer_decl([976], "uint8")
+        placeholder = T.Buffer([301056], dtype='int8', data=input_placeholder.data)
+        ethosu_write = T.Buffer([75264], dtype='int8', data=input_ethosu_write.data)
+        buffer1 = T.Buffer([2848], "uint8")
+        buffer3 = T.Buffer([976], "uint8")
         p1_data = T.allocate([2848], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p1 = T.buffer_decl([2848], "uint8", data=p1_data)
+        p1 = T.Buffer([2848], "uint8", data=p1_data)
         p2_data = T.allocate([976], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.buffer_decl([976], "uint8", data=p2_data)
+        p2 = T.Buffer([976], "uint8", data=p2_data)
         p5_data = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
-        p5 = T.buffer_decl([75264], "int8", data=p5_data)
+        p5 = T.Buffer([75264], "int8", data=p5_data)
         p6_data = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
-        p6 = T.buffer_decl([75264], "int8", data=p6_data)
+        p6 = T.Buffer([75264], "int8", data=p6_data)
         T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 2848, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 976, p2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p1[0], 2608, T.int8(-1), T.int8(-1), 12, p1[2608], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
diff --git a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
index f205bc3b26ca..d68c806f72d9 100644
--- a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
+++ b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
@@ -36,8 +36,8 @@ class SingleEthosUConv2D:
     def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_conv2d_1: T.Buffer[(1024,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_4 = T.buffer_decl([1], "uint8")
-        placeholder_5 = T.buffer_decl([1], "uint8")
+        placeholder_4 = T.Buffer([1], "uint8")
+        placeholder_5 = T.Buffer([1], "uint8")
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 8, 8, 3, 8, 0, 8, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 8, 8, 16, 8, 0, 8, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_4[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_5[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
 # fmt: on
@@ -51,10 +51,10 @@ class MultiEthosUConv2D:
     def main(placeholder_6: T.Buffer[(192,), "int8"], ethosu_conv2d_1: T.Buffer[(512,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_9 = T.buffer_decl([1], "uint8")
-        placeholder_7 = T.buffer_decl([1], "uint8")
-        placeholder_8 = T.buffer_decl([1], "uint8")
-        placeholder_5 = T.buffer_decl([1], "uint8")
+        placeholder_9 = T.Buffer([1], "uint8")
+        placeholder_7 = T.Buffer([1], "uint8")
+        placeholder_8 = T.Buffer([1], "uint8")
+        placeholder_5 = T.Buffer([1], "uint8")
         # body
         ethosu_conv2d_2 = T.decl_buffer([1024], "uint8")
         ethosu_conv2d_3 = T.decl_buffer([2048], "uint8")
@@ -73,8 +73,8 @@ class MultiEthosUCopy:
     def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_conv2d_1: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        placeholder_5 = T.buffer_decl([1], "int32")
-        placeholder_4 = T.buffer_decl([1], "uint8")
+        placeholder_5 = T.Buffer([1], "int32")
+        placeholder_4 = T.Buffer([1], "uint8")
         # body
         placeholder_global = T.decl_buffer([256], "uint8")
         placeholder_d_global = T.decl_buffer([8], "int32")
@@ -90,14 +90,14 @@ def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_conv2d_1: T.Buffer[(20
 class WeightStreamOnly:
     @T.prim_func
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
-        buffer = T.buffer_decl([1], "uint8")
-        buffer_1 = T.buffer_decl([1], "uint8")
-        buffer_2 = T.buffer_decl([1], "uint8")
-        buffer_3 = T.buffer_decl([1], "uint8")
-        buffer_4 = T.buffer_decl([1], "uint8")
-        buffer_5 = T.buffer_decl([1], "uint8")
-        buffer_6 = T.buffer_decl([1], "uint8")
-        buffer_7 = T.buffer_decl([1], "uint8")
+        buffer = T.Buffer([1], "uint8")
+        buffer_1 = T.Buffer([1], "uint8")
+        buffer_2 = T.Buffer([1], "uint8")
+        buffer_3 = T.Buffer([1], "uint8")
+        buffer_4 = T.Buffer([1], "uint8")
+        buffer_5 = T.Buffer([1], "uint8")
+        buffer_6 = T.Buffer([1], "uint8")
+        buffer_7 = T.Buffer([1], "uint8")
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True,
                      "global_symbol": "main", "tir.noalias": True,
@@ -136,16 +136,16 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
 class MixedRead:
     @T.prim_func
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
-        buffer = T.buffer_decl([1], "uint8")
-        buffer_1 = T.buffer_decl([1], "uint8")
-        buffer_2 = T.buffer_decl([1], "uint8")
-        buffer_3 = T.buffer_decl([1], "uint8")
-        buffer_4 = T.buffer_decl([1], "uint8")
-        buffer_5 = T.buffer_decl([1], "uint8")
-        buffer_6 = T.buffer_decl([1], "uint8")
-        buffer_7 = T.buffer_decl([1], "uint8")
-        buffer_8 = T.buffer_decl([1], "uint8")
-        buffer_9 = T.buffer_decl([1], "uint8")
+        buffer = T.Buffer([1], "uint8")
+        buffer_1 = T.Buffer([1], "uint8")
+        buffer_2 = T.Buffer([1], "uint8")
+        buffer_3 = T.Buffer([1], "uint8")
+        buffer_4 = T.Buffer([1], "uint8")
+        buffer_5 = T.Buffer([1], "uint8")
+        buffer_6 = T.Buffer([1], "uint8")
+        buffer_7 = T.Buffer([1], "uint8")
+        buffer_8 = T.Buffer([1], "uint8")
+        buffer_9 = T.Buffer([1], "uint8")
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True,
                      "global_symbol": "main", "tir.noalias": True,
@@ -161,11 +161,11 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
                                    buffer_9.name: buffer_9}})
         # body
         ethosu_write_1_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        ethosu_write_1 = T.buffer_decl([4096], "int8", data=ethosu_write_1_data)
+        ethosu_write_1 = T.Buffer([4096], "int8", data=ethosu_write_1_data)
         placeholder_global_data = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_global = T.buffer_decl([80], "uint8", data=placeholder_global_data)
+        placeholder_global = T.Buffer([80], "uint8", data=placeholder_global_data)
         placeholder_d_global_data = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global = T.buffer_decl([32], "uint8", data=placeholder_d_global_data)
+        placeholder_d_global = T.Buffer([32], "uint8", data=placeholder_d_global_data)
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, placeholder_d_global[0], dtype="handle"))
@@ -673,9 +673,9 @@ def populate_ethosu_copy_calls(stmt):
 class MixedConstantDatatypes:
     @T.prim_func
     def main(placeholder_4: T.Buffer[(2048,), "int8"], ethosu_write_1: T.Buffer[(16,), "int8"]) -> None:
-        buffer = T.buffer_decl([1], "uint8")
-        buffer_1 = T.buffer_decl([1], "uint8")
-        buffer_2 = T.buffer_decl([1], "int16")
+        buffer = T.Buffer([1], "uint8")
+        buffer_1 = T.Buffer([1], "uint8")
+        buffer_2 = T.Buffer([1], "int16")
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True,
                      "global_symbol": "main", "tir.noalias": True,
diff --git a/tests/python/relay/aot/test_pass_aot_lower_main.py b/tests/python/relay/aot/test_pass_aot_lower_main.py
index 093305203a94..b523e019299c 100644
--- a/tests/python/relay/aot/test_pass_aot_lower_main.py
+++ b/tests/python/relay/aot/test_pass_aot_lower_main.py
@@ -180,12 +180,12 @@ def func(a: T.handle, output: T.handle) -> None:
         T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output], "devices": []})
         tmp_read = T.buffer_var("uint8", "")
         # buffer definition
-        tmp_read_1 = T.buffer_decl([T.uint64(140)], dtype="uint8", data=tmp_read)
+        tmp_read_1 = T.Buffer([T.uint64(140)], dtype="uint8", data=tmp_read)
         a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16)
         output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16)
         # body
         tmp_write: T.Ptr[T.uint8] = output_buffer.data
-        tmp_write_1 = T.buffer_decl([T.uint64(140)], dtype="uint8", data=tmp_write)
+        tmp_write_1 = T.Buffer([T.uint64(140)], dtype="uint8", data=tmp_write)
         for i in T.serial(140):
             tmp_write_1[i] = T.let(tmp_read, a_buffer.data, tmp_read_1[i])
     # fmt: on
diff --git a/tests/python/unittest/test_lower_build.py b/tests/python/unittest/test_lower_build.py
index 665697b84be9..4c188d2f834b 100644
--- a/tests/python/unittest/test_lower_build.py
+++ b/tests/python/unittest/test_lower_build.py
@@ -60,9 +60,9 @@ def main(
     ) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "from_legacy_te_schedule": True, "tir.noalias": True})
-        A_flat = T.buffer_decl([16384], data=A.data)
-        B_flat = T.buffer_decl([16384], data=B.data)
-        C_flat = T.buffer_decl([16384], data=C.data)
+        A_flat = T.Buffer([16384], data=A.data)
+        B_flat = T.Buffer([16384], data=B.data)
+        C_flat = T.Buffer([16384], data=C.data)
         # body
         for x, y in T.grid(128, 128):
             C_flat[x * 128 + y] = 0.0
@@ -82,9 +82,9 @@ def main(
     ) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        A_flat = T.buffer_decl([16384], data=A.data)
-        B_flat = T.buffer_decl([16384], data=B.data)
-        C_flat = T.buffer_decl([16384], data=C.data)
+        A_flat = T.Buffer([16384], data=A.data)
+        B_flat = T.Buffer([16384], data=B.data)
+        C_flat = T.Buffer([16384], data=C.data)
         # body
         for x, y in T.grid(128, 128):
             C_flat[x * 128 + y] = 0.0
diff --git a/tests/python/unittest/test_tir_renew_defs.py b/tests/python/unittest/test_tir_renew_defs.py
index 28b440a608dc..65f81499bdfd 100644
--- a/tests/python/unittest/test_tir_renew_defs.py
+++ b/tests/python/unittest/test_tir_renew_defs.py
@@ -136,7 +136,7 @@ def test_undefined_buffer():
     def access_alloc():
         # Buffer A should be remapped
         A_data = T.allocate([128], "float16", "global")
-        A = T.buffer_decl(shape=[128], dtype="float16", data=A_data)
+        A = T.Buffer(shape=[128], dtype="float16", data=A_data)
         # check if buffer var also get remapped
         T.evaluate(A.data)
         for i in range(128):
diff --git a/tests/python/unittest/test_tir_schedule_cache_read_write.py b/tests/python/unittest/test_tir_schedule_cache_read_write.py
index 6a75057e72ff..bcb214594cb8 100644
--- a/tests/python/unittest/test_tir_schedule_cache_read_write.py
+++ b/tests/python/unittest/test_tir_schedule_cache_read_write.py
@@ -1011,9 +1011,9 @@ def cache_write_allocate_const(
 ):
     B = T.alloc_buffer([128, 128], dtype="float32")
     const = T.allocate_const([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], "float32", [8])
-    const_1 = T.buffer_decl([8], dtype="float32", data=const)
+    const_1 = T.Buffer([8], dtype="float32", data=const)
     const2 = T.allocate_const([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], "float32", [8])
-    const_2 = T.buffer_decl([8], dtype="float32", data=const)
+    const_2 = T.Buffer([8], dtype="float32", data=const)
     for i, j in T.grid(128, 128):
         for x in range(8):
             with T.block("B"):
@@ -1037,8 +1037,8 @@ def cache_write_allocate_const_output(
     A_global = T.alloc_buffer([128, 128], dtype="float32")
     C_global = T.alloc_buffer([128, 128], dtype="float16")
     const_2 = T.allocate_const([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], "float32", [8])
-    const_1 = T.buffer_decl([8], dtype="float32", data=const_2)
-    const_2_1 = T.buffer_decl([8], dtype="float32", data=const_2)
+    const_1 = T.Buffer([8], dtype="float32", data=const_2)
+    const_2_1 = T.Buffer([8], dtype="float32", data=const_2)
     const2 = T.allocate_const([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], "float32", [8])
     for ax0, ax1 in T.grid(128, 128):
         with T.block("A_global"):
diff --git a/tests/python/unittest/test_tir_transform_common_subexpr_elim.py b/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
index be229a580f01..113d9f047478 100644
--- a/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
+++ b/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
@@ -349,7 +349,7 @@ def test_no_normalization_without_commoning():
 # -------------------------------------------------
 @T.prim_func
 def func_distributivity(i1: T.int32, i2: T.int32, x: T.int32, y: T.int32, z: T.int32) -> None:
-    B = T.buffer_decl((50,), "int32")
+    B = T.Buffer((50,), "int32")
     B[i1] = x * (y + z)
     B[i2] = x * y + x * z
 
@@ -358,7 +358,7 @@ def func_distributivity(i1: T.int32, i2: T.int32, x: T.int32, y: T.int32, z: T.i
 def func_distributivity_expected(
     i1: T.int32, i2: T.int32, x: T.int32, y: T.int32, z: T.int32
 ) -> None:
-    B = T.buffer_decl((50,), "int32")
+    B = T.Buffer((50,), "int32")
     cse_var_1 = T.var("int32")
     with T.let(cse_var_1, x * y + x * z):
         B[i1] = cse_var_1
@@ -367,7 +367,7 @@ def func_distributivity_expected(
 
 @T.prim_func
 def func_associativity(i1: T.int32, i2: T.int32, x: T.int32, y: T.int32, z: T.int32) -> None:
-    B = T.buffer_decl((50,), "int32")
+    B = T.Buffer((50,), "int32")
     B[i1] = (x + y) + z
     B[i2] = x + (y + z)
 
@@ -376,7 +376,7 @@ def func_associativity(i1: T.int32, i2: T.int32, x: T.int32, y: T.int32, z: T.in
 def func_associativity_expected(
     i1: T.int32, i2: T.int32, x: T.int32, y: T.int32, z: T.int32
 ) -> None:
-    B = T.buffer_decl((50,), "int32")
+    B = T.Buffer((50,), "int32")
     cse_var_1 = T.var("int32")
     with T.let(cse_var_1, (x + y) + z):
         B[i1] = cse_var_1
diff --git a/tests/python/unittest/test_tir_transform_extract_constants.py b/tests/python/unittest/test_tir_transform_extract_constants.py
index 5de06e38a557..b3e0aa74f96d 100644
--- a/tests/python/unittest/test_tir_transform_extract_constants.py
+++ b/tests/python/unittest/test_tir_transform_extract_constants.py
@@ -28,7 +28,7 @@ def constant1(a: T.handle) -> None:
         A = T.match_buffer(a, (10), "int32")
         B = T.alloc_buffer((10), "int32")
         K_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
-        K = T.buffer_decl(shape=(10), dtype="int32", data=K_data)
+        K = T.Buffer(shape=(10), dtype="int32", data=K_data)
         for x in T.serial(0, 10):
             B[x] = A[x] + K[x]
 
@@ -37,7 +37,7 @@ def constant2(a: T.handle) -> None:
         A = T.match_buffer(a, (10), "int32")
         B = T.alloc_buffer((10), "int32")
         K_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
-        K = T.buffer_decl(shape=(10), dtype="int32", data=K_data)
+        K = T.Buffer(shape=(10), dtype="int32", data=K_data)
         for x in T.serial(0, 10):
             B[x] = A[x] + K[x]
 
@@ -46,7 +46,7 @@ def constant3(a: T.handle) -> None:
         A = T.match_buffer(a, (10), "int32")
         B = T.alloc_buffer((10), "int32")
         K_data = T.allocate_const([1, 2, 3, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
-        K = T.buffer_decl(shape=(10), dtype="int32", data=K_data)
+        K = T.Buffer(shape=(10), dtype="int32", data=K_data)
         for x in T.serial(0, 10):
             B[x] = A[x] + K[x]
 
diff --git a/tests/python/unittest/test_tir_transform_flatten_buffer.py b/tests/python/unittest/test_tir_transform_flatten_buffer.py
index 513e04dc2090..12523fbdb2ae 100644
--- a/tests/python/unittest/test_tir_transform_flatten_buffer.py
+++ b/tests/python/unittest/test_tir_transform_flatten_buffer.py
@@ -41,11 +41,11 @@ def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]):
                 C[i, j] = B_new[0, j] * 2.0
 
     def expected(input_A: T.Buffer[(16, 16), "float32"], input_C: T.Buffer[(16, 16), "float32"]):
-        A = T.buffer_decl(256, dtype="float32", data=input_A.data)
-        C = T.buffer_decl(256, dtype="float32", data=input_C.data)
+        A = T.Buffer(256, dtype="float32", data=input_A.data)
+        C = T.Buffer(256, dtype="float32", data=input_C.data)
         for i in T.serial(0, 16):
             B_new_data = T.allocate([16], "float32", scope="global")
-            B_new = T.buffer_decl([16], "float32", scope="global", data=B_new_data)
+            B_new = T.Buffer([16], "float32", scope="global", data=B_new_data)
             for j in T.serial(0, 16):
                 B_new[j] = A[((i * 16) + j)] + 1.0
             for j in T.serial(0, 16):
@@ -56,7 +56,7 @@ class TestElementwiseWithoutDeclBuffer(BaseCompare):
     """2-d buffers are flattened to 1-d
 
     Like TestElementwise, but the TIR doesn't have the DeclBuffer
-    node.  The T.buffer_decl declaration applies only during the
+    node.  The T.Buffer declaration applies only during the
     parsing the TVMScript, and doesn't occur in the TIR itself.  In
     this case, the allocation should be assumed to be targeting flat
     memory, and should be flattened to a 1-d allocation.
@@ -65,18 +65,18 @@ class TestElementwiseWithoutDeclBuffer(BaseCompare):
     def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]):
         for i in T.serial(0, 16):
             B_new_data = T.allocate([1, 16], "float32", "global")
-            B_new = T.buffer_decl([1, 16], "float32", data=B_new_data)
+            B_new = T.Buffer([1, 16], "float32", data=B_new_data)
             for j in T.serial(0, 16):
                 B_new[0, j] = A[i, j] + 1.0
             for j in T.serial(0, 16):
                 C[i, j] = B_new[0, j] * 2.0
 
     def expected(input_A: T.Buffer[(16, 16), "float32"], input_C: T.Buffer[(16, 16), "float32"]):
-        A = T.buffer_decl(256, dtype="float32", data=input_A.data)
-        C = T.buffer_decl(256, dtype="float32", data=input_C.data)
+        A = T.Buffer(256, dtype="float32", data=input_A.data)
+        C = T.Buffer(256, dtype="float32", data=input_C.data)
         for i in T.serial(0, 16):
             B_new_data = T.allocate([16], "float32", "global")
-            B_new = T.buffer_decl(16, "float32", data=B_new_data)
+            B_new = T.Buffer(16, "float32", data=B_new_data)
             for j in T.serial(0, 16):
                 B_new[j] = A[((i * 16) + j)] + 1.0
             for j in T.serial(0, 16):
@@ -101,8 +101,8 @@ def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]):
             C[i0 * 4 + i1 * 2 + i2, j] = B[0, j] * 2.0
 
     def expected(input_A: T.Buffer[(16, 16), "float32"], input_C: T.Buffer[(16, 16), "float32"]):
-        A = T.buffer_decl(256, dtype="float32", data=input_A.data)
-        C = T.buffer_decl(256, dtype="float32", data=input_C.data)
+        A = T.Buffer(256, dtype="float32", data=input_A.data)
+        C = T.Buffer(256, dtype="float32", data=input_C.data)
 
         i0 = T.env_thread("blockIdx.x")
         i1 = T.env_thread("threadIdx.x")
@@ -112,7 +112,7 @@ def expected(input_A: T.Buffer[(16, 16), "float32"], input_C: T.Buffer[(16, 16),
         T.launch_thread(i1, 2)
         T.launch_thread(i2, 2)
         B_data = T.allocate([16], "float32", scope="local")
-        B = T.buffer_decl([16], "float32", scope="local", data=B_data)
+        B = T.Buffer([16], "float32", scope="local", data=B_data)
         for j in range(0, 16):
             B[j] = A[i0 * 64 + i1 * 32 + i2 * 16 + j] + 1.0
         for j in range(0, 16):
@@ -136,12 +136,12 @@ def before(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> None:
     def expected(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> None:
         input_A = T.match_buffer(a, (n, m), "float32")
         input_C = T.match_buffer(c, (n, m), "float32")
-        A = T.buffer_decl(n * m, "float32", data=input_A.data)
-        C = T.buffer_decl(n * m, "float32", data=input_C.data)
+        A = T.Buffer(n * m, "float32", data=input_A.data)
+        C = T.Buffer(n * m, "float32", data=input_C.data)
 
         for i in range(0, n):
             B_data = T.allocate([m], "float32", scope="global")
-            B = T.buffer_decl([m], "float32", scope="global", data=B_data)
+            B = T.Buffer([m], "float32", scope="global", data=B_data)
             for j in range(0, m):
                 B[j] = A[i * m + j] + 1.0
             for j in range(0, m):
@@ -160,14 +160,14 @@ def before(A: T.Buffer[(4, 32), "float32"], D: T.Buffer[(4, 32), "float32"]):
             D[i, j] = C[i, j] * 2.0
 
     def expected(input_A: T.Buffer[(4, 32), "float32"], input_D: T.Buffer[(4, 32), "float32"]):
-        A = T.buffer_decl(128, "float32", data=input_A.data)
-        D = T.buffer_decl(128, "float32", data=input_D.data)
+        A = T.Buffer(128, "float32", data=input_A.data)
+        D = T.Buffer(128, "float32", data=input_D.data)
 
         for i, j in T.grid(4, 32):
             B_data = T.allocate([128], "float32", scope="global")
-            B = T.buffer_decl([128], "float32", scope="global", data=B_data)
+            B = T.Buffer([128], "float32", scope="global", data=B_data)
             C_data = T.allocate([128], "float32", scope="global")
-            C = T.buffer_decl([128], "float32", scope="global", data=C_data)
+            C = T.Buffer([128], "float32", scope="global", data=C_data)
             B[i * 32 + j] = A[i * 32 + j] + 1.0
             C[i * 32 + j] = A[i * 32 + j] + B[i * 32 + j]
             D[i * 32 + j] = C[i * 32 + j] * 2.0
@@ -179,18 +179,18 @@ class TestStrided(BaseCompare):
     def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]):
         for i0 in T.serial(4):
             B = T.decl_buffer([4, 17], "float32")
-            B_1 = T.buffer_decl([4, 16], dtype="float32", data=B.data, strides=[17, 1])
+            B_1 = T.Buffer([4, 16], dtype="float32", data=B.data, strides=[17, 1])
             for i1, j in T.grid(4, 16):
                 B_1[i1, j] = A[i0 * 4 + i1, j] + 1.0
             for i1, j in T.grid(4, 16):
                 C[i0 * 4 + i1, j] = B_1[i1, j] * 2.0
 
     def expected(input_A: T.Buffer[(16, 16), "float32"], input_C: T.Buffer[(16, 16), "float32"]):
-        A = T.buffer_decl(256, dtype="float32", data=input_A.data)
-        C = T.buffer_decl(256, dtype="float32", data=input_C.data)
+        A = T.Buffer(256, dtype="float32", data=input_A.data)
+        C = T.Buffer(256, dtype="float32", data=input_C.data)
         for i0 in T.serial(0, 4):
             B_new_data = T.allocate([68], "float32", scope="global")
-            B_new = T.buffer_decl([68], "float32", scope="global", data=B_new_data)
+            B_new = T.Buffer([68], "float32", scope="global", data=B_new_data)
             for i1 in T.serial(0, 4):
                 for j in T.serial(0, 16):
                     B_new[i1 * 17 + j] = A[i0 * 64 + i1 * 16 + j] + 1.0
@@ -207,8 +207,8 @@ def before(A: T.Buffer[10, "bool"], B: T.Buffer[10, "bool"]) -> None:
             B[i0] = A[i0]
 
     def expected(input_A: T.Buffer[10, "bool"], input_B: T.Buffer[10, "bool"]) -> None:
-        A = T.buffer_decl(10, dtype="int8", data=input_A.data)
-        B = T.buffer_decl(10, dtype="int8", data=input_B.data)
+        A = T.Buffer(10, dtype="int8", data=input_A.data)
+        B = T.Buffer(10, dtype="int8", data=input_B.data)
         # body
         for i0 in T.serial(10):
             B[i0] = T.cast(T.cast(A[i0], "bool"), "int8")
@@ -285,9 +285,7 @@ def before():
 
     def expected():
         A_data = T.allocate([30, 1001], dtype="float32", scope="global")
-        A = T.buffer_decl(
-            [30, 1001], dtype="float32", scope="global", axis_separators=[1], data=A_data
-        )
+        A = T.Buffer([30, 1001], dtype="float32", scope="global", axis_separators=[1], data=A_data)
         for i0, i1, i2, i3, i4, i5 in T.grid(2, 3, 5, 7, 11, 13):
             T.evaluate(A[i0 * 15 + i1 * 5 + i2, i3 * 143 + i4 * 13 + i5])
 
diff --git a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
index d75fb2b03e39..b7bd6cb46fd6 100644
--- a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
+++ b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
@@ -207,7 +207,7 @@ def main(A: T.handle, tensor: T.handle) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        tensor_2 = T.Buffer([1, 10, 12, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
@@ -239,7 +239,7 @@ def main(A: T.handle, tensor: T.handle) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1)
+        tensor_2 = T.Buffer([1, 10, 12, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1)
         # body
diff --git a/tests/python/unittest/test_tir_transform_inject_virtual_thread.py b/tests/python/unittest/test_tir_transform_inject_virtual_thread.py
index eb5ed08bb5af..d32714938424 100644
--- a/tests/python/unittest/test_tir_transform_inject_virtual_thread.py
+++ b/tests/python/unittest/test_tir_transform_inject_virtual_thread.py
@@ -146,13 +146,13 @@ def before_func():
         vthread = T.env_thread("vthread")
         T.launch_thread(vthread, 4)
         B_data = T.allocate([4], "int32", scope="shared")
-        B = T.buffer_decl([4], "int32", data=B_data, scope="shared")
+        B = T.Buffer([4], "int32", data=B_data, scope="shared")
         B[0:4] = T.broadcast(vthread, 4)
 
     @T.prim_func
     def expected_func():
         B_data = T.allocate([16], "int32", scope="shared")
-        B = T.buffer_decl([16], "int32", data=B_data, scope="shared")
+        B = T.Buffer([16], "int32", data=B_data, scope="shared")
         # The indices for B should each be a single Ramp node, and
         # should not be the sum of a Ramp and Broadcast node.
         B[T.Mul(0, 4) : T.Mul(0, 4) + 4] = T.broadcast(0, 4)
@@ -175,13 +175,13 @@ def before_func():
         vthread = T.env_thread("vthread")
         T.launch_thread(vthread, 4)
         B_data = T.allocate([4], "int32", "shared")
-        B = T.buffer_decl([4], "int32", data=B_data, scope="shared")
+        B = T.Buffer([4], "int32", data=B_data, scope="shared")
         B[0:4] = T.broadcast(vthread, 4)
 
     @T.prim_func
     def expected_func():
         B_data = T.allocate([4], "int32x4", "shared")
-        B = T.buffer_decl([4], "int32x4", data=B_data, scope="shared")
+        B = T.Buffer([4], "int32x4", data=B_data, scope="shared")
         B[T.Mul(0, 4) / 4] = T.broadcast(0, 4)
         B[T.Mul(1, 4) / 4] = T.broadcast(1, 4)
         B[T.Mul(2, 4) / 4] = T.broadcast(2, 4)
diff --git a/tests/python/unittest/test_tir_transform_loop_partition.py b/tests/python/unittest/test_tir_transform_loop_partition.py
index 7dd8e794103e..1a40f52140ee 100644
--- a/tests/python/unittest/test_tir_transform_loop_partition.py
+++ b/tests/python/unittest/test_tir_transform_loop_partition.py
@@ -583,10 +583,10 @@ def partitioned_concat_3(
     placeholder_2: T.Buffer[(1, 32, 28, 28), "int8"],
     T_concat: T.Buffer[(1, 128, 28, 28), "int8"],
 ) -> None:
-    placeholder_flat = T.buffer_decl([50176], "int8", data=placeholder.data)
-    placeholder_1_flat = T.buffer_decl([25088], "int8", data=placeholder_1.data)
-    placeholder_2_flat = T.buffer_decl([25088], "int8", data=placeholder_2.data)
-    T_concat_flat = T.buffer_decl([100352], "int8", data=T_concat.data)
+    placeholder_flat = T.Buffer([50176], "int8", data=placeholder.data)
+    placeholder_1_flat = T.Buffer([25088], "int8", data=placeholder_1.data)
+    placeholder_2_flat = T.Buffer([25088], "int8", data=placeholder_2.data)
+    T_concat_flat = T.Buffer([100352], "int8", data=T_concat.data)
     for i1, i2, i3 in T.grid(64, 28, 28):
         T_concat_flat[i1 * 784 + i2 * 28 + i3] = placeholder_flat[i1 * 784 + i2 * 28 + i3]
     for i1, i2, i3 in T.grid(32, 28, 28):
@@ -602,10 +602,10 @@ def concat_func_3(
     placeholder_2: T.Buffer[(1, 32, 28, 28), "int8"],
     T_concat: T.Buffer[(1, 128, 28, 28), "int8"],
 ) -> None:
-    placeholder_flat = T.buffer_decl([50176], "int8", data=placeholder.data)
-    placeholder_1_flat = T.buffer_decl([25088], "int8", data=placeholder_1.data)
-    placeholder_2_flat = T.buffer_decl([25088], "int8", data=placeholder_2.data)
-    T_concat_flat = T.buffer_decl([100352], "int8", data=T_concat.data)
+    placeholder_flat = T.Buffer([50176], "int8", data=placeholder.data)
+    placeholder_1_flat = T.Buffer([25088], "int8", data=placeholder_1.data)
+    placeholder_2_flat = T.Buffer([25088], "int8", data=placeholder_2.data)
+    T_concat_flat = T.Buffer([100352], "int8", data=T_concat.data)
     for i1 in T.serial(128, annotations={"pragma_loop_partition_hint": 1}):
         for i2, i3 in T.grid(28, 28):
             if 96 <= i1:
@@ -632,8 +632,8 @@ def test_loop_partition_unroll_hint():
     def main(
         A_arg: T.Buffer[(1, 3, 224, 224), "int8"], B_arg: T.Buffer[(1, 224, 7, 16), "int8"]
     ) -> None:
-        A = T.buffer_decl(150528, "int8", data=A_arg.data)
-        B = T.buffer_decl(25088, "int8", data=B_arg.data)
+        A = T.Buffer(150528, "int8", data=A_arg.data)
+        B = T.Buffer(25088, "int8", data=B_arg.data)
         for ax0 in T.serial(
             112,
             annotations={"pragma_loop_partition_hint": True},
@@ -646,8 +646,8 @@ def main(
     def partitioned_main(
         A_arg: T.Buffer[(1, 3, 224, 224), "int8"], B_arg: T.Buffer[(1, 224, 7, 16), "int8"]
     ) -> None:
-        A = T.buffer_decl(150528, dtype="int8", data=A_arg.data)
-        B = T.buffer_decl(25088, dtype="int8", data=B_arg.data)
+        A = T.Buffer(150528, dtype="int8", data=A_arg.data)
+        B = T.Buffer(25088, dtype="int8", data=B_arg.data)
         # body
         for ax1, ax2, ax3 in T.grid(224, 7, 16):
             if 3 <= ax2 and ax3 < 3:
@@ -706,11 +706,11 @@ def main():
     @T.prim_func
     def partitioned_main():
         placeholder_0_dm = T.allocate([16384], "int8", "global")
-        placeholder_0_dm_1 = T.buffer_decl([16384], dtype="int8", data=placeholder_0_dm)
+        placeholder_0_dm_1 = T.Buffer([16384], dtype="int8", data=placeholder_0_dm)
         for i3_0 in T.unroll(2):
             for i2_0 in T.unroll(2):
                 pad_temp = T.allocate([4096], "int8", "global")
-                pad_temp_1 = T.buffer_decl([4096], dtype="int8", data=pad_temp)
+                pad_temp_1 = T.Buffer([4096], dtype="int8", data=pad_temp)
                 for ax0, ax1, ax2 in T.grid(16, 16, 16):
                     if 6 <= i2_0 * 4 + ax0 and 6 <= i3_0 * 4 + ax1:
                         pad_temp_1[ax0 * 256 + ax1 * 16 + ax2] = placeholder_0_dm_1[
@@ -718,7 +718,7 @@ def partitioned_main():
                         ]
         for i2_0 in T.unroll(2):
             pad_temp_2 = T.allocate([4096], "int8", "global")
-            pad_temp_3 = T.buffer_decl([4096], dtype="int8", data=pad_temp_2)
+            pad_temp_3 = T.Buffer([4096], dtype="int8", data=pad_temp_2)
             for ax0, ax1, ax2 in T.grid(16, 16, 16):
                 if 6 <= i2_0 * 4 + ax0:
                     pad_temp_3[ax0 * 256 + ax1 * 16 + ax2] = placeholder_0_dm_1[
@@ -727,7 +727,7 @@ def partitioned_main():
         for i3_0 in T.unroll(2):
             for i2_0 in T.unroll(2):
                 pad_temp_4 = T.allocate([4096], "int8", "global")
-                pad_temp_5 = T.buffer_decl([4096], dtype="int8", data=pad_temp_4)
+                pad_temp_5 = T.Buffer([4096], dtype="int8", data=pad_temp_4)
                 for ax0, ax1, ax2 in T.grid(16, 16, 16):
                     if 6 <= i2_0 * 4 + ax0 and i3_0 * 4 + ax1 < 14:
                         pad_temp_5[ax0 * 256 + ax1 * 16 + ax2] = placeholder_0_dm_1[
diff --git a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
index 635badb847bd..5cdc272440e7 100644
--- a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
+++ b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
@@ -28,9 +28,9 @@ class Before:
     def main(inputs: T.Buffer[(1, 4, 4, 512), "float32"], weight: T.Buffer[(4, 4, 512, 256), "float32"], conv2d_transpose_nhwc: T.Buffer[(1, 8, 8, 256), "float32"]) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        inputs_flat = T.buffer_decl([8192], dtype="float32", data=inputs.data)
-        weight_flat = T.buffer_decl([2097152], dtype="float32", data=weight.data)
-        conv2d_transpose_nhwc_flat = T.buffer_decl([16384], dtype="float32", data=conv2d_transpose_nhwc.data)
+        inputs_flat = T.Buffer([8192], dtype="float32", data=inputs.data)
+        weight_flat = T.Buffer([2097152], dtype="float32", data=weight.data)
+        conv2d_transpose_nhwc_flat = T.Buffer([16384], dtype="float32", data=conv2d_transpose_nhwc.data)
         # var definition
         threadIdx_x = T.env_thread("threadIdx.x")
         blockIdx_x = T.env_thread("blockIdx.x")
@@ -59,9 +59,9 @@ class After:
     def main(inputs: T.Buffer[(1, 4, 4, 512), "float32"], weight: T.Buffer[(4, 4, 512, 256), "float32"], conv2d_transpose_nhwc: T.Buffer[(1, 8, 8, 256), "float32"]) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        inputs_flat = T.buffer_decl([8192], dtype="float32", data=inputs.data)
-        weight_flat = T.buffer_decl([2097152], dtype="float32", data=weight.data)
-        conv2d_transpose_nhwc_flat = T.buffer_decl([16384], dtype="float32", data=conv2d_transpose_nhwc.data)
+        inputs_flat = T.Buffer([8192], dtype="float32", data=inputs.data)
+        weight_flat = T.Buffer([2097152], dtype="float32", data=weight.data)
+        conv2d_transpose_nhwc_flat = T.Buffer([16384], dtype="float32", data=conv2d_transpose_nhwc.data)
         # var definition
         threadIdx_x = T.env_thread("threadIdx.x")
         blockIdx_x = T.env_thread("blockIdx.x")
@@ -93,9 +93,9 @@ def main(inputs: T.Buffer[(1, 4, 4, 512), "float32"], weight: T.Buffer[(4, 4, 51
         # var definition
         threadIdx_x = T.env_thread("threadIdx.x")
         blockIdx_x = T.env_thread("blockIdx.x")
-        inputs_flat = T.buffer_decl([8192], dtype="float32", data=inputs.data)
-        weight_flat = T.buffer_decl([2097152], dtype="float32", data=weight.data)
-        conv2d_transpose_nhwc_flat = T.buffer_decl([16384], dtype="float32", data=conv2d_transpose_nhwc.data)
+        inputs_flat = T.Buffer([8192], dtype="float32", data=inputs.data)
+        weight_flat = T.Buffer([2097152], dtype="float32", data=weight.data)
+        conv2d_transpose_nhwc_flat = T.Buffer([16384], dtype="float32", data=conv2d_transpose_nhwc.data)
         # body
         T.launch_thread(blockIdx_x, 64)
         conv2d_transpose_nhwc_local = T.decl_buffer([8], "float32", scope="local")
diff --git a/tests/python/unittest/test_tir_transform_storage_rewrite.py b/tests/python/unittest/test_tir_transform_storage_rewrite.py
index 533a835e0f9c..2ed2e6ec6d71 100644
--- a/tests/python/unittest/test_tir_transform_storage_rewrite.py
+++ b/tests/python/unittest/test_tir_transform_storage_rewrite.py
@@ -655,7 +655,7 @@ def test_access_in_let_value():
     def func(A: T.Buffer[(8,), "float32"]):
         for i in range(8):
             B_data = T.allocate((1,), "float32", "global")
-            B = T.buffer_decl(shape=[1], dtype="float32", data=B_data)
+            B = T.Buffer(shape=[1], dtype="float32", data=B_data)
             B[0] = 3.14
             x: T.float32 = T.exp(B[0], dtype="float32")
             A[i] = (x + 1.0) / (x - 1.0)
@@ -663,7 +663,7 @@ def func(A: T.Buffer[(8,), "float32"]):
     @T.prim_func
     def func_rewritten(A: T.Buffer[(8,), "float32"]) -> None:
         B_data = T.allocate((1,), "float32", "global")
-        B = T.buffer_decl(shape=[1], dtype="float32", data=B_data)
+        B = T.Buffer(shape=[1], dtype="float32", data=B_data)
         for i in range(8):
             B[0] = 3.14
             x: T.float32 = T.exp(B[0], dtype="float32")
@@ -690,12 +690,12 @@ class TestLetBufferRewrite(BaseCompare):
 
     def before() -> None:
         A_data: T.Ptr[T.int32] = T.call_extern("dummy_func", dtype="handle")
-        A = T.buffer_decl([8], "int32", data=A_data)
+        A = T.Buffer([8], "int32", data=A_data)
         A[0:8] = T.broadcast(42, 8)
 
     def expected() -> None:
         A_data: T.Ptr[T.int32x8] = T.call_extern("dummy_func", dtype="handle")
-        A = T.buffer_decl([1], "int32x8", data=A_data)
+        A = T.Buffer([1], "int32x8", data=A_data)
         A[0] = T.broadcast(42, 8)
 
 
@@ -708,7 +708,7 @@ def before(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"]):
             dtype="float32",
             scope="global",
         )
-        B = T.buffer_decl(
+        B = T.Buffer(
             [16, 16],
             dtype="float32",
             axis_separators=[1],
@@ -719,7 +719,7 @@ def before(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"]):
             dtype="float32",
             scope="global",
         )
-        C = T.buffer_decl(
+        C = T.Buffer(
             [16, 16],
             dtype="float32",
             axis_separators=[1],
@@ -741,8 +741,8 @@ def expected(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"])
             dtype="float32",
             scope="global",
         )
-        B = T.buffer_decl([16, 16], dtype="float32", axis_separators=[1], data=B_data)
-        C = T.buffer_decl(
+        B = T.Buffer([16, 16], dtype="float32", axis_separators=[1], data=B_data)
+        C = T.Buffer(
             [16, 16],
             dtype="float32",
             axis_separators=[1],
@@ -777,7 +777,7 @@ def before(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"]):
             dtype="float32",
             scope="global",
         )
-        B = T.buffer_decl(
+        B = T.Buffer(
             [16, 16],
             dtype="float32",
             axis_separators=[1],
@@ -788,7 +788,7 @@ def before(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"]):
             dtype="float32",
             scope="global",
         )
-        C = T.buffer_decl(
+        C = T.Buffer(
             [20, 20],
             dtype="float32",
             axis_separators=[1],
diff --git a/tests/python/unittest/test_tir_transform_thread_sync.py b/tests/python/unittest/test_tir_transform_thread_sync.py
index b2a0581d6980..b7caf04d659c 100644
--- a/tests/python/unittest/test_tir_transform_thread_sync.py
+++ b/tests/python/unittest/test_tir_transform_thread_sync.py
@@ -101,7 +101,7 @@ def test_sync_read_thread_id_independent_location():
     def func(p0_arg: T.Buffer[(1, 2, 1, 1), "float32"], p1: T.Buffer[2, "float32"]) -> None:
         threadIdx_x = T.env_thread("threadIdx.x")
         blockIdx_x = T.env_thread("blockIdx.x")
-        p0 = T.buffer_decl([2], dtype="float32", data=p0_arg.data)
+        p0 = T.Buffer([2], dtype="float32", data=p0_arg.data)
         result_local = T.alloc_buffer([1], dtype="float32", scope="local")
         temp_shared = T.alloc_buffer([1], dtype="float32", scope="shared")
         T.launch_thread(blockIdx_x, 8)
diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
index 25e895573551..6145c39b876d 100644
--- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
+++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
@@ -92,13 +92,13 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
         T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1)
         # body
         PaddedInput_7_data = T.allocate([157323], "int16", "global")
-        PaddedInput_7 = T.buffer_decl(shape=[157323], dtype="int16", data=PaddedInput_7_data)
+        PaddedInput_7 = T.Buffer(shape=[157323], dtype="int16", data=PaddedInput_7_data)
         for i0_i1_fused_7 in T.serial(0, 229):
             for i2_7, i3_7 in T.grid(229, 3):
                 PaddedInput_7[(((i0_i1_fused_7*687) + (i2_7*3)) + i3_7)] = T.if_then_else(((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and (2 <= i2_7)) and (i2_7 < 226)), placeholder_65[((((i0_i1_fused_7*672) + (i2_7*3)) + i3_7) - 1350)], T.int16(0), dtype="int16")
         for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544):
             Conv2dOutput_7_data = T.allocate([64], "int32", "global")
-            Conv2dOutput_7 = T.buffer_decl(shape=[64], dtype="int32", data=Conv2dOutput_7_data)
+            Conv2dOutput_7 = T.Buffer(shape=[64], dtype="int32", data=Conv2dOutput_7_data)
             for ff_3 in T.serial(0, 64):
                 Conv2dOutput_7[ff_3] = 0
                 for ry_2, rx_2, rc_7 in T.grid(7, 7, 3):
@@ -114,7 +114,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
         T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         tensor_2_data = T.allocate([200704], "uint8", "global")
-        tensor_2 = T.buffer_decl(shape=[200704], dtype="uint8", data=tensor_2_data)
+        tensor_2 = T.Buffer(shape=[200704], dtype="uint8", data=tensor_2_data)
         for ax0_ax1_fused_4 in T.serial(0, 56):
             for ax2_4 in T.serial(0, 56):
                 for ax3_init in T.serial(0, 64):
@@ -163,7 +163,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
         fast_memory_6_buffer_var = T.match_buffer(fast_memory_6_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
         slow_memory_7_buffer_var = T.match_buffer(slow_memory_7_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
-        tensor_2_let = T.buffer_decl([200704], dtype="uint8")
+        tensor_2_let = T.Buffer([200704], dtype="uint8")
         with T.let(tensor_2_let.data, T.address_of(fast_memory_6_buffer_var[0], dtype="handle")):
             for ax0_ax1_fused_4, ax2_4 in T.grid(56, 56):
                 for ax3_init in T.serial(0, 64):
@@ -193,12 +193,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
         fast_memory_4_buffer_var = T.match_buffer(fast_memory_4_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
         slow_memory_5_buffer_var = T.match_buffer(slow_memory_5_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
-        PaddedInput_7_let = T.buffer_decl([157323], "int16")
+        PaddedInput_7_let = T.Buffer([157323], "int16")
         with T.let(PaddedInput_7_let.data, T.address_of(slow_memory_5_buffer_var[802816], dtype="handle")):
             for i0_i1_fused_7, i2_7, i3_7 in T.grid(229, 229, 3):
                 PaddedInput_7_let[i0_i1_fused_7 * 687 + i2_7 * 3 + i3_7] = T.if_then_else(2 <= i0_i1_fused_7 and i0_i1_fused_7 < 226 and 2 <= i2_7 and i2_7 < 226, placeholder_65[i0_i1_fused_7 * 672 + i2_7 * 3 + i3_7 - 1350], T.int16(0), dtype="int16")
             for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544):
-                Conv2dOutput_7_let = T.buffer_decl([64], "int32")
+                Conv2dOutput_7_let = T.Buffer([64], "int32")
                 with T.let(Conv2dOutput_7_let.data, T.address_of(fast_memory_4_buffer_var[0], dtype="handle")):
                     for ff_3 in T.serial(0, 64):
                         Conv2dOutput_7_let[ff_3] = 0
@@ -272,12 +272,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla
         T_cast_5 = T.match_buffer(T_cast_4, [215], dtype="int16")
         # body
         PaddedInput_1_data = T.allocate([379456], "int16", "global")
-        PaddedInput_1 = T.buffer_decl(shape=[379456], dtype="int16", data=PaddedInput_1_data)
+        PaddedInput_1 = T.Buffer(shape=[379456], dtype="int16", data=PaddedInput_1_data)
         for i0_i1_fused_1, i2_1, i3_1 in T.grid(77, 77, 64):
             PaddedInput_1[i0_i1_fused_1 * 4928 + i2_1 * 64 + i3_1] = T.if_then_else(1 <= i0_i1_fused_1 and i0_i1_fused_1 < 76 and 1 <= i2_1 and i2_1 < 76, placeholder_13[i0_i1_fused_1 * 4800 + i2_1 * 64 + i3_1 - 4864], T.int16(0), dtype="int16")
         for ax0_ax1_fused_ax2_fused_1 in T.serial(0, 5625):
             Conv2dOutput_1_data = T.allocate([64], "int32", "global")
-            Conv2dOutput_1 = T.buffer_decl(shape=[64], dtype="int32", data=Conv2dOutput_1_data)
+            Conv2dOutput_1 = T.Buffer(shape=[64], dtype="int32", data=Conv2dOutput_1_data)
             for ff_1 in T.serial(0, 64):
                 Conv2dOutput_1[ff_1] = 0
                 for ry, rx, rc_1 in T.grid(3, 3, 64):
@@ -295,12 +295,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
         T_add_1 = T.match_buffer(T_add, [407], dtype="int32")
         # body
         PaddedInput_2_data = T.allocate([360000], "int16", "global")
-        PaddedInput_2 = T.buffer_decl(shape=[360000], dtype="int16", data=PaddedInput_2_data)
+        PaddedInput_2 = T.Buffer(shape=[360000], dtype="int16", data=PaddedInput_2_data)
         for i0_i1_fused_2, i2_2, i3_2 in T.grid(75, 75, 64):
             PaddedInput_2[i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2] = placeholder_19[i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2]
         for ax0_ax1_fused_ax2_fused_2 in T.serial(0, 5625):
             Conv2dOutput_2_data = T.allocate([64], "int32", "global")
-            Conv2dOutput_2 = T.buffer_decl(shape=[64], dtype="int32", data=Conv2dOutput_2_data)
+            Conv2dOutput_2 = T.Buffer(shape=[64], dtype="int32", data=Conv2dOutput_2_data)
             for ax3_outer_1 in T.serial(0, 4):
                 for ff_2 in T.serial(0, 64):
                     Conv2dOutput_2[ff_2] = 0
@@ -320,12 +320,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
         T_cast_7 = T.match_buffer(T_cast_6, [407], dtype="uint8")
         # body
         PaddedInput_3_data = T.allocate([360000], "int16", "global")
-        PaddedInput_3 = T.buffer_decl(shape=[360000], dtype="int16", data=PaddedInput_3_data)
+        PaddedInput_3 = T.Buffer(shape=[360000], dtype="int16", data=PaddedInput_3_data)
         for i0_i1_fused_3, i2_3, i3_3 in T.grid(75, 75, 64):
             PaddedInput_3[i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3] = placeholder_29[i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3]
         for ax0_ax1_fused_ax2_fused_3 in T.serial(0, 5625):
             Conv2dOutput_3_data = T.allocate([64], "int32", "global")
-            Conv2dOutput_3 = T.buffer_decl(shape=[64], dtype="int32", data=Conv2dOutput_3_data)
+            Conv2dOutput_3 = T.Buffer(shape=[64], dtype="int32", data=Conv2dOutput_3_data)
             for ax3_outer_2 in T.serial(0, 4):
                 for ff_3 in T.serial(0, 64):
                     Conv2dOutput_3[ff_3] = 0
@@ -361,12 +361,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place
         T_cast_3 = T.match_buffer(T_cast_2, [215], dtype="int16")
         # body
         PaddedInput_data = T.allocate([360000], "int16", "global")
-        PaddedInput = T.buffer_decl([360000], "int16", data=PaddedInput_data)
+        PaddedInput = T.Buffer([360000], "int16", data=PaddedInput_data)
         for i0_i1_fused, i2, i3 in T.grid(75, 75, 64):
             PaddedInput[i0_i1_fused * 4800 + i2 * 64 + i3] = placeholder_7[i0_i1_fused * 4800 + i2 * 64 + i3]
         for ax0_ax1_fused_ax2_fused in T.serial(0, 5625):
             Conv2dOutput_data = T.allocate([64], "int32", "global")
-            Conv2dOutput = T.buffer_decl([64], "int32", data=Conv2dOutput_data)
+            Conv2dOutput = T.Buffer([64], "int32", data=Conv2dOutput_data)
             for ff in T.serial(0, 64):
                 Conv2dOutput[ff] = 0
                 for rc in T.serial(0, 64):
@@ -398,12 +398,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
         T_cast_7 = T.match_buffer(T_cast_6, [407], dtype="uint8")
         global_workspace_5_buffer_var = T.match_buffer(global_workspace_5_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
-        PaddedInput_3_let = T.buffer_decl([360000], 'int16')
+        PaddedInput_3_let = T.Buffer([360000], 'int16')
         with T.let(PaddedInput_3_let.data, T.address_of(global_workspace_5_buffer_var[6480000], dtype="handle")):
             for i0_i1_fused_3, i2_3, i3_3 in T.grid(75, 75, 64):
                 PaddedInput_3_let[i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3] = placeholder_29[i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3]
             for ax0_ax1_fused_ax2_fused_3 in T.serial(0, 5625):
-                Conv2dOutput_3_let = T.buffer_decl([64], 'int32')
+                Conv2dOutput_3_let = T.Buffer([64], 'int32')
                 with T.let(Conv2dOutput_3_let.data, T.address_of(global_workspace_5_buffer_var[7200000], dtype="handle")):
                     for ax3_outer_2 in T.serial(0, 4):
                         for ff_3 in T.serial(0, 64):
@@ -421,12 +421,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
         T_add_1 = T.match_buffer(T_add, [407], dtype="int32")
         global_workspace_4_buffer_var = T.match_buffer(global_workspace_4_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
-        PaddedInput_2_let = T.buffer_decl([360000], "int16")
+        PaddedInput_2_let = T.Buffer([360000], "int16")
         with T.let(PaddedInput_2_let.data, T.address_of(global_workspace_4_buffer_var[7200000], dtype="handle")):
             for i0_i1_fused_2, i2_2, i3_2 in T.grid(75, 75, 64):
                 PaddedInput_2_let[i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2] = placeholder_19[i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2]
             for ax0_ax1_fused_ax2_fused_2 in T.serial(0, 5625):
-                Conv2dOutput_2_let = T.buffer_decl([64], 'int32')
+                Conv2dOutput_2_let = T.Buffer([64], 'int32')
                 with T.let(Conv2dOutput_2_let.data, T.address_of(global_workspace_4_buffer_var[7920000], dtype="handle")):
                     for ax3_outer_1 in T.serial(0, 4):
                         for ff_2 in T.serial(0, 64):
@@ -444,12 +444,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place
         T_cast_3 = T.match_buffer(T_cast_2, [215], dtype="int16")
         global_workspace_2_buffer_var = T.match_buffer(global_workspace_2_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
-        PaddedInput_let = T.buffer_decl([360000], "int16")
+        PaddedInput_let = T.Buffer([360000], "int16")
         with T.let(PaddedInput_let.data, T.address_of(global_workspace_2_buffer_var[7200000], dtype="handle")):
             for i0_i1_fused, i2, i3 in T.grid(75, 75, 64):
                 PaddedInput_let[i0_i1_fused * 4800 + i2 * 64 + i3] = placeholder_7[i0_i1_fused * 4800 + i2 * 64 + i3]
             for ax0_ax1_fused_ax2_fused in T.serial(0, 5625):
-                Conv2dOutput_let = T.buffer_decl([64], "int32")
+                Conv2dOutput_let = T.Buffer([64], "int32")
                 with T.let(Conv2dOutput_let.data, T.address_of(global_workspace_2_buffer_var[7920000], dtype="handle")):
                     for ff in T.serial(0, 64):
                         Conv2dOutput_let[ff] = 0
@@ -466,12 +466,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla
         T_cast_5 = T.match_buffer(T_cast_4, [215], dtype="int16")
         global_workspace_3_buffer_var = T.match_buffer(global_workspace_3_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
-        PaddedInput_1_let = T.buffer_decl([379456], "int16")
+        PaddedInput_1_let = T.Buffer([379456], "int16")
         with T.let(PaddedInput_1_let.data, T.address_of(global_workspace_3_buffer_var[0], dtype="handle")):
             for i0_i1_fused_1, i2_1, i3_1 in T.grid(77, 77, 64):
                 PaddedInput_1_let[i0_i1_fused_1 * 4928 + i2_1 * 64 + i3_1] = T.if_then_else(1 <= i0_i1_fused_1 and i0_i1_fused_1 < 76 and 1 <= i2_1 and i2_1 < 76, placeholder_13[i0_i1_fused_1 * 4800 + i2_1 * 64 + i3_1 - 4864], T.int16(0), dtype="int16")
             for ax0_ax1_fused_ax2_fused_1 in T.serial(0, 5625):
-                Conv2dOutput_1_let = T.buffer_decl([64], "int32")
+                Conv2dOutput_1_let = T.Buffer([64], "int32")
                 with T.let(Conv2dOutput_1_let.data, T.address_of(global_workspace_3_buffer_var[7200000], dtype="handle")):
                     for ff_1 in T.serial(0, 64):
                         Conv2dOutput_1_let[ff_1] = 0
@@ -546,7 +546,7 @@ def tensor_intrin_primfunc() -> None:
             )
         )
 
-        dense = T.buffer_decl([10], "int32", data=dense_data)
+        dense = T.Buffer([10], "int32", data=dense_data)
         dense[0] = T.q_multiply_shift(dense[0], 1608879842, 31, -7, dtype="int32")
 
     @T.prim_func
@@ -561,7 +561,7 @@ def tensor_intrin_primfunc(global_workspace_1_var: T.Ptr[T.uint8]) -> None:
         global_workspace_1_buffer_var = T.match_buffer(
             global_workspace_1_var, [40], dtype="uint8", strides=[1], elem_offset=0, align=16
         )
-        dense_let = T.buffer_decl([10], "int32")
+        dense_let = T.Buffer([10], "int32")
         with T.let(dense_let.data, T.address_of(global_workspace_1_buffer_var[0], dtype="handle")):
             T.evaluate(
                 T.call_extern(
diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
index 7d542c7bc7bd..85d2e808b3d8 100644
--- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py
+++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py
@@ -53,9 +53,9 @@ def test_ir_builder_tir_primfunc_complete():
         with T.prim_func():
             T.arg("a", T.handle())
             T.arg("b", T.var("int64"))
-            T.arg("c", T.buffer_decl((128, 128), "float32"))
+            T.arg("c", T.Buffer((128, 128), "float32"))
             d = T.arg("d", T.handle())
-            e = T.arg("e", T.buffer_decl((1024,), "int8"))
+            e = T.arg("e", T.Buffer((1024,), "int8"))
             T.func_attr({"key": "value"})
             T.func_ret(tvm.ir.PrimType("int64"))
             buffer_d = T.match_buffer(d, (64, 64), "int64")
@@ -120,10 +120,10 @@ def test_ir_builder_tir_block_base():
 def test_ir_builder_tir_block_complete():
     with IRBuilder() as ib:
         a = T.var("int64", "a")
-        b = T.buffer_decl((128, 128), "float32")
-        c = T.buffer_decl((128, 128), "float32")
+        b = T.Buffer((128, 128), "float32")
+        c = T.Buffer((128, 128), "float32")
         d = T.var("int32", "d")
-        e = T.buffer_decl((128, 128), "float32")
+        e = T.Buffer((128, 128), "float32")
         f = T.var("int32", "f")
         with T.block("block"):
             T.where(a > 1)
@@ -298,7 +298,7 @@ def test_ir_builder_tir_let():
 
 
 def test_ir_builder_tir_realize():
-    buffer_a = T.buffer_decl((128, 128), "float32")
+    buffer_a = T.Buffer((128, 128), "float32")
     with IRBuilder() as ib:
         with T.realize(buffer_a[0:128, 0:128], "test_storage_scope", True):
             T.evaluate(0)
@@ -417,7 +417,7 @@ def test_ir_builder_tir_if_then_else():
 
 
 def test_ir_builder_tir_buffer_store():
-    buffer_a = T.buffer_decl((10, 10), "float32")
+    buffer_a = T.Buffer((10, 10), "float32")
     i = T.var("int32", "x")
     with IRBuilder() as ib:
         T.buffer_store(buffer_a, 0.1, [0, i])
@@ -434,7 +434,7 @@ def test_ir_builder_tir_buffer_store():
 
 def test_ir_builder_tir_prefetch():
     with IRBuilder() as ib:
-        buffer_a = T.buffer_decl((128, 128), "float32")
+        buffer_a = T.Buffer((128, 128), "float32")
         T.prefetch(buffer_a, [])
 
     # the prefetch generated by IRBuilder
@@ -469,7 +469,7 @@ def test_ir_builder_tir_decl_buffer():
     ir_actual = ib.get()
 
     # the expected decl_buffer
-    buffer = T.buffer_decl((128, 128), "float32")
+    buffer = T.Buffer((128, 128), "float32")
     ir_expected = tir.Allocate(
         buffer.data,
         "float32",
diff --git a/tests/python/unittest/test_tvmscript_printer_tir.py b/tests/python/unittest/test_tvmscript_printer_tir.py
index 71da86bff763..ec69c54396c3 100644
--- a/tests/python/unittest/test_tvmscript_printer_tir.py
+++ b/tests/python/unittest/test_tvmscript_printer_tir.py
@@ -166,7 +166,7 @@ def test_match_buffer_region():
     _assert_print(
         obj,
         """
-src = T.buffer_decl((128, 128))
+src = T.Buffer((128, 128))
 tgt = T.match_buffer(src[64:128, 64:128], (64, 64))
 """,
     )
@@ -176,7 +176,7 @@ def test_buffer():
     a = tir.decl_buffer((128, 128), "float16", name="A")
     _assert_print(
         a,
-        """A = T.buffer_decl((128, 128), "float16")
+        """A = T.Buffer((128, 128), "float16")
 A""",
     )
 
@@ -193,7 +193,7 @@ def test_buffer_region():
     _assert_print(
         obj,
         """
-src = T.buffer_decl((128, 128))
+src = T.Buffer((128, 128))
 src[64:128, 64:128]
 """,
     )
@@ -205,7 +205,7 @@ def test_buffer_load():
     _assert_print(
         obj,
         """
-A = T.buffer_decl((128, 128), "float16")
+A = T.Buffer((128, 128), "float16")
 A[128, 128]
 """,
     )
@@ -219,7 +219,7 @@ def test_buffer_store():
     _assert_print(
         obj,
         """
-A = T.buffer_decl((128, 128), "float16")
+A = T.Buffer((128, 128), "float16")
 A[128, 128] = A[128, 128] + T.float16(1)
 """,
     )
@@ -380,7 +380,7 @@ def test_prefetch():
     _assert_print(
         obj,
         """
-A = T.buffer_decl((128, 128), "float16")
+A = T.Buffer((128, 128), "float16")
 T.prefetch(A, [T.Range(0, 64), T.Range(0, 64)])
 """,
     )
@@ -439,7 +439,7 @@ def test_buffer_realize():
     _assert_print(
         obj,
         """
-A = T.buffer_decl((128, 128))
+A = T.Buffer((128, 128))
 with T.realize(A[0:128, 0:128], "test_storage_scope"):
     T.evaluate(0)
 """,
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 0a6a2a26380c..4300c4bbade9 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -34,8 +34,8 @@ def mmult(A: T.handle, B: T.handle, C: T.handle) -> None:
             # function attr dict
             T.func_attr({"global_symbol": "mmult", "tir.noalias": True})
             # buffer definition
-            C_global = T.buffer_decl([1024, 1024], elem_offset=0, align=64, offset_factor=1)
-            packedB = T.buffer_decl([32, 1024, 32], elem_offset=0, align=64, offset_factor=1)
+            C_global = T.Buffer([1024, 1024], elem_offset=0, align=64, offset_factor=1)
+            packedB = T.Buffer([32, 1024, 32], elem_offset=0, align=64, offset_factor=1)
             A_1 = T.match_buffer(A, [1024, 1024], elem_offset=0, align=64, offset_factor=1)
             B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=64, offset_factor=1)
             C_1 = T.match_buffer(C, [1024, 1024], elem_offset=0, align=64, offset_factor=1)
@@ -95,15 +95,13 @@ def mmult(A: T.handle, B: T.handle, C: T.handle) -> None:
             C_1 = T.match_buffer(C, [16384], elem_offset=0, align=64, offset_factor=1)
             # body
             packedB_data = T.allocate([32768], "float32", "global")
-            packedB = T.buffer_decl(
-                shape=[32768], dtype="float32", scope="global", data=packedB_data
-            )
+            packedB = T.Buffer(shape=[32768], dtype="float32", scope="global", data=packedB_data)
             for x in T.parallel(0, 32):
                 for y in T.serial(0, 1024):
                     packedB[T.ramp(((x * 32768) + (y * 32)), 1, 32)] = B_1[y, T.ramp(x * 32, 1, 32)]
             for x_outer in T.parallel(0, 32):
                 C_global_data = T.allocate([1024], "float32", "global")
-                C_global = T.buffer_decl(
+                C_global = T.Buffer(
                     shape=[1024], dtype="float32", scope="global", data=C_global_data
                 )
                 for y_outer in T.serial(0, 32):
@@ -196,8 +194,8 @@ def mmult(
             # buffer definition
             buf_type_ids = T.match_buffer(arg_type_ids, [3], dtype="int32")
 
-            packedB = T.buffer_decl([32768], dtype="float32")
-            C_global = T.buffer_decl([1024], dtype="float32")
+            packedB = T.Buffer([32768], dtype="float32")
+            C_global = T.Buffer([1024], dtype="float32")
             # var definition
             # C_global = T.buffer_var("float32", "global")
             # packedB = T.buffer_var("float32", "global")
@@ -212,29 +210,29 @@ def mmult(
 
             A_data: T.Ptr[T.int32] = T.tvm_struct_get(arg0, 0, 1, dtype="handle")
             T.attr(A_data, "storage_alignment", 128)
-            A = T.buffer_decl([1024 * 1024], dtype="int32", data=A_data)
+            A = T.Buffer([1024 * 1024], dtype="int32", data=A_data)
             buf0_shape_data: T.Ptr[T.int32] = T.tvm_struct_get(arg0, 0, 2, dtype="handle")
-            buf0_shape = T.buffer_decl([2], dtype="int32", data=buf0_shape_data)
+            buf0_shape = T.Buffer([2], dtype="int32", data=buf0_shape_data)
             buf0_strides_data: T.Ptr[T.int32] = T.tvm_struct_get(arg0, 0, 3, dtype="handle")
-            buf0_strides = T.buffer_decl([2], dtype="int32", data=buf0_strides_data)
+            buf0_strides = T.Buffer([2], dtype="int32", data=buf0_strides_data)
 
             dev_id: T.int32 = T.tvm_struct_get(arg0, 0, 9, dtype="int32")
 
             B_data: T.Ptr[T.int32] = T.tvm_struct_get(arg1, 0, 1, dtype="handle")
             T.attr(B_data, "storage_alignment", 128)
-            B = T.buffer_decl([1024 * 1024], dtype="int32", data=B_data)
+            B = T.Buffer([1024 * 1024], dtype="int32", data=B_data)
             buf1_shape_data: T.Ptr[T.int32] = T.tvm_struct_get(arg1, 0, 2, dtype="handle")
-            buf1_shape = T.buffer_decl([2], dtype="int32", data=buf1_shape_data)
+            buf1_shape = T.Buffer([2], dtype="int32", data=buf1_shape_data)
             buf1_strides_data: T.Ptr[T.int32] = T.tvm_struct_get(arg1, 0, 3, dtype="handle")
-            buf1_strides = T.buffer_decl([2], dtype="int32", data=buf1_strides_data)
+            buf1_strides = T.Buffer([2], dtype="int32", data=buf1_strides_data)
 
             C_data: T.Ptr[T.int32] = T.tvm_struct_get(arg2, 0, 1, dtype="handle")
             T.attr(C_data, "storage_alignment", 128)
-            C = T.buffer_decl([1024 * 1024], dtype="int32", data=C_data)
+            C = T.Buffer([1024 * 1024], dtype="int32", data=C_data)
             buf2_shape_data: T.Ptr[T.int32] = T.tvm_struct_get(arg2, 0, 2, dtype="handle")
-            buf2_shape = T.buffer_decl([2], dtype="int32", data=buf2_shape_data)
+            buf2_shape = T.Buffer([2], dtype="int32", data=buf2_shape_data)
             buf2_strides_data: T.Ptr[T.int32] = T.tvm_struct_get(arg2, 0, 3, dtype="handle")
-            buf2_strides = T.buffer_decl([2], dtype="int32", data=buf2_strides_data)
+            buf2_strides = T.Buffer([2], dtype="int32", data=buf2_strides_data)
 
             assert (((arg0_code == 3) or (arg0_code == 13)) or (arg0_code == 7)) or (
                 arg0_code == 4
@@ -489,42 +487,34 @@ def func(A: T.handle, W: T.handle, Conv: T.handle) -> None:
         ty = T.env_thread("threadIdx.y")
         tz = T.env_thread("threadIdx.z")
         # buffer definition
-        Apad_shared = T.buffer_decl(
+        Apad_shared = T.Buffer(
             [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1
         )
-        Apad_shared_wmma_matrix_a = T.buffer_decl(
+        Apad_shared_wmma_matrix_a = T.Buffer(
             [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1
         )
-        BA = T.buffer_decl(
-            [16, 16], dtype="float16", scope="wmma.matrix_a", align=32, offset_factor=256
-        )
-        BB = T.buffer_decl(
-            [16, 16], dtype="float16", scope="wmma.matrix_b", align=32, offset_factor=256
-        )
-        BC = T.buffer_decl([16, 16], scope="wmma.accumulator", align=32, offset_factor=256)
-        Conv_wmma_accumulator = T.buffer_decl(
+        BA = T.Buffer([16, 16], dtype="float16", scope="wmma.matrix_a", align=32, offset_factor=256)
+        BB = T.Buffer([16, 16], dtype="float16", scope="wmma.matrix_b", align=32, offset_factor=256)
+        BC = T.Buffer([16, 16], scope="wmma.accumulator", align=32, offset_factor=256)
+        Conv_wmma_accumulator = T.Buffer(
             [16, 14, 14, 32, 16, 16], elem_offset=0, align=64, offset_factor=1
         )
-        W_shared = T.buffer_decl(
+        W_shared = T.Buffer(
             [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1
         )
-        W_shared_wmma_matrix_b = T.buffer_decl(
+        W_shared_wmma_matrix_b = T.Buffer(
             [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1
         )
-        buffer = T.buffer_decl(
-            [16, 16], dtype="float16", scope="shared", align=32, offset_factor=256
-        )
-        buffer_1 = T.buffer_decl(
+        buffer = T.Buffer([16, 16], dtype="float16", scope="shared", align=32, offset_factor=256)
+        buffer_1 = T.Buffer(
             [16, 16], dtype="float16", scope="wmma.matrix_a", align=32, offset_factor=256
         )
-        buffer_2 = T.buffer_decl(
-            [16, 16], dtype="float16", scope="shared", align=32, offset_factor=256
-        )
-        buffer_3 = T.buffer_decl(
+        buffer_2 = T.Buffer([16, 16], dtype="float16", scope="shared", align=32, offset_factor=256)
+        buffer_3 = T.Buffer(
             [16, 16], dtype="float16", scope="wmma.matrix_b", align=32, offset_factor=256
         )
-        buffer_4 = T.buffer_decl([16, 16], scope="wmma.accumulator", align=32, offset_factor=256)
-        buffer_5 = T.buffer_decl([16, 16], align=32, offset_factor=256)
+        buffer_4 = T.Buffer([16, 16], scope="wmma.accumulator", align=32, offset_factor=256)
+        buffer_5 = T.Buffer([16, 16], align=32, offset_factor=256)
         A_1 = T.match_buffer(
             A, [16, 14, 14, 16, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1
         )
@@ -949,9 +939,9 @@ def func(
         # function attr dict
         T.func_attr({"global_symbol": "default_function", "tir.noalias": True})
         # body
-        A_1 = T.buffer_decl([12845056], dtype="float16", data=A.data)
-        W_1 = T.buffer_decl([1179648], dtype="float16", data=W.data)
-        Conv_1 = T.buffer_decl([25690112], data=Conv.data)
+        A_1 = T.Buffer([12845056], dtype="float16", data=A.data)
+        W_1 = T.Buffer([1179648], dtype="float16", data=W.data)
+        Conv_1 = T.Buffer([25690112], data=Conv.data)
         bx = T.env_thread("blockIdx.x")
         by = T.env_thread("blockIdx.y")
         bz = T.env_thread("blockIdx.z")
@@ -960,21 +950,21 @@ def func(
         tz = T.env_thread("threadIdx.z")
         T.launch_thread(bz, 196)
         Conv_wmma_accumulator_data = T.allocate([2048], "float32", "wmma.accumulator")
-        Conv_wmma_accumulator = T.buffer_decl(
+        Conv_wmma_accumulator = T.Buffer(
             shape=[2048], dtype="float32", scope="wmma.accumulator", data=Conv_wmma_accumulator_data
         )
         Apad_shared_data = T.allocate([12288], "float16", "shared")
-        Apad_shared = T.buffer_decl(
+        Apad_shared = T.Buffer(
             shape=[12288], dtype="float16", scope="shared", data=Apad_shared_data
         )
         W_shared_data = T.allocate([12288], "float16", "shared")
-        W_shared = T.buffer_decl(shape=[12288], dtype="float16", scope="shared", data=W_shared_data)
+        W_shared = T.Buffer(shape=[12288], dtype="float16", scope="shared", data=W_shared_data)
         Apad_shared_wmma_matrix_a_data = T.allocate([512], "float16", "wmma.matrix_a")
-        Apad_shared_wmma_matrix_a = T.buffer_decl(
+        Apad_shared_wmma_matrix_a = T.Buffer(
             shape=[512], dtype="float16", scope="wmma.matrix_a", data=Apad_shared_wmma_matrix_a_data
         )
         W_shared_wmma_matrix_b_data = T.allocate([1024], "float16", "wmma.matrix_b")
-        W_shared_wmma_matrix_b = T.buffer_decl(
+        W_shared_wmma_matrix_b = T.Buffer(
             shape=[1024], dtype="float16", scope="wmma.matrix_b", data=W_shared_wmma_matrix_b_data
         )
         T.launch_thread(bx, 2)
@@ -2253,7 +2243,7 @@ def opt_conv_tensorcore_mod_host(
         )
         # body
         stack_tcode_data: T.Ptr[T.int32] = T.tvm_stack_alloca("arg_tcode", 10, dtype="handle")
-        stack_tcode = T.buffer_decl([9], "int32", data=stack_tcode_data)
+        stack_tcode = T.Buffer([9], "int32", data=stack_tcode_data)
         stack_value: T.handle = T.tvm_stack_alloca("arg_value", 10, dtype="handle")
         assert num_args == 3, "default_function: num_args should be 3"
         arg0: T.handle = T.tvm_struct_get(args, 0, 12, dtype="handle")
@@ -2266,25 +2256,25 @@ def opt_conv_tensorcore_mod_host(
         A: T.handle = T.tvm_struct_get(arg0, 0, 1, dtype="handle")
         T.attr(A, "storage_alignment", 128)
         arg0_shape_data: T.Ptr[T.int64] = T.tvm_struct_get(arg0, 0, 2, dtype="handle")
-        arg0_shape = T.buffer_decl([6], "int64", data=arg0_shape_data)
+        arg0_shape = T.Buffer([6], "int64", data=arg0_shape_data)
         arg0_strides_data: T.Ptr[T.int64] = T.tvm_struct_get(arg0, 0, 3, dtype="handle")
-        arg0_strides = T.buffer_decl([6], "int64", data=arg0_strides_data)
+        arg0_strides = T.Buffer([6], "int64", data=arg0_strides_data)
 
         dev_id: T.int32 = T.tvm_struct_get(arg0, 0, 9, dtype="int32")
 
         W: T.handle = T.tvm_struct_get(arg1, 0, 1, dtype="handle")
         T.attr(W, "storage_alignment", 128)
         arg1_shape_data: T.Ptr[T.int64] = T.tvm_struct_get(arg1, 0, 2, dtype="handle")
-        arg1_shape = T.buffer_decl([6], "int64", data=arg1_shape_data)
+        arg1_shape = T.Buffer([6], "int64", data=arg1_shape_data)
         arg1_strides_data: T.Ptr[T.int64] = T.tvm_struct_get(arg1, 0, 3, dtype="handle")
-        arg1_strides = T.buffer_decl([6], "int64", data=arg1_strides_data)
+        arg1_strides = T.Buffer([6], "int64", data=arg1_strides_data)
 
         Conv: T.handle = T.tvm_struct_get(arg2, 0, 1, dtype="handle")
         T.attr(Conv, "storage_alignment", 128)
         arg2_shape_data: T.Ptr[T.int64] = T.tvm_struct_get(arg2, 0, 2, dtype="handle")
-        arg2_shape = T.buffer_decl([6], "int64", data=arg2_shape_data)
+        arg2_shape = T.Buffer([6], "int64", data=arg2_shape_data)
         arg2_strides_data: T.Ptr[T.int64] = T.tvm_struct_get(arg2, 0, 3, dtype="handle")
-        arg2_strides = T.buffer_decl([6], "int64", data=arg2_strides_data)
+        arg2_strides = T.Buffer([6], "int64", data=arg2_strides_data)
 
         assert (((arg0_code == 3) or (arg0_code == 13)) or (arg0_code == 7)) or (
             arg0_code == 4
@@ -2499,7 +2489,7 @@ def vthread_func(a: T.handle, c: T.handle) -> None:
         T.launch_thread(i1, 2)
         T.launch_thread(i2, 2)
         B_data = T.allocate([16], "float32", "local")
-        B = T.buffer_decl(shape=[16], dtype="float32", scope="local", data=B_data)
+        B = T.Buffer(shape=[16], dtype="float32", scope="local", data=B_data)
         for j in range(16):
             B[j] = A[i0 * 64 + i1 * 32 + i2 * 16 + j] + T.float32(1)
         for j in range(16):
@@ -2813,12 +2803,12 @@ def B(a: T.handle, c: T.handle) -> None:
             B = T.alloc_buffer((10), "int32")
 
             K1_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
-            K1 = T.buffer_decl(shape=[10], dtype="int32", data=K1_data)
+            K1 = T.Buffer(shape=[10], dtype="int32", data=K1_data)
             for x in T.serial(0, 10):
                 B[x] = A[x] + K1[x]
 
             K2_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
-            K2 = T.buffer_decl(shape=[10], dtype="int32", data=K2_data)
+            K2 = T.Buffer(shape=[10], dtype="int32", data=K2_data)
             for x in T.serial(0, 10):
                 B[x] = B[x] + K2[x]
 
@@ -2835,7 +2825,7 @@ def constant(a: T.handle, c: T.handle) -> None:
         C = T.match_buffer(c, (10), "int32")
         B = T.alloc_buffer((10), "int32")
         K_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10])
-        K = T.buffer_decl(shape=[10], dtype="int32", data=K_data)
+        K = T.Buffer(shape=[10], dtype="int32", data=K_data)
         for x in T.serial(0, 10):
             B[x] = A[x] + K[x]
 
@@ -2980,7 +2970,7 @@ def primfunc_with_allocate_annotations(placeholder_28: T.handle, T_cast_6: T.han
         T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1)
         # body
         tensor_2_data = T.allocate([200704], "uint8", "global", annotations={"attr1_key": "attr1_value"})
-        tensor_2 = T.buffer_decl(shape=[200704], dtype="uint8", scope="global", data=tensor_2_data)
+        tensor_2 = T.Buffer(shape=[200704], dtype="uint8", scope="global", data=tensor_2_data)
         for ax0_ax1_fused_4 in T.serial(0, 56):
             for ax2_4 in T.serial(0, 56):
                 for ax3_init in T.serial(0, 64):
@@ -3007,7 +2997,7 @@ def comm_reducer_single_reduce_group(a: T.handle, b: T.handle) -> None:
         for i in T.serial(0, 128):
             T.launch_thread(threadIdx_x, 128)
             reduce_temp0_data = T.allocate([1], "float32", "local")
-            reduce_temp0 = T.buffer_decl(shape=[1], dtype="float32", scope="local", data=reduce_temp0_data)
+            reduce_temp0 = T.Buffer(shape=[1], dtype="float32", scope="local", data=reduce_temp0_data)
             with T.attr(T.comm_reducer(lambda x, y: x + y, [T.float32(0)]), "reduce_scope", T.reinterpret(T.uint64(0), dtype="handle")):
                 T.evaluate(T.tvm_thread_allreduce(T.uint32(1), A[i * 128 + threadIdx_x], True, reduce_temp0.data, threadIdx_x, dtype="handle"))
 
@@ -3023,7 +3013,7 @@ def comm_reducer_multiple_reduce_groups(a: T.handle, b: T.handle) -> None:
         for i in T.serial(0, 128):
             T.launch_thread(threadIdx_x, 128)
             reduce_temp0_data = T.allocate([1], "float32", "local")
-            reduce_temp0 = T.buffer_decl(shape=[1], dtype="float32", scope="local", data=reduce_temp0_data)
+            reduce_temp0 = T.Buffer(shape=[1], dtype="float32", scope="local", data=reduce_temp0_data)
             with T.attr(T.comm_reducer(lambda x0, x1, y0, y1: (T.Select((x1 >= y1), x0, y0), T.Select((x1 >= y1), x1, y1)), [T.int32(-1), T.min_value("float32")]), "reduce_scope", T.reinterpret(T.uint64(0), dtype="handle")):
                 T.evaluate(T.tvm_thread_allreduce(T.uint32(1), A[i * 128 + threadIdx_x], True, reduce_temp0.data, threadIdx_x, dtype="handle"))
 
@@ -3033,10 +3023,10 @@ def comm_reducer_multiple_reduce_groups(a: T.handle, b: T.handle) -> None:
 def multiple_commreducer():
     @T.prim_func
     def multiple_commreducer() -> None:
-        normal_reduce_temp0 = T.buffer_decl([1], dtype="float32", strides=[1], scope="local")
-        normal_reduce_temp1 = T.buffer_decl([1], dtype="float32", strides=[1], scope="local")
-        reduce_temp0 = T.buffer_decl([1], dtype="float32", strides=[1], scope="local")
-        reduce_temp1 = T.buffer_decl([1], dtype="float32", strides=[1], scope="local")
+        normal_reduce_temp0 = T.Buffer([1], dtype="float32", strides=[1], scope="local")
+        normal_reduce_temp1 = T.Buffer([1], dtype="float32", strides=[1], scope="local")
+        reduce_temp0 = T.Buffer([1], dtype="float32", strides=[1], scope="local")
+        reduce_temp1 = T.Buffer([1], dtype="float32", strides=[1], scope="local")
         for ax0_1 in T.thread_binding(0, 32, thread="threadIdx.x"):
             with T.block("T_softmax_maxelem_cross_thread_reduction"):
                 T.attr(T.comm_reducer(lambda x, y: T.max(x, y), [T.min_value("float32")]), "reduce_scope", T.reinterpret(T.uint64(0), dtype="handle"))
@@ -3163,7 +3153,7 @@ def func_T_ptr_let_statement(
     ) -> None:
         # The T.Ptr declaration in the parameter list should parse
         # correctly, and should be usable as the data pointer in a buffer.
-        arg_type_ids = T.buffer_decl([2], dtype="int32", data=arg_type_ids_handle)
+        arg_type_ids = T.Buffer([2], dtype="int32", data=arg_type_ids_handle)
 
         arg0: T.handle = T.tvm_struct_get(args, 0, 12, dtype="handle")
         arg1: T.handle = T.tvm_struct_get(args, 1, 12, dtype="handle")
@@ -3177,9 +3167,9 @@ def func_T_ptr_let_statement(
         # this function.  It should only be defined after the data pointer
         # has been defined, and should not be hoisted into the header of
         # the function as other buffer_decl statements can be.
-        A = T.buffer_decl([1024], dtype="float32", data=A_data)
+        A = T.Buffer([1024], dtype="float32", data=A_data)
         B_data: T.Ptr[T.float32] = T.tvm_struct_get(arg1, 0, 1, dtype="handle")
-        B = T.buffer_decl([1024], dtype="float32", data=B_data)
+        B = T.Buffer([1024], dtype="float32", data=B_data)
 
         B[0] = A[0]
 
@@ -3190,7 +3180,7 @@ def func_T_ptr_allocate():
     @T.prim_func
     def func_T_ptr_allocate() -> None:
         A_data = T.allocate([1024], "float32", "global")
-        A = T.buffer_decl(shape=[1024], dtype="float32", scope="global", data=A_data)
+        A = T.Buffer(shape=[1024], dtype="float32", scope="global", data=A_data)
         A[0] = 0.0
 
     return func_T_ptr_allocate
@@ -3282,9 +3272,9 @@ def pointer_type():
     @T.prim_func
     def func_with_ptr_type_annotations(x: T.Ptr[T.int32], y: T.Ptr[T.int32, "shared"]):
         xx_data = T.allocate([16], "int32", "global")
-        xx = T.buffer_decl(shape=[16], dtype="int32", scope="global", data=xx_data)
+        xx = T.Buffer(shape=[16], dtype="int32", scope="global", data=xx_data)
         yy_data = T.allocate([16], "int32", "shared")
-        yy = T.buffer_decl(shape=[16], dtype="int32", scope="shared", data=yy_data)
+        yy = T.Buffer(shape=[16], dtype="int32", scope="shared", data=yy_data)
         a: T.Ptr[T.int32] = T.address_of(xx[0], dtype="handle")
         b: T.Ptr[T.int32, "shared"] = T.address_of(yy[0], dtype="handle")
         T.evaluate(T.call_extern("copy", a, b, dtype=""))
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index 02b18e7e7c44..35f9e6c2e635 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -152,18 +152,6 @@ def func_with_sugar(A: T.Buffer[16, "float32"]):
     assert_structural_equal(func_no_sugar, func_with_sugar)
 
 
-# match buffer failed case
-def test_match_buffer_no_kwargs_failed():
-    with pytest.raises(ValueError) as e:
-
-        @T.prim_func
-        def elementwise_buffer_no_kwargs_failed(
-            a: T.Buffer[(128, 128, 128, 128)],
-            b: T.Buffer[(128, 128, 128, 128)],
-        ) -> None:
-            pass
-
-
 # dynamic shape gemm
 @T.prim_func
 def gemm_dyn_shape(a: T.handle, b: T.handle, c: T.handle):

From 4fc4ad007a6236f9fc7f86226085a8187a7d6c43 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao@apache.org>
Date: Wed, 25 Jan 2023 03:06:11 -0800
Subject: [PATCH 223/286] [TVMScript] Consolidate folder structure (#13841)

This PR consolidates the parser folder into Relay, as it is used
specifically for the Relay IR. This is the last step for the TVMScript
refactoring, where it established the default text format is the
roundtrippable TVMScript e-DSL.
---
 CMakeLists.txt                                |   2 +-
 include/tvm/ir/diagnostic.h                   |   2 -
 include/tvm/ir/expr.h                         |   2 +-
 include/tvm/ir/module.h                       |   6 +-
 include/tvm/ir/{span.h => source_map.h}       |  96 +++++++++++++-
 include/tvm/ir/type.h                         |   2 +-
 include/tvm/parser/source_map.h               | 119 ------------------
 include/tvm/relay/base.h                      |   2 +-
 include/tvm/relay/error.h                     |   3 +-
 include/tvm/{parser => relay}/parser.h        |  16 +--
 include/tvm/runtime/metadata_base.h           |   5 +-
 python/tvm/ir/base.py                         |  42 ++++---
 python/tvm/parser.py                          |  47 +++++++
 python/tvm/relay/__init__.py                  |   3 +
 .../_ffi_api.py => relay/_ffi_api_parser.py}  |   5 +-
 .../{parser/__init__.py => relay/parser.py}   |  22 ++--
 rust/tvm/src/ir/module.rs                     |   4 +-
 src/ir/diagnostic.cc                          |   4 +-
 src/ir/module.cc                              |  15 +--
 src/ir/{span.cc => source_map.cc}             |  75 ++++++++++-
 src/ir/transform.cc                           |   9 +-
 src/parser/source_map.cc                      |  97 --------------
 src/relay/backend/utils.cc                    |   2 +-
 src/relay/backend/vm/compiler.cc              |   2 +-
 src/relay/ir/base.cc                          |  17 ---
 src/relay/ir/function.cc                      |  12 ++
 src/{ => relay}/parser/meta_ref.cc            |   4 +-
 src/{ => relay}/parser/meta_ref.h             |  14 +--
 src/{ => relay}/parser/op_table.h             |  15 ++-
 src/{ => relay}/parser/parser.cc              |  51 ++++----
 src/{ => relay}/parser/span_check.cc          |   6 +-
 src/{ => relay}/parser/span_check.h           |  11 +-
 src/{ => relay}/parser/token.h                |  29 +++--
 src/{ => relay}/parser/tokenizer.h            |  33 ++---
 src/relay/printer/relay_text_printer.cc       |   2 +-
 src/runtime/profiling.cc                      |   1 -
 .../relay/backend/aot/aot_lower_main_test.cc  |   4 +-
 .../relay/collage/candidate_partition_test.cc |   4 +-
 .../cpp/relay/collage/partition_rule_test.cc  |   4 +-
 tests/cpp/relay/df_pattern_rewrite_test.cc    |   4 +-
 tests/cpp/relay/ir/indexed_graph_test.cc      |   6 +-
 .../relay/transforms/device_domains_test.cc   |   4 +-
 tests/cpp/relay/with_fields_test.cc           |   6 +-
 43 files changed, 395 insertions(+), 414 deletions(-)
 rename include/tvm/ir/{span.h => source_map.h} (59%)
 delete mode 100644 include/tvm/parser/source_map.h
 rename include/tvm/{parser => relay}/parser.h (86%)
 create mode 100644 python/tvm/parser.py
 rename python/tvm/{parser/_ffi_api.py => relay/_ffi_api_parser.py} (91%)
 rename python/tvm/{parser/__init__.py => relay/parser.py} (71%)
 rename src/ir/{span.cc => source_map.cc} (61%)
 delete mode 100644 src/parser/source_map.cc
 rename src/{ => relay}/parser/meta_ref.cc (98%)
 rename src/{ => relay}/parser/meta_ref.h (92%)
 rename src/{ => relay}/parser/op_table.h (93%)
 rename src/{ => relay}/parser/parser.cc (99%)
 rename src/{ => relay}/parser/span_check.cc (96%)
 rename src/{ => relay}/parser/span_check.h (93%)
 rename src/{ => relay}/parser/token.h (93%)
 rename src/{ => relay}/parser/tokenizer.h (96%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 46be2d52fd90..278afbe23563 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -287,7 +287,6 @@ tvm_file_glob(GLOB_RECURSE COMPILER_SRCS
     src/tir/*.cc
     src/topi/*.cc
     src/driver/*.cc
-    src/parser/*.cc
     src/support/*.cc
     src/script/*.cc
     )
@@ -317,6 +316,7 @@ tvm_file_glob(GLOB RELAY_BACKEND_SRCS
 tvm_file_glob(GLOB_RECURSE RELAY_IR_SRCS
     src/relay/ir/*.cc
     src/relay/printer/*.cc
+    src/relay/parser/*.cc
     )
 tvm_file_glob(GLOB_RECURSE RELAY_QNN_SRCS
     src/relay/qnn/*.cc
diff --git a/include/tvm/ir/diagnostic.h b/include/tvm/ir/diagnostic.h
index 41130a5be0aa..3b2407491f26 100644
--- a/include/tvm/ir/diagnostic.h
+++ b/include/tvm/ir/diagnostic.h
@@ -27,14 +27,12 @@
 #define TVM_IR_DIAGNOSTIC_H_
 
 #include <tvm/ir/module.h>
-#include <tvm/parser/source_map.h>
 
 #include <sstream>
 #include <string>
 
 namespace tvm {
 
-using tvm::parser::SourceMap;
 using tvm::runtime::TypedPackedFunc;
 
 /*! \brief The diagnostic level, controls the printing of the message. */
diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index 78c09e81b16f..c8531c88465a 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -24,7 +24,7 @@
 #ifndef TVM_IR_EXPR_H_
 #define TVM_IR_EXPR_H_
 
-#include <tvm/ir/span.h>
+#include <tvm/ir/source_map.h>
 #include <tvm/ir/type.h>
 #include <tvm/node/node.h>
 #include <tvm/runtime/container/string.h>
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index 0a5bac182fd9..fdb44b11887c 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -27,8 +27,8 @@
 #include <tvm/ir/adt.h>
 #include <tvm/ir/expr.h>
 #include <tvm/ir/function.h>
+#include <tvm/ir/source_map.h>
 #include <tvm/ir/type.h>
-#include <tvm/parser/source_map.h>
 #include <tvm/runtime/container/array.h>
 #include <tvm/runtime/container/map.h>
 #include <tvm/runtime/container/string.h>
@@ -60,7 +60,7 @@ class IRModuleNode : public Object {
   /*! \brief A map from global type vars to ADT type data. */
   Map<GlobalTypeVar, TypeData> type_definitions;
   /*! \brief The source map for the module. */
-  parser::SourceMap source_map;
+  SourceMap source_map;
   /* \brief Additional attributes storing meta-data about the module. */
   DictAttrs attrs;
   /*!
@@ -357,7 +357,7 @@ class IRModule : public ObjectRef {
    */
   TVM_DLL explicit IRModule(Map<GlobalVar, BaseFunc> functions,
                             Map<GlobalTypeVar, TypeData> type_definitions = {},
-                            std::unordered_set<String> import_set = {}, parser::SourceMap map = {},
+                            std::unordered_set<String> import_set = {}, SourceMap map = {},
                             DictAttrs attrs = {});
 
   /*! \brief default constructor */
diff --git a/include/tvm/ir/span.h b/include/tvm/ir/source_map.h
similarity index 59%
rename from include/tvm/ir/span.h
rename to include/tvm/ir/source_map.h
index b53ca2921fe7..536099f3114b 100644
--- a/include/tvm/ir/span.h
+++ b/include/tvm/ir/source_map.h
@@ -16,20 +16,25 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 /*!
- * \file tvm/ir/span.h
- * \brief Span information for debugging purposes.
+ * \file source_map.h
+ * \brief A map from source names to source code.
  */
-#ifndef TVM_IR_SPAN_H_
-#define TVM_IR_SPAN_H_
+#ifndef TVM_IR_SOURCE_MAP_H_
+#define TVM_IR_SOURCE_MAP_H_
 
 #include <tvm/node/node.h>
 #include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
 
+#include <fstream>
 #include <string>
+#include <utility>
+#include <vector>
 
 namespace tvm {
+
 /*!
  * \brief The source name in the Span
  * \sa SourceNameNode, Span
@@ -122,5 +127,84 @@ class Span : public ObjectRef {
   TVM_DEFINE_OBJECT_REF_METHODS(Span, ObjectRef, SpanNode);
 };
 
+/*! \brief A program source in any language.
+ *
+ * Could represent the source from an ML framework or a source
+ * representing a tvm::IRModule.
+ */
+class Source;
+
+class SourceNode : public Object {
+ public:
+  /*! \brief The source name. */
+  SourceName source_name;
+
+  /*! \brief The raw source. */
+  String source;
+
+  /*! \brief A mapping of line breaks into the raw source. */
+  std::vector<std::pair<int, int>> line_map;
+
+  // override attr visitor
+  void VisitAttrs(AttrVisitor* v) {
+    v->Visit("source_name", &source_name);
+    v->Visit("source", &source);
+  }
+
+  static constexpr const char* _type_key = "Source";
+  TVM_DECLARE_FINAL_OBJECT_INFO(SourceNode, Object);
+};
+
+class Source : public ObjectRef {
+ public:
+  TVM_DLL Source(SourceName src_name, std::string source);
+  TVM_DLL tvm::String GetLine(int line);
+
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Source, ObjectRef, SourceNode);
+};
+
+/*!
+ * \brief A mapping from a unique source name to source fragment.
+ */
+class SourceMap;
+/*!
+ * \brief Stores locations in frontend source that generated a node.
+ */
+class SourceMapNode : public Object {
+ public:
+  /*! \brief The source mapping. */
+  Map<SourceName, Source> source_map;
+
+  // override attr visitor
+  void VisitAttrs(AttrVisitor* v) { v->Visit("source_map", &source_map); }
+
+  bool SEqualReduce(const SourceMapNode* other, SEqualReducer equal) const {
+    return equal(source_map, other->source_map);
+  }
+
+  static constexpr const char* _type_key = "SourceMap";
+  TVM_DECLARE_FINAL_OBJECT_INFO(SourceMapNode, Object);
+};
+
+class SourceMap : public ObjectRef {
+ public:
+  explicit SourceMap(Map<SourceName, Source> source_map);
+
+  explicit SourceMap(std::initializer_list<std::pair<SourceName, Source>> source_map)
+      : SourceMap(Map<SourceName, Source>(source_map)) {}
+
+  SourceMap() : SourceMap(Map<SourceName, Source>()) {}
+
+  void Add(const Source& source);
+
+  SourceMapNode* operator->() {
+    ICHECK(get() != nullptr);
+    return static_cast<SourceMapNode*>(get_mutable());
+  }
+
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(SourceMap, ObjectRef, SourceMapNode);
+};
+
 }  // namespace tvm
-#endif  // TVM_IR_SPAN_H_
+
+#endif  // TVM_IR_SOURCE_MAP_H_
diff --git a/include/tvm/ir/type.h b/include/tvm/ir/type.h
index 62328f6a074a..c6baf5e08be3 100644
--- a/include/tvm/ir/type.h
+++ b/include/tvm/ir/type.h
@@ -49,7 +49,7 @@
 #ifndef TVM_IR_TYPE_H_
 #define TVM_IR_TYPE_H_
 
-#include <tvm/ir/span.h>
+#include <tvm/ir/source_map.h>
 #include <tvm/node/node.h>
 #include <tvm/runtime/container/array.h>
 #include <tvm/runtime/data_type.h>
diff --git a/include/tvm/parser/source_map.h b/include/tvm/parser/source_map.h
deleted file mode 100644
index a160c22a2a2f..000000000000
--- a/include/tvm/parser/source_map.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*!
- * \file source_map.h
- * \brief A map from source names to source code.
- */
-#ifndef TVM_PARSER_SOURCE_MAP_H_
-#define TVM_PARSER_SOURCE_MAP_H_
-
-#include <tvm/ir/span.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-
-#include <fstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace tvm {
-namespace parser {
-
-/*! \brief A program source in any language.
- *
- * Could represent the source from an ML framework or a source
- * representing a tvm::IRModule.
- */
-class Source;
-
-class SourceNode : public Object {
- public:
-  /*! \brief The source name. */
-  SourceName source_name;
-
-  /*! \brief The raw source. */
-  String source;
-
-  /*! \brief A mapping of line breaks into the raw source. */
-  std::vector<std::pair<int, int>> line_map;
-
-  // override attr visitor
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("source_name", &source_name);
-    v->Visit("source", &source);
-  }
-
-  static constexpr const char* _type_key = "Source";
-  TVM_DECLARE_FINAL_OBJECT_INFO(SourceNode, Object);
-};
-
-class Source : public ObjectRef {
- public:
-  TVM_DLL Source(SourceName src_name, std::string source);
-  TVM_DLL tvm::String GetLine(int line);
-
-  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Source, ObjectRef, SourceNode);
-};
-
-/*!
- * \brief A mapping from a unique source name to source fragment.
- */
-class SourceMap;
-/*!
- * \brief Stores locations in frontend source that generated a node.
- */
-class SourceMapNode : public Object {
- public:
-  /*! \brief The source mapping. */
-  Map<SourceName, Source> source_map;
-
-  // override attr visitor
-  void VisitAttrs(AttrVisitor* v) { v->Visit("source_map", &source_map); }
-
-  bool SEqualReduce(const SourceMapNode* other, SEqualReducer equal) const {
-    return equal(source_map, other->source_map);
-  }
-
-  static constexpr const char* _type_key = "SourceMap";
-  TVM_DECLARE_FINAL_OBJECT_INFO(SourceMapNode, Object);
-};
-
-class SourceMap : public ObjectRef {
- public:
-  TVM_DLL SourceMap(Map<SourceName, Source> source_map);
-
-  TVM_DLL SourceMap(std::initializer_list<std::pair<SourceName, Source>> source_map)
-      : SourceMap(Map<SourceName, Source>(source_map)) {}
-
-  TVM_DLL SourceMap() : SourceMap(Map<SourceName, Source>()) {}
-
-  void Add(const Source& source);
-
-  SourceMapNode* operator->() {
-    ICHECK(get() != nullptr);
-    return static_cast<SourceMapNode*>(get_mutable());
-  }
-
-  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(SourceMap, ObjectRef, SourceMapNode);
-};
-
-}  // namespace parser
-}  // namespace tvm
-
-#endif  // TVM_PARSER_SOURCE_MAP_H_
diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index 2825bcfc659a..a66b8044998b 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -24,7 +24,7 @@
 #ifndef TVM_RELAY_BASE_H_
 #define TVM_RELAY_BASE_H_
 
-#include <tvm/ir/span.h>
+#include <tvm/ir/source_map.h>
 #include <tvm/node/node.h>
 #include <tvm/tir/expr.h>
 
diff --git a/include/tvm/relay/error.h b/include/tvm/relay/error.h
index be34e2b8ae1a..abe8278f2f5d 100644
--- a/include/tvm/relay/error.h
+++ b/include/tvm/relay/error.h
@@ -20,7 +20,6 @@
 #define TVM_RELAY_ERROR_H_
 
 #include <tvm/ir/module.h>
-#include <tvm/ir/span.h>
 
 #include <sstream>
 #include <string>
@@ -31,7 +30,7 @@ namespace tvm {
 namespace relay {
 /*!
  * \brief A wrapper around std::stringstream to build error.
- *
+ *include/tvm/ir/type.h
  * Can be consumed by CompileError to construct an error.
  *
  * \code
diff --git a/include/tvm/parser/parser.h b/include/tvm/relay/parser.h
similarity index 86%
rename from include/tvm/parser/parser.h
rename to include/tvm/relay/parser.h
index 0a73e1a2a532..6e33e7873f60 100644
--- a/include/tvm/parser/parser.h
+++ b/include/tvm/relay/parser.h
@@ -16,13 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#ifndef TVM_RELAY_PARSER_H_
+#define TVM_RELAY_PARSER_H_
 
-#ifndef TVM_PARSER_PARSER_H_
-#define TVM_PARSER_PARSER_H_
-/*!
- * \file include/tvm/parser/parser.h
- * \brief A parser for TVM IR.
- */
 #include <tvm/ir/module.h>
 #include <tvm/ir/transform.h>
 #include <tvm/runtime/packed_func.h>
@@ -32,7 +28,7 @@
 #include <string>
 
 namespace tvm {
-namespace parser {
+namespace relay {
 
 using MetaTable = Map<String, Array<ObjectRef>>;
 
@@ -45,9 +41,9 @@ IRModule ParseModule(const std::string& file_name, const std::string& file_conte
  * for all Relay sub-expressions. This improves error and debugging diagnostics downstream for
  * modules constructed programaticaly rather than textually.
  */
-transform::Pass AnnotateSpans();
+tvm::transform::Pass AnnotateSpans();
 
-}  // namespace parser
+}  // namespace relay
 }  // namespace tvm
 
-#endif  // TVM_PARSER_PARSER_H_
+#endif  // TVM_RELAY_PARSER_H_
diff --git a/include/tvm/runtime/metadata_base.h b/include/tvm/runtime/metadata_base.h
index 698f56d46d28..ca412a3b615c 100644
--- a/include/tvm/runtime/metadata_base.h
+++ b/include/tvm/runtime/metadata_base.h
@@ -24,7 +24,10 @@
 #ifndef TVM_RUNTIME_METADATA_BASE_H_
 #define TVM_RUNTIME_METADATA_BASE_H_
 
-#include <tvm/ir/expr.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
 
 #include <memory>
diff --git a/python/tvm/ir/base.py b/python/tvm/ir/base.py
index b84a83d55843..5df529b0532f 100644
--- a/python/tvm/ir/base.py
+++ b/python/tvm/ir/base.py
@@ -17,17 +17,23 @@
 """Common base structures."""
 import tvm._ffi
 import tvm.error
-import tvm.runtime._ffi_node_api
-from tvm.runtime import Object
+from tvm._ffi import get_global_func, register_object
+from tvm.runtime import Object, _ffi_node_api
 
 from . import _ffi_api, json_compact
 
 
 class Node(Object):
-    """Base class of all IR Nodes, implements astext function."""
+    """Base class of all IR Nodes."""
 
 
-@tvm._ffi.register_object("SourceName")
+@register_object("SourceMap")
+class SourceMap(Object):
+    def add(self, name, content):
+        return get_global_func("SourceMapAdd")(self, name, content)
+
+
+@register_object("SourceName")
 class SourceName(Object):
     """A identifier for a source location.
 
@@ -38,10 +44,10 @@ class SourceName(Object):
     """
 
     def __init__(self, name):
-        self.__init_handle_by_constructor__(_ffi_api.SourceName, name)
+        self.__init_handle_by_constructor__(_ffi_api.SourceName, name)  # type: ignore # pylint: disable=no-member
 
 
-@tvm._ffi.register_object("Span")
+@register_object("Span")
 class Span(Object):
     """Specifies a location in a source program.
 
@@ -59,11 +65,11 @@ class Span(Object):
 
     def __init__(self, source_name, line, end_line, column, end_column):
         self.__init_handle_by_constructor__(
-            _ffi_api.Span, source_name, line, end_line, column, end_column
+            _ffi_api.Span, source_name, line, end_line, column, end_column  # type: ignore # pylint: disable=no-member
         )
 
 
-@tvm._ffi.register_object
+@register_object
 class EnvFunc(Object):
     """Environment function.
 
@@ -71,11 +77,11 @@ class EnvFunc(Object):
     """
 
     def __call__(self, *args):
-        return _ffi_api.EnvFuncCall(self, *args)
+        return _ffi_api.EnvFuncCall(self, *args)  # type: ignore # pylint: disable=no-member
 
     @property
     def func(self):
-        return _ffi_api.EnvFuncGetPackedFunc(self)
+        return _ffi_api.EnvFuncGetPackedFunc(self)  # type: ignore # pylint: disable=no-member
 
     @staticmethod
     def get(name):
@@ -86,7 +92,7 @@ def get(name):
         name : str
             The name of the function.
         """
-        return _ffi_api.EnvFuncGet(name)
+        return _ffi_api.EnvFuncGet(name)  # type: ignore # pylint: disable=no-member
 
 
 def load_json(json_str) -> Object:
@@ -104,10 +110,10 @@ def load_json(json_str) -> Object:
     """
 
     try:
-        return tvm.runtime._ffi_node_api.LoadJSON(json_str)
+        return _ffi_node_api.LoadJSON(json_str)
     except tvm.error.TVMError:
         json_str = json_compact.upgrade_json(json_str)
-        return tvm.runtime._ffi_node_api.LoadJSON(json_str)
+        return _ffi_node_api.LoadJSON(json_str)
 
 
 def save_json(node) -> str:
@@ -123,7 +129,7 @@ def save_json(node) -> str:
     json_str : str
         Saved json string.
     """
-    return tvm.runtime._ffi_node_api.SaveJSON(node)
+    return _ffi_node_api.SaveJSON(node)
 
 
 def structural_equal(lhs, rhs, map_free_vars=False):
@@ -175,7 +181,7 @@ def structural_equal(lhs, rhs, map_free_vars=False):
     """
     lhs = tvm.runtime.convert(lhs)
     rhs = tvm.runtime.convert(rhs)
-    return bool(tvm.runtime._ffi_node_api.StructuralEqual(lhs, rhs, False, map_free_vars))
+    return bool(_ffi_node_api.StructuralEqual(lhs, rhs, False, map_free_vars))  # type: ignore # pylint: disable=no-member
 
 
 def get_first_structural_mismatch(lhs, rhs, map_free_vars=False):
@@ -201,7 +207,7 @@ def get_first_structural_mismatch(lhs, rhs, map_free_vars=False):
     """
     lhs = tvm.runtime.convert(lhs)
     rhs = tvm.runtime.convert(rhs)
-    mismatch = tvm.runtime._ffi_node_api.GetFirstStructuralMismatch(lhs, rhs, map_free_vars)
+    mismatch = _ffi_node_api.GetFirstStructuralMismatch(lhs, rhs, map_free_vars)  # type: ignore # pylint: disable=no-member
     if mismatch is None:
         return None
     else:
@@ -233,7 +239,7 @@ def assert_structural_equal(lhs, rhs, map_free_vars=False):
     """
     lhs = tvm.runtime.convert(lhs)
     rhs = tvm.runtime.convert(rhs)
-    tvm.runtime._ffi_node_api.StructuralEqual(lhs, rhs, True, map_free_vars)
+    _ffi_node_api.StructuralEqual(lhs, rhs, True, map_free_vars)  # type: ignore # pylint: disable=no-member
 
 
 def structural_hash(node, map_free_vars=False):
@@ -275,4 +281,4 @@ def structural_hash(node, map_free_vars=False):
     --------
     structrual_equal
     """
-    return tvm.runtime._ffi_node_api.StructuralHash(node, map_free_vars)
+    return _ffi_node_api.StructuralHash(node, map_free_vars)  # type: ignore # pylint: disable=no-member
diff --git a/python/tvm/parser.py b/python/tvm/parser.py
new file mode 100644
index 000000000000..63c40deb2069
--- /dev/null
+++ b/python/tvm/parser.py
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""The legacy TVM parser """
+# pylint: disable=import-outside-toplevel
+
+
+def parse(*args, **kwargs):
+    """Deprecated, use `tvm.relay.parse` instead"""
+    from tvm.relay import parse as _impl
+
+    return _impl(*args, **kwargs)
+
+
+def parse_expr(*args, **kwargs):
+    """Deprecated, use `tvm.relay.parse_expr` instead"""
+    from tvm.relay import parse_expr as _impl
+
+    return _impl(*args, **kwargs)
+
+
+def fromtext(*args, **kwargs):
+    """Deprecated, use `tvm.relay.fromtext` instead"""
+    from tvm.relay import fromtext as _impl
+
+    return _impl(*args, **kwargs)
+
+
+def SpanCheck(*args, **kwargs):
+    """Deprecated, use `tvm.relay.SpanCheck` instead"""
+    from tvm.relay import SpanCheck as _impl
+
+    return _impl(*args, **kwargs)
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 5e5d1d5f18d8..02eec18d3013 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -65,6 +65,9 @@
 # Load Memory Passes
 from .transform import memory_plan
 
+# Parser
+from .parser import parse, parse_expr, fromtext, SpanCheck
+
 # Required to traverse large programs
 setrecursionlimit(10000)
 
diff --git a/python/tvm/parser/_ffi_api.py b/python/tvm/relay/_ffi_api_parser.py
similarity index 91%
rename from python/tvm/parser/_ffi_api.py
rename to python/tvm/relay/_ffi_api_parser.py
index 7fa3b78b72bb..731b926b5655 100644
--- a/python/tvm/parser/_ffi_api.py
+++ b/python/tvm/relay/_ffi_api_parser.py
@@ -14,8 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""FFI APIs for tvm.ir"""
+"""FFI APIs for Relay parser."""
 import tvm._ffi
 
-
-tvm._ffi._init_api("parser", __name__)
+tvm._ffi._init_api("relay.parser", __name__)
diff --git a/python/tvm/parser/__init__.py b/python/tvm/relay/parser.py
similarity index 71%
rename from python/tvm/parser/__init__.py
rename to python/tvm/relay/parser.py
index d75ad16ebab2..5e5f00a90eea 100644
--- a/python/tvm/parser/__init__.py
+++ b/python/tvm/relay/parser.py
@@ -15,25 +15,23 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name
-"""The under development unified IR parsing infrastructure."""
-from .. import _ffi, Object
-from . import _ffi_api
-
-
-@_ffi.register_object("SourceMap")
-class SourceMap(Object):
-    def add(self, name, content):
-        return _ffi.get_global_func("SourceMapAdd")(self, name, content)
+"""The relay parser."""
+from . import _ffi_api_parser
 
 
 def parse(source, source_name="from_string", init_module=None, init_meta_table=None):
     if init_meta_table is None:
         init_meta_table = {}
-    return _ffi_api.ParseModuleInContext(source_name, source, init_module, init_meta_table)
+    return _ffi_api_parser.ParseModuleInContext(  # type: ignore # pylint: disable=no-member
+        source_name,
+        source,
+        init_module,
+        init_meta_table,
+    )
 
 
 def parse_expr(source):
-    return _ffi_api.ParseExpr("string", source)
+    return _ffi_api_parser.ParseExpr("string", source)  # type: ignore # pylint: disable=no-member
 
 
 def fromtext(source, source_name="from_string"):
@@ -42,4 +40,4 @@ def fromtext(source, source_name="from_string"):
 
 def SpanCheck():
     """A debugging utility for reporting missing span information."""
-    return _ffi_api.SpanCheck()
+    return _ffi_api_parser.SpanCheck()  # type: ignore # pylint: disable=no-member
diff --git a/rust/tvm/src/ir/module.rs b/rust/tvm/src/ir/module.rs
index ea257af1ebc0..8f71a8be2c7c 100644
--- a/rust/tvm/src/ir/module.rs
+++ b/rust/tvm/src/ir/module.rs
@@ -57,9 +57,9 @@ pub struct IRModuleNode {
 
 external! {
     // Parser functions
-    #[name("parser.ParseModule")]
+    #[name("relay.parser.ParseModule")]
     fn parse_module(file_name: TVMString, source: TVMString) -> IRModule;
-    #[name("parser.ParseExpr")]
+    #[name("relay.parser.ParseExpr")]
     fn parse_expression(file_name: TVMString, source: TVMString) -> IRModule;
     #[name("ir.IRModule")]
     fn module_new(funcs: Map<GlobalVar, BaseFunc>, types: Map<GlobalTypeVar, TypeData>) -> IRModule;
diff --git a/src/ir/diagnostic.cc b/src/ir/diagnostic.cc
index 336575a93e97..6687a28d8c84 100644
--- a/src/ir/diagnostic.cc
+++ b/src/ir/diagnostic.cc
@@ -22,14 +22,12 @@
  * \brief Implementation of DiagnosticContext and friends.
  */
 #include <tvm/ir/diagnostic.h>
-#include <tvm/parser/source_map.h>
+#include <tvm/ir/source_map.h>
 
 #include <rang.hpp>
 
 namespace tvm {
 
-using tvm::parser::Source;
-
 // failed to check to argument arg0.dims[0] != 0
 
 /* Diagnostic */
diff --git a/src/ir/module.cc b/src/ir/module.cc
index b6923cd1e60d..22c6faf3d69d 100644
--- a/src/ir/module.cc
+++ b/src/ir/module.cc
@@ -16,16 +16,14 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 /*!
  * \file  module.cc
- * \brief The global module in Relay.
+ * \brief The global module in TVM.
  */
 #include <tvm/ir/global_var_supply.h>
 #include <tvm/ir/module.h>
 #include <tvm/ir/type_functor.h>
 #include <tvm/node/structural_equal.h>
-#include <tvm/parser/parser.h>
 #include <tvm/runtime/registry.h>
 
 #include <fstream>
@@ -36,8 +34,7 @@ namespace tvm {
 
 IRModule::IRModule(tvm::Map<GlobalVar, BaseFunc> functions,
                    tvm::Map<GlobalTypeVar, TypeData> type_definitions,
-                   std::unordered_set<String> import_set, parser::SourceMap source_map,
-                   DictAttrs attrs) {
+                   std::unordered_set<String> import_set, SourceMap source_map, DictAttrs attrs) {
   auto n = make_object<IRModuleNode>();
   n->functions = std::move(functions);
   n->type_definitions = std::move(type_definitions);
@@ -322,12 +319,14 @@ IRModule IRModule::FromExpr(const RelayExpr& expr, const Map<GlobalVar, BaseFunc
 }
 
 void IRModuleNode::Import(const String& path) {
+  static const auto* f = runtime::Registry::Get("relay.parser.ParseModule");
+  ICHECK(f != nullptr) << "ValueError: Relay parser is not available";
   if (this->import_set_.count(path) == 0) {
     this->import_set_.insert(path);
     std::fstream src_file(path, std::fstream::in);
     std::string file_contents{std::istreambuf_iterator<char>(src_file),
                               std::istreambuf_iterator<char>()};
-    auto mod_to_import = parser::ParseModule(path, file_contents, GetRef<IRModule>(this));
+    auto mod_to_import = (*f)(path, file_contents, GetRef<IRModule>(this));
     Update(mod_to_import);
   }
 }
@@ -342,7 +341,9 @@ void IRModuleNode::ImportFromStd(const String& path) {
 std::unordered_set<String> IRModuleNode::Imports() const { return this->import_set_; }
 
 IRModule IRModule::FromText(const String& text, const String& source_path) {
-  return tvm::parser::ParseModule(source_path, text);
+  static const auto* f = runtime::Registry::Get("relay.parser.ParseModule");
+  ICHECK(f != nullptr) << "ValueError: Relay parser is not available";
+  return (*f)(source_path, text, Optional<IRModule>());
 }
 
 TVM_REGISTER_NODE_TYPE(IRModuleNode);
diff --git a/src/ir/span.cc b/src/ir/source_map.cc
similarity index 61%
rename from src/ir/span.cc
rename to src/ir/source_map.cc
index 39f0044d16d3..8b913906ea42 100644
--- a/src/ir/span.cc
+++ b/src/ir/source_map.cc
@@ -17,11 +17,10 @@
  * under the License.
  */
 /*!
- * \file span.cc
- * \brief The span data structure.
+ * \file source_map.cc
+ * \brief The implementation of the source map data structure.
  */
-#include <tvm/ir/expr.h>
-#include <tvm/ir/span.h>
+#include <tvm/ir/source_map.h>
 #include <tvm/ir/transform.h>
 #include <tvm/runtime/registry.h>
 
@@ -100,4 +99,72 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << "Span(" << node->source_name << ", " << node->line << ", " << node->end_line
                 << ", " << node->column << ", " << node->end_column << ")";
     });
+
+TVM_REGISTER_NODE_TYPE(SourceNode);
+
+/*! \brief Construct a source from a string. */
+Source::Source(SourceName src_name, std::string source) {
+  auto n = make_object<SourceNode>();
+  n->source_name = std::move(src_name);
+  n->source = std::move(source);
+
+  int index = 0;
+  int length = 0;
+  n->line_map.push_back({index, length});
+  // NB(@jroesch):
+  std::string source_str = n->source;
+  for (auto c : source_str) {
+    if (c == '\n') {
+      // Record the length of the line.
+      n->line_map.back().second = length;
+      // Bump past the newline.
+      index += 1;
+      // Record the start of the next line, and put placeholder for length.
+      n->line_map.push_back({index, 0});
+      // Reset length to zero.
+      length = 0;
+    } else {
+      length += 1;
+      index += 1;
+    }
+  }
+  n->line_map.back().second = length;
+
+  data_ = n;
+}
+
+tvm::String Source::GetLine(int line) {
+  VLOG(1) << "Source::GetLine: line=" << line;
+  ICHECK(line - 1 < static_cast<int64_t>((*this)->line_map.size()))
+      << "requested line: " << line << "at index: " << (line - 1)
+      << "line_map size: " << (*this)->line_map.size() << "source: " << (*this)->source;
+
+  // Adjust for zero indexing, now have (line_start, line_length);
+  auto range = (*this)->line_map.at(line - 1);
+  int line_start = range.first;
+  int line_length = range.second;
+  VLOG(1) << "Source::GetLine: line_start=" << line_start << " line_length=" << line_length;
+  // TODO(@jroesch): expose substring on tvm::String.
+  auto line_text = std::string((*this)->source).substr(line_start, line_length);
+  VLOG(1) << "Source::GetLine: line_text=" << line_text;
+  return line_text;
+}
+
+TVM_REGISTER_NODE_TYPE(SourceMapNode);
+
+SourceMap::SourceMap(Map<SourceName, Source> source_map) {
+  auto n = make_object<SourceMapNode>();
+  n->source_map = std::move(source_map);
+  data_ = std::move(n);
+}
+
+void SourceMap::Add(const Source& source) { (*this)->source_map.Set(source->source_name, source); }
+
+TVM_REGISTER_GLOBAL("SourceMapAdd").set_body_typed([](SourceMap map, String name, String content) {
+  auto src_name = SourceName::Get(name);
+  Source source(src_name, content);
+  map.Add(source);
+  return src_name;
+});
+
 }  // namespace tvm
diff --git a/src/ir/transform.cc b/src/ir/transform.cc
index 9a669493ccb7..66b06e6b505d 100644
--- a/src/ir/transform.cc
+++ b/src/ir/transform.cc
@@ -587,11 +587,12 @@ TVM_REGISTER_GLOBAL("transform.OverrideInstruments")
 
 Pass PrintIR(String header, bool show_meta_data) {
   auto pass_func = [header, show_meta_data](IRModule mod, const PassContext& ctx) {
-    if (const auto* f = runtime::Registry::Get("relay.PrintIR")) {
-      (*f)(mod, header, show_meta_data);
-    } else {
-      LOG(INFO) << "PrintIR(" << header << "):\n" << mod;
+    if (const auto* f = runtime::Registry::Get("relay.ir.PrintIR")) {
+      if ((*f)(mod, header, show_meta_data)) {
+        return mod;
+      }
     }
+    LOG(INFO) << "PrintIR(" << header << "):\n" << mod;
     return mod;
   };
   return CreateModulePass(pass_func, 0, "PrintIR", {});
diff --git a/src/parser/source_map.cc b/src/parser/source_map.cc
deleted file mode 100644
index 3c1329670c40..000000000000
--- a/src/parser/source_map.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*!
- * \file source_map.cc
- * \brief The implementation of the source map data structure.
- */
-#include <tvm/parser/source_map.h>
-#include <tvm/runtime/registry.h>
-
-namespace tvm {
-namespace parser {
-
-TVM_REGISTER_NODE_TYPE(SourceNode);
-
-/*! \brief Construct a source from a string. */
-Source::Source(SourceName src_name, std::string source) {
-  auto n = make_object<SourceNode>();
-  n->source_name = std::move(src_name);
-  n->source = std::move(source);
-
-  int index = 0;
-  int length = 0;
-  n->line_map.push_back({index, length});
-  // NB(@jroesch):
-  std::string source_str = n->source;
-  for (auto c : source_str) {
-    if (c == '\n') {
-      // Record the length of the line.
-      n->line_map.back().second = length;
-      // Bump past the newline.
-      index += 1;
-      // Record the start of the next line, and put placeholder for length.
-      n->line_map.push_back({index, 0});
-      // Reset length to zero.
-      length = 0;
-    } else {
-      length += 1;
-      index += 1;
-    }
-  }
-  n->line_map.back().second = length;
-
-  data_ = n;
-}
-
-tvm::String Source::GetLine(int line) {
-  VLOG(1) << "Source::GetLine: line=" << line;
-  ICHECK(line - 1 < static_cast<int64_t>((*this)->line_map.size()))
-      << "requested line: " << line << "at index: " << (line - 1)
-      << "line_map size: " << (*this)->line_map.size() << "source: " << (*this)->source;
-
-  // Adjust for zero indexing, now have (line_start, line_length);
-  auto range = (*this)->line_map.at(line - 1);
-  int line_start = range.first;
-  int line_length = range.second;
-  VLOG(1) << "Source::GetLine: line_start=" << line_start << " line_length=" << line_length;
-  // TODO(@jroesch): expose substring on tvm::String.
-  auto line_text = std::string((*this)->source).substr(line_start, line_length);
-  VLOG(1) << "Source::GetLine: line_text=" << line_text;
-  return line_text;
-}
-
-TVM_REGISTER_NODE_TYPE(SourceMapNode);
-
-SourceMap::SourceMap(Map<SourceName, Source> source_map) {
-  auto n = make_object<SourceMapNode>();
-  n->source_map = std::move(source_map);
-  data_ = std::move(n);
-}
-
-void SourceMap::Add(const Source& source) { (*this)->source_map.Set(source->source_name, source); }
-
-TVM_REGISTER_GLOBAL("SourceMapAdd").set_body_typed([](SourceMap map, String name, String content) {
-  auto src_name = SourceName::Get(name);
-  Source source(src_name, content);
-  map.Add(source);
-  return src_name;
-});
-
-}  // namespace parser
-}  // namespace tvm
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 183a3094e473..4ff8a59b349e 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -25,7 +25,7 @@
 
 #include "utils.h"
 
-#include <tvm/parser/parser.h>
+#include <tvm/relay/parser.h>
 #include <tvm/relay/qnn/transform.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/tir/stmt_functor.h>
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index fb23c4cc082a..c29b3195a3fd 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -25,13 +25,13 @@
 #include "compiler.h"
 
 #include <tvm/driver/driver_api.h>
-#include <tvm/parser/parser.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/device_copy.h>
 #include <tvm/relay/attrs/memory.h>
 #include <tvm/relay/error.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/interpreter.h>
+#include <tvm/relay/parser.h>
 #include <tvm/relay/qnn/transform.h>
 #include <tvm/relay/runtime.h>
 #include <tvm/relay/transform.h>
diff --git a/src/relay/ir/base.cc b/src/relay/ir/base.cc
index 5f913026080d..deedd283c2ff 100644
--- a/src/relay/ir/base.cc
+++ b/src/relay/ir/base.cc
@@ -39,22 +39,5 @@ Id::Id(String name_hint) {
   data_ = std::move(n);
 }
 
-TVM_REGISTER_GLOBAL("ir.NodeSetSpan").set_body_typed([](ObjectRef node_ref, Span sp) {
-  if (auto* rn = node_ref.as<RelayNode>()) {
-    rn->span = sp;
-  } else if (auto* rn = node_ref.as<RelayExprNode>()) {
-    rn->span = sp;
-  } else if (auto* rn = node_ref.as<TypeNode>()) {
-    rn->span = sp;
-  } else {
-    LOG(FATAL) << "Expect Type or RelayNode ";
-  }
-});
-
-TVM_REGISTER_GLOBAL("relay.PrintIR")
-    .set_body_typed([](ObjectRef mod, String header, bool show_metadata) {
-      LOG(INFO) << "PrintIR(" << header << "):\n" << AsText(mod, show_metadata);
-    });
-
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/ir/function.cc b/src/relay/ir/function.cc
index 3ff5eaa059c1..5d743d521777 100644
--- a/src/relay/ir/function.cc
+++ b/src/relay/ir/function.cc
@@ -123,6 +123,7 @@ const FunctionNode* AsOptimizableFunctionNode(const BaseFunc& base_func) {
   }
   return nullptr;
 }
+
 TVM_REGISTER_GLOBAL("relay.ir.PrintRelayModule")
     .set_body_typed([](IRModule mod) -> Optional<String> {
       for (const auto& it : mod->functions) {
@@ -133,6 +134,17 @@ TVM_REGISTER_GLOBAL("relay.ir.PrintRelayModule")
       return NullOpt;
     });
 
+TVM_REGISTER_GLOBAL("relay.ir.PrintIR")
+    .set_body_typed([](IRModule mod, String header, bool show_metadata) -> bool {
+      for (const auto& it : mod->functions) {
+        if (it.second->IsInstance<FunctionNode>()) {
+          LOG(INFO) << "PrintIR(" << header << "):\n" << AsText(mod, show_metadata);
+          return true;
+        }
+      }
+      return false;
+    });
+
 TVM_REGISTER_GLOBAL("relay.ir.WarnIfMalformed")
     .set_body_typed([](const IRModule& mod, const BaseFunc& base_func) -> void {
       if (const auto* relay_func = base_func.as<FunctionNode>()) {
diff --git a/src/parser/meta_ref.cc b/src/relay/parser/meta_ref.cc
similarity index 98%
rename from src/parser/meta_ref.cc
rename to src/relay/parser/meta_ref.cc
index 6b0e8d0c5966..cdc6929622dd 100644
--- a/src/parser/meta_ref.cc
+++ b/src/relay/parser/meta_ref.cc
@@ -30,7 +30,7 @@
 #include <tvm/relay/transform.h>
 
 namespace tvm {
-namespace parser {
+namespace relay {
 
 using tvm::relay::transform::CreateFunctionPass;
 using tvm::transform::PassContext;
@@ -95,5 +95,5 @@ IRModule ExpandMetaRefs(const MetaTable& meta_table, const IRModule& mod) {
   return pass(mod, PassContext::Create());
 }
 
-}  // namespace parser
+}  // namespace relay
 }  // namespace tvm
diff --git a/src/parser/meta_ref.h b/src/relay/parser/meta_ref.h
similarity index 92%
rename from src/parser/meta_ref.h
rename to src/relay/parser/meta_ref.h
index 483b7f726e07..bed67bea05a4 100644
--- a/src/parser/meta_ref.h
+++ b/src/relay/parser/meta_ref.h
@@ -22,20 +22,18 @@
  * \brief A reference into the metadata section of the Relay text format.
  */
 
-#ifndef TVM_PARSER_META_REF_H_
-#define TVM_PARSER_META_REF_H_
+#ifndef TVM_RELAY_PARSER_META_REF_H_
+#define TVM_RELAY_PARSER_META_REF_H_
 
 #include <tvm/ir/attrs.h>
-#include <tvm/parser/parser.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/function.h>
+#include <tvm/relay/parser.h>
 
 #include <string>
 
 namespace tvm {
-namespace parser {
-
-using namespace relay;
+namespace relay {
 
 /*!
  * \brief Options for allocating storage.
@@ -78,7 +76,7 @@ Expr MetaRef(std::string type_key, uint64_t node_index);
 relay::Function ExpandMetaRefs(const MetaTable& meta_table, const relay::Function& func);
 IRModule ExpandMetaRefs(const MetaTable& meta_table, const IRModule& mod);
 
-}  // namespace parser
+}  // namespace relay
 }  // namespace tvm
 
-#endif  // TVM_PARSER_META_REF_H_
+#endif  // TVM_RELAY_PARSER_META_REF_H_
diff --git a/src/parser/op_table.h b/src/relay/parser/op_table.h
similarity index 93%
rename from src/parser/op_table.h
rename to src/relay/parser/op_table.h
index 28c9cd7fc05f..6ff2c05476f4 100644
--- a/src/parser/op_table.h
+++ b/src/relay/parser/op_table.h
@@ -18,14 +18,13 @@
  */
 
 /*!
- * \file token.h
+ * \file op_table.h
  * \brief A operator table for parsing.
- *
  * Provides symbolic token sequences to map to TVM operators, with a given associativity and arity.
  */
 
-#ifndef TVM_PARSER_OP_TABLE_H_
-#define TVM_PARSER_OP_TABLE_H_
+#ifndef TVM_RELAY_PARSER_OP_TABLE_H_
+#define TVM_RELAY_PARSER_OP_TABLE_H_
 
 #include <tvm/ir/op.h>
 #include <tvm/runtime/object.h>
@@ -38,7 +37,7 @@
 #include "./tokenizer.h"
 
 namespace tvm {
-namespace parser {
+namespace relay {
 
 struct Rule {
   std::vector<TokenType> tokens;
@@ -77,7 +76,7 @@ struct OperatorTable {
   }
 };
 
-OperatorTable DefaultOpTable() {
+inline OperatorTable DefaultOpTable() {
   return OperatorTable(
       {Rule({TokenType::kStar}, Op::Get("multiply"), 12, 2, true),
        Rule({TokenType::kDivision}, Op::Get("divide"), 12, 2, true),
@@ -91,6 +90,6 @@ OperatorTable DefaultOpTable() {
        Rule({TokenType::kBang, TokenType::kEqual}, Op::Get("not_equal"), 7, 2, true)});
 }
 
-}  // namespace parser
+}  // namespace relay
 }  // namespace tvm
-#endif  // TVM_PARSER_OP_TABLE_H_
+#endif  // TVM_RELAY_PARSER_OP_TABLE_H_
diff --git a/src/parser/parser.cc b/src/relay/parser/parser.cc
similarity index 99%
rename from src/parser/parser.cc
rename to src/relay/parser/parser.cc
index fe89857f2709..ae7fc52cbead 100644
--- a/src/parser/parser.cc
+++ b/src/relay/parser/parser.cc
@@ -23,11 +23,12 @@
  */
 #include <tvm/ir/module.h>
 #include <tvm/node/reflection.h>
-#include <tvm/parser/parser.h>
 #include <tvm/relay/adt.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/function.h>
+#include <tvm/relay/parser.h>
 #include <tvm/relay/transform.h>
+#include <tvm/runtime/builtin_fp16.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/registry.h>
@@ -35,18 +36,14 @@
 
 #include <fstream>
 
-#include "../support/scalars.h"
+#include "../../support/scalars.h"
 #include "./meta_ref.h"
 #include "./op_table.h"
 #include "./span_check.h"
 #include "./tokenizer.h"
-#include "tvm/runtime/builtin_fp16.h"
 
 namespace tvm {
-namespace parser {
-
-using namespace relay;
-using Expr = relay::Expr;
+namespace relay {
 
 /*! \brief The meta table maps from type key to a sequence of objects. */
 using MetaTable = Map<String, Array<ObjectRef>>;
@@ -1948,22 +1945,6 @@ Expr ParseExpr(const std::string& file_name, const std::string& file_content) {
   return expr;
 }
 
-TVM_REGISTER_GLOBAL("parser.ParseModuleInContext")
-    .set_body_typed([](const std::string& file_name, const std::string& file_content,
-                       const Optional<IRModule>& init_module, const MetaTable& init_meta_table) {
-      return ParseModule(file_name, file_content, init_module, init_meta_table);
-    });
-
-TVM_REGISTER_GLOBAL("parser.ParseModule")
-    .set_body_typed([](const std::string& file_name, const std::string& file_content) {
-      return ParseModule(file_name, file_content);
-    });
-
-TVM_REGISTER_GLOBAL("parser.ParseExpr")
-    .set_body_typed([](tvm::String file_name, tvm::String file_content) {
-      return ParseExpr(file_name, file_content);
-    });
-
 /*!
  * \brief This pass pretty-prints mod then parses it back so as to establish spans and sources
  * for all Relay sub-expressions. This improves error and debugging diagnostics downstream for
@@ -1978,7 +1959,29 @@ Pass AnnotateSpans() {
   return CreateModulePass(pass_func, 0, "AnnotateSpans", {});
 }
 
+TVM_REGISTER_GLOBAL("relay.parser.ParseModuleInContext")
+    .set_body_typed([](const std::string& file_name, const std::string& file_content,
+                       const Optional<IRModule>& init_module, const MetaTable& init_meta_table) {
+      return ParseModule(file_name, file_content, init_module, init_meta_table);
+    });
+
+TVM_REGISTER_GLOBAL("relay.parser.ParseModule").set_body([](TVMArgs args, TVMRetValue* ret) {
+  ICHECK(args.size() >= 2 && args.size() <= 4) << "Expected 2-4 arguments, but got " << args.size();
+  if (args.size() == 2) {
+    *ret = ParseModule(args[0], args[1]);
+  } else if (args.size() == 3) {
+    *ret = ParseModule(args[0], args[1], args[2]);
+  } else {
+    *ret = ParseModule(args[0], args[1], args[2], args[3]);
+  }
+});
+
+TVM_REGISTER_GLOBAL("relay.parser.ParseExpr")
+    .set_body_typed([](tvm::String file_name, tvm::String file_content) {
+      return ParseExpr(file_name, file_content);
+    });
+
 TVM_REGISTER_GLOBAL("relay._transform.AnnotateSpans").set_body_typed(AnnotateSpans);
 
-}  // namespace parser
+}  // namespace relay
 }  // namespace tvm
diff --git a/src/parser/span_check.cc b/src/relay/parser/span_check.cc
similarity index 96%
rename from src/parser/span_check.cc
rename to src/relay/parser/span_check.cc
index 7fed3730d926..6bbf6317ad9f 100644
--- a/src/parser/span_check.cc
+++ b/src/relay/parser/span_check.cc
@@ -25,7 +25,7 @@
 #include <tvm/relay/transform.h>
 
 namespace tvm {
-namespace parser {
+namespace relay {
 
 using tvm::relay::transform::CreateFunctionPass;
 using tvm::transform::PassContext;
@@ -101,7 +101,7 @@ Pass SpanCheck() {
       0, "SpanCheck", {});
 }
 
-TVM_REGISTER_GLOBAL("parser.SpanCheck").set_body_typed([]() { return SpanCheck(); });
+TVM_REGISTER_GLOBAL("relay.parser.SpanCheck").set_body_typed([]() { return SpanCheck(); });
 
-}  // namespace parser
+}  // namespace relay
 }  // namespace tvm
diff --git a/src/parser/span_check.h b/src/relay/parser/span_check.h
similarity index 93%
rename from src/parser/span_check.h
rename to src/relay/parser/span_check.h
index 0074c66d61f4..b85b4a497965 100644
--- a/src/parser/span_check.h
+++ b/src/relay/parser/span_check.h
@@ -21,9 +21,8 @@
  * \file span_check.h
  * \brief Check that the Relay IR has correctly attached span information.
  */
-
-#ifndef TVM_PARSER_SPAN_CHECK_H_
-#define TVM_PARSER_SPAN_CHECK_H_
+#ifndef TVM_RELAY_PARSER_SPAN_CHECK_H_
+#define TVM_RELAY_PARSER_SPAN_CHECK_H_
 
 #include <tvm/ir/transform.h>
 #include <tvm/ir/type_functor.h>
@@ -38,7 +37,7 @@
 #include <vector>
 
 namespace tvm {
-namespace parser {
+namespace relay {
 
 using namespace tvm::relay;
 using tvm::transform::Pass;
@@ -74,6 +73,6 @@ struct SpanChecker : ExprVisitor {
 
 Pass SpanCheck();
 
-}  // namespace parser
+}  // namespace relay
 }  // namespace tvm
-#endif  // TVM_PARSER_SPAN_CHECK_H_
+#endif  // TVM_RELAY_PARSER_SPAN_CHECK_H_
diff --git a/src/parser/token.h b/src/relay/parser/token.h
similarity index 93%
rename from src/parser/token.h
rename to src/relay/parser/token.h
index 48a1bf70a250..7b11e701cf6e 100644
--- a/src/parser/token.h
+++ b/src/relay/parser/token.h
@@ -22,10 +22,11 @@
  * \brief The definition of tokens for the TVM parser.
  */
 
-#ifndef TVM_PARSER_TOKEN_H_
-#define TVM_PARSER_TOKEN_H_
+#ifndef TVM_RELAY_PARSER_TOKEN_H_
+#define TVM_RELAY_PARSER_TOKEN_H_
 
-#include <tvm/ir/span.h>
+#include <tvm/ir/expr.h>
+#include <tvm/ir/source_map.h>
 #include <tvm/runtime/object.h>
 
 #include <fstream>
@@ -33,7 +34,7 @@
 #include <utility>
 
 namespace tvm {
-namespace parser {
+namespace relay {
 
 using namespace runtime;
 
@@ -97,7 +98,7 @@ enum class TokenType {
   kNull,
 };
 
-std::string ToString(const TokenType& token_type) {
+inline std::string ToString(const TokenType& token_type) {
   switch (token_type) {
     case TokenType::kCommentStart:
       return "CommentStart";
@@ -219,7 +220,7 @@ std::string ToString(const TokenType& token_type) {
   }
 }
 
-std::string Pretty(const TokenType& token_type) {
+inline std::string Pretty(const TokenType& token_type) {
   switch (token_type) {
     case TokenType::kCommentStart:
       return "`/*`";
@@ -375,7 +376,7 @@ class Token : public ObjectRef {
   TVM_DEFINE_OBJECT_REF_METHODS(Token, ObjectRef, TokenNode);
 };
 
-Token::Token(Span span, TokenType token_type, ObjectRef data) {
+inline Token::Token(Span span, TokenType token_type, ObjectRef data) {
   ObjectPtr<TokenNode> n = make_object<TokenNode>();
   n->span = span;
   n->token_type = token_type;
@@ -383,15 +384,17 @@ Token::Token(Span span, TokenType token_type, ObjectRef data) {
   data_ = std::move(n);
 }
 
-Token Token::Null() { return Token(Span(SourceName(), 0, 0, 0, 0), TokenType::kNull); }
+inline Token Token::Null() { return Token(Span(SourceName(), 0, 0, 0, 0), TokenType::kNull); }
 
-int64_t Token::ToNumber() const {
+inline int64_t Token::ToNumber() const {
   return Downcast<tvm::Integer>(this->operator->()->data).IntValue();
 }
 
-std::string Token::ToString() const { return Downcast<tvm::String>(this->operator->()->data); }
+inline std::string Token::ToString() const {
+  return Downcast<tvm::String>(this->operator->()->data);
+}
 
-Map<String, Array<ObjectRef>> Token::ToMetadata() const {
+inline Map<String, Array<ObjectRef>> Token::ToMetadata() const {
   ObjectRef data = this->operator->()->data;
   if (data.defined()) {
     return Downcast<Map<String, Array<ObjectRef>>>(data);
@@ -400,6 +403,6 @@ Map<String, Array<ObjectRef>> Token::ToMetadata() const {
   }
 }
 
-}  // namespace parser
+}  // namespace relay
 }  // namespace tvm
-#endif  // TVM_PARSER_TOKEN_H_
+#endif  // TVM_RELAY_PARSER_TOKEN_H_
diff --git a/src/parser/tokenizer.h b/src/relay/parser/tokenizer.h
similarity index 96%
rename from src/parser/tokenizer.h
rename to src/relay/parser/tokenizer.h
index 505784e4bf70..04dcd3263e99 100644
--- a/src/parser/tokenizer.h
+++ b/src/relay/parser/tokenizer.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file parser.h
+ * \file tokenizer.h
  * \brief A parser for TVM IR.
  */
-#ifndef TVM_PARSER_TOKENIZER_H_
-#define TVM_PARSER_TOKENIZER_H_
+#ifndef TVM_RELAY_PARSER_TOKENIZER_H_
+#define TVM_RELAY_PARSER_TOKENIZER_H_
 
 #include <tvm/node/serialization.h>
 #include <tvm/runtime/object.h>
@@ -34,12 +34,12 @@
 #include <utility>
 #include <vector>
 
-#include "../support/scalars.h"
+#include "../../support/scalars.h"
 #include "./meta_ref.h"
 #include "./token.h"
 
 namespace tvm {
-namespace parser {
+namespace relay {
 
 using namespace runtime;
 
@@ -54,20 +54,20 @@ static inline void rtrim(std::string& s) {  // NOLINT(*)
           s.end());
 }
 
-bool IsDigit(char c) { return '0' <= c && c <= '9'; }
+inline bool IsDigit(char c) { return '0' <= c && c <= '9'; }
 
-bool IsWhitespace(char c) { return ' ' == c || c == '\t' || c == '\n'; }
+inline bool IsWhitespace(char c) { return ' ' == c || c == '\t' || c == '\n'; }
 
-bool IsNumeric(char c) {
+inline bool IsNumeric(char c) {
   return (IsDigit(c) || c == '.' || c == 'e' || c == '-' || c == '+' || c == 'E') &&
          !IsWhitespace(c);
 }
 
-bool IsIdentLetter(char c) {
+inline bool IsIdentLetter(char c) {
   return '_' == c || c == '/' || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
 }
 
-bool IsIdent(char c) { return IsIdentLetter(c) || IsDigit(c); }
+inline bool IsIdent(char c) { return IsIdentLetter(c) || IsDigit(c); }
 
 static std::unordered_map<std::string, TokenType> KEYWORD_TABLE = {
     {"let", TokenType::kLet},          {"fn", TokenType::kFn},
@@ -371,7 +371,7 @@ struct Tokenizer {
     int line = this->line;
     int col = this->col;
     auto next = Peek();
-    VLOG(9) << "tvm::parser::TokenizeOnce: next=" << next;
+    VLOG(9) << "tvm::relay::TokenizeOnce: next=" << next;
     if (next == '\n') {
       auto token = NewToken(TokenType::kNewline);
       Next();
@@ -582,7 +582,7 @@ struct Tokenizer {
   }
 
   void Tokenize() {
-    VLOG(9) << "tvm::parser::Tokenize";
+    VLOG(9) << "tvm::relay::Tokenize";
     while (this->More()) {
       auto token = TokenizeOnce();
       ICHECK(token.defined());
@@ -601,7 +601,7 @@ struct Tokenizer {
         tokens() {}
 };
 
-std::vector<Token> Condense(const std::vector<Token>& tokens, Token* table) {
+inline std::vector<Token> Condense(const std::vector<Token>& tokens, Token* table) {
   std::vector<Token> out;
   bool found_metadata = false;
 
@@ -680,7 +680,8 @@ std::vector<Token> Condense(const std::vector<Token>& tokens, Token* table) {
   return out;
 }
 
-std::pair<std::vector<Token>, Token> Tokenize(const DiagnosticContext& ctx, const Source& source) {
+inline std::pair<std::vector<Token>, Token> Tokenize(const DiagnosticContext& ctx,
+                                                     const Source& source) {
   auto tokenizer = Tokenizer(ctx, source);
   tokenizer.Tokenize();
   Token meta_table(Span(), TokenType::kUnknown, ObjectRef());
@@ -691,7 +692,7 @@ std::pair<std::vector<Token>, Token> Tokenize(const DiagnosticContext& ctx, cons
   return {tokens, meta_table};
 }
 
-}  // namespace parser
+}  // namespace relay
 }  // namespace tvm
 
-#endif  // TVM_PARSER_TOKENIZER_H_
+#endif  // TVM_RELAY_PARSER_TOKENIZER_H_
diff --git a/src/relay/printer/relay_text_printer.cc b/src/relay/printer/relay_text_printer.cc
index cc86f9b56435..5b47c262fd48 100644
--- a/src/relay/printer/relay_text_printer.cc
+++ b/src/relay/printer/relay_text_printer.cc
@@ -41,9 +41,9 @@
 #include <tvm/tir/function.h>
 
 #include "../../ir/attr_functor.h"
-#include "../../parser/meta_ref.h"
 #include "../../support/scalars.h"
 #include "../analysis/dependency_graph.h"
+#include "../parser/meta_ref.h"
 #include "doc.h"
 #include "meta_data.h"
 #include "text_printer.h"
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 168441d1708d..8b6600fbdfa9 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -23,7 +23,6 @@
  */
 
 #include <dmlc/json.h>
-#include <tvm/ir/expr.h>
 #include <tvm/runtime/c_backend_api.h>
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/packed_func.h>
diff --git a/tests/cpp/relay/backend/aot/aot_lower_main_test.cc b/tests/cpp/relay/backend/aot/aot_lower_main_test.cc
index 31166f1e6bb8..0157f031c214 100644
--- a/tests/cpp/relay/backend/aot/aot_lower_main_test.cc
+++ b/tests/cpp/relay/backend/aot/aot_lower_main_test.cc
@@ -20,7 +20,7 @@
 #include "../../../../../src/relay/backend/aot/aot_lower_main.h"
 
 #include <gtest/gtest.h>
-#include <tvm/parser/parser.h>
+#include <tvm/relay/parser.h>
 
 namespace tvm {
 namespace relay {
@@ -37,7 +37,7 @@ TEST(AOTLowerMain, ExprAllocatorSkipNestedFunc) {
         %0(%x)
       }
     )";
-  IRModule mod = parser::ParseModule("string", mod_text, {}, {});
+  IRModule mod = ParseModule("string", mod_text, {}, {});
   auto host_target = tvm::Target("llvm");
   auto prim_target = tvm::Target(host_target, host_target);
   auto ctxt = tvm::transform::PassContext::Current();
diff --git a/tests/cpp/relay/collage/candidate_partition_test.cc b/tests/cpp/relay/collage/candidate_partition_test.cc
index bc5d2d880a3b..d298a493c11f 100644
--- a/tests/cpp/relay/collage/candidate_partition_test.cc
+++ b/tests/cpp/relay/collage/candidate_partition_test.cc
@@ -20,9 +20,9 @@
 #include "../../../src/relay/collage/candidate_partition.h"
 
 #include <gtest/gtest.h>
-#include <tvm/parser/parser.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/function.h>
+#include <tvm/relay/parser.h>
 #include <tvm/relay/transform.h>
 
 #include "../../../../src/relay/collage/mock_cost_estimator.h"
@@ -37,7 +37,7 @@ namespace {
 // so not re-tested here. The only other non-trivial code is CandidatePartition::EstimateCost
 
 Function MakeTestFunction(const std::string& mod_text) {
-  IRModule mod = parser::ParseModule("string", mod_text, {}, {});
+  IRModule mod = ParseModule("string", mod_text, {}, {});
   mod = transform::CapturePostDfsIndexInSpans()(mod);
   auto func = Downcast<Function>(mod->Lookup("main"));
   LOG(INFO) << "------- input function -------";
diff --git a/tests/cpp/relay/collage/partition_rule_test.cc b/tests/cpp/relay/collage/partition_rule_test.cc
index 51a4970c7ec0..effe0b1fa030 100644
--- a/tests/cpp/relay/collage/partition_rule_test.cc
+++ b/tests/cpp/relay/collage/partition_rule_test.cc
@@ -20,9 +20,9 @@
 #include "../../../src/relay/collage/partition_rule.h"
 
 #include <gtest/gtest.h>
-#include <tvm/parser/parser.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/function.h>
+#include <tvm/relay/parser.h>
 #include <tvm/relay/transform.h>
 
 #include "../../../src/relay/collage/partition_spec.h"
@@ -46,7 +46,7 @@ Function MakeTestFunction(
   }
   Map<String, Array<ObjectRef>> metatable;
   metatable.Set("relay.Constant", constants);
-  IRModule mod = parser::ParseModule("string", mod_text, {}, metatable);
+  IRModule mod = ParseModule("string", mod_text, {}, metatable);
   mod = transform::CapturePostDfsIndexInSpans()(mod);
   auto func = Downcast<Function>(mod->Lookup("main"));
   LOG(INFO) << "------- input function -------";
diff --git a/tests/cpp/relay/df_pattern_rewrite_test.cc b/tests/cpp/relay/df_pattern_rewrite_test.cc
index af09ae48aafd..374887c12a22 100644
--- a/tests/cpp/relay/df_pattern_rewrite_test.cc
+++ b/tests/cpp/relay/df_pattern_rewrite_test.cc
@@ -18,11 +18,11 @@
  */
 
 #include <gtest/gtest.h>
-#include <tvm/parser/parser.h>
 #include <tvm/relay/attrs/transform.h>
 #include <tvm/relay/dataflow_matcher.h>
 #include <tvm/relay/dataflow_pattern.h>
 #include <tvm/relay/function.h>
+#include <tvm/relay/parser.h>
 
 #include "../../../src/relay/transforms/simplify_expr.h"
 
@@ -82,7 +82,7 @@ TEST(DFPatternRewrite, DeeplyNestedWithCallAttributes) {
     }
   )";
 
-  IRModule module = parser::ParseModule("string", kModel);
+  IRModule module = ParseModule("string", kModel);
   DFPatternRewriteComposer composer;
   composer.AddRewrite<TestRewriter>();
   Function in_function = Downcast<Function>(module->Lookup("main"));
diff --git a/tests/cpp/relay/ir/indexed_graph_test.cc b/tests/cpp/relay/ir/indexed_graph_test.cc
index 17ec68261684..486d027fbc21 100644
--- a/tests/cpp/relay/ir/indexed_graph_test.cc
+++ b/tests/cpp/relay/ir/indexed_graph_test.cc
@@ -20,9 +20,9 @@
 #include "../../../src/relay/ir/indexed_graph.h"
 
 #include <gtest/gtest.h>
-#include <tvm/parser/parser.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/function.h>
+#include <tvm/relay/parser.h>
 
 namespace tvm {
 namespace relay {
@@ -81,7 +81,7 @@ IRModule TestRecursiveIRModule() {
       (%19, %20)                                                         // 51
     }                                                                    // 52
   )";
-  return parser::ParseModule("string", kModel, /*init_module=*/{}, metadata);
+  return ParseModule("string", kModel, /*init_module=*/{}, metadata);
 }
 
 TEST(IndexedGraph, RecursiveExprRegression) {
@@ -179,7 +179,7 @@ IRModule TestUnusedLetBoundIRModule() {
       }
     }
   )";
-  return parser::ParseModule("string", kModel);
+  return ParseModule("string", kModel);
 }
 
 TEST(IndexedGraph, UnusedLetVars) {
diff --git a/tests/cpp/relay/transforms/device_domains_test.cc b/tests/cpp/relay/transforms/device_domains_test.cc
index c5b2f26315b2..47e303996b3b 100644
--- a/tests/cpp/relay/transforms/device_domains_test.cc
+++ b/tests/cpp/relay/transforms/device_domains_test.cc
@@ -27,7 +27,7 @@
 #include "../../../../src/relay/transforms/device_domains.h"
 
 #include <gtest/gtest.h>
-#include <tvm/parser/parser.h>
+#include <tvm/relay/parser.h>
 #include <tvm/relay/transform.h>
 
 namespace tvm {
@@ -36,7 +36,7 @@ namespace transform {
 namespace {
 
 IRModule TestModule() {
-  return InferType()(tvm::parser::ParseModule("test", R"(
+  return InferType()(ParseModule("test", R"(
     #[version = "0.0.5"]
     def @f(%x : Tensor[(3, 7), float32], %y : Tensor[(3, 7), float32]) {
       add(%x, %y)
diff --git a/tests/cpp/relay/with_fields_test.cc b/tests/cpp/relay/with_fields_test.cc
index 48e04c259bb5..6114fa97a9fd 100644
--- a/tests/cpp/relay/with_fields_test.cc
+++ b/tests/cpp/relay/with_fields_test.cc
@@ -23,18 +23,18 @@
  */
 
 #include <gtest/gtest.h>
-#include <tvm/parser/parser.h>
 #include <tvm/relay/adt.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/function.h>
+#include <tvm/relay/parser.h>
 
 namespace tvm {
 namespace relay {
 namespace {
 
 IRModule TestIRModule() {
-  return parser::ParseModule("string",
-                             R"(
+  return ParseModule("string",
+                     R"(
     #[version = "0.0.5"]
     def @main(%data : Tensor[(1, 304, 128, 128), float32],
              %weight1 : Tensor[(304, 1, 3, 3), float32],

From f3b8ae2392618bc175afc2fbcd4ec866f985d273 Mon Sep 17 00:00:00 2001
From: MNGanesan <mnganesan@yahoo.co.uk>
Date: Wed, 25 Jan 2023 19:19:55 +0530
Subject: [PATCH 224/286] Enhance the --help message of composite target
 (#13842)

Presently --help for vitis displays the target and option string,
it has no description.  Eg:  target vitis-ai dpu<class 'str'>

This can be made more meaningful by fetching the description from
the config node of the target.  Eg: Vitis AI DPU identifier

Signed-off-by: MNGanesan <mnganesan@yahoo.co.uk>
---
 python/tvm/driver/tvmc/target.py              |  2 +-
 .../python/driver/tvmc/test_target_options.py | 23 +++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/python/tvm/driver/tvmc/target.py b/python/tvm/driver/tvmc/target.py
index a3602b4eb8e1..ec8215184ee3 100644
--- a/python/tvm/driver/tvmc/target.py
+++ b/python/tvm/driver/tvmc/target.py
@@ -72,7 +72,7 @@ def _generate_codegen_args(parser, codegen_name):
                     target_group.add_argument(
                         f"--target-{codegen_name}-{target_option}",
                         type=python_type,
-                        help=f"target {codegen_name} {target_option}{python_type}",
+                        help=field.description,
                     )
 
 
diff --git a/tests/python/driver/tvmc/test_target_options.py b/tests/python/driver/tvmc/test_target_options.py
index 891df86f0c1f..8d11e448009d 100644
--- a/tests/python/driver/tvmc/test_target_options.py
+++ b/tests/python/driver/tvmc/test_target_options.py
@@ -60,6 +60,29 @@ def test_mapping_target_args():
     assert reconstruct_target_args(parsed) == {"llvm": {"mcpu": "cortex-m3"}}
 
 
+@tvm.testing.requires_vitis_ai
+def test_composite_target_cmd_line_help():
+    parser = argparse.ArgumentParser()
+    generate_target_args(parser)
+    assert parser._option_string_actions["--target-vitis-ai-dpu"].help == "Vitis AI DPU identifier"
+    assert (
+        parser._option_string_actions["--target-vitis-ai-build_dir"].help
+        == "Build directory to be used (optional, debug)"
+    )
+    assert (
+        parser._option_string_actions["--target-vitis-ai-work_dir"].help
+        == "Work directory to be used (optional, debug)"
+    )
+    assert (
+        parser._option_string_actions["--target-vitis-ai-export_runtime_module"].help
+        == "Export the Vitis AI runtime module to this file"
+    )
+    assert (
+        parser._option_string_actions["--target-vitis-ai-load_runtime_module"].help
+        == "Load the Vitis AI runtime module to this file"
+    )
+
+
 @tvm.testing.requires_cmsisnn
 def test_include_known_codegen():
     parser = argparse.ArgumentParser()

From 1881cfd46d7286f6d5bf9f881f355b419800e013 Mon Sep 17 00:00:00 2001
From: Mohamad Katanbaf <mtkatanbaf@gmail.com>
Date: Wed, 25 Jan 2023 16:00:56 -0800
Subject: [PATCH 225/286] [microTVM] Allow multiple runners in tuning micro
 models with meta-schedule (#13811)

This PR adds support for running candidate projects on multiple physical boards when using meta-schedule to tune micro models, and helps reducing the tuning time.
---
 .../micro/meta_schedule/rpc_runner_micro.py   | 108 ++++++++++++------
 python/tvm/micro/build.py                     |   7 ++
 python/tvm/micro/testing/pytest_plugin.py     |   7 +-
 tests/micro/zephyr/test_ms_tuning.py          |  15 ++-
 4 files changed, 98 insertions(+), 39 deletions(-)

diff --git a/python/tvm/contrib/micro/meta_schedule/rpc_runner_micro.py b/python/tvm/contrib/micro/meta_schedule/rpc_runner_micro.py
index e4c08351841d..307855438e71 100644
--- a/python/tvm/contrib/micro/meta_schedule/rpc_runner_micro.py
+++ b/python/tvm/contrib/micro/meta_schedule/rpc_runner_micro.py
@@ -17,9 +17,10 @@
 """RPC Runner Micro"""
 
 from contextlib import contextmanager
-from typing import Callable, List, Optional
+from typing import Callable, List, Optional, Union
 from collections import namedtuple
 import signal
+import random
 
 from tvm import micro
 from tvm import nd
@@ -44,10 +45,11 @@ def __init__(
         self,
         platform: str = "crt",
         project_options: Optional[dict] = None,
-        rpc_config: Optional[RPCConfig] = None,
+        rpc_configs: Optional[List[RPCConfig]] = None,
         evaluator_config: Optional[EvaluatorConfig] = None,
         max_workers: Optional[int] = None,
         initializer: Optional[Callable[[], None]] = None,
+        session_timeout_sec: int = 300,
     ) -> None:
         """Constructor
 
@@ -65,21 +67,25 @@ def __init__(
             The maximum number of connections. Defaults to number of logical CPU cores.
         initializer: Optional[Callable[[], None]]
             The initializer function.
+        session_timeout_sec: int
+            The session timeout, including the pending time. if the number of candidates sent to runner is larger
+            than the runner workers, increase the timeout.
         """
         super().__init__()
         self.platform = platform
         if project_options is None:
             project_options = {}
         self.project_options = project_options
-        self.rpc_config = RPCConfig._normalized(rpc_config)
+        self.rpc_configs = rpc_configs
         self.evaluator_config = EvaluatorConfig._normalized(evaluator_config)
+        self.session_timeout_sec = session_timeout_sec
 
         if max_workers is None:
             max_workers = cpu_count(logical=True)
         logger.info("RPCRunner: max_workers = %d", max_workers)
         self.pool = PopenPoolExecutor(
             max_workers=max_workers,
-            timeout=rpc_config.session_timeout_sec,
+            timeout=session_timeout_sec,
             initializer=initializer,
         )
 
@@ -92,13 +98,13 @@ def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
                     _worker_func,
                     self.platform,
                     self.project_options or {},
-                    self.rpc_config,
+                    self.rpc_configs,
                     self.evaluator_config,
                     str(runner_input.artifact_path),
                     str(runner_input.device_type),
                     tuple(arg_info.as_json() for arg_info in runner_input.args_info),
                 ),
-                timeout_sec=self.rpc_config.session_timeout_sec,
+                timeout_sec=self.session_timeout_sec,
             )
             results.append(future)  # type: ignore
         return results
@@ -107,7 +113,7 @@ def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
 def _worker_func(
     platform: str,
     project_options: dict,
-    rpc_config: RPCConfig,
+    rpc_configs: List[RPCConfig],
     evaluator_config: EvaluatorConfig,
     artifact_path: str,
     device_type: str,
@@ -119,6 +125,7 @@ def _worker_func(
         project_options=project_options,
     )
 
+    rpc_config = random.choice(rpc_configs)
     remote_kw = {
         "device_key": rpc_config.tracker_key,
         "host": rpc_config.tracker_host,
@@ -126,6 +133,7 @@ def _worker_func(
         "priority": 0,
         "timeout": 100,
     }
+
     build_result = namedtuple("BuildResult", ["filename"])(artifact_path)
 
     with module_loader(remote_kw, build_result) as (remote, mod):
@@ -156,36 +164,35 @@ def _worker_func(
 def get_rpc_runner_micro(
     platform,
     options,
-    rpc_config: RPCConfig = None,
     evaluator_config: EvaluatorConfig = None,
-    session_timeout_sec=300,
+    tracker_host: Optional[str] = None,
+    tracker_port: Union[None, int, str] = None,
+    session_timeout_sec: int = 300,
+    rpc_timeout_sec: int = 10,
+    serial_numbers: List[str] = None,
 ):
     """Parameters
     ----------
     platform: str
         The platform used for project generation.
-    project_options: dict
+    options: dict
         The options for the generated micro project.
-    rpc_config: RPCConfig
-        The rpc configuration.
     evaluator_config: EvaluatorConfig
         The evaluator configuration.
+    tracker_host: Optional[str]
+        The host url of the rpc server.
+    tracker_port: Union[None, int, str]
+        The TCP port to bind to
     session_timeout_sec: int
         The session timeout. if the number of candidates sent to runner is larger
         than the runner workers, increase the timeout.
+    rpc_timeout_sec:
+        The rpc session timeout.
+    serial_numbers:
+        List of board serial numbers to be used during tuning.
+        For "CRT" and "QEMU" platforms the serial numners are not used,
+        but the length of the list determines the number of runner instances.
     """
-    if rpc_config is None:
-        tracker_host = "127.0.0.1"
-        tracker_port = 9000
-        tracker_key = "$local$device$%d" % tracker_port
-        rpc_config = RPCConfig(
-            tracker_host=tracker_host,
-            tracker_port=tracker_port,
-            tracker_key=tracker_key,
-            session_priority=0,
-            session_timeout_sec=session_timeout_sec,
-        )
-    tracker_port_end = rpc_config.tracker_port + 1000
 
     if evaluator_config is None:
         evaluator_config = EvaluatorConfig(
@@ -195,26 +202,54 @@ def get_rpc_runner_micro(
             enable_cpu_cache_flush=False,
         )
 
+    if tracker_host is None:
+        tracker_host = "127.0.0.1"
+
+    if tracker_port is None:
+        tracker_port = 9000
+    else:
+        tracker_port = int(tracker_port)
+    tracker_port_end = tracker_port + 1000
+
+    if not (serial_numbers):
+        serial_numbers = ["$local$device"]
+
     tracker = Tracker(
-        port=rpc_config.tracker_port,
+        port=tracker_port,
         port_end=tracker_port_end,
         silent=True,
         reuse_addr=True,
         timeout=60,
     )
-    server = Server(
-        port=rpc_config.tracker_port,
-        port_end=tracker_port_end,
-        key=rpc_config.tracker_key,
-        silent=True,
-        tracker_addr=(rpc_config.tracker_host, rpc_config.tracker_port),
-        reuse_addr=True,
-        timeout=60,
-    )
+
+    servers = []
+    rpc_configs = []
+    for serial_number in serial_numbers:
+        key = serial_number
+        rpc_config = RPCConfig(
+            tracker_host=tracker_host,
+            tracker_port=tracker_port,
+            tracker_key=key,
+            session_priority=0,
+            session_timeout_sec=rpc_timeout_sec,
+        )
+        rpc_configs.append(rpc_config)
+
+        server = Server(
+            port=tracker_port,
+            port_end=tracker_port_end,
+            key=key,
+            silent=True,
+            tracker_addr=(tracker_host, tracker_port),
+            reuse_addr=True,
+            timeout=60,
+        )
+        servers.append(server)
 
     def terminate():
         tracker.terminate()
-        server.terminate()
+        for server in servers:
+            server.terminate()
 
     def handle_SIGINT(signal, frame):
         terminate()
@@ -226,8 +261,9 @@ def handle_SIGINT(signal, frame):
         yield RPCRunnerMicro(
             platform=platform,
             project_options=options,
-            rpc_config=rpc_config,
+            rpc_configs=rpc_configs,
             evaluator_config=evaluator_config,
+            session_timeout_sec=session_timeout_sec,
         )
     finally:
         terminate()
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index ac35142a8937..aee114672233 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -151,6 +151,13 @@ def __call__(self, remote_kw, build_result):
         with open(build_result.filename, "rb") as build_file:
             build_result_bin = build_file.read()
 
+        # In case we are tuning on multiple physical boards (with Meta-schedule), the tracker
+        # device_key is the serial_number of the board that wil be used in generating micro session.
+        # For CRT projects, and in cases that the serial number is not provided
+        # (including tuning with AutoTVM), the serial number field doesn't change.
+        if "board" in self._project_options and "$local$device" not in remote_kw["device_key"]:
+            self._project_options["serial_number"] = remote_kw["device_key"]
+
         tracker = _rpc.connect_tracker(remote_kw["host"], remote_kw["port"])
         remote = tracker.request(
             remote_kw["device_key"],
diff --git a/python/tvm/micro/testing/pytest_plugin.py b/python/tvm/micro/testing/pytest_plugin.py
index c32377fb7e7d..3a828ea3a01e 100644
--- a/python/tvm/micro/testing/pytest_plugin.py
+++ b/python/tvm/micro/testing/pytest_plugin.py
@@ -142,4 +142,9 @@ def pytest_configure(config):
 
 @pytest.fixture
 def serial_number(request):
-    return request.config.getoption("--serial-number")
+    serial_number = request.config.getoption("--serial-number")
+    if serial_number:
+        serial_number_splitted = serial_number.split(",")
+        if len(serial_number_splitted) > 1:
+            return serial_number_splitted
+    return serial_number
diff --git a/tests/micro/zephyr/test_ms_tuning.py b/tests/micro/zephyr/test_ms_tuning.py
index 3ce6ff68bc32..16d48ca4cdd6 100644
--- a/tests/micro/zephyr/test_ms_tuning.py
+++ b/tests/micro/zephyr/test_ms_tuning.py
@@ -61,7 +61,7 @@ def create_relay_module():
 
 
 @tvm.testing.requires_micro
-@pytest.mark.xfail_on_fvp()
+@pytest.mark.skip_boards(["mps2_an521", "mps3_an547"])
 def test_ms_tuning_conv2d(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Test meta-schedule tuning for microTVM Zephyr"""
 
@@ -80,6 +80,14 @@ def test_ms_tuning_conv2d(workspace_dir, board, microtvm_debug, use_fvp, serial_
         "serial_number": serial_number,
         "config_main_stack_size": 4096,
     }
+    if isinstance(serial_number, list):
+        project_options["serial_number"] = serial_number[0]  # project_api expects an string.
+        serial_numbers = serial_number
+    else:
+        if serial_number is not None:  # use a single device in tuning
+            serial_numbers = [serial_number]
+        else:  # use two dummy serial numbers (for testing with QEMU)
+            serial_numbers = [str(i) for i in range(2)]
 
     boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
     with open(boards_file) as f:
@@ -95,7 +103,10 @@ def test_ms_tuning_conv2d(workspace_dir, board, microtvm_debug, use_fvp, serial_
     builder = get_local_builder_micro()
     with ms.Profiler() as profiler:
         with get_rpc_runner_micro(
-            platform=platform, options=project_options, session_timeout_sec=120
+            platform=platform,
+            options=project_options,
+            session_timeout_sec=120,
+            serial_numbers=serial_numbers,
         ) as runner:
 
             db: ms.Database = ms.relay_integration.tune_relay(

From 3d42755a449b41279be741f822d3e048dcdb7236 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Wed, 25 Jan 2023 21:42:31 -0800
Subject: [PATCH 226/286] [TVMScript] Comments and docstrings printing (#13839)

This PR introduces the `CommentDoc` for comments printing and `DocStringDoc` for docstring printing. It enables to add free comments and docstring as `stmt` in printing, e.g.
```python
# comment 1
# comment 2
"""
docstring 1
docstring 2
"""
```
The free here means to not be bound to any `stmt`, but acts as a single `stmt`, similar to `ExprStmtDoc` for `ExprDoc`.

This PR also introduces an example for the `CommentDoc`, as follow up of #13819.
In the old printer, we always print a `# with T.block("root"):`, when there is an implicit root block skipped when printing. For example,
```
@T.prim_func
def main():
  # with T.block("root"):
  a = T.alloc_buffer((128, 128))
  for i, j in T.grid(128, 128):
    with T.block(""):
      ...
```
We bring this syntax reminder back in this PR.
In addition, we introduce a field of `ir_usage` and `print_headers` into the printer configuration, to support the printing of headers for `IRModule` and `PrimFunc`. For example,

```python
# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module():
  @T.prim_func
  def func():
    ...
```
---
 include/tvm/script/printer/doc.h              | 44 +++++++++++++++
 include/tvm/script/printer/ir_docsifier.h     |  4 ++
 python/tvm/script/printer/doc.py              | 20 +++++++
 src/script/printer/doc.cc                     | 22 ++++++++
 .../printer/doc_printer/base_doc_printer.cc   |  4 ++
 .../printer/doc_printer/base_doc_printer.h    | 10 ++++
 .../printer/doc_printer/python_doc_printer.cc | 34 +++++++++--
 src/script/printer/ir/ir.cc                   |  3 +-
 src/script/printer/ir/utils.h                 |  1 +
 src/script/printer/tir/function.cc            |  4 +-
 src/script/printer/tir/utils.h                |  1 +
 src/script/printer/utils.h                    | 20 +++++++
 .../unittest/test_tvmscript_printer_doc.py    | 28 ++++++++++
 .../unittest/test_tvmscript_printer_ir.py     |  3 +
 ...st_tvmscript_printer_python_doc_printer.py | 56 ++++++++++++++++++-
 .../unittest/test_tvmscript_printer_tir.py    | 18 +++++-
 16 files changed, 261 insertions(+), 11 deletions(-)

diff --git a/include/tvm/script/printer/doc.h b/include/tvm/script/printer/doc.h
index 6504e2c2843d..6321caa4e057 100644
--- a/include/tvm/script/printer/doc.h
+++ b/include/tvm/script/printer/doc.h
@@ -1194,6 +1194,50 @@ class ClassDoc : public StmtDoc {
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(ClassDoc, StmtDoc, ClassDocNode);
 };
 
+/*!
+ * \brief Doc that represents comment.
+ *
+ * \sa CommentDoc
+ */
+class CommentDocNode : public StmtDocNode {
+ public:
+  static constexpr const char* _type_key = "script.printer.CommentDoc";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CommentDocNode, StmtDocNode);
+};
+
+/*!
+ * \brief Reference type of CommentDocNode.
+ *
+ * \sa CommentDocNode
+ */
+class CommentDoc : public StmtDoc {
+ public:
+  explicit CommentDoc(String comment);
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(CommentDoc, StmtDoc, CommentDocNode);
+};
+
+/*!
+ * \brief Doc that represents docstring.
+ *
+ * \sa DocStringDoc
+ */
+class DocStringDocNode : public StmtDocNode {
+ public:
+  static constexpr const char* _type_key = "script.printer.DocStringDoc";
+  TVM_DECLARE_FINAL_OBJECT_INFO(DocStringDocNode, StmtDocNode);
+};
+
+/*!
+ * \brief Reference type of DocStringDocNode.
+ *
+ * \sa DocStringDocNode
+ */
+class DocStringDoc : public StmtDoc {
+ public:
+  explicit DocStringDoc(String docs);
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(DocStringDoc, StmtDoc, DocStringDocNode);
+};
+
 }  // namespace printer
 }  // namespace script
 }  // namespace tvm
diff --git a/include/tvm/script/printer/ir_docsifier.h b/include/tvm/script/printer/ir_docsifier.h
index 67fa96ef8082..c41827fe9530 100644
--- a/include/tvm/script/printer/ir_docsifier.h
+++ b/include/tvm/script/printer/ir_docsifier.h
@@ -24,6 +24,7 @@
 #include <tvm/script/printer/doc.h>
 #include <tvm/script/printer/ir_docsifier_functor.h>
 
+#include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -148,6 +149,8 @@ class IRDocsifierNode : public Object {
   std::unordered_set<String> defined_names;
   /*! \brief Common prefixes of variable usages */
   std::unordered_map<const Object*, std::vector<const Object*>> common_prefix;
+  /*! \brief The IR usages for headers printing */
+  std::unordered_set<std::string> ir_usage;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("frames", &frames);
@@ -156,6 +159,7 @@ class IRDocsifierNode : public Object {
     // `obj2info` is not visited
     // `defined_names` is not visited
     // `common_prefix` is not visited
+    // `ir_usage` is not visited
   }
 
   static constexpr const char* _type_key = "script.printer.IRDocsifier";
diff --git a/python/tvm/script/printer/doc.py b/python/tvm/script/printer/doc.py
index 5a4a4cd67a72..9a6e7f1b8c8f 100644
--- a/python/tvm/script/printer/doc.py
+++ b/python/tvm/script/printer/doc.py
@@ -521,3 +521,23 @@ def __init__(self, name: IdDoc, decorators: List[ExprDoc], body: List[StmtDoc]):
             decorators,
             body,
         )
+
+
+@register_object("script.printer.CommentDoc")
+class CommentDoc(StmtDoc):
+    """Doc that represents comment."""
+
+    def __init__(self, comment: str):
+        self.__init_handle_by_constructor__(
+            _ffi_api.CommentDoc, comment  # type: ignore # pylint: disable=no-member
+        )
+
+
+@register_object("script.printer.DocStringDoc")
+class DocStringDoc(StmtDoc):
+    """Doc that represents docstring."""
+
+    def __init__(self, docs: str):
+        self.__init_handle_by_constructor__(
+            _ffi_api.DocStringDoc, docs  # type: ignore # pylint: disable=no-member
+        )
diff --git a/src/script/printer/doc.cc b/src/script/printer/doc.cc
index 89f6b7c8b1cf..1db4e090dcff 100644
--- a/src/script/printer/doc.cc
+++ b/src/script/printer/doc.cc
@@ -221,6 +221,18 @@ ClassDoc::ClassDoc(IdDoc name, Array<ExprDoc> decorators, Array<StmtDoc> body) {
   this->data_ = std::move(n);
 }
 
+CommentDoc::CommentDoc(String comment) {
+  ObjectPtr<CommentDocNode> n = make_object<CommentDocNode>();
+  n->comment = comment;
+  this->data_ = std::move(n);
+}
+
+DocStringDoc::DocStringDoc(String docs) {
+  ObjectPtr<DocStringDocNode> n = make_object<DocStringDocNode>();
+  n->comment = docs;
+  this->data_ = std::move(n);
+}
+
 TVM_REGISTER_NODE_TYPE(DocNode);
 TVM_REGISTER_GLOBAL("script.printer.DocSetSourcePaths")
     .set_body_typed([](Doc doc, Array<ObjectPath> source_paths) {
@@ -365,6 +377,16 @@ TVM_REGISTER_GLOBAL("script.printer.ClassDoc")
       return ClassDoc(name, decorators, body);
     });
 
+TVM_REGISTER_NODE_TYPE(CommentDocNode);
+TVM_REGISTER_GLOBAL("script.printer.CommentDoc").set_body_typed([](String comment) {
+  return CommentDoc(comment);
+});
+
+TVM_REGISTER_NODE_TYPE(DocStringDocNode);
+TVM_REGISTER_GLOBAL("script.printer.DocStringDoc").set_body_typed([](String docs) {
+  return DocStringDoc(docs);
+});
+
 }  // namespace printer
 }  // namespace script
 }  // namespace tvm
diff --git a/src/script/printer/doc_printer/base_doc_printer.cc b/src/script/printer/doc_printer/base_doc_printer.cc
index a3a5c06ede0d..8df599347f07 100644
--- a/src/script/printer/doc_printer/base_doc_printer.cc
+++ b/src/script/printer/doc_printer/base_doc_printer.cc
@@ -316,6 +316,10 @@ void DocPrinter::PrintDoc(const Doc& doc) {
     PrintTypedDoc(GetRef<FunctionDoc>(doc_node));
   } else if (const auto* doc_node = doc.as<ClassDocNode>()) {
     PrintTypedDoc(GetRef<ClassDoc>(doc_node));
+  } else if (const auto* doc_node = doc.as<CommentDocNode>()) {
+    PrintTypedDoc(GetRef<CommentDoc>(doc_node));
+  } else if (const auto* doc_node = doc.as<DocStringDocNode>()) {
+    PrintTypedDoc(GetRef<DocStringDoc>(doc_node));
   } else {
     LOG(FATAL) << "Do not know how to print " << doc->GetTypeKey();
     throw;
diff --git a/src/script/printer/doc_printer/base_doc_printer.h b/src/script/printer/doc_printer/base_doc_printer.h
index 7851ce061b0d..f5cf40a23357 100644
--- a/src/script/printer/doc_printer/base_doc_printer.h
+++ b/src/script/printer/doc_printer/base_doc_printer.h
@@ -204,6 +204,16 @@ class DocPrinter {
    */
   virtual void PrintTypedDoc(const ClassDoc& doc) = 0;
 
+  /*!
+   * \brief Virtual method to print a CommentDoc
+   */
+  virtual void PrintTypedDoc(const CommentDoc& doc) = 0;
+
+  /*!
+   * \brief Virtual method to print a DocStringDoc
+   */
+  virtual void PrintTypedDoc(const DocStringDoc& doc) = 0;
+
   /*!
    * \brief Increase the indent level of any content to be
    *        printed after this call
diff --git a/src/script/printer/doc_printer/python_doc_printer.cc b/src/script/printer/doc_printer/python_doc_printer.cc
index ce6b8e7f423c..334f76f72280 100644
--- a/src/script/printer/doc_printer/python_doc_printer.cc
+++ b/src/script/printer/doc_printer/python_doc_printer.cc
@@ -169,6 +169,8 @@ class PythonDocPrinter : public DocPrinter {
   void PrintTypedDoc(const ScopeDoc& doc) final;
   void PrintTypedDoc(const FunctionDoc& doc) final;
   void PrintTypedDoc(const ClassDoc& doc) final;
+  void PrintTypedDoc(const CommentDoc& doc) final;
+  void PrintTypedDoc(const DocStringDoc& doc) final;
 
  private:
   void NewLineWithoutIndent() { output_ << "\n"; }
@@ -253,11 +255,19 @@ class PythonDocPrinter : public DocPrinter {
     }
   }
 
-  void MaybePrintCommentWithNewLine(const StmtDoc& stmt) {
+  void MaybePrintCommenMultiLines(const StmtDoc& stmt, bool new_line = false) {
     if (stmt->comment.defined()) {
       std::vector<std::string> comment_lines = support::Split(stmt->comment.value(), '\n');
+      bool first_line = true;
       for (const std::string& line : comment_lines) {
-        output_ << "# " << line;
+        if (first_line) {
+          output_ << "# " << line;
+          first_line = false;
+        } else {
+          NewLine() << "# " << line;
+        }
+      }
+      if (new_line) {
         NewLine();
       }
     }
@@ -523,7 +533,7 @@ void PythonDocPrinter::PrintTypedDoc(const AssignDoc& doc) {
 }
 
 void PythonDocPrinter::PrintTypedDoc(const IfDoc& doc) {
-  MaybePrintCommentWithNewLine(doc);
+  MaybePrintCommenMultiLines(doc, true);
   output_ << "if ";
   PrintDoc(doc->predicate);
   output_ << ":";
@@ -538,7 +548,7 @@ void PythonDocPrinter::PrintTypedDoc(const IfDoc& doc) {
 }
 
 void PythonDocPrinter::PrintTypedDoc(const WhileDoc& doc) {
-  MaybePrintCommentWithNewLine(doc);
+  MaybePrintCommenMultiLines(doc, true);
   output_ << "while ";
   PrintDoc(doc->predicate);
   output_ << ":";
@@ -547,7 +557,7 @@ void PythonDocPrinter::PrintTypedDoc(const WhileDoc& doc) {
 }
 
 void PythonDocPrinter::PrintTypedDoc(const ForDoc& doc) {
-  MaybePrintCommentWithNewLine(doc);
+  MaybePrintCommenMultiLines(doc, true);
   output_ << "for ";
   if (const auto* tuple = doc->lhs.as<TupleDocNode>()) {
     if (tuple->elements.size() == 1) {
@@ -567,7 +577,7 @@ void PythonDocPrinter::PrintTypedDoc(const ForDoc& doc) {
 }
 
 void PythonDocPrinter::PrintTypedDoc(const ScopeDoc& doc) {
-  MaybePrintCommentWithNewLine(doc);
+  MaybePrintCommenMultiLines(doc, true);
   output_ << "with ";
   PrintDoc(doc->rhs);
   if (doc->lhs != nullptr) {
@@ -642,6 +652,18 @@ void PythonDocPrinter::PrintTypedDoc(const ClassDoc& doc) {
   NewLineWithoutIndent();
 }
 
+void PythonDocPrinter::PrintTypedDoc(const CommentDoc& doc) {
+  if (doc->comment.defined()) {
+    MaybePrintCommenMultiLines(doc, false);
+  }
+}
+
+void PythonDocPrinter::PrintTypedDoc(const DocStringDoc& doc) {
+  if (doc->comment.defined() && !doc->comment.value().empty()) {
+    output_ << "\"\"\"" << doc->comment.value() << "\"\"\"";
+  }
+}
+
 String DocToPythonScript(Doc doc, const PrinterConfig& cfg) {
   if (cfg->num_context_lines < 0) {
     cfg->num_context_lines = std::numeric_limits<int32_t>::max();
diff --git a/src/script/printer/ir/ir.cc b/src/script/printer/ir/ir.cc
index 4a246e169276..7f7857dba671 100644
--- a/src/script/printer/ir/ir.cc
+++ b/src/script/printer/ir/ir.cc
@@ -119,7 +119,8 @@ std::string ReprPrintIRModule(const ObjectRef& mod, const PrinterConfig& cfg) {
       return s.value();
     }
   }
-  Doc doc = IRDocsifier(cfg)->AsDoc(mod, ObjectPath::Root());
+  IRDocsifier d(cfg);
+  Doc doc = HeaderWrapper(d, d->AsDoc(mod, ObjectPath::Root()));
   return DocToPythonScript(doc, cfg);
 }
 
diff --git a/src/script/printer/ir/utils.h b/src/script/printer/ir/utils.h
index d20756e6081a..a05030516f3f 100644
--- a/src/script/printer/ir/utils.h
+++ b/src/script/printer/ir/utils.h
@@ -36,6 +36,7 @@ namespace printer {
 
 /*! \brief Creates the IR common prefix, which is by default `I` */
 inline ExprDoc IR(const IRDocsifier& d, const String& attr) {
+  d->ir_usage.insert("ir");
   return IdDoc(d->cfg->ir_prefix)->Attr(attr);
 }
 
diff --git a/src/script/printer/tir/function.cc b/src/script/printer/tir/function.cc
index fbcc2fca3b4b..65f3db5b4fec 100644
--- a/src/script/printer/tir/function.cc
+++ b/src/script/printer/tir/function.cc
@@ -153,6 +153,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       if (implicit_root_block) {
         tir::Block root_block = implicit_root_block.value();
         ObjectPath root_block_p = p->Attr("body")->Attr("body");
+        (*frame)->stmts.push_back(CommentDoc("with T.block(\"root\"):"));
         // Handle root block `alloc_buffer`
         for (int i = 0, n = root_block->alloc_buffers.size(); i < n; ++i) {
           tir::Buffer buffer = root_block->alloc_buffers[i];
@@ -181,7 +182,8 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     });
 
 std::string ReprPrintPrimFunc(const ObjectRef& obj, const PrinterConfig& cfg) {
-  Doc doc = IRDocsifier(cfg)->AsDoc(obj, ObjectPath::Root());
+  IRDocsifier d(cfg);
+  Doc doc = HeaderWrapper(d, d->AsDoc(obj, ObjectPath::Root()));
   return DocToPythonScript(doc, cfg);
 }
 
diff --git a/src/script/printer/tir/utils.h b/src/script/printer/tir/utils.h
index 88094ee816ca..0eead9a57713 100644
--- a/src/script/printer/tir/utils.h
+++ b/src/script/printer/tir/utils.h
@@ -74,6 +74,7 @@ class TIRFrame : public Frame {
 
 /*! \brief Creates the TIR common prefix, which is by default `T` */
 inline ExprDoc TIR(const IRDocsifier& d, const String& attr) {
+  d->ir_usage.insert("tir");
   return IdDoc(d->cfg->tir_prefix)->Attr(attr);
 }
 
diff --git a/src/script/printer/utils.h b/src/script/printer/utils.h
index cb20eb363ddd..e90fbc0fb39d 100644
--- a/src/script/printer/utils.h
+++ b/src/script/printer/utils.h
@@ -69,6 +69,26 @@ inline std::string DType2Str(const runtime::DataType& dtype) {
   return dtype.is_void() ? "void" : runtime::DLDataType2String(dtype);
 }
 
+/*! \brief Add headers as comments to doc if needed */
+inline Doc HeaderWrapper(const IRDocsifier& d, const Doc& doc) {
+  if (d->ir_usage.size()) {
+    Array<StmtDoc> stmts;
+    if (d->ir_usage.count("ir")) {
+      stmts.push_back(CommentDoc("from tvm.script import ir as " + d->cfg->ir_prefix));
+    }
+    if (d->ir_usage.count("tir")) {
+      stmts.push_back(CommentDoc("from tvm.script import tir as " + d->cfg->tir_prefix));
+    }
+    if (d->ir_usage.count("relax")) {
+      stmts.push_back(CommentDoc("from tvm.script import relax as " + d->cfg->relax_prefix));
+    }
+    stmts.push_back(CommentDoc(""));
+    stmts.push_back(Downcast<StmtDoc>(doc));
+    return StmtBlockDoc(stmts);
+  }
+  return doc;
+}
+
 }  // namespace printer
 }  // namespace script
 }  // namespace tvm
diff --git a/tests/python/unittest/test_tvmscript_printer_doc.py b/tests/python/unittest/test_tvmscript_printer_doc.py
index 16a0c31ac364..6353627c5814 100644
--- a/tests/python/unittest/test_tvmscript_printer_doc.py
+++ b/tests/python/unittest/test_tvmscript_printer_doc.py
@@ -29,7 +29,9 @@
     AttrAccessDoc,
     CallDoc,
     ClassDoc,
+    CommentDoc,
     DictDoc,
+    DocStringDoc,
     ExprStmtDoc,
     ForDoc,
     FunctionDoc,
@@ -505,6 +507,32 @@ def test_class_doc(decorators, body):
     assert list(doc.body) == body
 
 
+@pytest.mark.parametrize(
+    "comment",
+    [
+        "",
+        "test comment 1",
+        "test comment 1\ntest comment 1",
+    ],
+)
+def test_comment_doc(comment):
+    doc = CommentDoc(comment)
+    assert doc.comment == comment
+
+
+@pytest.mark.parametrize(
+    "comment",
+    [
+        "",
+        "test comment 1",
+        "test comment 1\ntest comment 1",
+    ],
+)
+def test_doc_string_doc(comment):
+    doc = DocStringDoc(comment)
+    assert doc.comment == comment
+
+
 def test_stmt_doc_comment():
     doc = ExprStmtDoc(IdDoc("x"))
     assert doc.comment is None
diff --git a/tests/python/unittest/test_tvmscript_printer_ir.py b/tests/python/unittest/test_tvmscript_printer_ir.py
index c3da3d8c702b..6b3ac19a5ef8 100644
--- a/tests/python/unittest/test_tvmscript_printer_ir.py
+++ b/tests/python/unittest/test_tvmscript_printer_ir.py
@@ -37,6 +37,9 @@ def test_ir_module():
     _assert_print(
         mod,
         """
+# from tvm.script import ir as I
+# from tvm.script import tir as T
+
 @I.ir_module
 class Module:
     @T.prim_func
diff --git a/tests/python/unittest/test_tvmscript_printer_python_doc_printer.py b/tests/python/unittest/test_tvmscript_printer_python_doc_printer.py
index d87f9ec69e05..75beb59d02cf 100644
--- a/tests/python/unittest/test_tvmscript_printer_python_doc_printer.py
+++ b/tests/python/unittest/test_tvmscript_printer_python_doc_printer.py
@@ -23,7 +23,9 @@
     AssignDoc,
     CallDoc,
     ClassDoc,
+    CommentDoc,
     DictDoc,
+    DocStringDoc,
     ExprStmtDoc,
     ForDoc,
     FunctionDoc,
@@ -53,7 +55,7 @@ def format_script(s: str) -> str:
     non_empty_lines = [line for line in s.splitlines() if line and not line.isspace()]
     if not non_empty_lines:
         # no actual content
-        return "\n"
+        return ""
 
     line_indents = [len(line) - len(line.lstrip(" ")) for line in non_empty_lines]
     spaces_to_remove = min(line_indents)
@@ -887,6 +889,58 @@ def test_print_class_doc(decorators, body, expected):
     assert to_python_script(doc) == format_script(expected)
 
 
+@pytest.mark.parametrize(
+    "comment, expected",
+    [
+        (
+            "",
+            "",
+        ),
+        (
+            "test comment 1",
+            "# test comment 1",
+        ),
+        (
+            "test comment 1\ntest comment 2",
+            """
+            # test comment 1
+            # test comment 2
+            """,
+        ),
+    ],
+    ids=itertools.count(),
+)
+def test_print_comment_doc(comment, expected):
+    doc = CommentDoc(comment)
+    assert to_python_script(doc) == format_script(expected)
+
+
+@pytest.mark.parametrize(
+    "comment, expected",
+    [
+        (
+            "",
+            "",
+        ),
+        (
+            "test comment 1",
+            '"""test comment 1"""',
+        ),
+        (
+            "test comment 1\ntest comment 2",
+            '''
+            """test comment 1
+            test comment 2"""
+            ''',
+        ),
+    ],
+    ids=itertools.count(),
+)
+def test_print_doc_string_doc(comment, expected):
+    doc = DocStringDoc(comment)
+    assert to_python_script(doc) == format_script(expected)
+
+
 @pytest.mark.parametrize(
     "doc, comment, expected",
     [
diff --git a/tests/python/unittest/test_tvmscript_printer_tir.py b/tests/python/unittest/test_tvmscript_printer_tir.py
index ec69c54396c3..49a33cd0f0e8 100644
--- a/tests/python/unittest/test_tvmscript_printer_tir.py
+++ b/tests/python/unittest/test_tvmscript_printer_tir.py
@@ -41,6 +41,8 @@ def test_prim_func():
     _assert_print(
         func,
         expected="""
+# from tvm.script import tir as T
+
 @T.prim_func
 def main(A: T.Buffer((128, 128), "float32"), B: T.Buffer((256, 256), "float32")):
     T.evaluate(0)""",
@@ -62,6 +64,8 @@ def test_prim_func_no_sugar_inlined_buffer():
     _assert_print(
         func,
         expected="""
+# from tvm.script import tir as T
+
 @T.prim_func
 def main(a: T.handle, B: T.Buffer((256, 256), "float32")):
     A = T.match_buffer(a, (128, 128))
@@ -86,6 +90,8 @@ def test_prim_func_no_sugar_shared_buffer_data():
     _assert_print(
         func,
         expected="""
+# from tvm.script import tir as T
+
 @T.prim_func
 def main(a: T.handle, b: T.handle):
     A = T.match_buffer(a, (128, 128))
@@ -698,8 +704,12 @@ def block_with_remap_explicitly():
                 v3 = T.axis.spatial(128, i3 - 1)
                 v4, v5 = T.axis.remap("RS", [i4, i5])
 
-    expected_output = """@T.prim_func
+    expected_output = """
+# from tvm.script import tir as T
+
+@T.prim_func
 def main():
+    # with T.block("root"):
     for i0, i1, i2, i3, i4, i5 in T.grid(128, 128, 128, 128, 128, 128):
         with T.block("update"):
             v0 = T.axis.spatial(128, i0 + 1)
@@ -731,8 +741,12 @@ def root_block_explicitly():
                 with T.block():
                     T.evaluate(0)
 
-    expected_output = """@T.prim_func
+    expected_output = """
+# from tvm.script import tir as T
+
+@T.prim_func
 def main():
+    # with T.block("root"):
     a = T.alloc_buffer((128, 128))
     for i, j in T.grid(128, 128):
         with T.block(""):

From 427b5486b6ce637f39c933b7da02f30c82b6decd Mon Sep 17 00:00:00 2001
From: chengven027-intellif <darkvan_wen@hotmail.com>
Date: Fri, 27 Jan 2023 02:25:46 +0800
Subject: [PATCH 227/286] [ONNX] QGemm support (#13747)

Co-authored-by: cheng.wen <wen.cheng@intellif.com>
---
 python/tvm/relay/frontend/onnx.py          |  85 +++++++++++
 tests/python/frontend/onnx/test_forward.py | 168 +++++++++++++++++++++
 2 files changed, 253 insertions(+)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index ffd31317e9f5..19854072d853 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -5091,6 +5091,90 @@ def _impl_v10(cls, inputs, attr, params):
         return out
 
 
+class QGemm(OnnxOpConverter):
+    """Operator converter for QGemm."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.QGemm
+
+        a = inputs[0]
+        a_scale = get_scalar(inputs[1], params)
+        a_zp = get_scalar(inputs[2], params, "int32")
+
+        b = inputs[3]
+        # must be a scalar or 1D tensor which means a per-tensor or per-column quantization
+        # If 1-D tensor, number of elements should be equal to columns elements of input B
+        b_scale = get_scalar_or_1d_tensor(inputs[4], params)
+        b_zp = get_scalar_or_1d_tensor(inputs[5], params, "int32")
+
+        # note that if optional and not provided then value will be None.
+        C = inputs[6]
+        # must be null or a scalar or 1D tensor of size 1
+        y_scale = inputs[7]
+        # must be null or a scalar or 1D tensor of size 1
+        y_zp = get_scalar(inputs[8], params, "int32")
+
+        assert len(infer_shape(a)) == 2
+        assert len(infer_shape(b)) == 2
+        # zero point and scale of input b should have same shape size
+        assert infer_shape(b_scale) == infer_shape(b_zp)
+
+        alpha = float(attr.get("alpha", 1.0))
+        transA = int(attr.get("transA", 0))
+        transB = int(attr.get("transB", 0))
+
+        # get number of channels
+        channels = infer_channels(b, not transB)
+        a_dtype = infer_type(a).checked_type.dtype
+
+        if transA:
+            a = _op.transpose(a, axes=(1, 0))
+        if not transB:
+            b = _op.transpose(b, axes=(1, 0))
+
+        result = _qnn.op.dense(
+            a,
+            b,
+            a_zp,
+            b_zp,
+            a_scale,
+            b_scale,
+            channels,
+        )
+
+        if C:
+            result = _op.add(result, C)
+
+        requantize_scale = _op.multiply(a_scale, b_scale)
+        if alpha != 1.0:
+            requantize_scale *= _expr.const(alpha, dtype="float32")
+        requantize_zp = _op.const(0, dtype="int32")
+
+        if y_scale:
+            # requantize requires y_scale to be constant,
+            # if y_scale is not constant, doing dequantize -> quantize
+            if isinstance(y_scale, _expr.Constant):
+                y = _qnn.op.requantize(
+                    result,
+                    requantize_scale,
+                    requantize_zp,
+                    y_scale,
+                    y_zp,
+                    axis=-1,
+                    rounding="TONEAREST",
+                    out_dtype=a_dtype,
+                )
+            else:
+                result_deq = _qnn.op.dequantize(result, requantize_scale, requantize_zp, axis=0)
+
+                y = _qnn.op.quantize(result_deq, y_scale, y_zp, axis=0, out_dtype=a_dtype)
+        else:
+            y = _op.multiply(_op.cast(result, "float32"), requantize_scale)
+
+        return y
+
+
 class QLinearAdd(OnnxOpConverter):
     """Operator converter for QLinearAdd from Microsoft onnxruntime contrib opset."""
 
@@ -6337,6 +6421,7 @@ def _get_convert_map(opset):
         "DequantizeLinear": DequantizeLinear.get_converter(opset),
         "DynamicQuantizeLinear": DynamicQuantizeLinear.get_converter(opset),
         "ReverseSequence": ReverseSequence.get_converter(opset),
+        "QGemm": QGemm.get_converter(opset),
         "QLinearConv": QLinearConv.get_converter(opset),
         "QLinearConcat": QLinearConcat.get_converter(opset),
         "QLinearAdd": QLinearAdd.get_converter(opset),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index f5b5f7c65cb5..a8aa4331d8c4 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -6213,6 +6213,174 @@ def verify_skiplayernormalization(input_, skip, gamma, beta, bias):
     verify_skiplayernormalization(input_array, skip, gamma, beta, bias)
 
 
+@tvm.testing.known_failing_targets("cuda")
+@tvm.testing.parametrize_targets
+def test_qgemm(target, dev):
+    """test_qgemm"""
+
+    def verify_qgemm(
+        a_shape,
+        b_shape,
+        y_shape,
+        C=False,
+        y_zp=False,
+        b_per_tensor_quantization=False,
+        alpha=1.0,
+        transA=0,
+        transB=1,
+    ):
+        a_array = np.random.randint(low=0, high=255, size=a_shape).astype("uint8")
+        b_array = np.random.uniform(low=0, high=255, size=b_shape).astype("uint8")
+
+        input_nodes = [
+            helper.make_tensor_value_info("a", TensorProto.UINT8, list(a_shape)),
+            helper.make_tensor_value_info("b", TensorProto.UINT8, list(b_shape)),
+        ]
+
+        initializer = [
+            helper.make_tensor("a_scale", TensorProto.FLOAT, (), [np.random.rand()]),
+            helper.make_tensor("a_zero_point", TensorProto.UINT8, (), [np.random.randint(0, 255)]),
+        ]
+
+        input_names = [
+            "a",
+            "a_scale",
+            "a_zero_point",
+            "b",
+            "b_scale",
+            "b_zero_point",
+        ]
+        input_values = [a_array, b_array]
+
+        if b_per_tensor_quantization:
+            initializer.append(
+                helper.make_tensor("b_scale", TensorProto.FLOAT, (), [np.random.rand()])
+            )
+            initializer.append(
+                helper.make_tensor(
+                    "b_zero_point", TensorProto.UINT8, (), [np.random.randint(0, 255)]
+                )
+            )
+        else:  # per_colume_quantization
+            shape_value = b_shape[0] if transB else b_shape[1]
+            b_scale_array = np.random.random(shape_value).astype("float32")
+            w_zero_point_array = np.random.randint(0, 255, size=shape_value).astype("uint8")
+            initializer.append(
+                helper.make_tensor(
+                    "b_scale", TensorProto.FLOAT, list(b_scale_array.shape), b_scale_array
+                )
+            )
+            initializer.append(
+                helper.make_tensor(
+                    "b_zero_point",
+                    TensorProto.UINT8,
+                    list(w_zero_point_array.shape),
+                    w_zero_point_array,
+                )
+            )
+
+        output_tensor = helper.make_tensor_value_info("output", TensorProto.FLOAT, list(y_shape))
+
+        if C is True:
+            C_shape = (b_shape[0] if transB else b_shape[1],)
+            C_array = np.random.randint(low=0, high=65536, size=C_shape).astype("int32")
+            input_nodes.append(helper.make_tensor_value_info("C", TensorProto.INT32, list(C_shape)))
+            input_names.append("C")
+            input_values.append(C_array)
+
+        if y_zp is True:
+            input_names.append("y_scale")
+            initializer.append(
+                helper.make_tensor("y_scale", TensorProto.FLOAT, (), [np.random.rand()])
+            )
+
+            input_names.append("y_zero_point")
+            initializer.append(
+                helper.make_tensor(
+                    "y_zero_point", TensorProto.UINT8, (), [np.random.randint(0, 255)]
+                )
+            )
+
+            output_tensor = helper.make_tensor_value_info(
+                "output", TensorProto.UINT8, list(y_shape)
+            )
+
+        kwargs = {}
+        kwargs["alpha"] = alpha
+        kwargs["transA"] = transA
+        kwargs["transB"] = transB
+
+        node = helper.make_node(
+            "QGemm",
+            inputs=input_names,
+            outputs=["output"],
+            domain="com.microsoft",
+            # Default values for other attributes:
+            **kwargs,
+        )
+
+        graph = helper.make_graph(
+            [node],
+            "QGemm",
+            inputs=input_nodes,
+            outputs=[output_tensor],
+            initializer=initializer,
+        )
+        model = helper.make_model(
+            graph,
+            producer_name="QGemm",
+            opset_imports=[
+                onnx.helper.make_opsetid("com.microsoft", 1),
+            ],
+        )
+
+        verify_with_ort_with_inputs(model, input_values, target=target, dev=dev)
+
+    # B per tensor quantization
+    verify_qgemm(
+        (20, 30),
+        (50, 30),
+        (20, 50),
+        True,
+        True,
+        True,
+    )
+
+    # B per column  quantization
+    verify_qgemm(
+        (20, 30),
+        (50, 30),
+        (20, 50),
+        True,
+        True,
+        False,
+    )
+
+    # test alpha
+    verify_qgemm(
+        (20, 30),
+        (50, 30),
+        (20, 50),
+        True,
+        True,
+        True,
+        0.5,
+    )
+
+    # test transpose A
+    verify_qgemm(
+        (20, 50),
+        (20, 80),
+        (50, 80),
+        True,
+        True,
+        True,
+        0.5,
+        1,
+        0,
+    )
+
+
 @tvm.testing.known_failing_targets("cuda")
 @tvm.testing.parametrize_targets
 def test_qlinearconv(target, dev):

From 4f1f8c593beca530c9274be9bea0d765e69b657b Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Thu, 26 Jan 2023 13:12:56 -0800
Subject: [PATCH 228/286] [microTVM] Fix host-driven AOT memory workspaces
 (#13807)

When using host-driven AOT with memory pools enabled, the workspace and constant memory were not properly supported. In order for them to work properly, the _run function (typically tvmgen_default_run()) needed to be called instead of tvmgen_default___tvm_main__() in order to properly setup the memory workspace pointers.

fixes #13777
---
 src/runtime/crt/aot_executor/aot_executor.c    | 13 +------------
 src/target/source/source_module.cc             | 16 +++++++++++++---
 tests/python/unittest/test_crt.py              | 18 +++++-------------
 .../test_micro_model_library_format.py         |  1 -
 4 files changed, 19 insertions(+), 29 deletions(-)

diff --git a/src/runtime/crt/aot_executor/aot_executor.c b/src/runtime/crt/aot_executor/aot_executor.c
index a40c1d530fa9..ae007037e6cc 100644
--- a/src/runtime/crt/aot_executor/aot_executor.c
+++ b/src/runtime/crt/aot_executor/aot_executor.c
@@ -83,7 +83,7 @@ int TVMAotExecutor_GetInputIndex(TVMAotExecutor* executor, const char* name) {
 }
 
 int TVMAotExecutor_Run(TVMAotExecutor* executor) {
-  const char* tvm_main_suffix = "___tvm_main__";
+  const char* tvm_main_suffix = "_run";
   char tvm_main_name[TVM_CRT_MAX_STRLEN_FUNCTION_NAME];
 
   {
@@ -203,17 +203,6 @@ int TVMAotExecutor_Init(TVMAotExecutor* executor, TVMModuleHandle module_handle,
     TVMNDArray_IncrementReference(array);
   }
 
-  for (i = 0; i < md->num_workspace_pools; ++i) {
-    LOG_DEBUG("pools allocate[%d]: %s\n", i, md->workspace_pools[i].name);
-
-    status = TVMNDArray_Empty(md->workspace_pools[i].num_shape, md->workspace_pools[i].shape,
-                              md->workspace_pools[i].dtype, executor->device,
-                              &executor->args[arg_idx++]);
-    if (status != 0) {
-      return status;
-    }
-  }
-  CHECK_EQ(0, md->num_constant_pools, "Constant pools not supported");
   return status;
 }
 
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index ccc15fc1ee49..ee5a7cd33de9 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -929,11 +929,21 @@ runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& mod
                                                relay::backend::ExecutorCodegenMetadata metadata,
                                                runtime::metadata::Metadata aot_metadata) {
   Array<runtime::Module> final_modules(modules);
-  if (aot_metadata.defined()) {
-    final_modules.push_back(CreateAotMetadataModule(aot_metadata, true));
+  Array<String> func_names;
+
+  if (metadata.defined()) {
+    if (metadata->executor == "aot") {
+      if (aot_metadata.defined()) {
+        final_modules.push_back(CreateAotMetadataModule(aot_metadata, true));
+      }
+
+      // add the run function (typically "tvmgen_default_run") to function registry
+      // when using AOT executor
+      std::string run_func = runtime::get_name_mangled(metadata->mod_name, "run");
+      func_names.push_back(run_func);
+    }
   }
 
-  Array<String> func_names;
   for (runtime::Module mod : final_modules) {
     auto pf_funcs = mod.GetFunction("get_func_names");
     if (pf_funcs != nullptr) {
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 83fa91af06c9..3309aad0a5db 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -229,15 +229,10 @@ def do_test():
         do_test()
 
 
-enable_usmp, expect_exception = tvm.testing.parameters((True, True), (False, False))
-
-
 @tvm.testing.requires_micro
-def test_aot_executor_usmp_const_pool(enable_usmp, expect_exception):
-    """Test the AOT executor with microTVM using usmp.
-    Test should fail if const pool is supplied to executor
-    as these are currently not supported
-    """
+def test_aot_executor_usmp_const_pool():
+    """Test the AOT executor with microTVM using USMP to generate a constant data pool."""
+
     ws_root = pathlib.Path(os.path.dirname(__file__) + "/micro-workspace-usmp")
     if ws_root.exists():
         shutil.rmtree(ws_root)
@@ -260,7 +255,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8], %c : Tensor[(1
     C_np = np.array([[8, 9]], dtype="uint8").astype(type_dict["c"])
     params = {"c": C_np}
     with tvm.transform.PassContext(
-        opt_level=3, config={"tir.disable_vectorize": True, "tir.usmp.enable": enable_usmp}
+        opt_level=3, config={"tir.disable_vectorize": True, "tir.usmp.enable": True}
     ):
         factory = tvm.relay.build(
             relay_mod,
@@ -278,10 +273,7 @@ def do_test():
                 )
             )
         except tvm._ffi.base.TVMError as e:
-            if expect_exception:
-                return
-            else:
-                raise e
+            raise e
 
         assert aot_executor.get_input_index("a") == 0
         assert aot_executor.get_input_index("b") == 1
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index e664c2ebb858..39919f337197 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -618,7 +618,6 @@ def test_multiple_relay_modules_aot_graph():
 
     assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib0.c"))
     assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib1.c"))
-    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib2.c"))
     assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib0.c"))
     assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib1.c"))
 

From d2bacff6b730f5b7b42d1218aa64a203aaad811e Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 27 Jan 2023 08:54:33 +0900
Subject: [PATCH 229/286] [ROCM] Fixes compiling on ROCM 5 and accuracy on
 dense op (#13847)

Fixes https://github.com/apache/tvm/issues/13666

* Some bitcode files need to be updated
* There is a strange, device-dependent accuracy issue when using the default topi `dense` op schedule on AMDGPU (see the issue above). I confirmed that other schedule works fine.
---
 python/tvm/contrib/rocm.py           |  4 ++--
 python/tvm/relay/op/strategy/cuda.py | 15 +++++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py
index 4f62f1a8da26..372281dbabf1 100644
--- a/python/tvm/contrib/rocm.py
+++ b/python/tvm/contrib/rocm.py
@@ -141,9 +141,7 @@ def callback_rocm_bitcode_path(rocdl_dir=None):
     bitcode_names = [
         "oclc_daz_opt_on",
         "ocml",
-        "hc",
         "irif",  # this does not exist in rocm 3.9, drop eventually
-        "ockl",
         "oclc_correctly_rounded_sqrt_off",
         "oclc_correctly_rounded_sqrt_on",
         "oclc_daz_opt_off",
@@ -152,9 +150,11 @@ def callback_rocm_bitcode_path(rocdl_dir=None):
         "oclc_isa_version_803",  # todo (t-vi): an alternative might be to scan for the
         "oclc_isa_version_900",  #              isa version files (if the linker throws out
         "oclc_isa_version_906",  #              the unneeded ones or we filter for the arch we need)
+        "oclc_isa_version_1030",
         "oclc_unsafe_math_off",
         "oclc_unsafe_math_on",
         "oclc_wavefrontsize64_on",
+        "oclc_abi_version_500",
     ]
 
     bitcode_files = []
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index cc438092666a..fa295c93a19f 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -918,13 +918,16 @@ def dense_strategy_cuda(attrs, inputs, out_type, target):
             name="dense_int8.cuda",
         )
     else:
-        strategy.add_implementation(
-            wrap_compute_dense(topi.gpu.dense_small_batch),
-            wrap_topi_schedule(topi.gpu.schedule_dense_small_batch),
-            name="dense_small_batch.gpu",
-        )
+        # Some AMDGPU cards have accuracy issues with this schedule
+        # See https://github.com/apache/tvm/issues/13666
+        if target.kind.name != "rocm":
+            strategy.add_implementation(
+                wrap_compute_dense(topi.gpu.dense_small_batch),
+                wrap_topi_schedule(topi.gpu.schedule_dense_small_batch),
+                name="dense_small_batch.gpu",
+            )
 
-        with SpecializedCondition(b >= 32):
+        with SpecializedCondition(target.kind.name == "rocm" or b >= 32):
             strategy.add_implementation(
                 wrap_compute_dense(topi.gpu.dense_large_batch),
                 wrap_topi_schedule(topi.gpu.schedule_dense_large_batch),

From 9c382e17d2bba2ad8066a1fd0c7ad57464574468 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 27 Jan 2023 15:01:07 +0900
Subject: [PATCH 230/286] [MetaSchedule] Fix for RewriteLayout + AllocateConst
 when the rank of the rewritten weight doesn't change (#13851)

[MetaSchedule] Fix for RewriteLayout + AllocateConst when the rank of
the rewritten weight doesn't change
---
 src/relay/backend/te_compiler_cache.cc        | 21 +++++-
 .../test_meta_schedule_relay_integration.py   | 74 +++++++++++++++++++
 2 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 154101fc94fe..c680c5a77e04 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -576,7 +576,26 @@ class ScheduleBuilder : public ExprVisitor {
                       << "Only one layout-free constant is supported by RewriteLayout for now";
                   auto constant = const_collector.constants[0];
 
-                  if (constant.Shape().size() == index_map->initial_indices.size()) {
+                  auto is_constant_transformed = [index_map](runtime::NDArray c) {
+                    if (c.Shape().size() != index_map->initial_indices.size()) {
+                      return true;
+                    }
+                    size_t src_size_1d = 1;
+                    Array<PrimExpr> orig_shape;
+                    for (size_t i = 0; i < c.Shape().size(); ++i) {
+                      src_size_1d *= c->shape[i];
+                      orig_shape.push_back(PrimExpr(static_cast<int>((c->shape[i]))));
+                    }
+                    auto dst_shape = index_map->MapShape(orig_shape);
+                    std::vector<int64_t> dst_shape_int;
+                    size_t dst_size_1d = 1;
+                    for (size_t i = 0; i < dst_shape.size(); ++i) {
+                      dst_size_1d *= dst_shape[i].as<IntImmNode>()->value;
+                    }
+                    return src_size_1d != dst_size_1d;
+                  };
+
+                  if (!is_constant_transformed(constant)) {
                     // This is the first case, reached during the MetaScheduleLayoutRewrite pass.
                     //
                     // A layout-free constant having the same rank as an input to the index map
diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index 795890de083e..8cd58e5a6f36 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -880,5 +880,79 @@ def test_disabled_pass_param():
     pytest.fail("'disabled_pass' argument does not work")
 
 
+def test_rewrite_layout_link_params_1x1_conv2d():
+    I, O, H, W = 32, 16, 256, 256
+    kH = kW = 1
+
+    strides = (1, 1)
+    padding = (0, 0)
+
+    data_shape = (1, H, W, I)
+    w_shape = (kH, kW, I, O)
+
+    data = relay.var("data", shape=data_shape, dtype="float32")
+    weight = relay.var("weight", shape=w_shape, dtype="float32")
+
+    conv = relay.nn.conv2d(
+        data=data,
+        weight=weight,
+        kernel_size=(kH, kW),
+        channels=O,
+        padding=padding,
+        strides=strides,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="float32",
+    )
+
+    mod = tvm.IRModule.from_expr(conv)
+
+    weight_np = np.random.randn(*w_shape).astype("float32")
+
+    params = {"weight": weight_np}
+
+    data_np = np.random.randn(*data_shape).astype("float32")
+
+    ref = (
+        relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm")
+        .evaluate()(*[data_np, weight_np])
+        .numpy()
+    )
+
+    link_params = True
+
+    target = "llvm --num-cores=4"
+
+    executor = relay.backend.Executor("graph", {"link-params": link_params})
+    mod = mod.with_attr("executor", executor)
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        database = ms.relay_integration.tune_relay(
+            mod=mod,
+            target=target,
+            params=params,
+            work_dir=work_dir,
+            max_trials_global=8,
+            strategy="replay-trace",
+        )
+
+        lib = ms.relay_integration.compile_relay(
+            database=database,
+            mod=mod,
+            target=target,
+            params=params,
+        )
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    np.testing.assert_allclose(ref, out, rtol=1e-4, atol=1e-4)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From f2b62836bbc743d230044c3cdbd2a973ab6474b2 Mon Sep 17 00:00:00 2001
From: krishnaraj36 <quic_kvegiraj@quicinc.com>
Date: Fri, 27 Jan 2023 14:09:22 +0530
Subject: [PATCH 231/286] [CLML][RUNTIME] Enable more ops in CLML runtime
 (#13834)

Enable the DepthToSpace and Resize bilinear operator in CLML runtime and bug fix in concat layer
---
 python/tvm/relay/op/contrib/clml.py        |  16 +++-
 src/runtime/contrib/clml/clml_runtime.cc   |  67 +++++++++++++-
 tests/python/contrib/test_clml/test_ops.py | 102 +++++++++++++++++++++
 3 files changed, 183 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
index 1ec9b298abe5..ec8cbb63209f 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -316,6 +316,18 @@ def check_softmax_op(extract):
             return False
         return True
 
+    def check_upsampling_op(extract):
+        call = extract
+        if call.attrs["method"] != "bilinear":
+            return False
+        return True
+
+    def check_concat_op(extract):
+        call = extract
+        if call.attrs["axis"] != 1:
+            return False
+        return True
+
     def check_default_op(extract):
         return True
 
@@ -324,7 +336,7 @@ def check_default_op(extract):
         ("clml.conv2d", conv_pattern(), check_conv),
         ("clml.dense", dense_pattern(), check_default_op),
         ("clml.pad", pad_pattern(), check_pad_op),
-        ("clml.concat", concat_pattern(), check_default_op),
+        ("clml.concat", concat_pattern(), check_concat_op),
         ("clml.batch_norm", batch_norm_pattern(), check_default_op),
         ("clml.add", is_op("add")(wildcard(), wildcard()), check_binary_op),
         ("clml.subtract", is_op("subtract")(wildcard(), wildcard()), check_binary_op),
@@ -341,6 +353,8 @@ def check_default_op(extract):
         ("clml.relu", is_op("nn.relu")(wildcard()), check_default_op),
         ("clml.clip", is_op("clip")(wildcard()), check_default_op),
         ("clml.batch_flatten", is_op("nn.batch_flatten")(wildcard()), check_default_op),
+        ("clml.depth_to_space", is_op("nn.depth_to_space")(wildcard()), check_default_op),
+        ("clml.upsampling", is_op("nn.upsampling")(wildcard()), check_upsampling_op),
     ]
 
 
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index 1fb694a91201..0987eefdc9c0 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -430,6 +430,14 @@ class CLMLRuntime : public JSONRuntimeBase {
           auto out = CreateBinaryLayer(&layer_, node);
           this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
           this->layer_.func_outs.push_back(out);
+        } else if ("nn.depth_to_space" == op_name) {
+          auto out = CreateDepthToSpaceLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.upsampling" == op_name) {
+          auto out = CreateResizeLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
         } else {
           LOG(FATAL) << "Unsupported op: " << op_name;
         }
@@ -1151,13 +1159,14 @@ class CLMLRuntime : public JSONRuntimeBase {
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
     int inputSize = input_.size();
     auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    cl_uint axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
     cl_ml_tensor_qcom* concatInputs = new cl_ml_tensor_qcom[inputSize];
     for (int i = 0; i < inputSize; i++) {
       auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[i], {},
                                                CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
       concatInputs[i] = input->tensor;
     }
-    cl_ml_op_concat_desc_qcom concatDesc = {1, (cl_uint)inputSize, cl_arithmetic_mode};
+    cl_ml_op_concat_desc_qcom concatDesc = {axis, (cl_uint)inputSize, cl_arithmetic_mode};
 
     result = h_ClmlIntf->clCreateMLOpConcatQCOM(workspace->context, 0, &concatDesc, concatInputs,
                                                 output->tensor, &op, tuning_cache);
@@ -1301,6 +1310,62 @@ class CLMLRuntime : public JSONRuntimeBase {
     return output;
   }
 
+  /*!
+   * \brief Create a DepthToSpace(X) layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateDepthToSpaceLayer(
+      CachedLayer* layer, const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+                                             cl_dtype);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    cl_uint block_size = std::stoi(node.GetAttr<std::vector<std::string>>("block_size")[0]);
+
+    cl_ml_op_depthtospace_desc_qcom dtos_desc = {block_size, cl_arithmetic_mode};
+    result = h_ClmlIntf->clCreateMLOpDepthToSpaceQCOM(
+        workspace->context, 0, &dtos_desc, input->tensor, output->tensor, &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "DepthToSpace Layer Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a Resize(X) layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateResizeLayer(CachedLayer* layer,
+                                                                   const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+                                             cl_dtype);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    cl_bool align_corners = std::stoi(node.GetAttr<std::vector<std::string>>("align_corners")[0]);
+
+    cl_ml_op_resize_bilinear_desc_qcom resize_desc = {align_corners, false, cl_arithmetic_mode};
+    result = h_ClmlIntf->clCreateMLOpResizeBilinearQCOM(
+        workspace->context, 0, &resize_desc, input->tensor, output->tensor, &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Resize Layer Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
   /*!
    * \brief The network layers represented by acl functions.
    * \note Currently only supports a single layer.
diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py
index c4ec2603249b..b8177435a0dc 100644
--- a/tests/python/contrib/test_clml/test_ops.py
+++ b/tests/python/contrib/test_clml/test_ops.py
@@ -574,5 +574,107 @@ def _verify(out, params, inputs):
     _verify(*(_get_model((1, 16), relay.nn.relu)))
 
 
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@tvm.testing.requires_openclml
+def test_depth_to_space(device, dtype):
+    def _get_model(a_shape, block_size):
+        a = relay.var("a", shape=(a_shape), dtype=dtype)
+        out = relay.nn.depth_to_space(a, block_size)
+        inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, a_shape).astype(dtype))}
+        params = {}
+        return out, params, inputs
+
+    def _verify(out, params, inputs):
+        mod = IRModule.from_expr(out)
+        opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0]
+        clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0]
+        tvm.testing.assert_allclose(
+            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
+        )
+
+        # Check to make sure these ops are offloaded to CLML instead of TVM.
+        exp_codegen = [
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "shape": [[list(inputs["a"].shape)]],
+                },
+                "name": "",
+                "op": "input",
+            },
+            {
+                "attrs": {
+                    "block_size": [[str(int(out.attrs.block_size))]],
+                    "layout": [["NCHW"]],
+                    "mode": [["DCR"]],
+                    "dtype": [[dtype]],
+                    "num_inputs": "1",
+                    "num_outputs": "1",
+                    "shape": [[list(clml_out[0].shape)]],
+                },
+                "inputs": [[0, 0, 0]],
+                "name": "nn.depth_to_space",
+                "op": "kernel",
+            },
+        ]
+        verify_codegen(out, exp_codegen, device, params)
+
+    _verify(*(_get_model((1, 64, 8, 8), 4)))
+    _verify(*(_get_model((1, 64, 8, 8), 8)))
+
+
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@tvm.testing.requires_openclml
+def test_resize_bilinear(device, dtype):
+    def _get_model(a_shape, scale, align_corners):
+        a = relay.var("a", shape=(a_shape), dtype=dtype)
+        out = relay.nn.upsampling(
+            a, scale_h=scale[0], scale_w=scale[1], method="bilinear", align_corners=align_corners
+        )
+        inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, a_shape).astype(dtype))}
+        params = {}
+        return out, params, inputs
+
+    def _verify(out, params, inputs):
+        mod = IRModule.from_expr(out)
+        opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0]
+        clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0]
+        tvm.testing.assert_allclose(
+            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
+        )
+
+        # Check to make sure these ops are offloaded to CLML instead of TVM.
+        exp_codegen = [
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "shape": [[list(inputs["a"].shape)]],
+                },
+                "name": "",
+                "op": "input",
+            },
+            {
+                "attrs": {
+                    "scale_h": [[str(int(out.attrs.scale_h))]],
+                    "scale_w": [[str(int(out.attrs.scale_w))]],
+                    "layout": [["NCHW"]],
+                    "method": [[out.attrs.method]],
+                    "align_corners": [[str(out.attrs.align_corners)]],
+                    "dtype": [[dtype]],
+                    "num_inputs": "1",
+                    "num_outputs": "1",
+                    "shape": [[list(clml_out[0].shape)]],
+                },
+                "inputs": [[0, 0, 0]],
+                "name": "nn.upsampling",
+                "op": "kernel",
+            },
+        ]
+        verify_codegen(out, exp_codegen, device, params)
+
+    _verify(*(_get_model((1, 16, 8, 8), (2, 2), False)))
+    _verify(*(_get_model((1, 16, 7, 7), (2, 2), True)))
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 08d202fdef6ef024c5eae246ab58e22bae1ac3af Mon Sep 17 00:00:00 2001
From: balaram-cadence <76058410+balaram-cadence@users.noreply.github.com>
Date: Fri, 27 Jan 2023 03:09:56 -0600
Subject: [PATCH 232/286] [Relay] Convert negative axes to positive when
 importing ONNX Unsqueeze (#13846)

---
 python/tvm/relay/frontend/onnx.py          |  5 ++-
 tests/python/frontend/onnx/test_forward.py | 39 ++++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 19854072d853..ed99176282de 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2540,6 +2540,8 @@ class Unsqueeze(OnnxOpConverter):
     def run_calculation(cls, tensor, axes):
         axes = sorted(axes)
         for axis in axes:
+            if axis < 0 and isinstance(tensor, _expr.Var):
+                axis = len(tensor.type_annotation.concrete_shape) + len(axes) + axis
             tensor = _op.expand_dims(tensor, axis=axis, num_newaxis=1)
         return tensor
 
@@ -2558,6 +2560,7 @@ def _impl_v13(cls, inputs, attr, params):
         num_new_axis = int(infer_type(inputs[1]).checked_type.shape[0])
         axes = relay.sort(inputs[1])
         axes = relay.split(axes, num_new_axis).astuple()
+        rank_output = rank_input + num_new_axis
         result = inputs[0]
 
         # TODO (AndrewZhaoLuo): investigate performance issues with consecutive
@@ -2567,7 +2570,7 @@ def _impl_v13(cls, inputs, attr, params):
             # Unpack scalar
             axis = relay.reshape(axis, [])
             axis = relay.where(
-                axis >= relay.const(0, "int64"), axis, axis + relay.const(rank_input, "int64")
+                axis >= relay.const(0, "int64"), axis, axis + relay.const(rank_output, "int64")
             )
             result = _op.expand_dims(result, axis)
         return result
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index a8aa4331d8c4..ebb682190140 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -676,6 +676,45 @@ def test_unsqueeze(target, dev):
     verify_with_ort(model, [in_shape], target=target, dev=dev, opset=11)
 
 
+@tvm.testing.parametrize_targets
+def test_unsqueeze_with_neg_axes(target, dev):
+    def verify_unsqueeze_with_neg_axes(opset=11):
+        in_shape = (2, 3, 4)
+        axis = (-2, -1)
+        out_shape = (2, 3, 4, 1, 1)
+        if opset < 13:
+            y = helper.make_node("Unsqueeze", ["in"], ["out"], axes=list(axis))
+            nodes = [y]
+        else:
+            axes = np.array(list(axis)).astype(np.int64)
+            axes = helper.make_node(
+                "Constant",
+                inputs=[],
+                outputs=["axes"],
+                value=onnx.helper.make_tensor(
+                    name="const_axes",
+                    data_type=onnx.TensorProto.INT64,
+                    dims=axes.shape,
+                    vals=axes.flatten().astype(int),
+                ),
+            )
+            y = helper.make_node("Unsqueeze", ["in", "axes"], ["out"])
+            nodes = [axes, y]
+
+        graph = helper.make_graph(
+            nodes,
+            "squeeze_test",
+            inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+            outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))],
+        )
+
+        model = helper.make_model(graph, producer_name="squeeze_test")
+        verify_with_ort(model, [in_shape], target=target, dev=dev, opset=opset)
+
+    verify_unsqueeze_with_neg_axes()
+    verify_unsqueeze_with_neg_axes(opset=13)
+
+
 @tvm.testing.parametrize_targets
 def test_gather(target, dev):
     """test_gather"""

From 56209267f780aa2290137f5b934a9e9e17faa3ee Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Fri, 27 Jan 2023 09:20:26 +0000
Subject: [PATCH 233/286] [ETHOSN] Apply FoldConstant before NPU partitioning
 (#13848)

Introduced FoldConstant before NPU partitioning.
Added a qnn.add test where both inputs are constants.
Updated the number of operators remaining in the host code
for ssd_mobilenet_v1 as the FoldConstant reduces the number
of operators.
---
 python/tvm/relay/op/contrib/ethosn.py         |  1 +
 .../contrib/test_ethosn/test_addition.py      | 68 +++++++++++++++----
 .../contrib/test_ethosn/test_networks.py      |  2 +-
 3 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 3e10f3d60415..7acaee9706c2 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -129,6 +129,7 @@ def partition_for_ethosn(mod, params=None, **opts):
 
     passes = [
         transform.InferType(),
+        transform.FoldConstant(fold_qnn=True),
         transform.MergeComposite(pattern_table()),
         transform.AnnotateTarget("ethos-n"),
         transform.MergeCompilerRegions(),
diff --git a/tests/python/contrib/test_ethosn/test_addition.py b/tests/python/contrib/test_ethosn/test_addition.py
index 9841e798aff4..5813ef7b9d44 100644
--- a/tests/python/contrib/test_ethosn/test_addition.py
+++ b/tests/python/contrib/test_ethosn/test_addition.py
@@ -41,20 +41,28 @@ def _get_model(
 ):
     """Return a model and any parameters it may have"""
 
-    iinfo = np.iinfo(dtype)
-    data_min = iinfo.min
-    data_max = iinfo.max
+    def create_or_assign_constant(shape, dtype, default_data):
+        """Creates new numpy array or assigns default_data if available."""
+
+        iinfo = np.iinfo(dtype)
+        data_min = iinfo.min
+        data_max = iinfo.max
+
+        nparray = None
+        if default_data:
+            nparray = np.array(default_data, dtype=dtype).reshape(shape)
+        else:
+            nparray = np.random.randint(data_min, data_max + 1, size=shape, dtype=dtype)
+
+        return relay.const(nparray, dtype=dtype)
 
     if lhs_is_constant:
-        a_data = np.array(constant_data, dtype=dtype).reshape(lhs_shape)
-        a = relay.const(a_data, dtype=dtype)
+        a = create_or_assign_constant(lhs_shape, dtype, constant_data)
     else:
         a = relay.var("a", shape=lhs_shape, dtype=dtype)
 
     if rhs_is_constant:
-        b_data = np.array(constant_data, dtype=dtype).reshape(rhs_shape)
-        np.random.randint(data_min, data_max + 1, size=rhs_shape, dtype=dtype)
-        b = relay.const(b_data, dtype=dtype)
+        b = create_or_assign_constant(rhs_shape, dtype, constant_data)
     else:
         b = relay.var("b", shape=rhs_shape, dtype=dtype)
 
@@ -125,6 +133,46 @@ def test_addition(dtype, shape):
     tei.verify(outputs, dtype, 1)
 
 
+@requires_ethosn
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
+@pytest.mark.parametrize(
+    "lhs_shape,lhs_is_constant,rhs_shape,rhs_is_constant",
+    [
+        ((1, 4, 4, 8), True, (1, 1, 1, 8), True),
+        ((4,), True, (1, 16, 12, 4), True),
+        ((1, 1, 1, 8), True, (1, 4, 4, 8), True),
+        ((1, 16, 12, 4), True, (4,), True),
+    ],
+)
+def test_addition_both_inputs_constants(
+    dtype, lhs_shape, lhs_is_constant, rhs_shape, rhs_is_constant
+):
+    """Check if addition is simplified when both inputs are constants."""
+    np.random.seed(0)
+
+    lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype)
+
+    model = _get_model(
+        lhs_shape,
+        rhs_shape,
+        lhs_zp,
+        lhs_sc,
+        rhs_zp,
+        rhs_sc,
+        out_zp,
+        out_sc,
+        dtype,
+        lhs_is_constant=lhs_is_constant,
+        rhs_is_constant=rhs_is_constant,
+    )
+    from tvm.relay.op.contrib import partition_for_ethosn  # pylint: disable=import-outside-toplevel
+
+    mod = tei.make_module(model, {})
+    assert "qnn.add" in mod.astext(False)
+    mod = partition_for_ethosn(mod, {})
+    assert "qnn.add" not in mod.astext(False)
+
+
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 @pytest.mark.parametrize(
@@ -145,9 +193,6 @@ def test_addition_to_depthwise(dtype, lhs_shape, lhs_is_constant, rhs_shape, rhs
     data_max = iinfo.max
     lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype)
 
-    constant_shape = lhs_shape if lhs_is_constant else rhs_shape
-    constant_data = np.random.randint(data_min, data_max + 1, size=constant_shape, dtype=dtype)
-
     model = _get_model(
         lhs_shape,
         rhs_shape,
@@ -160,7 +205,6 @@ def test_addition_to_depthwise(dtype, lhs_shape, lhs_is_constant, rhs_shape, rhs
         dtype,
         lhs_is_constant=lhs_is_constant,
         rhs_is_constant=rhs_is_constant,
-        constant_data=constant_data,
     )
     input_shape = rhs_shape if lhs_is_constant else lhs_shape
     input_name = "b" if lhs_is_constant else "a"
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index 23ff5207fbcd..dfbd262abf96 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -218,6 +218,6 @@ def test_ssd_mobilenet_v1():
         input_dict={"normalized_input_image_tensor": (1, 300, 300, 3)},
         compile_hash=_compile_hash,
         output_count=4,
-        host_ops=26,
+        host_ops=14,
         npu_partitions=1,
     )

From 4a992b5116651161aef142951476e416d2dd77c0 Mon Sep 17 00:00:00 2001
From: joshherr-quic <95375797+joshherr-quic@users.noreply.github.com>
Date: Fri, 27 Jan 2023 12:56:04 -0600
Subject: [PATCH 234/286] [Hexagon][CI] Updated sha for builder LLVM (#13418)

Updated sha to deal with some codegen issues that came up with the last version.
---
 docker/install/ubuntu_install_hexagon.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docker/install/ubuntu_install_hexagon.sh b/docker/install/ubuntu_install_hexagon.sh
index 722cfaa40cb3..57807398a7f7 100755
--- a/docker/install/ubuntu_install_hexagon.sh
+++ b/docker/install/ubuntu_install_hexagon.sh
@@ -21,7 +21,7 @@ set -o pipefail
 
 # Install LLVM/clang
 CLANG_LLVM_HOME=/opt/clang-llvm
-LLVM_SHA=361a27c155ec8b222e3318488a208c0eb39624c8
+LLVM_SHA=a9871772a8b13c1240a95a84a3327f84bb67dddc
 
 mkdir llvm-hexagon
 pushd llvm-hexagon
@@ -37,8 +37,7 @@ cmake \
   -DCMAKE_INSTALL_PREFIX=${CLANG_LLVM_HOME} \
   -DLLVM_ENABLE_ASSERTIONS=ON \
   -DLLVM_TARGETS_TO_BUILD:STRING="Hexagon;X86" \
-  -DLLVM_ENABLE_PROJECTS:STRING="clang;llvm" \
-  -DTARGET_TRIPLE=x86_64-unknown-linux-gnu \
+  -DLLVM_ENABLE_PROJECTS:STRING="llvm" \
   -DLLVM_DEFAULT_TARGET_TRIPLE=x86_64-unknown-linux-gnu \
   ../llvm
 ninja install

From 998f8941c306d905c960c5962a1d7918b6844dd7 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 27 Jan 2023 14:13:21 -0800
Subject: [PATCH 235/286] [microTVM] Update tutorials (#13845)

This PR updates microTVM tutorials to use updated APIs.
It also adds an ordering to the tutorials that are useful for first time users.
RVM tutorial is also removed as it is not supported anymore.
---
 docs/conf.py                                  |  12 +-
 docs/topic/microtvm/index.rst                 |  11 +-
 .../how_to/work_with_microtvm/micro_aot.py    |  17 +-
 .../work_with_microtvm/micro_autotune.py      |  13 +-
 .../how_to/work_with_microtvm/micro_ethosu.py |   6 +-
 .../work_with_microtvm/micro_mlperftiny.py    |   7 +-
 .../work_with_microtvm/micro_pytorch.py       |  18 +-
 .../work_with_microtvm/micro_reference_vm.py  | 159 ------------------
 .../how_to/work_with_microtvm/micro_tflite.py |  72 ++++----
 .../how_to/work_with_microtvm/micro_train.py  |   9 +-
 .../how_to/work_with_microtvm/micro_tvmc.sh   |  43 ++---
 python/tvm/micro/testing/utils.py             |   8 +-
 tests/scripts/request_hook/request_hook.py    |   2 +-
 13 files changed, 105 insertions(+), 272 deletions(-)
 delete mode 100644 gallery/how_to/work_with_microtvm/micro_reference_vm.py

diff --git a/docs/conf.py b/docs/conf.py
index eb2b39d4b1fd..8d24f05b9b3e 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -511,15 +511,15 @@ def jupyter_notebook(script_blocks, gallery_conf, target_dir, real_func):
         "use_pass_instrument.py",
         "bring_your_own_datatypes.py",
     ],
-    "micro": [
-        "micro_train.py",
-        "micro_autotune.py",
-        "micro_reference_vm.py",
-        "micro_tflite.py",
-        "micro_ethosu.py",
+    "work_with_microtvm": [
         "micro_tvmc.py",
+        "micro_tflite.py",
         "micro_aot.py",
         "micro_pytorch.py",
+        "micro_train.py",
+        "micro_autotune.py",
+        "micro_ethosu.py",
+        "micro_mlperftiny.py",
     ],
 }
 
diff --git a/docs/topic/microtvm/index.rst b/docs/topic/microtvm/index.rst
index ebcadb34427c..4dd4ab5d511d 100644
--- a/docs/topic/microtvm/index.rst
+++ b/docs/topic/microtvm/index.rst
@@ -50,13 +50,12 @@ Getting Started with microTVM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Before working with microTVM, we recommend you have a supported development board. Then, follow these
-tutorials to get started with microTVM:
+tutorials to get started with microTVM. Tutorials are in the order that could help developers to learn
+more as they follow through them. Here is a list of tutorials that you can start with:
 
-1. :ref:`Start the microTVM Reference VM <tutorial-micro-reference-vm>`. The microTVM tutorials
-   depend on Zephyr and on a compiler toolchain for your hardware. The reference VM is a convenient
-   way to install those dependencies.
-2. Try the :ref:`microTVM with TFLite Tutorial <microTVM-with-TFLite>`.
-3. Try running a more complex `CIFAR10-CNN model <https://github.com/areusch/microtvm-blogpost-eval>`_.
+1. Try :ref:`microTVM CLI Tool <tutorial-micro-cli-tool>`.
+2. Try the :ref:`microTVM TFLite Tutorial <tutorial_micro_tflite>`.
+3. Try running a more complex tutorial: :ref:`Creating Your MLPerfTiny Submission with microTVM <tutorial-micro-mlperftiny>`.
 
 
 How microTVM Works
diff --git a/gallery/how_to/work_with_microtvm/micro_aot.py b/gallery/how_to/work_with_microtvm/micro_aot.py
index c1b29ba5c582..f31ffa1570af 100644
--- a/gallery/how_to/work_with_microtvm/micro_aot.py
+++ b/gallery/how_to/work_with_microtvm/micro_aot.py
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-.. _tutorial-micro-AoT:
+.. _tutorial-micro-aot:
 
-microTVM Host-Driven AoT
-===========================
+3. microTVM Ahead-of-Time (AOT) Compilation
+===========================================
 **Authors**:
 `Mehrdad Hessar <https://github.com/mehrdadh>`_,
 `Alan MacDonald <https://github.com/alanmacd>`_
@@ -59,6 +59,7 @@
 
 import tvm
 from tvm import relay
+import tvm.micro.testing
 from tvm.relay.backend import Executor, Runtime
 from tvm.contrib.download import download_testdata
 
@@ -102,8 +103,7 @@
 # using AOT host driven executor. We use the host micro target which is for running a model
 # on x86 CPU using CRT runtime or running a model with Zephyr platform on qemu_x86 simulator
 # board. In the case of a physical microcontroller, we get the target model for the physical
-# board (E.g. nucleo_l4r5zi) and pass it to `tvm.target.target.micro` to create a full
-# micro target.
+# board (E.g. nucleo_l4r5zi) and change `BOARD` to supported Zephyr board.
 #
 
 # Use the C runtime (crt) and enable static linking by setting system-lib to True
@@ -111,18 +111,15 @@
 
 # Simulate a microcontroller on the host machine. Uses the main() from `src/runtime/crt/host/main.cc`.
 # To use physical hardware, replace "host" with something matching your hardware.
-TARGET = tvm.target.target.micro("host")
+TARGET = tvm.micro.testing.get_target("crt")
 
 # Use the AOT executor rather than graph or vm executors. Don't use unpacked API or C calling style.
 EXECUTOR = Executor("aot")
 
 if use_physical_hw:
-    boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
-    with open(boards_file) as f:
-        boards = json.load(f)
     BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
     SERIAL = os.getenv("TVM_MICRO_SERIAL", default=None)
-    TARGET = tvm.target.target.micro(boards[BOARD]["model"])
+    TARGET = tvm.micro.testing.get_target("zephyr", BOARD)
 
 ######################################################################
 # Compile the model
diff --git a/gallery/how_to/work_with_microtvm/micro_autotune.py b/gallery/how_to/work_with_microtvm/micro_autotune.py
index 9be257a57ac5..e8c032b70e05 100644
--- a/gallery/how_to/work_with_microtvm/micro_autotune.py
+++ b/gallery/how_to/work_with_microtvm/micro_autotune.py
@@ -18,8 +18,8 @@
 """
 .. _tutorial-micro-autotune:
 
-Autotuning with microTVM
-=========================
+6. Model Tuning with microTVM
+=============================
 **Authors**:
 `Andrew Reusch <https://github.com/areusch>`_,
 `Mehrdad Hessar <https://github.com/mehrdadh>`_
@@ -55,6 +55,7 @@
 
 import tvm
 from tvm.relay.backend import Runtime
+import tvm.micro.testing
 
 ####################
 # Defining the model
@@ -102,20 +103,16 @@
 #
 
 RUNTIME = Runtime("crt", {"system-lib": True})
-TARGET = tvm.target.target.micro("host")
+TARGET = tvm.micro.testing.get_target("crt")
 
 # Compiling for physical hardware
 # --------------------------------------------------------------------------
 #  When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The
 #  STM32L4R5ZI Nucleo target and board is chosen in the example below.
 if use_physical_hw:
-    boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
-    with open(boards_file) as f:
-        boards = json.load(f)
-
     BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
     SERIAL = os.getenv("TVM_MICRO_SERIAL", default=None)
-    TARGET = tvm.target.target.micro(boards[BOARD]["model"])
+    TARGET = tvm.micro.testing.get_target("zephyr", BOARD)
 
 
 #########################
diff --git a/gallery/how_to/work_with_microtvm/micro_ethosu.py b/gallery/how_to/work_with_microtvm/micro_ethosu.py
index 74a9d59d77c1..e6f47321c812 100644
--- a/gallery/how_to/work_with_microtvm/micro_ethosu.py
+++ b/gallery/how_to/work_with_microtvm/micro_ethosu.py
@@ -15,8 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN
-======================================================================================
+.. _tutorial-micro-ethosu:
+
+7. Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN
+=========================================================================================
 **Author**:
 `Grant Watson <https://github.com/grant-arm>`_
 
diff --git a/gallery/how_to/work_with_microtvm/micro_mlperftiny.py b/gallery/how_to/work_with_microtvm/micro_mlperftiny.py
index 79308e072365..e8c6a253ad2b 100644
--- a/gallery/how_to/work_with_microtvm/micro_mlperftiny.py
+++ b/gallery/how_to/work_with_microtvm/micro_mlperftiny.py
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-.. _tutorial-micro-MLPerfTiny:
+.. _tutorial-micro-mlperftiny:
 
-Creating Your MLPerfTiny Submission with microTVM
-=================================================
+8. Creating Your MLPerfTiny Submission with microTVM
+====================================================
 **Authors**:
 `Mehrdad Hessar <https://github.com/mehrdadh>`_
 
@@ -69,6 +69,7 @@
 from tvm.contrib.download import download_testdata
 from tvm.micro import export_model_library_format
 from tvm.micro.model_library_format import generate_c_interface_header
+import tvm.micro.testing
 from tvm.micro.testing.utils import (
     create_header_file,
     mlf_extract_workspace_size_bytes,
diff --git a/gallery/how_to/work_with_microtvm/micro_pytorch.py b/gallery/how_to/work_with_microtvm/micro_pytorch.py
index 370e4d7e804b..a7f5f1028047 100644
--- a/gallery/how_to/work_with_microtvm/micro_pytorch.py
+++ b/gallery/how_to/work_with_microtvm/micro_pytorch.py
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-.. _tutorial-micro-Pytorch:
+.. _tutorial-micro-pytorch:
 
-microTVM PyTorch Tutorial
-===========================
+4. microTVM PyTorch Tutorial
+============================
 **Authors**:
 `Mehrdad Hessar <https://github.com/mehrdadh>`_
 
@@ -46,6 +46,7 @@
 from tvm import relay
 from tvm.contrib.download import download_testdata
 from tvm.relay.backend import Executor
+import tvm.micro.testing
 
 ##################################
 # Load a pre-trained PyTorch model
@@ -91,13 +92,14 @@
 # and we use `host` micro target. Using this setup, TVM compiles the model
 # for C runtime which can run on a x86 CPU machine with the same flow that
 # would run on a physical microcontroller.
+# CRT Uses the main() from `src/runtime/crt/host/main.cc`
+# To use physical hardware, replace `board` with another physical micro target, e.g. `nrf5340dk_nrf5340_cpuapp`
+# or `mps2_an521` and change the platform type to Zephyr.
+# See more target examples in :ref:`Training Vision Models for microTVM on Arduino <tutorial-micro-train-arduino>`
+# and :ref:`microTVM TFLite Tutorial<tutorial_micro_tflite>`.
 #
 
-
-# Simulate a microcontroller on the host machine. Uses the main() from `src/runtime/crt/host/main.cc`
-# To use physical hardware, replace "host" with another physical micro target, e.g. `nrf52840`
-# or `mps2_an521`. See more more target examples in micro_train.py and micro_tflite.py tutorials.
-target = tvm.target.target.micro("host")
+target = tvm.micro.testing.get_target(platform="crt", board=None)
 
 # Use the C runtime (crt) and enable static linking by setting system-lib to True
 runtime = tvm.relay.backend.Runtime("crt", {"system-lib": True})
diff --git a/gallery/how_to/work_with_microtvm/micro_reference_vm.py b/gallery/how_to/work_with_microtvm/micro_reference_vm.py
deleted file mode 100644
index 3121bca353a5..000000000000
--- a/gallery/how_to/work_with_microtvm/micro_reference_vm.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-.. _tutorial-micro-reference-vm:
-
-===================================
-microTVM Reference Virtual Machines
-===================================
-**Author**: `Andrew Reusch <areusch@octoml.ai>`_
-
-This tutorial explains how to launch microTVM Reference Virtual Machines. You can use these to
-develop on real physical hardware without needing to individually install the microTVM
-dependencies. These are also particularly useful when trying to reproduce behavior with
-microTVM, such as when filing bug reports.
-
-microTVM is the effort to allow TVM to build and execute models on bare-metal microcontrollers.
-microTVM aims to be compatible with a wide variety of SoCs and runtime environments (i.e. bare metal,
-RTOS, etc). However, some stable software environment is needed to allow developers to share and
-reproduce bugs and results. The microTVM Reference Virtual Machines are intended to provide that
-environment.
-
-How it works
-============
-
-No Virtual Machines are stored in the TVM repository--instead, the files stored in
-``apps/microtvm/reference-vm`` describe how to build VMs to the Vagrant_ VM builder tool.
-
-The Reference VMs are split into two parts:
-
-1. A Vagrant Base Box, which contains all of the stable dependencies for that platform. Build
-   scripts are stored in ``apps/microtvm/reference-vm/<platform>/base-box``. TVM committers run
-   these when a platform's "stable" dependencies change, and the generated base boxes are stored in
-   `Vagrant Cloud`_.
-2. A per-workspace VM, which users normally build using the Base Box as a starting point. Build
-   scripts are stored in ``apps/microtvm/reference-vm/<platform>`` (everything except ``base-box``).
-
-.. _Vagrant: https://vagrantup.com
-.. _Vagrant Cloud: https://app.vagrantup.com/tlcpack
-
-Setting up the VM
-=================
-
-Installing prerequisites
-------------------------
-
-A minimal set of prerequisites are needed:
-
-1. `Vagrant <https://vagrantup.com>`__
-2. A supported Virtual Machine hypervisor (**VirtualBox**, **Parallels**, or **VMWare Fusion/Workstation**).
-   `VirtualBox <https://www.virtualbox.org>`__ is a suggested free hypervisor, but please note
-   that the `VirtualBox Extension Pack`_ is required for proper USB forwarding. If using VirtualBox,
-   also consider installing the `vbguest <https://github.com/dotless-de/vagrant-vbguest>`_ plugin.
-
-.. _VirtualBox Extension Pack: https://www.virtualbox.org/wiki/Downloads#VirtualBox6.1.16OracleVMVirtualBoxExtensionPack
-
-3. If required for your hypervisor, the
-   `Vagrant provider plugin <https://github.com/hashicorp/vagrant/wiki/Available-Vagrant-Plugins#providers>`__ (or see `here <https://www.vagrantup.com/vmware>`__ for VMWare).
-
-First boot
-----------
-
-The first time you use a reference VM, you need to create the box locally and then provision it.
-
-.. code-block:: bash
-
-    # Replace zephyr with the name of a different platform, if you are not using Zephyr.
-    ~/.../tvm $ cd apps/microtvm/reference-vm/zephyr
-    # Replace <provider_name> with the name of the hypervisor you wish to use (i.e. virtualbox, parallels, vmware_desktop).
-    ~/.../tvm/apps/microtvm/reference-vm/zephyr $ vagrant up --provider=<provider_name>
-
-
-This command will take a couple of minutes to run and will require 4 to 5GB of storage on your
-machine. It does the following:
-
-1. Downloads the `microTVM base box`_ and clones it to form a new VM specific to this TVM directory.
-2. Mounts your TVM directory (and, if using ``git-subtree``, the original ``.git`` repo) into the
-   VM.
-3. Builds TVM and installs a Python virtualenv with the dependencies corresponding with your TVM
-   build.
-
-.. _microTVM base box: https://app.vagrantup.com/tlcpack/boxes/microtvm
-
-Connect Hardware to the VM
---------------------------
-
-Next, you need to configure USB passthrough to attach your physical development board to the virtual
-machine (rather than directly to your laptop's host OS).
-
-It's suggested you setup a device filter, rather than doing a one-time forward, because often the
-device may reboot during the programming process and you may, at that time, need to enable
-forwarding again. It may not be obvious to the end user when this occurs. Instructions to do that:
-
- * `VirtualBox <https://www.virtualbox.org/manual/ch03.html#usb-support>`__
- * `Parallels <https://kb.parallels.com/122993>`__
- * `VMWare Workstation <https://docs.vmware.com/en/VMware-Workstation-Pro/15.0/com.vmware.ws.using.doc/GUID-E003456F-EB94-4B53-9082-293D9617CB5A.html>`__
-
-Rebuilding TVM inside the Reference VM
---------------------------------------
-
-After the first boot, you'll need to ensure you keep the build, in ``$TVM_HOME/build-microtvm-zephyr``,
-up-to-date when you modify the C++ runtime or checkout a different revision. You can either
-re-provision the machine (``vagrant provision`` in the same directory you ran ``vagrant up`` before)
-or manually rebuild TVM yourself.
-
-Remember: the TVM ``.so`` built inside the VM is different from the one you may use on your host
-machine. This is why it's built inside the special directory ``build-microtvm-zephyr``.
-
-Logging in to the VM
---------------------
-
-The VM should be available to your host only with the hostname ``microtvm``. You can SSH to the VM
-as follows:
-
-.. code-block:: bash
-
-    $ vagrant ssh
-
-Then ``cd`` to the same path used on your host machine for TVM. For example, on Mac:
-
-.. code-block:: bash
-
-    $ cd /Users/yourusername/path/to/tvm
-
-Running tests
-=============
-
-Once the VM has been provisioned, tests can be executed using ``poetry``:
-
-.. code-block:: bash
-
-    $ cd apps/microtvm/reference-vm/zephyr
-    $ poetry run python3 ../../../../tests/micro/zephyr/test_zephyr.py --board=stm32f746g_disco
-
-If you do not have physical hardware attached, but wish to run the tests using the
-local QEMU emulator running within the VM, run the following commands instead:
-
-.. code-block:: bash
-
-    $ cd /Users/yourusername/path/to/tvm
-    $ cd apps/microtvm/reference-vm/zephyr/
-    $ poetry run pytest ../../../../tests/micro/zephyr/test_zephyr.py --board=qemu_x86
-
-
-
-"""
diff --git a/gallery/how_to/work_with_microtvm/micro_tflite.py b/gallery/how_to/work_with_microtvm/micro_tflite.py
index 0770d472c9b8..67b3e66e3315 100644
--- a/gallery/how_to/work_with_microtvm/micro_tflite.py
+++ b/gallery/how_to/work_with_microtvm/micro_tflite.py
@@ -15,9 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-.. _microTVM-with-TFLite:
+.. _tutorial_micro_tflite:
 
-microTVM with TFLite Models
+2. microTVM TFLite Tutorial
 ===========================
 **Author**: `Tom Gall <https://github.com/tom-gall>`_
 
@@ -55,11 +55,16 @@
 import numpy as np
 
 import tvm
+import tvm.micro
+import tvm.micro.testing
 from tvm import relay
 import tvm.contrib.utils
+from tvm.micro import export_model_library_format
 from tvm.contrib.download import download_testdata
 
-model_url = "https://people.linaro.org/~tom.gall/sine_model.tflite"
+model_url = (
+    "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/sine_model.tflite"
+)
 model_file = "sine_model.tflite"
 model_path = download_testdata(model_url, model_file, module="data")
 
@@ -105,55 +110,44 @@
 #
 # Now we create a build config for relay, turning off two options and then calling relay.build which
 # will result in a C source file for the selected TARGET. When running on a simulated target of the
-# same architecture as the host (where this Python script is executed) choose "host" below for the
+# same architecture as the host (where this Python script is executed) choose "crt" below for the
 # TARGET, the C Runtime as the RUNTIME and a proper board/VM to run it (Zephyr will create the right
 # QEMU VM based on BOARD. In the example below the x86 arch is selected and a x86 VM is picked up accordingly:
 #
 RUNTIME = tvm.relay.backend.Runtime("crt", {"system-lib": True})
-TARGET = tvm.target.target.micro("host")
+TARGET = tvm.micro.testing.get_target("crt")
 
-#
-# Compiling for physical hardware
-#  When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The
-#  STM32F746 Nucleo target and board is chosen in the example below. Another option would be to
-#  choose the STM32F746 Discovery board instead. Since that board has the same MCU as the Nucleo
-#  board but a couple of wirings and configs differ, it's necessary to select the "stm32f746g_disco"
-#  board to generated the right firmware image.
-#
+# When running on physical hardware, choose a TARGET and a BOARD that describe the hardware. The
+# STM32L4R5ZI Nucleo target and board is chosen in the example below. You could change the testing
+# board by simply exporting `TVM_MICRO_BOARD` variable with a different Zephyr supported board.
 
 if use_physical_hw:
-    boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
-    with open(boards_file) as f:
-        boards = json.load(f)
     BOARD = os.getenv("TVM_MICRO_BOARD", default="nucleo_l4r5zi")
     SERIAL = os.getenv("TVM_MICRO_SERIAL", default=None)
-    TARGET = tvm.target.target.micro(boards[BOARD]["model"])
+    TARGET = tvm.micro.testing.get_target("zephyr", BOARD)
 
+# For some boards, Zephyr runs them emulated by default, using QEMU. For example, below is the
+# TARGET and BOARD used to build a microTVM firmware for the mps2-an521 board.
 #
-#  For some boards, Zephyr runs them emulated by default, using QEMU. For example, below is the
-#  TARGET and BOARD used to build a microTVM firmware for the mps2-an521 board. Since that board
-#  runs emulated by default on Zephyr the suffix "-qemu" is added to the board name to inform
-#  microTVM that the QEMU transporter must be used to communicate with the board. If the board name
-#  already has the prefix "qemu_", like "qemu_x86", then it's not necessary to add that suffix.
-#
-#  TARGET = tvm.target.target.micro("mps2_an521")
-#  BOARD = "mps2_an521-qemu"
+# `mps2_an521 = "mps2_an521"`
+# `TARGET = tvm.micro.testing.get_target("zephyr", BOARD)`
 
 ######################################################################
-# Now, compile the model for the target:
+# Now, compile the model for the target. If you do not specify Executor,
+# by default it uses GraphExecutor.
 
-with tvm.transform.PassContext(
-    opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["AlterOpLayout"]
-):
+with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
     module = relay.build(mod, target=TARGET, runtime=RUNTIME, params=params)
 
 
+######################################################################
 # Inspecting the compilation output
 # ---------------------------------
 #
 # The compilation process has produced some C code implementing the operators in this graph. We
 # can inspect it by printing the CSourceModule contents (for the purposes of this tutorial, let's
 # just print the first 10 lines):
+#
 
 c_source_module = module.get_lib().imported_modules[0]
 assert c_source_module.type_key == "c", "tutorial is broken"
@@ -166,27 +160,23 @@
 print("\n".join(first_few_lines))
 
 
+######################################################################
 # Compiling the generated code
 # ----------------------------
 #
 # Now we need to incorporate the generated C code into a project that allows us to run inference on the
 # device. The simplest way to do this is to integrate it yourself, using microTVM's standard output format
-# (:doc:`Model Library Format` </dev/model_library_format>`). This is a tarball with a standard layout:
+# model library format. This is a tarball with a standard layout.
 
 # Get a temporary path where we can store the tarball (since this is running as a tutorial).
 
-fd, model_library_format_tar_path = tempfile.mkstemp()
-os.close(fd)
-os.unlink(model_library_format_tar_path)
-tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+temp_dir = tvm.contrib.utils.tempdir()
+model_tar_path = temp_dir / "model.tar"
+export_model_library_format(module, model_tar_path)
 
-with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
+with tarfile.open(model_tar_path, "r:*") as tar_f:
     print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
 
-# Cleanup for tutorial:
-os.unlink(model_library_format_tar_path)
-
-
 # TVM also provides a standard way for embedded platforms to automatically generate a standalone
 # project, compile and flash it to a target, and communicate with it using the standard TVM RPC
 # protocol. The Model Library Format serves as the model input to this process. When embedded
@@ -201,11 +191,8 @@
 template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("crt"))
 project_options = {}  # You can use options to provide platform-specific options through TVM.
 
-# Compiling for physical hardware (or an emulated board, like the mps_an521)
-# --------------------------------------------------------------------------
 #  For physical hardware, you can try out the Zephyr platform by using a different template project
 #  and options:
-#
 
 if use_physical_hw:
     template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
@@ -218,7 +205,6 @@
     }
 
 # Create a temporary directory
-
 temp_dir = tvm.contrib.utils.tempdir()
 generated_project_dir = temp_dir / "generated-project"
 generated_project = tvm.micro.generate_project(
diff --git a/gallery/how_to/work_with_microtvm/micro_train.py b/gallery/how_to/work_with_microtvm/micro_train.py
index 9b8a9a68dde3..56ff54616f1b 100644
--- a/gallery/how_to/work_with_microtvm/micro_train.py
+++ b/gallery/how_to/work_with_microtvm/micro_train.py
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-.. _microtvm-train-arduino:
+.. _tutorial-micro-train-arduino:
 
-Training Vision Models for microTVM on Arduino
-==============================================
+5. Training Vision Models for microTVM on Arduino
+=================================================
 **Author**: `Gavin Uberti <https://github.com/guberti>`_
 
 This tutorial shows how MobileNetV1 models can be trained
@@ -441,6 +441,7 @@ def representative_dataset():
 import shutil
 import tflite
 import tvm
+import tvm.micro.testing
 
 # Method to load model is different in TFLite 1 vs 2
 try:  # TFLite 2.1 and above
@@ -452,7 +453,7 @@ def representative_dataset():
 mod, params = tvm.relay.frontend.from_tflite(tflite_model)
 
 # Set configuration flags to improve performance
-target = tvm.target.target.micro("nrf52840")
+target = tvm.micro.testing.get_target("zephyr", "nrf5340dk_nrf5340_cpuapp")
 runtime = tvm.relay.backend.Runtime("crt")
 executor = tvm.relay.backend.Executor("aot", {"unpacked-api": True})
 
diff --git a/gallery/how_to/work_with_microtvm/micro_tvmc.sh b/gallery/how_to/work_with_microtvm/micro_tvmc.sh
index 0eaef9c6a836..f7f27ed1828c 100755
--- a/gallery/how_to/work_with_microtvm/micro_tvmc.sh
+++ b/gallery/how_to/work_with_microtvm/micro_tvmc.sh
@@ -16,30 +16,26 @@
 # under the License.
 
 : '
-.. _tutorial-micro-tvmc:
+.. _tutorial-micro-cli-tool:
 
-Executing a Tiny Model with TVMC Micro
-======================================
+1. microTVM CLI Tool
+====================
 **Author**: `Mehrdad Hessar <https://github.com/mehrdadh>`_
 
 This tutorial explains how to compile a tiny model for a micro device,
 build a program on Zephyr platform to execute this model, flash the program
 and run the model all using `tvmc micro` command.
+You need to install python and Zephyr dependencies before processing with this tutorial.
 '
 
 ######################################################################
-# .. note::
-#     This tutorial is explaining using TVMC Mirco on Zephyr platform. You need
-#     to install Zephyr dependencies before processing with this tutorial. Alternatively,
-#     you can run this tutorial in one of the following ways which has Zephyr depencencies already installed.
 #
-#     * Use `microTVM Reference Virtual Machines <https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_reference_vm.html#sphx-glr-how-to-work-with-microtvm-micro-reference-vm-py>`_.
-#     * Use QEMU docker image provided by TVM. Following these you will download and login to the docker image:
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_dependencies.rst
 #
-#     .. code-block:: bash
+
+######################################################################
 #
-#       cd tvm
-#       ./docker/bash.sh tlcpack/ci-qemu
+#     .. include:: ../../../../gallery/how_to/work_with_microtvm/install_zephyr.rst
 #
 
 # bash-ignore
@@ -93,7 +89,7 @@ wget https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/e
 #
 # Model Library Format (MLF) is an output format that TVM provides for micro targets. MLF is a tarball
 # containing a file for each piece of the TVM compiler output which can be used on micro targets outside
-# TVM environment. Read more about `Model Library Format <https://tvm.apache.org/docs//arch/model_library_format.html>`_.
+# TVM environment. Read more about :ref:`Model Library Format <model_library_format>`.
 #
 # Here, we generate a MLF file for ``qemu_x86`` Zephyr board. To generate MLF output for the ``magic_wand`` tflite model:
 #
@@ -183,12 +179,17 @@ tvmc run \
     --fill-mode ones \
     --print-top 4
 # bash
-#     # Output:
-#     #
-#     # INFO:__main__:b'[100%] [QEMU] CPU: qemu32,+nx,+pae\n'
-#     # remote: microTVM Zephyr runtime - running
-#     # INFO:__main__:b'[100%] Built target run\n'
-#     # [[3.         1.         2.         0.        ]
-#     # [0.47213247 0.41364592 0.07525456 0.03896701]]
+
+############################################################
+# Specifically, this command sets the input of the model
+# to all ones and shows the four values of the output with their indices.
+#
+# .. code-block:: bash
+#
+#      # Output:
+#      # INFO:__main__:b'[100%] [QEMU] CPU: qemu32,+nx,+pae\n'
+#      # remote: microTVM Zephyr runtime - running
+#      # INFO:__main__:b'[100%] Built target run\n'
+#      # [[3.         1.         2.         0.        ]
+#      # [0.47213247 0.41364592 0.07525456 0.03896701]]
 #
-# Specifically, this command sets the input of the model to all ones and shows the four values of the output with their indices.
diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py
index 170c57631444..43cad7850716 100644
--- a/python/tvm/micro/testing/utils.py
+++ b/python/tvm/micro/testing/utils.py
@@ -47,10 +47,16 @@ def get_supported_boards(platform: str):
         return json.load(f)
 
 
-def get_target(platform: str, board: str) -> tvm.target.Target:
+def get_target(platform: str, board: str = None) -> tvm.target.Target:
     """Intentionally simple function for making Targets for microcontrollers.
     If you need more complex arguments, one should call target.micro directly. Note
     that almost all, but not all, supported microcontrollers are Arm-based."""
+    if platform == "crt":
+        return tvm.target.target.micro("host")
+
+    if not board:
+        raise ValueError(f"`board` type is required for {platform} platform.")
+
     model = get_supported_boards(platform)[board]["model"]
     return tvm.target.target.micro(model, options=["-device=arm_cpu"])
 
diff --git a/tests/scripts/request_hook/request_hook.py b/tests/scripts/request_hook/request_hook.py
index b033f1ca8457..f093ab789b87 100644
--- a/tests/scripts/request_hook/request_hook.py
+++ b/tests/scripts/request_hook/request_hook.py
@@ -152,7 +152,7 @@
     "https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg": f"{BASE}/vta_cat.jpg",
     "https://objects.githubusercontent.com/github-production-release-asset-2e65be/130932608/4b196a8a-4e2d-11e8-9a11-be3c41846711?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221004%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221004T170456Z&X-Amz-Expires=300&X-Amz-Signature=0602b68e8864b9b01c9142eee22aed3543fe98a5482686eec33d98e2617a2295&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=130932608&response-content-disposition=attachment%3B%20filename%3Dmobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5&response-content-type=application%2Foctet-stream": f"{BASE}/2022-10-05/aws-mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5",
     "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip": f"{BASE}/oneflow/resnet18.zip",
-    "https://people.linaro.org/~tom.gall/sine_model.tflite": f"{BASE}/sine_model.tflite",
+    "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/sine_model.tflite": f"{BASE}/tlc-pack/web-data/testdata/microTVM/model/sine_model.tflite",
     "https://pjreddie.com/media/files/yolov3-tiny.weights?raw=true": f"{BASE}/yolov3-tiny.weights",
     "https://pjreddie.com/media/files/yolov3.weights": f"{BASE}/yolov3.weights",
     "https://raw.githubusercontent.com/Cadene/pretrained-models.pytorch/master/data/imagenet_classes.txt": f"{BASE}/2022-10-05/imagenet_classes.txt",

From d239d5858ebf084fc23401d884fa36bdce23a5af Mon Sep 17 00:00:00 2001
From: Valery Chernov <black.chervi@gmail.com>
Date: Sat, 28 Jan 2023 03:22:43 +0400
Subject: [PATCH 236/286] [ONNX] Support Bernoulli op on ONNX front-end
 (#13802)

* add Bernoulli converter for onnx front-end

* test for bernoulli was implemented

* fix tuple split. update test for stability with different seed on ort and tvm sides

* check that output values are 0 or 1

* remove std check as meaningless

* calculate theoretical mean and compare with result, remove ort for comparison. clean code

* add customized input as arg

* add test with input sequence of 0 and 1

* pylint fix

* fix inputs-shape issue

* add binomial test

* fix input type

* small fix

* update 0-1 check

* init arrays in numpy style

* check result determinism for fixed seed

* fix inputs issue

* modify binomial test

* pylint fix

---------

Co-authored-by: Valery Chernov <valery.chernov@deelvin.com>
---
 python/tvm/relay/frontend/onnx.py          |  31 ++++
 tests/python/frontend/onnx/test_forward.py | 159 +++++++++++++++++++++
 2 files changed, 190 insertions(+)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index ed99176282de..7b35d4a48135 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -5669,6 +5669,36 @@ def _impl_v16(cls, inputs, attr, params):
         )
 
 
+class Bernoulli(OnnxOpConverter):
+    """Operator converter for Bernoulli"""
+
+    @classmethod
+    def _impl_v15(cls, inputs, attr, params):
+        in_dtype = infer_type(inputs[0]).checked_type.dtype
+        assert in_dtype in [
+            "float32",
+            "float64",
+        ], "Only float input tensor is currently supported."
+        # The data type for the elements of the output tensor.
+        # if not specified, we will use the data type of the input tensor
+        out_dtype = attr.get("dtype", None)
+        if out_dtype is None:
+            out_dtype = in_dtype
+        else:
+            out_dtype = get_type(out_dtype)
+
+        seed = attr.get("seed", None)
+        if seed is None:
+            seed = np.random.randint(1e6)
+        else:
+            seed = int(seed)
+
+        key = _random.threefry_key(seed)
+        inter_outputs = _op.random.uniform(key, infer_shape(inputs[0]), in_dtype)
+        _, uniform_nums = _expr.TupleWrapper(inter_outputs, 2)
+        return _op.cast(_op.less(uniform_nums, inputs[0]), out_dtype)
+
+
 class RandomNormal(OnnxOpConverter):
     """Operator converter for random_normal"""
 
@@ -6436,6 +6466,7 @@ def _get_convert_map(opset):
         "QLinearGlobalAveragePool": QLinearGlobalAveragePool.get_converter(opset),
         "QLinearLeakyRelu": QLinearLeakyRelu.get_converter(opset),
         # Random number generation.
+        "Bernoulli": Bernoulli.get_converter(opset),
         "RandomNormal": RandomNormal.get_converter(opset),
         "RandomNormalLike": RandomNormalLike.get_converter(opset),
         "RandomUniform": RandomUniform.get_converter(opset),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index ebb682190140..4b17cfbbb3a5 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -6914,6 +6914,165 @@ def verify_qlinearsigmoid(a_shape):
     verify_qlinearsigmoid([])
 
 
+@tvm.testing.parametrize_targets("llvm")
+def test_random_bernoulli(target, dev):
+    """test_random_bernoulli"""
+
+    def _get_tvm_output(
+        inputs,
+        out_dtype="int32",
+        seed=None,
+        target=target,
+        dev=dev,
+        use_vm=False,
+        freeze_params=False,
+    ):
+        def get_bernoulli_model(shape, in_dtype="float32", out_dtype="int32", seed=None):
+            onnx_itype = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(in_dtype)]
+            onnx_otype = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(out_dtype)]
+            node = helper.make_node(
+                "Bernoulli",
+                ["input"],
+                ["output"],
+            )
+            dtype_attr = helper.make_attribute("dtype", onnx_otype)
+            node.attribute.append(dtype_attr)
+            if seed is not None:
+                seed_attr = helper.make_attribute("seed", float(seed))
+                node.attribute.append(seed_attr)
+
+            graph = helper.make_graph(
+                [node],
+                "random_bernoulli_test",
+                inputs=[helper.make_tensor_value_info("input", onnx_itype, list(shape))],
+                outputs=[helper.make_tensor_value_info("output", onnx_otype, list(shape))],
+            )
+            return helper.make_model(graph, producer_name="random_bernoulli_test")
+
+        shape = inputs.shape
+        in_dtype = inputs.dtype
+        model = get_bernoulli_model(shape, in_dtype, out_dtype, seed)
+
+        if use_vm:
+            return get_tvm_output_with_vm(
+                model,
+                inputs,
+                target,
+                dev,
+                freeze_params=freeze_params,
+            )
+        else:
+            return get_tvm_output(
+                model,
+                inputs,
+                target,
+                dev,
+            )
+
+    def binom_test(input, ideal_mean, threshold=0.05):
+        # This test is strictly appropriate when input probabilities are all identical.
+        # In that case, it should lead to flaky failures in only one run in a million (p>=1e-6).
+        # The test should be over-conservative when input probabilities are not identical.
+        # (i.e., It should have a rate of flaky failures lower than one run in a million.)
+        # If this test starts repeatedly throwing flaky failures, consult a statistician
+        # in addition to your regular debugging.
+        bnm_test_res = scipy.stats.binomtest(
+            k=np.sum(input, dtype="int32"), n=len(input), p=ideal_mean
+        )
+        return bnm_test_res.pvalue > threshold
+
+    def verify_bernoulli(
+        inputs=None,
+        shape=[],
+        in_dtype="float32",
+        out_dtype="int32",
+        seed=None,
+        target=target,
+        dev=dev,
+        use_vm=False,
+        freeze_params=False,
+        in_out_equal=False,
+    ):
+        if inputs is None:
+            assert len(shape) != 0
+            inputs = np.random.uniform(size=shape).astype(in_dtype)
+
+        tvm_out = _get_tvm_output(
+            inputs,
+            out_dtype,
+            seed,
+            target,
+            dev,
+            use_vm,
+            freeze_params,
+        )
+
+        if isinstance(tvm_out, list):
+            tvm_out = tvm_out[0]
+        # check that values are 0 or 1
+        tvm_flat = tvm_out.flatten()
+        assert np.array_equal(tvm_flat, tvm_flat.astype("bool"))
+        if in_out_equal:
+            tvm.testing.assert_allclose(inputs, tvm_out)
+        else:
+            # check that mean value is close to the theoretical one by binomial test
+            ideal_mean = np.mean(inputs)
+            repeats = 3
+            check = False
+            for i in range(repeats):
+                if binom_test(tvm_flat, ideal_mean):
+                    check = True
+                    break
+                else:
+                    # repeat with new seed
+                    seed = np.random.randint(1e6)
+                    tvm_flat = _get_tvm_output(
+                        inputs,
+                        out_dtype,
+                        seed,
+                        target,
+                        dev,
+                        use_vm,
+                        freeze_params,
+                    ).flatten()
+            assert check, "Binomial test failed"
+
+    # Test input sequence of 0 and 1
+    inputs = np.random.randint(2, size=[10000]).astype("float32")
+    verify_bernoulli(inputs, in_out_equal=True)
+
+    # Binomial test input with 0.5 values
+    val_num = 10000
+    inputs = np.ones([val_num], dtype="float32") * 0.5
+    verify_bernoulli(inputs)
+
+    # Binomial test input with 0.1 values
+    inputs = np.ones([val_num], dtype="float32") * 0.1
+    verify_bernoulli(inputs)
+
+    # Simple test
+    verify_bernoulli(shape=[val_num])
+
+    # Floating output type
+    verify_bernoulli(shape=[val_num], out_dtype="float32")
+
+    # Double input type
+    verify_bernoulli(shape=[val_num], in_dtype="float64")
+
+    # Test N-D tensor generation
+    verify_bernoulli(shape=[2, 4, 100, 100])
+
+    # Test with seed
+    verify_bernoulli(shape=[val_num], seed=np.random.randint(1e6))
+
+    # Test result determinism with the same seeds
+    inputs = np.random.uniform(size=[val_num])
+    fixed_seed = np.random.randint(1e6)
+    tvm_out_1 = _get_tvm_output(inputs, seed=fixed_seed)
+    tvm_out_2 = _get_tvm_output(inputs, seed=fixed_seed)
+    tvm.testing.assert_allclose(tvm_out_1, tvm_out_2)
+
+
 @tvm.testing.parametrize_targets("llvm")
 def test_random_uniform(target, dev):
     """test_random_uniform"""

From 40e664900a2e4d34042251465f712da42c321589 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Sat, 28 Jan 2023 13:42:53 +0800
Subject: [PATCH 237/286] [Arith] Support eq in detect_clip_bound (#13746)

* Support eq in detect_clip_bound

* follow review suggestion
---
 src/arith/detect_linear_equation.cc           | 38 ++++++++++++++-----
 .../unittest/test_arith_detect_clip_bound.py  | 13 +++++++
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/src/arith/detect_linear_equation.cc b/src/arith/detect_linear_equation.cc
index 8ea8f168b6ee..576ac1716e69 100644
--- a/src/arith/detect_linear_equation.cc
+++ b/src/arith/detect_linear_equation.cc
@@ -189,6 +189,7 @@ bool DetectClipBound(const PrimExpr& cond,
   PostOrderVisit(cond, fvisit);
   if (flag != 1) return false;
   // canonical form: exp >= 0
+  bool is_eq = false;
   PrimExpr canonical;
   if (const LTNode* op = cond.as<LTNode>()) {
     if (!op->a.dtype().is_int()) return false;
@@ -202,6 +203,10 @@ bool DetectClipBound(const PrimExpr& cond,
   } else if (const GENode* op = cond.as<GENode>()) {
     if (!op->a.dtype().is_int()) return false;
     canonical = op->a - op->b;
+  } else if (const EQNode* op = cond.as<EQNode>()) {
+    if (!op->a.dtype().is_int()) return false;
+    canonical = op->a - op->b;
+    is_eq = true;
   } else {
     return false;
   }
@@ -210,25 +215,40 @@ bool DetectClipBound(const PrimExpr& cond,
   if (!LinearEqDetector(var).Detect(canonical, &ret)) return false;
   ret.coeff = analyzer.Simplify(ret.coeff);
   IntervalEntry& p = (*bmap)[var.get()];
+
+  Optional<PrimExpr> min_value;
+  Optional<PrimExpr> max_value;
   if (is_const_int(ret.coeff, 1)) {
     // var + shift >=0 -> var >= -shift
+    min_value = -ret.base;
+    if (is_eq) {
+      max_value = min_value;
+    }
+  } else if (is_const_int(ret.coeff, -1)) {
+    // -var + shift >=0 -> var <= shift
+    max_value = ret.base;
+    if (is_eq) {
+      min_value = max_value;
+    }
+  }
+  if (!min_value.defined() && !max_value.defined()) {
+    return false;
+  }
+  if (min_value.defined()) {
     if (p.min_value.defined()) {
-      p.min_value = max(p.min_value, -ret.base);
+      p.min_value = max(p.min_value, min_value.value());
     } else {
-      p.min_value = -ret.base;
+      p.min_value = min_value.value();
     }
-    return true;
   }
-  if (is_const_int(ret.coeff, -1)) {
-    // -var + shift >=0 -> var <= shift
+  if (max_value.defined()) {
     if (p.max_value.defined()) {
-      p.max_value = min(p.max_value, ret.base);
+      p.max_value = min(p.max_value, max_value.value());
     } else {
-      p.max_value = ret.base;
+      p.max_value = max_value.value();
     }
-    return true;
   }
-  return false;
+  return true;
 }
 
 template <typename OP>
diff --git a/tests/python/unittest/test_arith_detect_clip_bound.py b/tests/python/unittest/test_arith_detect_clip_bound.py
index 0a9d75fcea54..03fff11f77e5 100644
--- a/tests/python/unittest/test_arith_detect_clip_bound.py
+++ b/tests/python/unittest/test_arith_detect_clip_bound.py
@@ -39,5 +39,18 @@ def test_basic():
     tvm.testing.assert_prim_expr_equal(m[2], 4)
 
 
+def test_trivial_eq():
+    a = te.var("a")
+    b = te.var("b")
+    m = tvm.arith.detect_clip_bound(b == 3, [a, b])
+    tvm.testing.assert_prim_expr_equal(m[2], 3)
+    tvm.testing.assert_prim_expr_equal(m[3], 3)
+    m = tvm.arith.detect_clip_bound(tvm.tir.all(a == 4, b == 3), [a, b])
+    tvm.testing.assert_prim_expr_equal(m[0], 4)
+    tvm.testing.assert_prim_expr_equal(m[1], 4)
+    tvm.testing.assert_prim_expr_equal(m[2], 3)
+    tvm.testing.assert_prim_expr_equal(m[3], 3)
+
+
 if __name__ == "__main__":
     test_basic()

From ed25753a60902c0fe693835d2476a9dc70a5dc02 Mon Sep 17 00:00:00 2001
From: TerranceLiang <11499470+terrance-liang@users.noreply.github.com>
Date: Sat, 28 Jan 2023 17:04:54 +0800
Subject: [PATCH 238/286] [topi] remove comment redundancy in resize.py
 (#13860)

Co-authored-by: Terrance Liang <tailin.liang@outlook.com>
---
 python/tvm/topi/image/resize.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/python/tvm/topi/image/resize.py b/python/tvm/topi/image/resize.py
index 51ce204cf6a8..1973e0543a15 100644
--- a/python/tvm/topi/image/resize.py
+++ b/python/tvm/topi/image/resize.py
@@ -813,12 +813,6 @@ def resize2d(
     layout: string, optional
         "NCHW", "NHWC", or "NCHWc".
 
-    coordinate_transformation_mode: string, optional
-        Describes how to transform the coordinate in the resized tensor
-        to the coordinate in the original tensor.
-        Refer to the ONNX Resize operator specification for details.
-        Available options are "half_pixel", "align_corners" and "asymmetric".
-
     method: string, optional
         method of interpolation ("nearest", "linear", "bicubic")
 

From 6563998d100bbc9309c2f21587697e56a9653f38 Mon Sep 17 00:00:00 2001
From: Eye <380614540@qq.com>
Date: Sun, 29 Jan 2023 23:45:33 +0800
Subject: [PATCH 239/286] [WASM][FIX] test tests/node/websock_rpc_test.py
 (#13862)

Co-authored-by: rqg <ranqingguo318@gmail.com>
---
 web/emcc/tvmjs_support.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc
index aa9546f3b71a..6395bfbb08f4 100644
--- a/web/emcc/tvmjs_support.cc
+++ b/web/emcc/tvmjs_support.cc
@@ -162,7 +162,7 @@ class AsyncLocalSession : public LocalSession {
       // pass the callback as the last argument.
       setter(num_args, packed_callback);
 
-      auto* pf = static_cast<PackedFunc*>(func);
+      auto* pf = static_cast<PackedFuncObj*>(func);
       pf->CallPacked(TVMArgs(values.data(), type_codes.data(), num_args + 1), &temp);
     } else if (func == get_time_eval_placeholder_.get()) {
       // special handle time evaluator.

From e5cc9ca7e2699e782312d6797e9e477fe15af332 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Mon, 30 Jan 2023 11:07:12 +0530
Subject: [PATCH 240/286] [RUNTIME][CLML] OpenCLML tuning and profiling
 enhanced (#13843)

* [RUNTIME][CLML] OpenCLML tuning and profiling enhanced

Tuning cache bin is serialized through DMLC::Stream to support multiple
CLML sub graphs with in a tvm module. Individual tuning cache blobs are
saved to same output file.

New API on OpenCLWorkspace to enable or disable profiling on command queue
rather doing this only when Timer is invoked. This is required to perform
CLML operator tuning.

CLML layer profiling now uses OpenCL Timer interface.

This PR also fix avoiding pad operator offloading at the very first layer
(to be specific before at least one convolution layer) due to the limitation
of CLML pad operator is concerned about layout. Please refer to CLML SDK
documentation for more details.

* Update src/runtime/opencl/opencl_common.h

Co-authored-by: Egor Churaev <egor.churaev@gmail.com>

* * review comments

---------

Co-authored-by: Egor Churaev <egor.churaev@gmail.com>
---
 src/runtime/contrib/clml/clml_runtime.cc | 143 ++++++++++++++---------
 src/runtime/opencl/opencl_common.h       |  40 +++----
 2 files changed, 111 insertions(+), 72 deletions(-)

diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index 0987eefdc9c0..6d79196a08e6 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -29,16 +29,26 @@
 #endif
 #include <stdlib.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
 
 #include <fstream>
 #include <map>
 #include <utility>
 
+#include "../../file_utils.h"
 #include "../../opencl/opencl_common.h"
 #include "../json/json_node.h"
 #include "../json/json_runtime.h"
 
+#define CAT_I(a, b) a##b
+#define CAT(a, b) CAT_I(a, b)
+#define GET_ML_INTERFACE CAT(CAT(clGetMLInterfaceV, CL_QCOM_ML_OPS_H_MAJOR_VERSION), QCOM)
+#define GET_ML_API_INTERFACE CAT(CAT(CLMLInterfaceV, CL_QCOM_ML_OPS_H_MAJOR_VERSION), QCOM)
+
+/*! \brief Magic number for CLML Tuning cache entry */
+static const uint64_t kTVMCLMLTuningCacheMagic = 0x434C4D4C54554E45;
+
 namespace tvm {
 namespace runtime {
 namespace contrib {
@@ -58,7 +68,7 @@ class CLMLRuntime : public JSONRuntimeBase {
    */
   explicit CLMLRuntime(const std::string& symbol_name, const std::string& graph_json,
                        const Array<String>& const_names)
-      : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
+      : JSONRuntimeBase(symbol_name, graph_json, const_names), clml_symbol(symbol_name) {}
 
   ~CLMLRuntime() {
 #ifdef TVM_GRAPH_EXECUTOR_CLML
@@ -153,40 +163,59 @@ class CLMLRuntime : public JSONRuntimeBase {
     ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
 
     for (cl_uint i = 0; i < numVersions; ++i) {
-#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 2
-      if (majorVersions[i] == 2) {
-        h_ClmlIntf = clGetMLInterfaceV2QCOM(0);
-        LOG(WARNING) << "CLML Target version:" << majorVersions[i];
-        break;
-      }
-#endif
-#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 3
-      if (majorVersions[i] == 3) {
-        h_ClmlIntf = clGetMLInterfaceV3QCOM(0);
+      if (majorVersions[i] == CL_QCOM_ML_OPS_H_MAJOR_VERSION) {
+        h_ClmlIntf = GET_ML_INTERFACE(0);
         LOG(WARNING) << "CLML Target version:" << majorVersions[i];
         break;
       }
-#endif
     }
     ICHECK(h_ClmlIntf != NULL)
         << "clGetMLInterfaceVxQCOM:" << result
         << " Perhaps there is mispatch between CLML SDK version to target supported version:"
         << majorVersions[numVersions - 1];
     char* tune_flag;
-    if ((tune_flag = getenv("CLML_IS_TUNNING_RUN")))
+    if ((tune_flag = getenv("CLML_IS_TUNING_RUN")))
       this->is_tuning_run = std::stoi(tune_flag);
     else
       this->is_tuning_run = 0;
 
-    if (!(tuning_file = getenv("CLML_TUNNING_CACHE"))) this->is_tuning_run = 0;
+    if (!(tuning_file = getenv("CLML_TUNING_CACHE"))) this->is_tuning_run = 0;
     // A Tuning run, so create the cache from scratch
     result = h_ClmlIntf->clCreateMLTuningCacheQCOM(&tuning_cache);
     ICHECK(result == CL_SUCCESS) << "clCreateMLTuningCacheQCOM:" << result;
     if (!this->is_tuning_run && this->tuning_file) {
-      std::vector<unsigned char> buffer;
-      buffer = readBinFile(this->tuning_file);
-      result = h_ClmlIntf->clLoadMLTuningCacheQCOM(tuning_cache, buffer.size(), buffer.data());
-      ICHECK(result == CL_SUCCESS) << "clLoadMLTuningCacheQCOM:" << result;
+      std::vector<unsigned char> tune_buffer;
+      std::string tune_blob;
+      LoadBinaryFromFile(this->tuning_file, &tune_blob);
+      dmlc::MemoryStringStream mstrm(const_cast<std::string*>(&tune_blob));
+      dmlc::Stream* strm = &mstrm;
+
+      uint64_t header, reserve;
+      std::string tune_symbol;
+      while (strm->Read(&header)) {
+        if (header != kTVMCLMLTuningCacheMagic) break;
+        if (!strm->Read(&reserve)) break;
+        if (!strm->Read(&tune_symbol)) break;
+        LOG(INFO) << "Tuning Cache Symbol:" << tune_symbol;
+        if (tune_symbol == clml_symbol) {
+          strm->Read(&tune_buffer);
+          break;
+        } else {
+          std::vector<unsigned char> tmp_buf;
+          if (!strm->Read(&tmp_buf)) break;
+        }
+      }
+
+      if (tune_buffer.size()) {
+        LOG(INFO) << "Loading tuning cache for symbol:" << clml_symbol
+                  << " size:" << tune_buffer.size();
+        result = h_ClmlIntf->clLoadMLTuningCacheQCOM(tuning_cache, tune_buffer.size(),
+                                                     tune_buffer.data());
+        ICHECK(result == CL_SUCCESS) << "clLoadMLTuningCacheQCOM:" << result;
+      } else {
+        LOG(WARNING) << "Tuning cache not cound for symbol :" << clml_symbol << " in file "
+                     << this->tuning_file;
+      }
     }
   }
 
@@ -281,32 +310,33 @@ class CLMLRuntime : public JSONRuntimeBase {
       }
     }
 
+    int64_t duration = 0;
     for (size_t i = 0; i < this->layer_.function.size(); ++i) {
       // Make CLML subgraphs accounted by OpenCLTimerNode.
-      if (getenv("CLML_PROFILING") || workspace->IsProfiling(tentry->device)) {
+
+      if (getenv("CLML_PROFILING")) {
+        Timer t;
+        auto f = Registry::Get(std::string("profiling.timer.opencl"));
+        t = f->operator()(tentry->device);
+        t->Start();
+        queue = workspace->GetQueue(tentry->device);
         evts.resize(evts.size() + 1);
         cl_event* evt = &(evts.back());
+
         result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
                                                this->layer_.descriptorSet, 0, NULL, evt);
+        t->Stop();
+        duration += t->SyncAndGetElapsedNanos();
+        LOG(WARNING) << "Layer:" << this->layer_.layer_names[i]
+                     << " Duration:" << t->SyncAndGetElapsedNanos();
       } else {
         result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
                                                this->layer_.descriptorSet, 0, NULL, NULL);
       }
       ICHECK(result == CL_SUCCESS) << "clEnqueueMLOpQCOM:" << result;
     }
-
     if (getenv("CLML_PROFILING")) {
-      cl_ulong start, end;
-      cl_ulong duration = 0;
-      clWaitForEvents(1, &(evts.back()));
-      for (size_t i = 0; i < this->layer_.layer_names.size(); ++i) {
-        clGetEventProfilingInfo(evts[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start,
-                                nullptr);
-        clGetEventProfilingInfo(evts[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, nullptr);
-        duration += (end - start);
-        LOG(WARNING) << "Layer:" << this->layer_.layer_names[i] << " Duration:" << (end - start);
-      }
-      LOG(WARNING) << "Total Duration:" << duration;
+      LOG(WARNING) << "Total Duration for " << clml_symbol << " is:" << duration;
     }
 
     for (size_t i = 0; i < outputs_.size(); ++i) {
@@ -484,30 +514,42 @@ class CLMLRuntime : public JSONRuntimeBase {
 
     if (this->is_tuning_run) {
       LOG(WARNING) << "CLML Tunning In Progress:";
+      // Let the command queue recreated in profiling mode.
+      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(tentry->device, true);
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
-        LOG(WARNING) << "CLML Tunning:" << i;
+        LOG(WARNING) << "CLML Tunning:" << this->layer_.layer_names[i];
         result = h_ClmlIntf->clTuneMLOpQCOM(workspace->GetQueue(tentry->device),
                                             this->layer_.function[i], this->layer_.descriptorSet,
                                             this->tuning_cache, NULL);
         ICHECK(result == CL_SUCCESS) << "clTuneMLOpQCOM:" << result;
       }
+      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(tentry->device, false);
 
-      size_t cacheLenBytes = 0;
-      size_t lenRet = 0;
-      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, 0, NULL, &cacheLenBytes);
+      size_t cache_len_bytes = 0;
+      size_t len_ret = 0;
+      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, 0, NULL, &cache_len_bytes);
       ICHECK(result == CL_SUCCESS) << "clSaveMLTuningCacheQCOM:" << result;
 
-      std::vector<unsigned char> savedCache(cacheLenBytes, 0);
-      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, savedCache.size(),
-                                                   savedCache.data(), &lenRet);
-      assert(result == CL_SUCCESS);
-
-      std::ofstream cache_out(tuning_file, std::ios_base::binary);
-      if (cache_out) {
-        cache_out.write(reinterpret_cast<char*>(savedCache.data()), savedCache.size());
-        cache_out.close();
-      }
-      LOG(WARNING) << "CLML: Tuning cache dumped to:" << tuning_file;
+      std::vector<unsigned char> saved_cache(cache_len_bytes, 0);
+      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, saved_cache.size(),
+                                                   saved_cache.data(), &len_ret);
+      ICHECK(result == CL_SUCCESS) << "clSaveMLTuningCacheQCOM" << result;
+
+      std::string tune_str;
+      dmlc::MemoryStringStream mstrm(&tune_str);
+      dmlc::Stream* strm = &mstrm;
+      uint64_t header = kTVMCLMLTuningCacheMagic;
+      uint64_t reserved = 0x0;
+      strm->Write(header);
+      strm->Write(reserved);
+      strm->Write(clml_symbol);
+      strm->Write(saved_cache);
+
+      std::ofstream fs(tuning_file, std::ios::app | std::ios::binary);
+      ICHECK(!fs.fail()) << "Cannot open " << tuning_file;
+      fs.write(&tune_str[0], tune_str.length());
+      LOG(WARNING) << "CLML: Tuning cache dumped to:" << tuning_file << " size" << tune_str.length()
+                   << " with tuning blob len " << saved_cache.size();
     }
   }
 
@@ -1373,12 +1415,7 @@ class CLMLRuntime : public JSONRuntimeBase {
 
   CachedLayer layer_;
   // CLML Context
-#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 2
-  CLMLInterfaceV2QCOM* h_ClmlIntf = NULL;
-#endif
-#if CL_QCOM_ML_OPS_H_MAJOR_VERSION == 3
-  CLMLInterfaceV3QCOM* h_ClmlIntf = NULL;
-#endif
+  GET_ML_API_INTERFACE* h_ClmlIntf = NULL;
   cl::OpenCLWorkspace* workspace = NULL;
   cl::OpenCLThreadEntry* tentry = NULL;
   cl_ml_tuningcache_qcom tuning_cache = NULL;
@@ -1395,6 +1432,8 @@ class CLMLRuntime : public JSONRuntimeBase {
                  << "Please build with USE_CLML_GRAPH_EXECUTOR.";
   }
 #endif
+  /*! CLML sub graph symbol in TVM main module */
+  std::string clml_symbol;
 };
 
 runtime::Module CLMLRuntimeCreate(const String& symbol_name, const String& graph_json,
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 7bbb358f8f92..c172a0f94539 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -284,6 +284,24 @@ class OpenCLWorkspace : public DeviceAPI {
 
     return prop & CL_QUEUE_PROFILING_ENABLE;
   }
+  // Enable queue profiling, recreate if required
+  void EnableQueueProfiling(Device dev, bool enable) {
+    bool is_enabled = cl::OpenCLWorkspace::Global()->IsProfiling(dev);
+    if (is_enabled == enable) {
+      return;
+    }
+    cl_command_queue_properties prop = (enable) ? CL_QUEUE_PROFILING_ENABLE : 0;
+    auto queue = cl::OpenCLWorkspace::Global()->GetQueue(dev);
+    OPENCL_CALL(clFlush(queue));
+    OPENCL_CALL(clFinish(queue));
+    OPENCL_CALL(clReleaseCommandQueue(queue));
+    cl_int err_code;
+    cl_device_id did = cl::OpenCLWorkspace::Global()->devices[dev.device_id];
+    auto profiling_queue =
+        clCreateCommandQueue(cl::OpenCLWorkspace::Global()->context, did, prop, &err_code);
+    OPENCL_CHECK_ERROR(err_code);
+    cl::OpenCLWorkspace::Global()->queues[dev.device_id] = profiling_queue;
+  }
 
   // override device API
   void SetDevice(Device dev) final;
@@ -508,26 +526,8 @@ class OpenCLTimerNode : public TimerNode {
   Device dev_;
 
   void recreateCommandQueue() {
-    cl_command_queue_properties prop;
-
-    if (!cl::OpenCLWorkspace::Global()->IsProfiling(dev_)) {
-      prop = CL_QUEUE_PROFILING_ENABLE;
-    } else {
-      prop = 0;
-    }
-
-    auto queue = cl::OpenCLWorkspace::Global()->GetQueue(dev_);
-
-    OPENCL_CALL(clFlush(queue));
-    OPENCL_CALL(clFinish(queue));
-    OPENCL_CALL(clReleaseCommandQueue(queue));
-
-    cl_int err_code;
-    cl_device_id did = cl::OpenCLWorkspace::Global()->devices[dev_.device_id];
-    auto profiling_queue =
-        clCreateCommandQueue(cl::OpenCLWorkspace::Global()->context, did, prop, &err_code);
-    OPENCL_CHECK_ERROR(err_code);
-    cl::OpenCLWorkspace::Global()->queues[dev_.device_id] = profiling_queue;
+    cl::OpenCLWorkspace::Global()->EnableQueueProfiling(
+        dev_, !cl::OpenCLWorkspace::Global()->IsProfiling(dev_));
   }
 };
 }  // namespace runtime

From 5b669b5a476d9b6faabbc9377d22e653b7dc7aad Mon Sep 17 00:00:00 2001
From: Valery Chernov <black.chervi@gmail.com>
Date: Mon, 30 Jan 2023 13:29:04 +0400
Subject: [PATCH 241/286] [ONNX] Support SequenceLength op (#13863)

* add SequenceLength op

* add SequenceLength test

* graph fix

---------

Co-authored-by: Valery Chernov <valery.chernov@deelvin.com>
---
 python/tvm/relay/frontend/onnx.py          | 10 ++++++++++
 tests/python/frontend/onnx/test_forward.py | 21 +++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 7b35d4a48135..6e0c7cc2dd3f 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -6148,6 +6148,15 @@ def _impl_v11(cls, inputs, attr, params):
         return _expr.Tuple(inputs)
 
 
+class SequenceLength(OnnxOpConverter):
+    """Operator converter for sequence length op."""
+
+    @classmethod
+    def _impl_v11(cls, inputs, attr, params):
+        # Get length of input sequence
+        return _expr.const(len(inputs[0]), dtype="int64")
+
+
 class SequenceInsert(OnnxOpConverter):
     """Operator converter for sequence insert op."""
 
@@ -6483,6 +6492,7 @@ def _get_convert_map(opset):
         "LinearRegressor": LinearRegressor.get_converter(opset),
         # Sequence operators
         "SequenceConstruct": SequenceConstruct.get_converter(opset),
+        "SequenceLength": SequenceLength.get_converter(opset),
         "SequenceInsert": SequenceInsert.get_converter(opset),
         "ConcatFromSequence": ConcatFromSequence.get_converter(opset),
         "SplitToSequence": SplitToSequence.get_converter(opset),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 4b17cfbbb3a5..6a780a632fb7 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -7760,10 +7760,16 @@ def verify_sequence_ops(tensor_shape, num_tensors, axis=0, position=0, new_axis=
             "SplitToSequence", inputs=["concat_sequence"], outputs=["split_sequence"], axis=axis
         )
 
+        # Test tensor extraction from sequence
         at_node = helper.make_node(
             "SequenceAt", inputs=["split_sequence", "position"], outputs=["output"]
         )
 
+        # Test sequence length
+        length_node = helper.make_node(
+            "SequenceLength", inputs=["split_sequence"], outputs=["output_2"]
+        )
+
         if new_axis is not None:
             new_axis_attr = helper.make_attribute("new_axis", new_axis)
             concat_node.attribute.append(new_axis_attr)
@@ -7781,9 +7787,20 @@ def verify_sequence_ops(tensor_shape, num_tensors, axis=0, position=0, new_axis=
             output_shape[axis] = num_tensors + 1
         else:
             output_shape[axis] = (num_tensors + 1) * output_shape[axis]
-        graph_outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, output_shape)]
+        graph_outputs = [
+            helper.make_tensor_value_info("output", TensorProto.FLOAT, output_shape),
+            helper.make_tensor_value_info("output_2", TensorProto.INT64, []),
+        ]
 
-        graph_nodes = [position_node, construct_node, insert_node, concat_node, split_node, at_node]
+        graph_nodes = [
+            position_node,
+            construct_node,
+            insert_node,
+            concat_node,
+            split_node,
+            at_node,
+            length_node,
+        ]
 
         graph = helper.make_graph(
             graph_nodes,

From e11e8d4919d14ac240d03f86e796dfeaea99f36b Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Mon, 30 Jan 2023 18:01:20 +0300
Subject: [PATCH 242/286] Enable C++17 for cmake modules (#13869)

Moved the lines which add c++17 to CXX_FLAGS before the include of
modules. After this c++17 features should be also supported in source
code of the modules.
---
 CMakeLists.txt | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 278afbe23563..acd78999a051 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -478,6 +478,17 @@ endif(USE_KALLOC_ALIGNMENT)
 # need to be re-compiled every time. Using ccache 4.0+ can resolve this issue.
 include(cmake/utils/CCache.cmake)
 
+include(CheckCXXCompilerFlag)
+if(NOT MSVC)
+  check_cxx_compiler_flag("-std=c++17" SUPPORT_CXX17)
+  set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CUDA_STANDARD 17)
+else()
+  check_cxx_compiler_flag("/std:c++17" SUPPORT_CXX17)
+  set(CMAKE_CXX_FLAGS "/std:c++17 ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CUDA_STANDARD 17)
+endif()
+
 # Module rules
 include(cmake/modules/VTA.cmake)
 include(cmake/modules/StandaloneCrt.cmake)
@@ -526,17 +537,6 @@ include(cmake/modules/Git.cmake)
 include(cmake/modules/LibInfo.cmake)
 include(cmake/modules/RustExt.cmake)
 
-include(CheckCXXCompilerFlag)
-if(NOT MSVC)
-  check_cxx_compiler_flag("-std=c++17" SUPPORT_CXX17)
-  set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS}")
-  set(CMAKE_CUDA_STANDARD 17)
-else()
-  check_cxx_compiler_flag("/std:c++17" SUPPORT_CXX17)
-  set(CMAKE_CXX_FLAGS "/std:c++17 ${CMAKE_CXX_FLAGS}")
-  set(CMAKE_CUDA_STANDARD 17)
-endif()
-
 set(LIBINFO_FILE ${CMAKE_CURRENT_LIST_DIR}/src/support/libinfo.cc)
 add_lib_info(${LIBINFO_FILE})
 list(REMOVE_ITEM COMPILER_SRCS ${LIBINFO_FILE})

From b02c5ff547dcb380341d7b7d9368eab64dadec8a Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 30 Jan 2023 12:53:01 -0600
Subject: [PATCH 243/286] [Hexagon][CI] Update the docker image ID to reflect
 newer LLVM (#13870)

* [Hexagon][CI] Update the docker image ID to reflect newer LLVM

The latest image ID is ci_hexagon:20230127-185848-95fa22308.

* Set C/C++ compilers in /opt/sccache to cc/c++
---
 ci/jenkins/docker-images.ini               | 2 +-
 tests/scripts/task_config_build_hexagon.sh | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ci/jenkins/docker-images.ini b/ci/jenkins/docker-images.ini
index 53ad2092ea4f..149ea7b76b28 100644
--- a/ci/jenkins/docker-images.ini
+++ b/ci/jenkins/docker-images.ini
@@ -21,7 +21,7 @@ ci_arm: tlcpack/ci-arm:20221013-060115-61c9742ea
 ci_cortexm: tlcpack/ci-cortexm:20230116-133924-dad13d1c1
 ci_cpu: tlcpack/ci-cpu:20230110-070003-d00168ffb
 ci_gpu: tlcpack/ci-gpu:20221128-070141-ae4fd7df7
-ci_hexagon: tlcpack/ci-hexagon:20221013-060115-61c9742ea
+ci_hexagon: tlcpack/ci_hexagon:20230127-185848-95fa22308
 ci_i386: tlcpack/ci-i386:20221013-060115-61c9742ea
 ci_lint: tlcpack/ci-lint:20221013-060115-61c9742ea
 ci_minimal: tlcpack/ci-minimal:20230117-070124-125886350
diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh
index 0736ed6b53b8..a3a42f18ee4e 100755
--- a/tests/scripts/task_config_build_hexagon.sh
+++ b/tests/scripts/task_config_build_hexagon.sh
@@ -31,11 +31,12 @@ echo set\(USE_LLVM "${CLANG_LLVM_HOME}/bin/llvm-config"\) >> config.cmake
 
 if [[ ${CI:-false} == "true" ]]; then
     # sccache needs to be used in CI to speed up builds
-    echo set\(CMAKE_CXX_COMPILER "/opt/sccache/clang++"\) >> config.cmake
+    echo set\(CMAKE_C_COMPILER "/opt/sccache/cc"\) >> config.cmake
+    echo set\(CMAKE_CXX_COMPILER "/opt/sccache/c++"\) >> config.cmake
 else
     echo 'Skipping sccache setup for local build'
-    echo set\(CMAKE_CXX_COMPILER \"/usr/bin/c++\"\) >> config.cmake
     echo set\(CMAKE_C_COMPILER \"/usr/bin/cc\"\) >> config.cmake
+    echo set\(CMAKE_CXX_COMPILER \"/usr/bin/c++\"\) >> config.cmake
 fi
 
 echo set\(USE_HEXAGON "ON"\) >> config.cmake

From 82fedb36ce1f7b3d9f9833fb5ef16aa02740977b Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 30 Jan 2023 14:15:07 -0600
Subject: [PATCH 244/286] [LLVM] Remove call to EmitDebugLocation from
 AddAliasInfo (#13872)

This function only creates alias metadata, so there isn't anything
for it to create debug location information for. If `index` is used
in executable code, the debug location should be emitted then.
---
 src/target/llvm/codegen_llvm.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 2182ecfa51ce..dcca33732060 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -557,7 +557,6 @@ llvm::Type* CodeGenLLVM::GetLLVMType(const PrimExpr& expr) const {
 //
 void CodeGenLLVM::AddAliasInfo(llvm::Instruction* inst, const VarNode* buffer_var, PrimExpr index,
                                DataType access_dtype) {
-  EmitDebugLocation(index->span);
   if (alias_var_set_.count(buffer_var) != 0) {
     // Mark all possibly aliased pointer as same type.
     llvm::MDNode* meta = md_tbaa_alias_set_;

From d8b099177791a221892ef6514a1fbd389c811bd3 Mon Sep 17 00:00:00 2001
From: Ever-Kid <mayiheng8@gmail.com>
Date: Tue, 31 Jan 2023 10:04:07 +0800
Subject: [PATCH 245/286] [TIR][FIX] check args size when creating prim_func by
 runtime::Registry (#13809)

---
 src/te/operation/create_primfunc.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 92186a4ffea4..bdcdbc023a1c 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -574,6 +574,7 @@ TVM_REGISTER_GLOBAL("te.CreatePrimFunc").set_body([](TVMArgs args, TVMRetValue*
   Array<te::Tensor> arg_list = args[0];
   std::optional<DataType> index_dtype_override{std::nullopt};
   // Add conversion to make std::optional compatible with FFI.
+  ICHECK_EQ(args.size(), 2);
   if (args[1].type_code() != kTVMNullptr) {
     index_dtype_override = args[1].operator DataType();
   }

From 18d7d8267125a984d395a49801c1a578496fb17a Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Tue, 31 Jan 2023 14:49:47 +0800
Subject: [PATCH 246/286] [TVMScript] Robustify the Highlight Printer (#13861)

Current `cprint` allows `PrimFunc` and `IRModule` as input. However,
after the fragment printing enabled by @junrushao, we can make it robust
to support any Objects that have method `script`
---
 python/tvm/script/highlight.py                | 13 +++++----
 .../test_tvmscript_printer_highlight.py       | 29 +++++++++++++++++++
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/python/tvm/script/highlight.py b/python/tvm/script/highlight.py
index 5cf28fff3a4b..80a1f9972248 100644
--- a/python/tvm/script/highlight.py
+++ b/python/tvm/script/highlight.py
@@ -20,14 +20,11 @@
 import os
 import sys
 import warnings
-from typing import Optional, Union
-
-from tvm.ir import IRModule
-from tvm.tir import PrimFunc
+from typing import Any, Optional, Union
 
 
 def cprint(
-    printable: Union[IRModule, PrimFunc, str],
+    printable: Union[Any, str],
     style: Optional[str] = None,
     black_format: bool = True,
 ) -> None:
@@ -61,8 +58,12 @@ def cprint(
     The default pygmentize style can also be set with the environment
     variable "TVM_PYGMENTIZE_STYLE".
     """
-    if isinstance(printable, (IRModule, PrimFunc)):
+    if hasattr(printable, "script") and callable(getattr(printable, "script")):
         printable = printable.script()
+    elif not isinstance(printable, str):
+        raise TypeError(
+            f"Only can print strings or objects with `script` method, but got: {type(printable)}"
+        )
 
     if black_format:
         printable = _format(printable)
diff --git a/tests/python/unittest/test_tvmscript_printer_highlight.py b/tests/python/unittest/test_tvmscript_printer_highlight.py
index cc3469a2ceea..16e90c3563fc 100644
--- a/tests/python/unittest/test_tvmscript_printer_highlight.py
+++ b/tests/python/unittest/test_tvmscript_printer_highlight.py
@@ -14,10 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
 import pytest
 
 import tvm
+import tvm.testing
+from tvm import relay
 from tvm.script import tir as T
+from tvm.script.highlight import cprint
 
 
 def test_highlight_script():
@@ -45,3 +49,28 @@ def main(  # type: ignore
     Module["main"].show(style="light")
     Module["main"].show(style="dark")
     Module["main"].show(style="ansi")
+
+
+def test_cprint():
+    # Print string
+    cprint("a + 1")
+
+    # Print nodes with `script` method, e.g. PrimExpr
+    cprint(tvm.tir.Var("v", "int32") + 1)
+
+    # Cannot print non-Python-style codes if black installed
+    try:
+        import black
+
+        with pytest.raises(ValueError):
+            cprint("if (a == 1) { a +=1; }")
+    except ImportError:
+        pass
+
+    # Cannot print unsupported nodes (nodes without `script` method)
+    with pytest.raises(TypeError):
+        cprint(relay.const(1))
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From b95130deea1850d273086237cae64d50b6d3da1b Mon Sep 17 00:00:00 2001
From: Valery Chernov <black.chervi@gmail.com>
Date: Tue, 31 Jan 2023 11:06:02 +0400
Subject: [PATCH 247/286] [ONNX] Support SequenceErase op (#13865)

* SequenceErase was implemented in ONNX front-end

* add SequenceErase node to Sequence test

* remark from reviewer. fix negative position recalculation

* add assert

---------

Co-authored-by: Valery Chernov <valery.chernov@deelvin.com>
---
 python/tvm/relay/frontend/onnx.py          | 42 +++++++++++++++++++---
 tests/python/frontend/onnx/test_forward.py | 10 +++++-
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 6e0c7cc2dd3f..93429a863889 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -6148,13 +6148,35 @@ def _impl_v11(cls, inputs, attr, params):
         return _expr.Tuple(inputs)
 
 
-class SequenceLength(OnnxOpConverter):
-    """Operator converter for sequence length op."""
+class SequenceErase(OnnxOpConverter):
+    """Operator converter for sequence erase op."""
 
     @classmethod
     def _impl_v11(cls, inputs, attr, params):
-        # Get length of input sequence
-        return _expr.const(len(inputs[0]), dtype="int64")
+        # Erase tensor from sequence on specified position
+        input_sequence = inputs[0]
+
+        if len(inputs) == 2:
+            position = inputs[1]
+            # Non constant position is not supported.
+            if isinstance(position, _expr.Constant):
+                position = position.data.numpy()
+            elif position.name_hint in params:
+                position = params[position.name_hint].numpy()
+            else:
+                raise NotImplementedError("Position must be a constant.")
+        else:
+            position = -1
+
+        seq_len = len(input_sequence)
+        assert -seq_len <= position < seq_len, "Position is out of bounds"
+
+        if position < 0:
+            position = seq_len + position
+        # Convert sequence to a list, insert tensors before erased, and repackage as Tuple.
+        tensor_list = [input_sequence[i] for i in range(seq_len) if i != position]
+        # Create new tuple and return.
+        return _expr.Tuple(tensor_list)
 
 
 class SequenceInsert(OnnxOpConverter):
@@ -6188,6 +6210,15 @@ def _impl_v11(cls, inputs, attr, params):
         return _expr.Tuple(tensor_list)
 
 
+class SequenceLength(OnnxOpConverter):
+    """Operator converter for sequence length op."""
+
+    @classmethod
+    def _impl_v11(cls, inputs, attr, params):
+        # Get length of input sequence
+        return _expr.const(len(inputs[0]), dtype="int64")
+
+
 class ConcatFromSequence(OnnxOpConverter):
     """Operator converter for sequence concatenation op."""
 
@@ -6492,8 +6523,9 @@ def _get_convert_map(opset):
         "LinearRegressor": LinearRegressor.get_converter(opset),
         # Sequence operators
         "SequenceConstruct": SequenceConstruct.get_converter(opset),
-        "SequenceLength": SequenceLength.get_converter(opset),
+        "SequenceErase": SequenceErase.get_converter(opset),
         "SequenceInsert": SequenceInsert.get_converter(opset),
+        "SequenceLength": SequenceLength.get_converter(opset),
         "ConcatFromSequence": ConcatFromSequence.get_converter(opset),
         "SplitToSequence": SplitToSequence.get_converter(opset),
         "SequenceAt": SequenceAt.get_converter(opset),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 6a780a632fb7..3e1af4086784 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -7747,10 +7747,17 @@ def verify_sequence_ops(tensor_shape, num_tensors, axis=0, position=0, new_axis=
             outputs=["inserted_sequence"],
         )
 
+        # Test sequence erase.
+        erase_node = helper.make_node(
+            "SequenceErase",
+            inputs=["inserted_sequence", "position"],
+            outputs=["erased_sequence"],
+        )
+
         # Test sequence concatenation.
         concat_node = helper.make_node(
             "ConcatFromSequence",
-            inputs=["inserted_sequence"],
+            inputs=["erased_sequence"],
             outputs=["concat_sequence"],
             axis=axis,
         )
@@ -7796,6 +7803,7 @@ def verify_sequence_ops(tensor_shape, num_tensors, axis=0, position=0, new_axis=
             position_node,
             construct_node,
             insert_node,
+            erase_node,
             concat_node,
             split_node,
             at_node,

From 23cf2f7b0ceb3c8a5e849e47503a844765f49385 Mon Sep 17 00:00:00 2001
From: Valery Chernov <black.chervi@gmail.com>
Date: Tue, 31 Jan 2023 15:48:13 +0400
Subject: [PATCH 248/286] [ONNX] Support SequenceEmpty op (#13866)

* add SequenceEmpty

* add SequenceEmpty test

* pylint fix

---------

Co-authored-by: Valery Chernov <valery.chernov@deelvin.com>
---
 python/tvm/relay/frontend/onnx.py          | 10 +++++++
 tests/python/frontend/onnx/test_forward.py | 32 ++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 93429a863889..8b4a0cc5e8d3 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -6148,6 +6148,15 @@ def _impl_v11(cls, inputs, attr, params):
         return _expr.Tuple(inputs)
 
 
+class SequenceEmpty(OnnxOpConverter):
+    """Operator converter for sequence empty op."""
+
+    @classmethod
+    def _impl_v11(cls, inputs, attr, params):
+        # Construct an empty tuple.
+        return _expr.Tuple([])
+
+
 class SequenceErase(OnnxOpConverter):
     """Operator converter for sequence erase op."""
 
@@ -6523,6 +6532,7 @@ def _get_convert_map(opset):
         "LinearRegressor": LinearRegressor.get_converter(opset),
         # Sequence operators
         "SequenceConstruct": SequenceConstruct.get_converter(opset),
+        "SequenceEmpty": SequenceEmpty.get_converter(opset),
         "SequenceErase": SequenceErase.get_converter(opset),
         "SequenceInsert": SequenceInsert.get_converter(opset),
         "SequenceLength": SequenceLength.get_converter(opset),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 3e1af4086784..dd172d1ddea6 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -7829,6 +7829,38 @@ def verify_sequence_ops(tensor_shape, num_tensors, axis=0, position=0, new_axis=
     verify_sequence_ops((3, 3, 3, 3), 4, axis=2, new_axis=1)
 
 
+@tvm.testing.parametrize_targets
+def test_empty_sequence(target, dev):
+    """test_empty_sequence"""
+
+    # Test creating an empty tensor sequence.
+    empty_node = helper.make_node(
+        "SequenceEmpty",
+        inputs=[],
+        outputs=["empty_sequence"],
+    )
+
+    length_node = helper.make_node("SequenceLength", inputs=["empty_sequence"], outputs=["output"])
+
+    graph_outputs = [helper.make_tensor_value_info("output", TensorProto.INT64, [])]
+
+    graph_nodes = [empty_node, length_node]
+
+    graph = helper.make_graph(
+        graph_nodes,
+        "Sequence_empty_test",
+        inputs=[],
+        outputs=graph_outputs,
+    )
+
+    model = helper.make_model(
+        graph,
+        producer_name="Sequence_empty_test",
+    )
+
+    verify_with_ort_with_inputs(model, [], target=target, dev=dev)
+
+
 def test_exporting_node_renamed_model():
     """test exproting model when export_node_renamed_model is set"""
 

From f258e2c0488d246f2553f2313b03299a74b42652 Mon Sep 17 00:00:00 2001
From: Liam Sturge <50229489+Liam-Sturge@users.noreply.github.com>
Date: Tue, 31 Jan 2023 14:29:04 +0000
Subject: [PATCH 249/286] [CI] NNPACK build issue workaround (#13873)

NNPACK build issue workaround

There is currently a problem building and installing NNPACK using the
script `ubuntu_install_nnpack.sh`. The build fails to complete with
CMake errors, complaining about missing libraries.

This issue is due to a change in the imported dependency pytorch/cpuinfo
that has moved the default branch name to 'main'.

This patch has been submitted as a workaround to the issue raised at
https://github.com/apache/tvm/issues/13871
---
 docker/install/ubuntu_install_nnpack.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_nnpack.sh b/docker/install/ubuntu_install_nnpack.sh
index e4a37f56f7eb..91d153aef840 100755
--- a/docker/install/ubuntu_install_nnpack.sh
+++ b/docker/install/ubuntu_install_nnpack.sh
@@ -26,7 +26,7 @@ git clone https://github.com/Maratyszcza/NNPACK NNPACK
 git clone https://github.com/Maratyszcza/pthreadpool  NNPACK/pthreadpool
 
 # Use specific versioning tag.
-(cd NNPACK && git checkout 1e005b0c2)
+(cd NNPACK && sed -i 's/GIT_TAG master/GIT_TAG main/g' ./cmake/DownloadCpuinfo.cmake && git checkout 1e005b0c2)
 (cd NNPACK/pthreadpool && git checkout 13da0b4c)
 
 mkdir -p NNPACK/build

From 044a6931a4d6125e4517cdada29dd1e25f451b44 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 31 Jan 2023 12:24:41 -0800
Subject: [PATCH 250/286] [microTVM]Refactor test and add skip to current
 failing tests/boards (#13858)

This PR refactors some of the Zephyr tests and adds skip for each test/board that is currently failing.
---
 tests/micro/common/test_autotune.py           |  3 +
 tests/micro/common/test_tvmc.py               |  3 +
 tests/micro/zephyr/test_ms_tuning.py          |  7 +-
 tests/micro/zephyr/test_zephyr.py             | 70 +++++++------------
 tests/micro/zephyr/test_zephyr_aot_exec.py    | 13 ++--
 .../zephyr/test_zephyr_aot_exec_standalone.py |  9 ++-
 tests/micro/zephyr/test_zephyr_armv7m.py      | 21 ++----
 tests/micro/zephyr/utils.py                   | 28 +-------
 8 files changed, 55 insertions(+), 99 deletions(-)

diff --git a/tests/micro/common/test_autotune.py b/tests/micro/common/test_autotune.py
index 46f6d8889a9a..14c0290c7b4f 100644
--- a/tests/micro/common/test_autotune.py
+++ b/tests/micro/common/test_autotune.py
@@ -31,6 +31,9 @@
 
 @pytest.mark.requires_hardware
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(
+    ["nucleo_l4r5zi", "", "nucleo_f746zg", "stm32f746g_disco", "nrf5340dk_nrf5340_cpuapp"]
+)
 def test_kws_autotune_workflow(platform, board, tmp_path):
     mod, params = fetch_model_from_url(
         url="https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/examples/micro_speech/micro_speech.tflite",
diff --git a/tests/micro/common/test_tvmc.py b/tests/micro/common/test_tvmc.py
index 3aa7fec2f299..1e1249b65ed8 100644
--- a/tests/micro/common/test_tvmc.py
+++ b/tests/micro/common/test_tvmc.py
@@ -128,6 +128,9 @@ def test_tvmc_model_build_only(platform, board, output_dir):
     "output_dir,",
     [pathlib.Path("./tvmc_relative_path_test"), pathlib.Path(tempfile.mkdtemp())],
 )
+@pytest.mark.skip_boards(
+    ["nucleo_l4r5zi", "", "nucleo_f746zg", "stm32f746g_disco", "nrf5340dk_nrf5340_cpuapp"]
+)
 def test_tvmc_model_run(platform, board, output_dir):
     target = tvm.micro.testing.get_target(platform, board)
 
diff --git a/tests/micro/zephyr/test_ms_tuning.py b/tests/micro/zephyr/test_ms_tuning.py
index 16d48ca4cdd6..560f5e09596a 100644
--- a/tests/micro/zephyr/test_ms_tuning.py
+++ b/tests/micro/zephyr/test_ms_tuning.py
@@ -22,6 +22,7 @@
 
 import tvm
 from tvm import relay
+import tvm.micro.testing
 from tvm.relay.backend import Executor
 from tvm.contrib import graph_executor, utils
 from tvm import meta_schedule as ms
@@ -61,7 +62,7 @@ def create_relay_module():
 
 
 @tvm.testing.requires_micro
-@pytest.mark.skip_boards(["mps2_an521", "mps3_an547"])
+@pytest.mark.skip_boards(["mps2_an521", "mps3_an547", "nucleo_f746zg", "stm32f746g_disco"])
 def test_ms_tuning_conv2d(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Test meta-schedule tuning for microTVM Zephyr"""
 
@@ -92,7 +93,7 @@ def test_ms_tuning_conv2d(workspace_dir, board, microtvm_debug, use_fvp, serial_
     boards_file = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) / "boards.json"
     with open(boards_file) as f:
         boards = json.load(f)
-    target = tvm.target.target.micro(model=boards[project_options["board"]]["model"])
+    target = tvm.micro.testing.get_target("zephyr", board)
 
     runtime = relay.backend.Runtime("crt", {"system-lib": True})
     executor = Executor("aot", {"link-params": True})
@@ -158,7 +159,7 @@ def test_ms_tuning_conv2d(workspace_dir, board, microtvm_debug, use_fvp, serial_
 
     # Build reference model (without tuning)
     dev = tvm.cpu()
-    target = tvm.target.target.micro(model="host")
+    target = tvm.micro.testing.get_target("crt")
     with tvm.transform.PassContext(
         opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["AlterOpLayout"]
     ):
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index f86f4a7a7f3f..89bd9c75fbe9 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -40,8 +40,7 @@
 
 def _make_sess_from_op(
     temp_dir,
-    model,
-    zephyr_board,
+    board,
     op_name,
     sched,
     arg_bufs,
@@ -50,23 +49,23 @@ def _make_sess_from_op(
     serial_number,
 ):
     runtime = Runtime("crt", {"system-lib": True})
-    target = tvm.target.target.micro(model)
+    target = tvm.micro.testing.get_target("zephyr", board)
     target = tvm.target.Target(target=target, host=target)
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.build(sched, arg_bufs, target=target, runtime=runtime, name=op_name)
 
-    return _make_session(temp_dir, zephyr_board, mod, build_config, use_fvp, serial_number)
+    return _make_session(temp_dir, board, mod, build_config, use_fvp, serial_number)
 
 
-def _make_session(temp_dir, zephyr_board, mod, build_config, use_fvp, serial_number):
+def _make_session(temp_dir, board, mod, build_config, use_fvp, serial_number):
     config_main_stack_size = None
-    if utils.qemu_boards(zephyr_board):
+    if utils.ZEPHYR_BOARDS[board]["is_qemu"]:
         config_main_stack_size = 1536
 
     project_options = {
         "project_type": "host_driven",
         "verbose": bool(build_config.get("debug")),
-        "board": zephyr_board,
+        "board": board,
         "arm_fvp_path": "/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4/FVP_Corstone_SSE-300_Ethos-U55",
         "use_fvp": bool(use_fvp),
         "serial_number": serial_number,
@@ -85,17 +84,14 @@ def _make_session(temp_dir, zephyr_board, mod, build_config, use_fvp, serial_num
     return tvm.micro.Session(project.transport())
 
 
-def _make_add_sess(
-    temp_dir, model, zephyr_board, build_config, use_fvp, serial_number, dtype="int8"
-):
+def _make_add_sess(temp_dir, board, build_config, use_fvp, serial_number, dtype="int8"):
     A = tvm.te.placeholder((2,), dtype=dtype)
     B = tvm.te.placeholder((1,), dtype=dtype)
     C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
     sched = tvm.te.create_schedule(C.op)
     return _make_sess_from_op(
         temp_dir,
-        model,
-        zephyr_board,
+        board,
         "add",
         sched,
         [A, B, C],
@@ -111,8 +107,6 @@ def _make_add_sess(
 @pytest.mark.xfail_on_fvp()
 def test_add_uint(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Test compiling the on-device runtime."""
-
-    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -128,7 +122,7 @@ def test_basic_add(sess):
         system_lib.get_function("add")(A_data, B_data, C_data)
         assert (C_data.numpy() == np.array([6, 7])).all()
 
-    with _make_add_sess(workspace_dir, model, board, build_config, use_fvp, serial_number) as sess:
+    with _make_add_sess(workspace_dir, board, build_config, use_fvp, serial_number) as sess:
         test_basic_add(sess)
 
 
@@ -138,8 +132,7 @@ def test_basic_add(sess):
 @pytest.mark.xfail_on_fvp()
 def test_add_float(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Test compiling the on-device runtime."""
-    model = utils.ZEPHYR_BOARDS[board]
-    if not utils.has_fpu(board):
+    if not utils.ZEPHYR_BOARDS[board]["fpu"]:
         pytest.skip(f"FPU not enabled for {board}")
 
     build_config = {"debug": microtvm_debug}
@@ -159,7 +152,6 @@ def test_basic_add(sess):
 
     with _make_add_sess(
         workspace_dir,
-        model,
         board,
         build_config,
         use_fvp,
@@ -174,8 +166,6 @@ def test_basic_add(sess):
 @pytest.mark.xfail_on_fvp()
 def test_platform_timer(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Test compiling the on-device runtime."""
-
-    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -196,7 +186,7 @@ def test_basic_add(sess):
         assert result.mean > 0
         assert len(result.results) == 3
 
-    with _make_add_sess(workspace_dir, model, board, build_config, use_fvp, serial_number) as sess:
+    with _make_add_sess(workspace_dir, board, build_config, use_fvp, serial_number) as sess:
         test_basic_add(sess)
 
 
@@ -205,7 +195,6 @@ def test_basic_add(sess):
 @pytest.mark.xfail_on_fvp()
 def test_relay(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Testing a simple relay graph"""
-    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
     shape = (10,)
     dtype = "int8"
@@ -218,7 +207,7 @@ def test_relay(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     ir_mod = tvm.IRModule.from_expr(func)
 
     runtime = Runtime("crt", {"system-lib": True})
-    target = tvm.target.target.micro(model)
+    target = tvm.micro.testing.get_target("zephyr", board)
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.relay.build(ir_mod, target=target, runtime=runtime)
 
@@ -239,7 +228,6 @@ def test_relay(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
 @pytest.mark.xfail_on_fvp()
 def test_onnx(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Testing a simple ONNX model."""
-    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
 
     this_dir = pathlib.Path(os.path.dirname(__file__))
@@ -262,7 +250,7 @@ def test_onnx(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     # There is currently a bug preventing the host_driven environment from receiving
     # the model weights when set using graph_mod.set_input().
     # See: https://github.com/apache/tvm/issues/7567
-    target = tvm.target.target.micro(model)
+    target = tvm.micro.testing.get_target("zephyr", board)
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         executor = Executor("graph", {"link-params": True})
         runtime = Runtime("crt", {"system-lib": True})
@@ -292,8 +280,7 @@ def test_onnx(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
 def check_result(
     temp_dir,
     relay_mod,
-    model,
-    zephyr_board,
+    board,
     map_inputs,
     out_shape,
     result,
@@ -304,13 +291,11 @@ def check_result(
     """Helper function to verify results"""
     TOL = 1e-5
     runtime = Runtime("crt", {"system-lib": True})
-    target = tvm.target.target.micro(model)
+    target = tvm.micro.testing.get_target("zephyr", board)
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.relay.build(relay_mod, target=target, runtime=runtime)
 
-    with _make_session(
-        temp_dir, zephyr_board, mod, build_config, use_fvp, serial_number
-    ) as session:
+    with _make_session(temp_dir, board, mod, build_config, use_fvp, serial_number) as session:
         rt_mod = tvm.micro.create_local_graph_executor(
             mod.get_graph_json(), session.get_system_lib(), session.device
         )
@@ -334,7 +319,6 @@ def check_result(
 @pytest.mark.xfail_on_fvp()
 def test_byoc_microtvm(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """This is a simple test case to check BYOC capabilities of microTVM"""
-    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
     x = relay.var("x", shape=(10, 10))
     w0 = relay.var("w0", shape=(10, 10))
@@ -387,22 +371,19 @@ def test_byoc_microtvm(workspace_dir, board, microtvm_debug, use_fvp, serial_num
             ),
             axis=0,
         ),
-        model=model,
-        zephyr_board=board,
+        board=board,
         build_config=build_config,
         use_fvp=use_fvp,
         serial_number=serial_number,
     )
 
 
-def _make_add_sess_with_shape(
-    temp_dir, model, zephyr_board, shape, build_config, use_fvp, serial_number
-):
+def _make_add_sess_with_shape(temp_dir, board, shape, build_config, use_fvp, serial_number):
     A = tvm.te.placeholder(shape, dtype="int8")
     C = tvm.te.compute(A.shape, lambda i: A[i] + A[i], name="C")
     sched = tvm.te.create_schedule(C.op)
     return _make_sess_from_op(
-        temp_dir, model, zephyr_board, "add", sched, [A, C], build_config, use_fvp, serial_number
+        temp_dir, board, "add", sched, [A, C], build_config, use_fvp, serial_number
     )
 
 
@@ -419,7 +400,6 @@ def _make_add_sess_with_shape(
 @pytest.mark.xfail_on_fvp()
 def test_rpc_large_array(workspace_dir, board, microtvm_debug, shape, use_fvp, serial_number):
     """Test large RPC array transfer."""
-    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
 
     # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
@@ -432,7 +412,7 @@ def test_tensors(sess):
         assert (C_data.numpy() == np.zeros(shape)).all()
 
     with _make_add_sess_with_shape(
-        workspace_dir, model, board, shape, build_config, use_fvp, serial_number
+        workspace_dir, board, shape, build_config, use_fvp, serial_number
     ) as sess:
         test_tensors(sess)
 
@@ -445,7 +425,6 @@ def test_autotune_conv2d(workspace_dir, board, microtvm_debug, use_fvp, serial_n
         pytest.xfail(f"Autotune fails on {board}.")
 
     runtime = Runtime("crt", {"system-lib": True})
-    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
 
     # Create a Relay model
@@ -473,14 +452,14 @@ def test_autotune_conv2d(workspace_dir, board, microtvm_debug, use_fvp, serial_n
     ).astype("float32")
     params = {mod["main"].params[1].name_hint: weight_sample}
 
-    target = tvm.target.target.micro(model)
+    target = tvm.micro.testing.get_target("zephyr", board)
     pass_context = tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True})
     with pass_context:
         tasks = tvm.autotvm.task.extract_from_program(mod["main"], {}, target)
     assert len(tasks) > 0
 
     config_main_stack_size = None
-    if utils.qemu_boards(board):
+    if utils.ZEPHYR_BOARDS[board]["is_qemu"]:
         config_main_stack_size = 1536
 
     project_options = {
@@ -572,9 +551,10 @@ def test_schedule_build_with_cmsis_dependency(workspace_dir, board, microtvm_deb
     """Test Relay schedule with CMSIS dependency. This test shows if microTVM Auto tuning
     with Zephyr breaks if CMSIS dependency was required for a schedule.
     """
-    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
-    target = tvm.target.target.micro(model, options=["-keys=arm_cpu,cpu"])
+    target = tvm.target.target.micro(
+        utils.ZEPHYR_BOARDS[board]["model"], options=["-keys=arm_cpu,cpu"]
+    )
 
     if not target.features.has_dsp:
         pytest.skip(f"ISA does not support DSP. target: {target}")
diff --git a/tests/micro/zephyr/test_zephyr_aot_exec.py b/tests/micro/zephyr/test_zephyr_aot_exec.py
index a67cf0830a70..d42c7a00b40e 100644
--- a/tests/micro/zephyr/test_zephyr_aot_exec.py
+++ b/tests/micro/zephyr/test_zephyr_aot_exec.py
@@ -19,6 +19,7 @@
 
 import tvm
 import tvm.testing
+import tvm.micro.testing
 import tvm.relay as relay
 from tvm.relay.backend import Executor, Runtime
 from tvm.contrib import utils
@@ -26,9 +27,9 @@
 from . import utils
 
 
-def _make_session(workspace_dir, zephyr_board, mod, build_config, use_fvp, serial_number):
+def _make_session(workspace_dir, board, mod, build_config, use_fvp, serial_number):
     config_main_stack_size = None
-    if utils.qemu_boards(zephyr_board):
+    if utils.ZEPHYR_BOARDS[board]["is_qemu"]:
         # fyi: qemu_riscv64 seems to be the greediest stack user
         config_main_stack_size = 4096
     else:
@@ -38,7 +39,7 @@ def _make_session(workspace_dir, zephyr_board, mod, build_config, use_fvp, seria
     project_options = {
         "project_type": "host_driven",
         "verbose": bool(build_config.get("debug")),
-        "board": zephyr_board,
+        "board": board,
         "arm_fvp_path": "/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4/FVP_Corstone_SSE-300_Ethos-U55",
         "use_fvp": bool(use_fvp),
         "serial_number": serial_number,
@@ -63,7 +64,6 @@ def _make_session(workspace_dir, zephyr_board, mod, build_config, use_fvp, seria
 def test_relay(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Testing a simple relay graph"""
 
-    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
     shape = (10,)
     dtype = "int8"
@@ -77,7 +77,7 @@ def test_relay(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
 
     runtime = Runtime("crt", {"system-lib": True})
     executor = Executor("aot")
-    target = tvm.target.target.micro(model)
+    target = tvm.micro.testing.get_target("zephyr", board)
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.relay.build(ir_mod, target=target, runtime=runtime, executor=executor)
 
@@ -98,7 +98,6 @@ def test_relay(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
 def test_aot_executor(workspace_dir, board, microtvm_debug, use_fvp, serial_number):
     """Test use of the AOT executor with microTVM."""
 
-    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
     shape = (10,)
     dtype = "int8"
@@ -117,7 +116,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8]) {
 
     runtime = Runtime("crt", {"system-lib": True})
     executor = Executor("aot")
-    target = tvm.target.target.micro(model)
+    target = tvm.micro.testing.get_target("zephyr", board)
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         mod = tvm.relay.build(relay_mod, target=target, runtime=runtime, executor=executor)
 
diff --git a/tests/micro/zephyr/test_zephyr_aot_exec_standalone.py b/tests/micro/zephyr/test_zephyr_aot_exec_standalone.py
index 9e015448e91b..16c1f9e30814 100644
--- a/tests/micro/zephyr/test_zephyr_aot_exec_standalone.py
+++ b/tests/micro/zephyr/test_zephyr_aot_exec_standalone.py
@@ -22,6 +22,7 @@
 
 import tvm
 import tvm.testing
+import tvm.micro.testing
 from tvm.micro.project_api import server
 import tvm.relay as relay
 from tvm.relay.backend import Executor, Runtime
@@ -34,7 +35,6 @@
 @pytest.mark.skip_boards(["mps2_an521", "mps3_an547"])
 def test_tflite(workspace_dir, board, microtvm_debug, serial_number):
     """Testing a TFLite model."""
-    model = utils.ZEPHYR_BOARDS[board]
     input_shape = (1, 49, 10, 1)
     output_shape = (1, 12)
     build_config = {"debug": microtvm_debug}
@@ -58,7 +58,7 @@ def test_tflite(workspace_dir, board, microtvm_debug, serial_number):
         tflite_model, shape_dict={"input_1": input_shape}, dtype_dict={"input_1 ": "int8"}
     )
 
-    target = tvm.target.target.micro(model)
+    target = tvm.micro.testing.get_target("zephyr", board)
     executor = Executor(
         "aot", {"unpacked-api": True, "interface-api": "c", "workspace-byte-alignment": 4}
     )
@@ -90,10 +90,9 @@ def test_tflite(workspace_dir, board, microtvm_debug, serial_number):
 @pytest.mark.skip_boards(["mps2_an521", "mps3_an547"])
 def test_qemu_make_fail(workspace_dir, board, microtvm_debug, serial_number):
     """Testing QEMU make fail."""
-    if board not in ["qemu_x86", "mps2_an521", "mps3_an547"]:
+    if not utils.ZEPHYR_BOARDS[board]["is_qemu"]:
         pytest.skip(msg="Only for QEMU targets.")
 
-    model = utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": microtvm_debug}
     shape = (10,)
     dtype = "float32"
@@ -105,7 +104,7 @@ def test_qemu_make_fail(workspace_dir, board, microtvm_debug, serial_number):
     func = relay.Function([x], z)
     ir_mod = tvm.IRModule.from_expr(func)
 
-    target = tvm.target.target.micro(model)
+    target = tvm.micro.testing.get_target("zephyr", board)
     executor = Executor("aot")
     runtime = Runtime("crt")
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
diff --git a/tests/micro/zephyr/test_zephyr_armv7m.py b/tests/micro/zephyr/test_zephyr_armv7m.py
index eb709382024d..cd589a19e886 100644
--- a/tests/micro/zephyr/test_zephyr_armv7m.py
+++ b/tests/micro/zephyr/test_zephyr_armv7m.py
@@ -95,21 +95,12 @@ def _apply_desired_layout_no_simd(relay_mod):
 
 
 @tvm.testing.requires_micro
-@pytest.mark.skip_boards(["mps2_an521"])
+@pytest.mark.skip_boards(
+    ["mps2_an521", "stm32f746g_disco", "nucleo_f746zg", "nucleo_l4r5zi", "nrf5340dk_nrf5340_cpuapp"]
+)
 @pytest.mark.xfail(reason="due https://github.com/apache/tvm/issues/12619")
 def test_armv7m_intrinsic(workspace_dir, board, microtvm_debug, serial_number):
     """Testing a ARM v7m SIMD extension."""
-    if board not in [
-        "mps2_an521",
-        "stm32f746g_disco",
-        "nucleo_f746zg",
-        "nucleo_l4r5zi",
-        "nrf5340dk_nrf5340_cpuapp",
-    ]:
-        pytest.skip(msg="Platform does not support ARM v7m SIMD extension.")
-
-    model = utils.ZEPHYR_BOARDS[board]
-
     build_config = {"debug": microtvm_debug}
 
     this_dir = pathlib.Path(os.path.dirname(__file__))
@@ -123,8 +114,10 @@ def test_armv7m_intrinsic(workspace_dir, board, microtvm_debug, serial_number):
     # kernel layout "HWIO" is not supported by arm_cpu SIMD extension (see tvm\python\relay\op\strategy\arm_cpu.py)
     relay_mod_no_simd = _apply_desired_layout_no_simd(relay_mod)
 
-    target = tvm.target.target.micro(model, options=["-keys=cpu"])
-    target_simd = tvm.target.target.micro(model, options=["-keys=arm_cpu,cpu"])
+    target = tvm.target.target.micro(utils.ZEPHYR_BOARDS[board]["model"], options=["-keys=cpu"])
+    target_simd = tvm.target.target.micro(
+        utils.ZEPHYR_BOARDS[board]["model"], options=["-keys=arm_cpu,cpu"]
+    )
 
     executor = Executor("aot", {"unpacked-api": True, "interface-api": "c"})
     runtime = Runtime("crt")
diff --git a/tests/micro/zephyr/utils.py b/tests/micro/zephyr/utils.py
index bdac4e9c63a7..26f9d6a10e2d 100644
--- a/tests/micro/zephyr/utils.py
+++ b/tests/micro/zephyr/utils.py
@@ -41,41 +41,19 @@
 
 TEMPLATE_PROJECT_DIR = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
 
-BOARDS = TEMPLATE_PROJECT_DIR / "boards.json"
-
 _LOG = logging.getLogger(__name__)
 
 
 def zephyr_boards() -> dict:
-    """Returns a dict mapping board to target model"""
-    with open(BOARDS) as f:
+    """Returns Zephyr board properties"""
+    with open(TEMPLATE_PROJECT_DIR / "boards.json") as f:
         board_properties = json.load(f)
-
-    boards_model = {board: info["model"] for board, info in board_properties.items()}
-    return boards_model
+    return board_properties
 
 
 ZEPHYR_BOARDS = zephyr_boards()
 
 
-def qemu_boards(board: str):
-    """Returns True if board is QEMU."""
-    with open(BOARDS) as f:
-        board_properties = json.load(f)
-
-    qemu_boards = [name for name, board in board_properties.items() if board["is_qemu"]]
-    return board in qemu_boards
-
-
-def has_fpu(board: str):
-    """Returns True if board has FPU."""
-    with open(BOARDS) as f:
-        board_properties = json.load(f)
-
-    fpu_boards = [name for name, board in board_properties.items() if board["fpu"]]
-    return board in fpu_boards
-
-
 def build_project(
     temp_dir, zephyr_board, mod, build_config, serial_number, simd=False, extra_files_tar=None
 ):

From 14462f74111996c93391ae8255bacd794c6c92ab Mon Sep 17 00:00:00 2001
From: Valery Chernov <black.chervi@gmail.com>
Date: Wed, 1 Feb 2023 01:20:58 +0400
Subject: [PATCH 251/286] [TIR][TOPI][CI] Fix number of arguments in calls of
 llvm_pure_intrin (#13881)

fix number of arguments in calls of llvm_pure_intrin

Co-authored-by: Valery Chernov <valery.chernov@deelvin.com>
---
 python/tvm/tir/tensor_intrin/x86.py                  |  2 +-
 python/tvm/topi/x86/tensor_intrin.py                 | 12 ++++++------
 .../test_meta_schedule_postproc_rewrite_tensorize.py |  2 +-
 .../unittest/test_meta_schedule_trace_apply.py       |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index c527d0d21008..b4b6f07cd90e 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -59,7 +59,7 @@ def dot_product_16x4_u8i8i32_vnni(
 
         C[T.ramp(T.int32(0), 1, 16)] = T.call_llvm_pure_intrin(
             T.llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512"),
-            T.uint32(0),
+            T.uint32(3),
             C_i32x16,
             T.broadcast(A_i32, 16),
             B_i32x16,
diff --git a/python/tvm/topi/x86/tensor_intrin.py b/python/tvm/topi/x86/tensor_intrin.py
index 3b83fecbf552..519922a2bd4a 100644
--- a/python/tvm/topi/x86/tensor_intrin.py
+++ b/python/tvm/topi/x86/tensor_intrin.py
@@ -120,14 +120,14 @@ def _instr(index):
             pair_reduction = tvm.tir.call_llvm_pure_intrin(
                 int_lx32,
                 pmaddubs,
-                tvm.tir.const(0, "uint32"),
+                tvm.tir.const(2, "uint32"),
                 vec_a,
                 vec_b,
             )
             quad_reduction = tvm.tir.call_llvm_pure_intrin(
                 int_32xl,
                 pmaddw,
-                tvm.tir.const(0, "uint32"),
+                tvm.tir.const(2, "uint32"),
                 pair_reduction,
                 vec_one,
             )
@@ -215,7 +215,7 @@ def _instr(index):
                 pair_reduction = tvm.tir.call_llvm_pure_intrin(
                     "int16x32",
                     "llvm.x86.avx512.pmaddubs.w.512",
-                    tvm.tir.const(0, "uint32"),
+                    tvm.tir.const(2, "uint32"),
                     vec_a,
                     vec_b,
                 )
@@ -309,7 +309,7 @@ def _instr(index):
                 quad_reduction = tvm.tir.call_llvm_pure_intrin(
                     "int32x16",
                     "llvm.x86.avx512.vpdpbusd.512",
-                    tvm.tir.const(0, "uint32"),
+                    tvm.tir.const(3, "uint32"),
                     vec_c,
                     vec_ai32,
                     vec_bi32,
@@ -321,14 +321,14 @@ def _instr(index):
                 pair_reduction = tvm.tir.call_llvm_pure_intrin(
                     "int16x32",
                     "llvm.x86.avx512.pmaddubs.w.512",
-                    tvm.tir.const(0, "uint32"),
+                    tvm.tir.const(2, "uint32"),
                     vec_a,
                     vec_b,
                 )
                 quad_reduction = tvm.tir.call_llvm_pure_intrin(
                     "int32x16",
                     "llvm.x86.avx512.pmaddw.d.512",
-                    tvm.tir.const(0, "uint32"),
+                    tvm.tir.const(2, "uint32"),
                     pair_reduction,
                     vec_one,
                 )
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
index 1d51b932f359..21755e1338eb 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
@@ -236,7 +236,7 @@ def main(
                     C_i32x16 = C.vload([0], dtype="int32x16")
                     C[T.ramp(0, 1, 16)] = T.call_llvm_pure_intrin(
                         T.llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512"),
-                        T.uint32(0),
+                        T.uint32(3),
                         C_i32x16,
                         T.broadcast(A_i32, 16),
                         B_i32x16,
diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
index 43b9eb8bbb19..aadc530a9ba8 100644
--- a/tests/python/unittest/test_meta_schedule_trace_apply.py
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -1182,7 +1182,7 @@ def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1,
                             B_i8x64: T.int8x64 = B[0, 0:64]
                             B_i32x16: T.int32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
                             C_i32x16: T.int32x16 = C[0:16]
-                            C[0:16] = T.call_llvm_pure_intrin(T.uint32(intrin_id), T.uint32(0), C_i32x16, T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
+                            C[0:16] = T.call_llvm_pure_intrin(T.uint32(intrin_id), T.uint32(3), C_i32x16, T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
                     for ax0, ax1, ax2, ax3 in T.grid(1, 1, 1, 7):
                         for ax4_fused in T.vectorized(16):
                             with T.block("T_cast_8"):

From 9be25a2d95ab7cba4a8ba796b7c7f6570a0e35ea Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Wed, 1 Feb 2023 15:15:16 +0800
Subject: [PATCH 252/286] [ARITH] support floordiv in deduce bound (#13880)

* support floordiv in deduce bound

* add rule for (x // -positive)

* leave todo for x // a == b
---
 src/arith/bound_deducer.cc                    | 61 ++++++++++++++++---
 .../unittest/test_arith_deduce_bound.py       | 38 +++++++++++-
 2 files changed, 91 insertions(+), 8 deletions(-)

diff --git a/src/arith/bound_deducer.cc b/src/arith/bound_deducer.cc
index d4a3101378b0..7cfe8681bea3 100644
--- a/src/arith/bound_deducer.cc
+++ b/src/arith/bound_deducer.cc
@@ -94,6 +94,13 @@ class BoundDeducer : public ExprFunctor<void(const PrimExpr&)> {
 
   void VisitExprDefault_(const Object* op) final { success_ = false; }
 
+  SignType GetSignType(const PrimExpr& e) {
+    if (e.dtype().is_uint()) {
+      return kPositive;
+    }
+    return expr_map_[e].GetSignType();
+  }
+
   void VisitExpr_(const VarNode* op) final {}
 
   void VisitExpr_(const AddNode* op) final {
@@ -119,13 +126,7 @@ class BoundDeducer : public ExprFunctor<void(const PrimExpr&)> {
     PrimExpr operand = left ? op->b : op->a;
     PrimExpr target_var = left ? op->a : op->b;
 
-    SignType sign_operand;
-    if (operand.dtype().is_uint()) {
-      sign_operand = kPositive;
-    } else {
-      sign_operand = expr_map_[operand].GetSignType();
-    }
-
+    SignType sign_operand = GetSignType(operand);
     if (sign_operand == SignType::kNegative) {
       comp_op = ReverseOp(comp_op);
     } else if (sign_operand == SignType::kUnknown) {
@@ -162,6 +163,52 @@ class BoundDeducer : public ExprFunctor<void(const PrimExpr&)> {
     this->VisitExpr(left ? op->a : op->b);
   }
 
+  void VisitExpr_(const FloorDivNode* op) final {
+    if (op->b.get() == path_[iter_]) {
+      // Skip cases where the var is divisor.
+      success_ = false;
+      return;
+    }
+    PrimExpr divisor = op->b;
+    if (analyzer_.CanProveEqual(divisor, 0)) {
+      // Skip zero divisor
+      success_ = false;
+      return;
+    }
+
+    SignType sign_operand = GetSignType(divisor);
+    if (sign_operand == SignType::kNegative) {
+      comp_op = ReverseOp(comp_op);
+      divisor = -divisor;
+      result_ = -result_;
+    } else if (sign_operand == SignType::kUnknown) {
+      // unable to get the sign of operand
+      success_ = false;
+      return;
+    }
+
+    if (comp_op == kGreater) {
+      // (x // 6 >= 4 --> x >= 4 * 6)
+      result_ = result_ * divisor;
+    } else if (comp_op == kEqual) {
+      // The bound is not single directional
+      // (x // 6 == 4 --> 30 > x >= 24)
+      // TODO(@wrongtest): support bidirectional bound
+      success_ = false;
+      return;
+    } else {
+      // (x // 6 <= 4 --> x <= 4 * 6 + 5)
+      result_ = result_ * divisor + divisor - 1;
+    }
+    if (sign_operand == SignType::kNegative) {
+      // (x // -6 >= 4 --> -((x + 6 - 1) // 6) >= 4
+      //               --> (x + 6 - 1) // 6 <= -4
+      result_ = result_ - divisor + 1;
+    }
+
+    this->VisitExpr(op->a);
+  }
+
   PrimExpr result_;
   CompareOp comp_op{kGreater};
   bool success_{true};
diff --git a/tests/python/unittest/test_arith_deduce_bound.py b/tests/python/unittest/test_arith_deduce_bound.py
index d5e0303b05b2..45ecb6275549 100644
--- a/tests/python/unittest/test_arith_deduce_bound.py
+++ b/tests/python/unittest/test_arith_deduce_bound.py
@@ -219,7 +219,6 @@ def test_non_support(lhs):
         res = tvm.arith.deduce_bound(a, lhs < 10, {}, {})
         assert res.is_nothing()
 
-    test_non_support(tvm.tir.floordiv(a, 16))
     test_non_support(tvm.tir.floormod(a, 16))
     test_non_support(tvm.tir.Min(a, 16))
     test_non_support(tvm.tir.Max(a, 16))
@@ -233,5 +232,42 @@ def test_non_support(lhs):
     test_non_support(tvm.tir.BufferLoad(decl_buffer([16], "int32"), [a]))
 
 
+def test_deduce_floordiv():
+    def do_test(gen_expr, dom_map, expect_min, expect_max):
+        a = te.var("a")
+        expr = gen_expr(a)
+        res = tvm.arith.deduce_bound(a, expr, dom_map, dom_map)
+        if isinstance(expect_min, str):
+            assert str(res.min_value) == expect_min
+        else:
+            tvm.testing.assert_prim_expr_equal(res.min_value, expect_min)
+        if isinstance(expect_max, str):
+            assert str(res.max_value) == expect_max
+        else:
+            tvm.testing.assert_prim_expr_equal(res.max_value, expect_max)
+
+    # test basic cases
+    do_test(lambda a: a // 8 > 3, {}, 32, "pos_inf")
+    do_test(lambda a: a // 8 >= 3, {}, 24, "pos_inf")
+    do_test(lambda a: a // 8 < 3, {}, "neg_inf", 23)
+    do_test(lambda a: a // 8 <= 3, {}, "neg_inf", 31)
+    do_test(lambda a: a // 8 == 3, {}, "pos_inf", "neg_inf")
+    do_test(lambda a: a // 8 > -3, {}, -16, "pos_inf")
+    do_test(lambda a: a // 8 >= -3, {}, -24, "pos_inf")
+    do_test(lambda a: a // -8 > 3, {}, "neg_inf", -32)
+    do_test(lambda a: a // -8 >= 3, {}, "neg_inf", -24)
+    do_test(lambda a: a // -8 < 3, {}, -23, "pos_inf")
+    do_test(lambda a: a // -8 <= 3, {}, -31, "pos_inf")
+    do_test(lambda a: 8 // a >= 2, {}, "pos_inf", "neg_inf")
+
+    # test nested cases
+    b = te.var("b")
+    bs = {b: tvm.arith.IntervalSet(2, 6)}
+    do_test(lambda a: b * 3 + a // 8 < 63, bs, "neg_inf", 359)
+    do_test(lambda a: b * 3 + a // 8 <= 63, bs, "neg_inf", 367)
+    do_test(lambda a: b * 3 + a // 8 > 63, bs, 464, "pos_inf")
+    do_test(lambda a: b * 3 + a // 8 >= 63, bs, 456, "pos_inf")
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From e02f3c3816971dfc1471c21f2e0a4ad2f44392b3 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 11:34:15 +0100
Subject: [PATCH 253/286] CI fixes

---
 src/tir/ir/stmt.cc                                      | 2 +-
 src/tir/transforms/inject_gemmini_pointer_correction.cc | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index c01e6ccaec5f..ff28121db27d 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -35,11 +35,11 @@ namespace tir {
 LetStmt::LetStmt(Var var, PrimExpr value, Stmt body, Span span) {
   ICHECK(value.defined());
   ICHECK(body.defined());
-  auto vdtype = value.dtype();
   // It is still valid to bind a pointer type
   // var to a value that is of type handle.
   if (var->type_annotation.as<PointerTypeNode>()) {
     // TODO (FP): Is this check really necessary?
+    // auto vdtype = value.dtype();
     // ICHECK(vdtype.is_handle());
   } else {
     ICHECK_EQ(value.dtype(), var.dtype());
diff --git a/src/tir/transforms/inject_gemmini_pointer_correction.cc b/src/tir/transforms/inject_gemmini_pointer_correction.cc
index d73f6b9b63ca..54f5692fee2c 100644
--- a/src/tir/transforms/inject_gemmini_pointer_correction.cc
+++ b/src/tir/transforms/inject_gemmini_pointer_correction.cc
@@ -83,10 +83,8 @@ class CorrectGemminisScratchpadAndAccumulatorPointersInjector : public StmtExprM
         auto info = GetMemoryInfo(scope);
         ICHECK(info.defined()) << "Cannot find memory info of " << scope;
         DataType dtype = Downcast<PrimType>(ptr_type->element_type)->dtype;
-        int dtype_bits = dtype.bits() * dtype.lanes();
 
         int div = dim_;
-        const IntImmNode* extent_int = extent.as<IntImmNode>();
 
         PrimExpr inner_offset = indexmod(offset, extent);
         PrimExpr outer_offset = offset - inner_offset;

From 1fb123169b848f72fecfa73a9f5da5a7d09334a5 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 11:50:44 +0100
Subject: [PATCH 254/286] LINT changes

---
 .../template_project/microtvm_api_server.py   |  2 +-
 .../src/{Makefile.template => Makefile}       |  0
 .../src/{Makefrag => Makefrag.mk}             |  0
 cmake/modules/contrib/Gemmini.cmake           | 24 +++++++++----------
 4 files changed, 13 insertions(+), 13 deletions(-)
 rename apps/microtvm/gemmini/template_project/src/{Makefile.template => Makefile} (100%)
 rename apps/microtvm/gemmini/template_project/src/{Makefrag => Makefrag.mk} (100%)

diff --git a/apps/microtvm/gemmini/template_project/microtvm_api_server.py b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
index df2f27d315ea..1f55eedf1e3d 100644
--- a/apps/microtvm/gemmini/template_project/microtvm_api_server.py
+++ b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
@@ -110,7 +110,7 @@ def _copy_project_files(self, api_server_dir, project_dir, project_type):
             else:
                 shutil.copy2(item, dest)
 
-        shutil.copy2(project_dir / "src" / "Makefile.template", project_dir / "src" / "Makefile")
+        shutil.copy2(project_dir / "src" / "Makefrag.mk", project_dir / "src" / "Makefrag")
 
         test_name = project_type.replace("_example", "")
         new_line = f"tests = {test_name}\n"
diff --git a/apps/microtvm/gemmini/template_project/src/Makefile.template b/apps/microtvm/gemmini/template_project/src/Makefile
similarity index 100%
rename from apps/microtvm/gemmini/template_project/src/Makefile.template
rename to apps/microtvm/gemmini/template_project/src/Makefile
diff --git a/apps/microtvm/gemmini/template_project/src/Makefrag b/apps/microtvm/gemmini/template_project/src/Makefrag.mk
similarity index 100%
rename from apps/microtvm/gemmini/template_project/src/Makefrag
rename to apps/microtvm/gemmini/template_project/src/Makefrag.mk
diff --git a/cmake/modules/contrib/Gemmini.cmake b/cmake/modules/contrib/Gemmini.cmake
index 0d224c74ea75..757a99217510 100644
--- a/cmake/modules/contrib/Gemmini.cmake
+++ b/cmake/modules/contrib/Gemmini.cmake
@@ -10,9 +10,9 @@ if(USE_GEMMINI)
 
       # Dense example project generation
       "apps/microtvm/gemmini/template_project/src dense.c -> gemmini/src/dense_example"
-      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/dense_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dense_example"
-      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/dense_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/dense_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/dense_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/dense_example/include"
@@ -20,9 +20,9 @@ if(USE_GEMMINI)
 
       # CONV2D example project generation
       "apps/microtvm/gemmini/template_project/src conv2d.c -> gemmini/src/conv2d_example"
-      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/conv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/conv2d_example"
-      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/conv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/conv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/conv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/conv2d_example/include"
@@ -30,9 +30,9 @@ if(USE_GEMMINI)
 
       # DW CONV2D example project generation
       "apps/microtvm/gemmini/template_project/src dwconv2d.c -> gemmini/src/dwconv2d_example"
-      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/dwconv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dwconv2d_example"
-      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/dwconv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/dwconv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/dwconv2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/dwconv2d_example/include"
@@ -40,9 +40,9 @@ if(USE_GEMMINI)
 
       # ADD example project generation
       "apps/microtvm/gemmini/template_project/src add.c -> gemmini/src/add_example"
-      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/add_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/add_example"
-      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/add_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/add_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/add_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/add_example/include"
@@ -50,9 +50,9 @@ if(USE_GEMMINI)
 
       # Max pooling 2d example project generation
       "apps/microtvm/gemmini/template_project/src maxpool2d.c -> gemmini/src/maxpool2d_example"
-      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/maxpool2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/maxpool2d_example"
-      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/maxpool2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/maxpool2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/maxpool2d_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/maxpool2d_example/include"
@@ -60,9 +60,9 @@ if(USE_GEMMINI)
 
       # Mobilenet example project generation
       "apps/microtvm/gemmini/template_project/src mobilenet.c -> gemmini/src/mobilenet_example"
-      "apps/microtvm/gemmini/template_project/src Makefile.template -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/mobilenet_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/mobilenet_example"
-      "apps/microtvm/gemmini/template_project/src Makefrag -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/mobilenet_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests build.sh -> gemmini/src/mobilenet_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests configure.ac -> gemmini/src/mobilenet_example"
       "3rdparty/gemmini/software/gemmini-rocc-tests/include *.h -> gemmini/src/mobilenet_example/include"

From a83f00db20fdbb54634ff97b6987166ba13f6d8c Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 11:59:48 +0100
Subject: [PATCH 255/286] LINT fix

---
 src/tir/transforms/inject_gemmini_pointer_correction.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/tir/transforms/inject_gemmini_pointer_correction.cc b/src/tir/transforms/inject_gemmini_pointer_correction.cc
index 54f5692fee2c..4a9260ff014c 100644
--- a/src/tir/transforms/inject_gemmini_pointer_correction.cc
+++ b/src/tir/transforms/inject_gemmini_pointer_correction.cc
@@ -82,7 +82,6 @@ class CorrectGemminisScratchpadAndAccumulatorPointersInjector : public StmtExprM
         auto scope = ptr_type->storage_scope;
         auto info = GetMemoryInfo(scope);
         ICHECK(info.defined()) << "Cannot find memory info of " << scope;
-        DataType dtype = Downcast<PrimType>(ptr_type->element_type)->dtype;
 
         int div = dim_;
 

From 6fd52872c39fb53f6dc5580728b45c6c8beaeb20 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 12:59:24 +0100
Subject: [PATCH 256/286] Lint fix

---
 python/tvm/tir/transform/transform.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 82b162ef7df0..a72390997420 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -222,7 +222,7 @@ def CorrectGemminisScratchpadAndAccumulatorPointers():
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.CorrectGemminisScratchpadAndAccumulatorPointers()
+    return _ffi_api.CorrectGemminisScratchpadAndAccumulatorPointers()  # type: ignore
 
 
 def StorageRewrite():

From ccb5732086f1dfe429d5537e2b32d2761b3aac9f Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 13:26:47 +0100
Subject: [PATCH 257/286] LINT fix

---
 apps/microtvm/gemmini/template_project/src/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/apps/microtvm/gemmini/template_project/src/Makefile b/apps/microtvm/gemmini/template_project/src/Makefile
index 9368836a8802..8849236926b0 100644
--- a/apps/microtvm/gemmini/template_project/src/Makefile
+++ b/apps/microtvm/gemmini/template_project/src/Makefile
@@ -57,4 +57,3 @@ run-baremetal: $(runs_baremetal)
 	$(RUNNER)$(abs_top_srcdir)/build/$^
 
 junk += $(tests_baremetal)
-

From e2cffedf4fa9189997fa751cd8c5986b807aaac5 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 13:50:26 +0100
Subject: [PATCH 258/286] pylint fixes

---
 python/tvm/contrib/gemmini/__init__.py      |  1 -
 python/tvm/contrib/gemmini/build_module.py  |  2 +-
 python/tvm/contrib/gemmini/environment.py   |  4 --
 python/tvm/contrib/gemmini/helpers.py       | 14 +----
 python/tvm/contrib/gemmini/intrin.py        | 20 +++----
 python/tvm/contrib/gemmini/legalize.py      | 11 ----
 python/tvm/contrib/gemmini/pattern_table.py |  7 +--
 python/tvm/contrib/gemmini/transform.py     | 58 +++++++++------------
 8 files changed, 38 insertions(+), 79 deletions(-)

diff --git a/python/tvm/contrib/gemmini/__init__.py b/python/tvm/contrib/gemmini/__init__.py
index 9515769fd641..73c2ce6bfcf1 100644
--- a/python/tvm/contrib/gemmini/__init__.py
+++ b/python/tvm/contrib/gemmini/__init__.py
@@ -20,7 +20,6 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-import sys
 import tvm._ffi.base
 
 from .environment import Environment
diff --git a/python/tvm/contrib/gemmini/build_module.py b/python/tvm/contrib/gemmini/build_module.py
index a094147b7a14..8ef934b02ab3 100644
--- a/python/tvm/contrib/gemmini/build_module.py
+++ b/python/tvm/contrib/gemmini/build_module.py
@@ -190,7 +190,7 @@ def mem_info_acc_buffer():
     Returns:
         node: The corresponding MemoryInfo node
     """
-    spec = Environment.instance()
+    Environment.instance()
     return tvm.ir.make_node(
         "MemoryInfo",
         unit_bits=env.inp_bits,
diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
index 7d6350d1ebb9..ac98b2c2e738 100644
--- a/python/tvm/contrib/gemmini/environment.py
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -31,10 +31,6 @@
     add_mvout_tensorize,
 )
 import re
-from pydevicetree import Devicetree
-import os
-import tvm
-import sys
 from typing import List, Tuple, Dict, Callable
 from .utils import counters
 
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
index 84c028b3d33c..0d84e3039ffe 100644
--- a/python/tvm/contrib/gemmini/helpers.py
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -23,20 +23,8 @@
 import numpy as np
 import pathlib
 from .environment import Environment
-
-import abc
-import collections
-import matplotlib
-import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
-import PIL.Image as Image
-import PIL.ImageColor as ImageColor
-import PIL.ImageDraw as ImageDraw
-import PIL.ImageFont as ImageFont
-import six
 from six.moves import range
-from six.moves import zip
-import tensorflow.compat.v1 as tf
-from typing import List, Tuple
+from typing import List
 
 
 env = Environment.instance()
diff --git a/python/tvm/contrib/gemmini/intrin.py b/python/tvm/contrib/gemmini/intrin.py
index 0909e58a890d..51a0fa7a643e 100644
--- a/python/tvm/contrib/gemmini/intrin.py
+++ b/python/tvm/contrib/gemmini/intrin.py
@@ -399,13 +399,13 @@ def conv2d_cisc(
     inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
     bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
 
-    OC = wgt.shape[3]
+    wgt.shape[3]
     KH = wgt.shape[0]
     KW = wgt.shape[1]
 
-    N = inp.shape[0]
-    IH = inp.shape[1]
-    IW = inp.shape[2]
+    inp.shape[0]
+    inp.shape[1]
+    inp.shape[2]
     IC = inp.shape[3]
 
     ric = te.reduce_axis((0, IC), name="ric")
@@ -571,14 +571,14 @@ def dw_conv2d_cisc(
     inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
     bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
 
-    OC = wgt.shape[0]
+    wgt.shape[0]
     KH = wgt.shape[1]
     KW = wgt.shape[2]
 
-    N = inp.shape[0]
-    IH = inp.shape[1]
-    IW = inp.shape[2]
-    IC = inp.shape[3]
+    inp.shape[0]
+    inp.shape[1]
+    inp.shape[2]
+    inp.shape[3]
 
     rkh = te.reduce_axis((0, KH), name="rkh")
     rkw = te.reduce_axis((0, KW), name="rkw")
@@ -751,7 +751,7 @@ def add_tensorize(env, oshape: Tuple[int, ...]):
     def intrin_func(ins, outs):
         """Add intrinsic function"""
         difm1, difm2 = ins
-        dout = outs[0]
+        outs[0]
 
         def _body():
             irb = tvm.tir.ir_builder.create()
diff --git a/python/tvm/contrib/gemmini/legalize.py b/python/tvm/contrib/gemmini/legalize.py
index 6f279bb512b3..083268d9c469 100644
--- a/python/tvm/contrib/gemmini/legalize.py
+++ b/python/tvm/contrib/gemmini/legalize.py
@@ -20,23 +20,12 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-from typing import List, Type, Callable
-import math
-
-import numpy as np  # type: ignore
-
 import tvm  # type: ignore
-from tvm import te
 from tvm import relay
 from tvm import ir
 from tvm.relay.dataflow_pattern import DFPatternCallback  # type: ignore
 from tvm.relay.dataflow_pattern import wildcard
-from tvm.relay.dataflow_pattern import is_op
 from tvm.relay.dataflow_pattern import rewrite
-from tvm.relay.dataflow_pattern import CallPattern
-from tvm.relay.frontend.common import infer_shape as _infer_shape
-from tvm.relay.frontend.common import infer_type as _infer_type
-from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 
 from tvm.relay.op import _make  # type: ignore
 
diff --git a/python/tvm/contrib/gemmini/pattern_table.py b/python/tvm/contrib/gemmini/pattern_table.py
index a43f10699c75..9faecbe49d07 100644
--- a/python/tvm/contrib/gemmini/pattern_table.py
+++ b/python/tvm/contrib/gemmini/pattern_table.py
@@ -20,19 +20,15 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-from typing import Dict, List, Tuple, Callable, Optional
+from typing import Callable, List, Tuple
 
 import tvm  # type: ignore
 from tvm import relay
-from tvm.target import Target
-from tvm.relay.build_module import bind_params_by_name  # type: ignore
 from tvm.relay.op.contrib.register import register_pattern_table  # type: ignore
 from tvm.relay.dataflow_pattern import is_constant, wildcard, is_op
 from .utils import *
 
-from tvm.topi.utils import const_vector, get_const_int, get_const_float
 from tvm.relay.frontend.common import infer_shape as _infer_shape
-from tvm.relay.frontend.common import infer_type as _infer_type
 
 from .environment import Environment
 
@@ -89,7 +85,6 @@ class AddParams:
 
     def __init__(self, func_body: tvm.relay.Function):
         if str(func_body.op) in self.activation_map.keys():
-            activation = func_body
             add_op = func_body.args[0]
         else:
             add_op = func_body
diff --git a/python/tvm/contrib/gemmini/transform.py b/python/tvm/contrib/gemmini/transform.py
index 312217cc8210..22146175ce5a 100644
--- a/python/tvm/contrib/gemmini/transform.py
+++ b/python/tvm/contrib/gemmini/transform.py
@@ -21,13 +21,7 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-from numpy import isin
 import tvm
-from tvm import te
-from tvm.topi import utils
-import numpy as np
-from copy import deepcopy
-import itertools
 import ast
 from tvm.tir.ir_builder import IRBuilder
 from typing import Dict
@@ -273,8 +267,6 @@ def _ftransform(f, mod, ctx):
 def InsertGemminiFenceOperator():
     """Pass to generate the call to the fence instruction at the end of the operator"""
 
-    func_name = ""
-
     def _do_fold(stmt):
         if _match_pragma(stmt, "gemm_end"):
             irb = tvm.tir.ir_builder.create()
@@ -285,7 +277,7 @@ def _do_fold(stmt):
         return None
 
     def _ftransform(f, mod, ctx):
-        func_name = f.attrs["global_symbol"]
+        f.attrs["global_symbol"]
         return f.with_body(
             tvm.tir.stmt_functor.ir_transform(f.body, _do_fold, None, ["tir.AttrStmt"])
         )
@@ -303,8 +295,8 @@ def InjectAMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -347,8 +339,8 @@ def InjectAMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -390,8 +382,8 @@ def InjectBMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -433,8 +425,8 @@ def InjectBMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -475,8 +467,8 @@ def InjectDMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -518,8 +510,8 @@ def InjectDMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -561,8 +553,8 @@ def InjectCMVOUTIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -607,8 +599,8 @@ def InjectCMVOUTIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -654,8 +646,8 @@ def InjectCMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -697,8 +689,8 @@ def InjectCMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -740,8 +732,8 @@ def InjectCMVINAccumIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -782,8 +774,8 @@ def InjectCMVINAccumIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
+    tvm.tir.indexdiv
+    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...

From db3646fb772e64d16454ee24a8f5a886cef09f1b Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 1 Feb 2023 14:25:41 +0100
Subject: [PATCH 259/286] more lint fixes

---
 python/tvm/relay/backend/contrib/gemmini/gemmini_add.py   | 5 +----
 .../relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py  | 8 ++------
 python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py | 5 ++---
 .../relay/backend/contrib/gemmini/gemmini_dense_cisc.py   | 6 +-----
 .../contrib/gemmini/gemmini_depthwise_conv2d_cisc.py      | 7 +------
 .../relay/backend/contrib/gemmini/gemmini_max_pool2d.py   | 5 -----
 6 files changed, 7 insertions(+), 29 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
index 9f7837c076e5..0be4afebbb9e 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
@@ -26,13 +26,10 @@
 from tvm import te
 from tvm import autotvm
 from tvm import topi
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
 
 from tvm.contrib.gemmini.environment import Environment
-from tvm.contrib.gemmini.build_module import lower
 from tvm.contrib.gemmini.helpers import get_greater_div
 
-import json
 
 env = Environment.instance()
 
@@ -131,7 +128,7 @@ def schedule_add(
 
     ifm1, ifm2_op = add_stage.op.input_tensors
     ifm2, ofm_offset_op = ifm2_op.op.input_tensors
-    ofm_offset = ofm_offset_op.op.input_tensors[0]
+    ofm_offset_op.op.input_tensors[0]
 
     b, x, y, c = sch[add_stage].op.axis
 
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
index 6d129a0e8b0f..fdb9213aeb4a 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
@@ -28,9 +28,6 @@
 from tvm import topi
 
 from tvm.contrib.gemmini.environment import Environment
-from tvm.contrib.gemmini.build_module import lower
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
-from tvm.contrib.gemmini.helpers import get_greater_div
 
 env = Environment.instance()
 
@@ -104,7 +101,8 @@ def conv2d_cisc(
     oshape = (N, OH, OW, OC)
 
     if len(set(padding)) == 1 and (env.supports_non_zero_padding or ifm_offset == 0):
-        # If the padding is the same for all borders, there is no need to use topi.nn.pad, because Gemminis CISC instructions support equal padding
+        # If the padding is the same for all borders, there is no need to use topi.nn.pad,
+        # because Gemminis CISC instructions support equal padding
         data = orig_data
     else:
         # If not, then pad before calling Gemminis functions
@@ -204,8 +202,6 @@ def _traverse(op):
     else:
         pad_data = data
 
-    orig_kernel = kernel
-
     x_bo, x_i, x_j, x_co = sch[conv2d_stage].op.axis
     rkh, rkw, ric = sch[conv2d_stage].op.reduce_axis
 
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
index 03051f193638..d37e1922027d 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
@@ -26,10 +26,9 @@
 from tvm import te
 from tvm import autotvm
 from tvm import topi
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
 from tvm.contrib.gemmini.environment import Environment
-from tvm.contrib.gemmini.build_module import lower
 from tvm.contrib.gemmini.helpers import get_greater_div
 
 env = Environment.instance()
@@ -114,7 +113,7 @@ def schedule_gemm(
     sch = te.create_schedule([x.op for x in outs])
 
     data, weight, bias_op = dense_stage.op.input_tensors
-    bias = bias_op.op.input_tensors[0]
+    bias_op.op.input_tensors[0]
 
     ##### space definition begin #####
     x, y = sch[dense_stage].op.axis
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
index 0144563940f9..09097a003ce2 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
@@ -21,18 +21,14 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-import math
-import sys
 import numpy as np
 import tvm
 from tvm import te
 from tvm import autotvm
 from tvm import topi
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
+from tvm.autotvm.task.space import OtherOptionEntity
 
 from tvm.contrib.gemmini.environment import Environment
-from tvm.contrib.gemmini.build_module import lower
-from tvm.contrib.gemmini.intrin import gemm_cisc
 
 env = Environment.instance()
 
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
index c67767f783c2..eedbc6b052b0 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
@@ -28,9 +28,6 @@
 from tvm import topi
 
 from tvm.contrib.gemmini.environment import Environment
-from tvm.contrib.gemmini.build_module import lower
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-from tvm.contrib.gemmini.helpers import get_greater_div
 
 env = Environment.instance()
 
@@ -80,7 +77,7 @@ def depthwise_conv2d_cisc(
     N = orig_data.shape[0]
     IH = orig_data.shape[1]
     IW = orig_data.shape[2]
-    IC = orig_data.shape[3]
+    orig_data.shape[3]
 
     HSTR = strides[0]
     WSTR = strides[1]
@@ -191,8 +188,6 @@ def _traverse(op):
     else:
         pad_data = data
 
-    orig_kernel = kernel
-
     x_bo, x_i, x_j, x_co = sch[conv2d_stage].op.axis
     rkh, rkw = sch[conv2d_stage].op.reduce_axis
 
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
index 7d922ddd2db4..292743eff78c 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
@@ -21,16 +21,11 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-import numpy as np
 import tvm
 from tvm import te
 from tvm import autotvm
-from tvm import topi
 
 from tvm.contrib.gemmini.environment import Environment
-from tvm.contrib.gemmini.build_module import lower
-from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity, ReorderEntity
-from tvm.contrib.gemmini.helpers import get_greater_div
 
 env = Environment.instance()
 

From 9ce2b625a8b34655426bc0dc9742392f6c928f9c Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 08:43:35 +0100
Subject: [PATCH 260/286] Small makefile addition to enable use of math.h
 functions

---
 apps/microtvm/gemmini/template_project/src/Makefile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/apps/microtvm/gemmini/template_project/src/Makefile b/apps/microtvm/gemmini/template_project/src/Makefile
index 8849236926b0..b8da778d7eec 100644
--- a/apps/microtvm/gemmini/template_project/src/Makefile
+++ b/apps/microtvm/gemmini/template_project/src/Makefile
@@ -25,7 +25,6 @@ CFLAGS := $(CFLAGS) \
 	-fno-common \
 	-fno-builtin-printf \
 	-march=rv64gc -Wa,-march=rv64gcxhwacha \
-	-lm \
 	-lgcc \
 	-I${RISCV_TESTS} \
 	-I${RISCV_TESTS}/env \
@@ -37,7 +36,6 @@ CFLAGS := $(CFLAGS) \
 
 CFLAGS_BAREMETAL := \
 	$(CFLAGS) \
-	-nostdlib \
 	-nostartfiles \
 	-static \
 	-T $(BENCH_COMMON)/test.ld \
@@ -49,7 +47,7 @@ vpath %.c $(src_dir)
 
 %-baremetal: %.c $(GEMMINI_HEADERS)
 	$(CC_BAREMETAL) $(CFLAGS_BAREMETAL) $< $(LFLAGS) -o $@ \
-		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS)
+		$(wildcard $(BENCH_COMMON)/*.c) $(wildcard $(abs_top_srcdir)/model/*.c) $(wildcard $(BENCH_COMMON)/*.S) $(LIBS) -lm
 
 run-baremetal: $(runs_baremetal)
 

From cc6ab7214a642cbd8bee3d14d21feaad3679d972 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 08:56:50 +0100
Subject: [PATCH 261/286] moved gemmini how tos to tutorials

---
 gallery/how_to/work_with_microtvm/micro_gemmini/README.txt   | 5 -----
 .../micro_gemmini => tutorial}/micro_gemmini_add.py          | 0
 .../micro_gemmini => tutorial}/micro_gemmini_conv2d.py       | 0
 .../micro_gemmini => tutorial}/micro_gemmini_dense.py        | 0
 .../micro_gemmini => tutorial}/micro_gemmini_dwconv2d.py     | 0
 .../micro_gemmini => tutorial}/micro_gemmini_maxpool2d.py    | 0
 .../micro_gemmini => tutorial}/micro_gemmini_mobilenet.py    | 0
 7 files changed, 5 deletions(-)
 delete mode 100644 gallery/how_to/work_with_microtvm/micro_gemmini/README.txt
 rename gallery/{how_to/work_with_microtvm/micro_gemmini => tutorial}/micro_gemmini_add.py (100%)
 rename gallery/{how_to/work_with_microtvm/micro_gemmini => tutorial}/micro_gemmini_conv2d.py (100%)
 rename gallery/{how_to/work_with_microtvm/micro_gemmini => tutorial}/micro_gemmini_dense.py (100%)
 rename gallery/{how_to/work_with_microtvm/micro_gemmini => tutorial}/micro_gemmini_dwconv2d.py (100%)
 rename gallery/{how_to/work_with_microtvm/micro_gemmini => tutorial}/micro_gemmini_maxpool2d.py (100%)
 rename gallery/{how_to/work_with_microtvm/micro_gemmini => tutorial}/micro_gemmini_mobilenet.py (100%)

diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/README.txt b/gallery/how_to/work_with_microtvm/micro_gemmini/README.txt
deleted file mode 100644
index 6826cc7ab810..000000000000
--- a/gallery/how_to/work_with_microtvm/micro_gemmini/README.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-.. _tutorial-micro-gemmini:
-
-Generate code for the Gemmini accelerator using microTVM
-------------------
-These how-tos demonstrate how to deploy models for the Gemmini accelerator using microTVM.
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py b/gallery/tutorial/micro_gemmini_add.py
similarity index 100%
rename from gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_add.py
rename to gallery/tutorial/micro_gemmini_add.py
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py b/gallery/tutorial/micro_gemmini_conv2d.py
similarity index 100%
rename from gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_conv2d.py
rename to gallery/tutorial/micro_gemmini_conv2d.py
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py b/gallery/tutorial/micro_gemmini_dense.py
similarity index 100%
rename from gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dense.py
rename to gallery/tutorial/micro_gemmini_dense.py
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py b/gallery/tutorial/micro_gemmini_dwconv2d.py
similarity index 100%
rename from gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_dwconv2d.py
rename to gallery/tutorial/micro_gemmini_dwconv2d.py
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py b/gallery/tutorial/micro_gemmini_maxpool2d.py
similarity index 100%
rename from gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_maxpool2d.py
rename to gallery/tutorial/micro_gemmini_maxpool2d.py
diff --git a/gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py b/gallery/tutorial/micro_gemmini_mobilenet.py
similarity index 100%
rename from gallery/how_to/work_with_microtvm/micro_gemmini/micro_gemmini_mobilenet.py
rename to gallery/tutorial/micro_gemmini_mobilenet.py

From d6dedc957b9e931dccf53cbb75dff70e348f8273 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 13:01:31 +0100
Subject: [PATCH 262/286] Fix docs

---
 gallery/tutorial/micro_gemmini_add.py       | 4 ++--
 gallery/tutorial/micro_gemmini_conv2d.py    | 4 ++--
 gallery/tutorial/micro_gemmini_dense.py     | 4 ++--
 gallery/tutorial/micro_gemmini_dwconv2d.py  | 4 ++--
 gallery/tutorial/micro_gemmini_maxpool2d.py | 4 ++--
 gallery/tutorial/micro_gemmini_mobilenet.py | 4 ++--
 python/tvm/tir/transform/transform.py       | 2 +-
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/gallery/tutorial/micro_gemmini_add.py b/gallery/tutorial/micro_gemmini_add.py
index b8521c4b6ae2..c90344aa75f0 100644
--- a/gallery/tutorial/micro_gemmini_add.py
+++ b/gallery/tutorial/micro_gemmini_add.py
@@ -202,9 +202,9 @@ def representative_data_gen():
 with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
-##################################
+#################################################
 # Exporting and testing the model using microTVM
-# --------------------------------
+# -----------------------------------------------
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
diff --git a/gallery/tutorial/micro_gemmini_conv2d.py b/gallery/tutorial/micro_gemmini_conv2d.py
index b58881162dcc..14ac6933be98 100644
--- a/gallery/tutorial/micro_gemmini_conv2d.py
+++ b/gallery/tutorial/micro_gemmini_conv2d.py
@@ -185,9 +185,9 @@ def representative_data_gen():
 with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
-##################################
+#################################################
 # Exporting and testing the model using microTVM
-# --------------------------------
+# -----------------------------------------------
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
diff --git a/gallery/tutorial/micro_gemmini_dense.py b/gallery/tutorial/micro_gemmini_dense.py
index c9a7caffc71b..22419ad22276 100644
--- a/gallery/tutorial/micro_gemmini_dense.py
+++ b/gallery/tutorial/micro_gemmini_dense.py
@@ -176,9 +176,9 @@ def representative_data_gen():
 with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
-##################################
+#################################################
 # Exporting and testing the model using microTVM
-# --------------------------------
+# -----------------------------------------------
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
diff --git a/gallery/tutorial/micro_gemmini_dwconv2d.py b/gallery/tutorial/micro_gemmini_dwconv2d.py
index 14c39898278e..6030d14ea024 100644
--- a/gallery/tutorial/micro_gemmini_dwconv2d.py
+++ b/gallery/tutorial/micro_gemmini_dwconv2d.py
@@ -175,9 +175,9 @@ def representative_data_gen():
 with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
-##################################
+#################################################
 # Exporting and testing the model using microTVM
-# --------------------------------
+# -----------------------------------------------
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
diff --git a/gallery/tutorial/micro_gemmini_maxpool2d.py b/gallery/tutorial/micro_gemmini_maxpool2d.py
index 6dbb11695ac2..39f84f88fba5 100644
--- a/gallery/tutorial/micro_gemmini_maxpool2d.py
+++ b/gallery/tutorial/micro_gemmini_maxpool2d.py
@@ -175,9 +175,9 @@ def representative_data_gen():
 with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
-##################################
+#################################################
 # Exporting and testing the model using microTVM
-# --------------------------------
+# -----------------------------------------------
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
diff --git a/gallery/tutorial/micro_gemmini_mobilenet.py b/gallery/tutorial/micro_gemmini_mobilenet.py
index fdb43096c87d..ca3690fbdb33 100644
--- a/gallery/tutorial/micro_gemmini_mobilenet.py
+++ b/gallery/tutorial/micro_gemmini_mobilenet.py
@@ -230,9 +230,9 @@ def generate_mobilenet_tflite_model():
 with gemmini.build_config(usmp_alg="hill_climb", opt_level=3, disabled_pass=["AlterOpLayout"]):
     module = relay.build(mod, executor=EXECUTOR, runtime=RUNTIME, target=TARGET, params=params)
 
-##################################
+#################################################
 # Exporting and testing the model using microTVM
-# --------------------------------
+# -----------------------------------------------
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index a72390997420..0040f0ae5897 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -217,7 +217,7 @@ def InjectRollingBuffer():
 def CorrectGemminisScratchpadAndAccumulatorPointers():
     """Corrects the pointer addresses of buffers inside Gemmini's scratchpad and accumulator
 
-    Returns:
+    Returns
     -------
     fpass : tvm.transform.Pass
         The result pass

From 99f69f202161d76e919615c04530ce6f532d6e5f Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 14:54:05 +0100
Subject: [PATCH 263/286] lint changes

---
 python/tvm/contrib/gemmini/__init__.py        |   2 +-
 python/tvm/contrib/gemmini/build_module.py    |  36 +++--
 python/tvm/contrib/gemmini/environment.py     |  46 +++---
 python/tvm/contrib/gemmini/helpers.py         |  46 +++---
 python/tvm/contrib/gemmini/intrin.py          | 140 +++++++++---------
 python/tvm/contrib/gemmini/legalize.py        |  35 ++---
 python/tvm/contrib/gemmini/pattern_table.py   |  14 +-
 python/tvm/contrib/gemmini/transform.py       |  58 ++++----
 .../backend/contrib/gemmini/gemmini_add.py    |  42 +++---
 .../contrib/gemmini/gemmini_conv2d_cisc.py    |  71 +++++----
 .../backend/contrib/gemmini/gemmini_dense.py  |  96 ++++++------
 .../contrib/gemmini/gemmini_dense_cisc.py     |  14 +-
 .../gemmini/gemmini_depthwise_conv2d_cisc.py  |  67 +++++----
 .../contrib/gemmini/gemmini_max_pool2d.py     |   4 +-
 .../tvm/relay/backend/contrib/gemmini/op.py   |  12 +-
 15 files changed, 339 insertions(+), 344 deletions(-)

diff --git a/python/tvm/contrib/gemmini/__init__.py b/python/tvm/contrib/gemmini/__init__.py
index 73c2ce6bfcf1..02d10645e2a3 100644
--- a/python/tvm/contrib/gemmini/__init__.py
+++ b/python/tvm/contrib/gemmini/__init__.py
@@ -22,9 +22,9 @@
 
 import tvm._ffi.base
 
+from tvm.relay.backend.contrib.gemmini import *
 from .environment import Environment
 from .build_module import build_config, lower, build, preprocess_pass
-from tvm.relay.backend.contrib.gemmini import *
 from .helpers import create_header_file
 from .utils import *
 
diff --git a/python/tvm/contrib/gemmini/build_module.py b/python/tvm/contrib/gemmini/build_module.py
index 8ef934b02ab3..fc72a6b03af8 100644
--- a/python/tvm/contrib/gemmini/build_module.py
+++ b/python/tvm/contrib/gemmini/build_module.py
@@ -21,10 +21,24 @@
 """
 
 import tvm
-
-from .environment import Environment
-from .transform import *
 from tvm import relay
+from .environment import Environment
+from .transform import (
+    InjectAMVINIntrin,
+    InjectAMVINIntrinTransposed,
+    InjectBMVINIntrin,
+    InjectBMVINIntrinTransposed,
+    InjectCMVOUTIntrin,
+    InjectCMVOUTIntrinTransposed,
+    InjectDMVINIntrin,
+    InjectDMVINIntrinTransposed,
+    InjectCMVINIntrin,
+    InjectCMVINIntrinTransposed,
+    InjectCMVINAccumIntrin,
+    InjectCMVINAccumIntrinTransposed,
+    InsertGemminiHeaderOperators,
+    InsertGemminiFenceOperator,
+)
 from .legalize import LegalizeGemmini
 
 
@@ -145,7 +159,7 @@ def build(*args, **kwargs):
 
 
 # The memory information for the compiler
-@tvm.register_func("tvm.info.mem.%s" % Environment.instance().scr_scope)
+@tvm.register_func(f"tvm.info.mem.{Environment.instance().scr_scope}")
 def mem_info_inp_buffer():
     """Creates the information about the local.scratchpad memory node
 
@@ -164,7 +178,7 @@ def mem_info_inp_buffer():
 
 
 # The memory information for the compiler
-@tvm.register_func("tvm.info.mem.%s" % Environment.instance().scr_wgt_scope)
+@tvm.register_func(f"tvm.info.mem.{Environment.instance().scr_wgt_scope}")
 def mem_info_wgt_buffer():
     """Creates the information about the local.scratchpad_weight memory node
 
@@ -183,7 +197,7 @@ def mem_info_wgt_buffer():
 
 
 # The memory information for the compiler
-@tvm.register_func("tvm.info.mem.%s" % Environment.instance().acc_scope)
+@tvm.register_func(f"tvm.info.mem.{Environment.instance().acc_scope}")
 def mem_info_acc_buffer():
     """Creates the information about the local.accumulator memory node
 
@@ -193,9 +207,13 @@ def mem_info_acc_buffer():
     Environment.instance()
     return tvm.ir.make_node(
         "MemoryInfo",
-        unit_bits=env.inp_bits,
-        max_simd_bits=env.DIM,
-        max_num_bits=int(env.ACC_ROWS * env.DIM * env.inp_bits),
+        unit_bits=Environment.instance().inp_bits,
+        max_simd_bits=Environment.instance().DIM,
+        max_num_bits=int(
+            Environment.instance().ACC_ROWS
+            * Environment.instance().DIM
+            * Environment.instance().inp_bits
+        ),
         # head_address=tvm.runtime.const(env.OUT_ACC_BASE_ADDRESS, "uint32"),
         head_address=None,
     )
diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
index ac98b2c2e738..56e8e61b646d 100644
--- a/python/tvm/contrib/gemmini/environment.py
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -22,6 +22,8 @@
 """
 
 from __future__ import absolute_import as _abs
+import re
+from typing import List, Tuple, Dict, Callable
 from .intrin import (
     gemm,
     gemm_cisc,
@@ -30,8 +32,6 @@
     add_tensorize,
     add_mvout_tensorize,
 )
-import re
-from typing import List, Tuple, Dict, Callable
 from .utils import counters
 
 
@@ -67,17 +67,17 @@ def init_overwrite(
         Args:
             batch (int, optional): Batch size. Defaults to 1.
             dim (int, optional): Gemminis systolic array dimensions (DIM). Defaults to 32.
-            max_bytes (int, optional): Used to calculate the maximum amount of columns one mvin instruction can generate. Defaults to 64.
-            inp_dtype (str, optional): Type supported by the Gemmini scratchpad. Defaults to "int8".
-            wgt_dtype (str, optional): Type supported by the Gemmini "logical" weight scratchpad. Defaults to "int8".
-            acc_dtype (str, optional): Type supported by the Gemmini accumulator. Defaults to "int32".
-            acc_rows (int, optional): Amount of rows of the accumulator. Defaults to 4096.
-            bank_rows (int, optional): Amount of rows of each bank in the scratchpad. Defaults to 8192.
-            bank_num (int, optional): Amount of banks for the scratchpad. Defaults to 4.
-            debug (bool, optional): Adds debug of Gemmini counters to generated code. Defaults to False.
-            enabled_counters (dict, optional): Dictionary of enabled Gemmini counters for debug purposes. Defaults to empty.
-            supports_non_zero_padding (bool, optional): If Gemmini supports instructions with non-zero padding. Defaults to False.
-            use_experimental_qnn_add (bool, optional): Activate pattern matching for qnn.add. Defaults to False.
+            max_bytes (int, optional): Limits maximum amount of mvin columns. Defaults to 64.
+            inp_dtype (str, optional): Type of the Gemmini scratchpad. Defaults to "int8".
+            wgt_dtype (str, optional): Type of the Gemmini weight scratchpad. Defaults to "int8".
+            acc_dtype (str, optional): Type of the Gemmini accumulator. Defaults to "int32".
+            acc_rows (int, optional): Rows of the accumulator. Defaults to 4096.
+            bank_rows (int, optional): Rows of each bank in the scratchpad. Defaults to 8192.
+            bank_num (int, optional): Banks for the scratchpad. Defaults to 4.
+            debug (bool, optional): Adds debug of Gemmini counters. Defaults to False.
+            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to empty.
+            supports_non_zero_padding (bool, optional): Gemmini supports instructions with non-zero padding. Defaults to False.
+            use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add. Defaults to False.
         """
         inst = Environment.instance()
         inst.init(
@@ -129,17 +129,17 @@ def init(
         Args:
             batch (int, optional): Batch size. Defaults to 1.
             dim (int, optional): Gemminis systolic array dimensions (DIM). Defaults to 32.
-            max_bytes (int, optional): Used to calculate the maximum amount of columns one mvin instruction can generate. Defaults to 64.
-            inp_dtype (str, optional): Type supported by the Gemmini scratchpad. Defaults to "int8".
-            wgt_dtype (str, optional): Type supported by the Gemmini "logical" weight scratchpad. Defaults to "int8".
-            acc_dtype (str, optional): Type supported by the Gemmini accumulator. Defaults to "int32".
+            max_bytes (int, optional): Limits maximum amount of mvin columns. Defaults to 64.
+            inp_dtype (str, optional): Type of the Gemmini scratchpad. Defaults to "int8".
+            wgt_dtype (str, optional): Type of the Gemmini "logical" weight scratchpad. Defaults to "int8".
+            acc_dtype (str, optional): Type of the Gemmini accumulator. Defaults to "int32".
             acc_rows (int, optional): Amount of rows of the accumulator. Defaults to 4096.
             bank_rows (int, optional): Amount of rows of each bank in the scratchpad. Defaults to 8192.
             bank_num (int, optional): Amount of banks for the scratchpad. Defaults to 4.
-            debug (bool, optional): Adds debug of Gemmini counters to generated code. Defaults to False.
-            enabled_counters (dict, optional): Dictionary of enabled Gemmini counters for debug purposes. Defaults to empty.
-            supports_non_zero_padding (bool, optional): If Gemmini supports instructions with non-zero padding. Defaults to False.
-            use_experimental_qnn_add (bool, optional): Activate pattern matching for qnn.add. Defaults to False.
+            debug (bool, optional): Adds debug of Gemmini counters. Defaults to False.
+            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to empty.
+            supports_non_zero_padding (bool, optional): Gemmini supports instructions with non-zero padding. Defaults to False.
+            use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add. Defaults to False.
         """
 
         assert batch == 1, "Only batch size of 1 is currently supported"
@@ -190,7 +190,9 @@ def init(
 
         self.scr_scope = "local.scratchpad"
         self.acc_scope = "local.accumulator"
-        # TODO (FP): check this scratchpad_weight. Actually, only one scratchpad should exist, but we do this logical partition to correctly manage the pointers to the buffers stored in this memories. Should see how we can fix this in the future.
+        # Actually, only one scratchpad should exist.
+        # But we do this logical partition to correctly manage the pointers to the buffers stored in this memories.
+        # Should see how we can fix this in the future.
         self.scr_wgt_scope = "local.scratchpad_weight"
 
         self.A_mvin = "A_mvin"
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
index 0d84e3039ffe..df3a9bfe9bce 100644
--- a/python/tvm/contrib/gemmini/helpers.py
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -20,14 +20,14 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-import numpy as np
 import pathlib
-from .environment import Environment
-from six.moves import range
 from typing import List
+import numpy as np
+from six.moves import range
+from .environment import Environment
 
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 def create_header_file(
@@ -56,42 +56,42 @@ def create_header_file(
     raw_source_path = file_path.with_suffix(".c").resolve()
 
     if tensor_data.dtype == np.float32:
-        type = "float"
+        datatype = "float"
         align = 32
     elif tensor_data.dtype == np.int8:
-        type = "int8_t"
+        datatype = "int8_t"
         align = 16
     elif tensor_data.dtype == np.uint8:
-        type = "uint8_t"
+        datatype = "uint8_t"
         align = 16
     elif tensor_data.dtype == np.uint32:
-        type = "uint32_t"
+        datatype = "uint32_t"
         align = 16
     else:
-        assert False, "Type %s is not supported!" % tensor_data.dtype
+        assert False, f"Type {tensor_data.dtype} is not supported!"
 
     with open(raw_header_path, "a+") as header_file:
         header_file.write(
             f"#define {tensor_name}_len {tensor_data.size}\n"
-            + f"extern {type} {tensor_name}[{tensor_name}_len];\n"
+            + f"extern {datatype} {tensor_name}[{tensor_name}_len];\n"
         )
 
     if not raw_source_path.is_file():
         with open(raw_source_path, "a+") as source_file:
-            source_file.write(f"#include <stdint.h>\n")
+            source_file.write("#include <stdint.h>\n")
     with open(raw_source_path, "a+") as source_file:
 
         source_file.write(
-            f'{type} {tensor_name}[] __attribute__((section("{section}"), aligned({align}))) = {{'
+            f'{datatype} {tensor_name}[] __attribute__((section("{section}"), aligned({align}))) = {{'
             if section
-            else f"{type} {tensor_name}[] __attribute__((aligned({align}))) = {{"
+            else f"{datatype} {tensor_name}[] __attribute__((aligned({align}))) = {{"
         )
         data_hexstr = tensor_data.tobytes().hex()
         flatten = tensor_data.flatten()
 
-        if tensor_data.dtype == np.float32 or tensor_data.dtype == np.uint32:
-            for i in range(0, len(flatten)):
-                source_file.write(f"{flatten[i]},")
+        if tensor_data.dtype in (np.float32, np.uint32):
+            for element in flatten:
+                source_file.write(f"{element},")
             source_file.write("};\n\n")
         else:
             for i in range(0, len(data_hexstr), 2):
@@ -110,20 +110,20 @@ def create_header_file(
         if debug:
             source_file.write("/*\n")
             for n in range(tensor_data.shape[0]):
-                for ch in range(tensor_data.shape[3]):
-                    source_file.write("Channel %i:\n" % ch)
+                for i_ch in range(tensor_data.shape[3]):
+                    source_file.write(f"Channel {i_ch}:\n")
                     for row in range(tensor_data.shape[1]):
                         for col in range(tensor_data.shape[2]):
-                            source_file.write(f"{tensor_data[n][row][col][ch]}\t")
+                            source_file.write(f"{tensor_data[n][row][col][i_ch]}\t")
                         source_file.write("\n")
             source_file.write("*/\n")
 
             if weights is not None:
                 source_file.write("/*\n")
                 for o_ch in range(weights.shape[3]):
-                    source_file.write("Output channel %i:\n" % o_ch)
+                    source_file.write(f"Output channel {o_ch}:\n")
                     for i_ch in range(weights.shape[2]):
-                        source_file.write("Input channel %i:\n" % i_ch)
+                        source_file.write(f"Input channel {i_ch}:\n")
                         for row in range(weights.shape[0]):
                             for col in range(weights.shape[1]):
                                 source_file.write(f"{weights[row][col][i_ch][o_ch]}\t")
@@ -158,14 +158,14 @@ def get_greater_div(x, limit: int = None):
         int: Greater divisor
     """
 
-    limit = env.DIM if limit == None else limit
+    limit = ENV.DIM if limit is None else limit
 
     if isinstance(x, int):
         elements = [x]
     elif isinstance(x, list):
         elements = x
     else:
-        assert False, "type of x not supported!"
+        assert False, "datatype of x not supported!"
 
     divisors = []
     for element in elements:
diff --git a/python/tvm/contrib/gemmini/intrin.py b/python/tvm/contrib/gemmini/intrin.py
index 51a0fa7a643e..d8809726555a 100644
--- a/python/tvm/contrib/gemmini/intrin.py
+++ b/python/tvm/contrib/gemmini/intrin.py
@@ -22,16 +22,16 @@
 
 from __future__ import absolute_import as _abs
 
+from typing import List, Tuple
 import tvm
 from tvm import te
-from typing import List, Tuple
 
 
 def gemm(
     env,
-    I: int,
-    K: int,
-    J: int,
+    dim_i: int,
+    dim_k: int,
+    dim_j: int,
     stride: int = 1,
     is_depthwise_conv2d: bool = True,
     mode: int = 1,
@@ -41,9 +41,9 @@ def gemm(
 
     Args:
         env (Environment): Environment with configurations
-        I (int): output first axis dimension
-        K (int): reduction axis dimension
-        J (int): output second axis dimension
+        dim_i (int): output first axis dimension
+        dim_k (int): reduction axis dimension
+        dim_j (int): output second axis dimension
         stride (int, optional): Stride, useful for convolutions. Defaults to 1.
         is_depthwise_conv2d (bool, optional): Flag to explain if this is a GEMM for a depthwise convolution. Defaults to False.
         mode (int, optional): Systolic array mode (WS=1,OS=0). Defaults to 1.
@@ -53,13 +53,13 @@ def gemm(
         TensorIntrin: gemm tensor intrinsic
     """
 
-    # TODO (FP): add assertions here for I, K and J?
+    # TODO (FP): add assertions here for dim_i, dim_k and dim_j?
 
-    wgt_shape = (K, J)
+    wgt_shape = (dim_k, dim_j)
 
-    inp_shape = (I, K)
+    inp_shape = (dim_i, dim_k)
 
-    out_shape = (I, J)
+    out_shape = (dim_i, dim_j)
 
     wgt = te.placeholder(wgt_shape, dtype=env.wgt_dtype, name=env.scr_wgt_scope)
     inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
@@ -125,7 +125,7 @@ def gemm(
 
     def intrin_func(ins, outs):
         """Matrix-matrix multiply intrinsic function"""
-        dinp, dwgt, dbias = ins
+        dinp, dwgt, _ = ins
         dout = outs[0]
 
         inp_base_address = tvm.runtime.const(env.INP_SCR_BASE_ADDRESS, "uint32")
@@ -142,47 +142,47 @@ def _body():
 
             inp_access_ptr = dinp.access_ptr("r", "uint32")
 
-            A_access_ptr = inp_base_address + inp_access_ptr
-            BD_access_ptr = (
+            a_access_ptr = inp_base_address + inp_access_ptr
+            bd_access_ptr = (
                 wgt_base_address + wgt_access_ptr if mode == env.WEIGHT_STATIONARY else garbage
             )
-            C_access_ptr = out_base_address + out_access_ptr
-            DB_access_ptr = (
+            c_access_ptr = out_base_address + out_access_ptr
+            db_access_ptr = (
                 garbage if mode == env.WEIGHT_STATIONARY else wgt_base_address + wgt_access_ptr
             )
 
-            A_cols = dinp.shape[1]
-            A_rows = dinp.shape[0]
-            BD_cols = dwgt.shape[1] if mode == env.WEIGHT_STATIONARY else dout.shape[1]
-            BD_rows = dwgt.shape[0] if mode == env.WEIGHT_STATIONARY else dout.shape[0]
-            C_cols = dout.shape[1]
-            C_rows = dout.shape[0]
-            DB_cols = C_cols if mode == env.WEIGHT_STATIONARY else dwgt.shape[1]
-            DB_rows = C_rows if mode == env.WEIGHT_STATIONARY else dwgt.shape[0]
+            a_cols = dinp.shape[1]
+            a_rows = dinp.shape[0]
+            bd_cols = dwgt.shape[1] if mode == env.WEIGHT_STATIONARY else dout.shape[1]
+            bd_rows = dwgt.shape[0] if mode == env.WEIGHT_STATIONARY else dout.shape[0]
+            c_cols = dout.shape[1]
+            c_rows = dout.shape[0]
+            db_cols = c_cols if mode == env.WEIGHT_STATIONARY else dwgt.shape[1]
+            db_rows = c_rows if mode == env.WEIGHT_STATIONARY else dwgt.shape[0]
 
             with irb.if_scope(accum_patch == 0):
                 irb.emit(
                     tvm.tir.call_extern(
                         "",
                         "gemmini_extended_preload",
-                        BD_access_ptr,
-                        C_access_ptr,
-                        BD_cols,
-                        BD_rows,
-                        C_cols,
-                        C_rows,
+                        bd_access_ptr,
+                        c_access_ptr,
+                        bd_cols,
+                        bd_rows,
+                        c_cols,
+                        c_rows,
                     )
                 )
                 irb.emit(
                     tvm.tir.call_extern(
                         "",
                         "gemmini_extended_compute_preloaded",
-                        A_access_ptr,
-                        DB_access_ptr,
-                        A_cols,
-                        A_rows,
-                        DB_cols,
-                        DB_rows,
+                        a_access_ptr,
+                        db_access_ptr,
+                        a_cols,
+                        a_rows,
+                        db_cols,
+                        db_rows,
                     )
                 )
             with irb.else_scope():
@@ -191,23 +191,23 @@ def _body():
                         "",
                         "gemmini_extended_preload",
                         garbage,
-                        C_access_ptr,
-                        BD_cols,
-                        BD_rows,
-                        C_cols,
-                        C_rows,
+                        c_access_ptr,
+                        bd_cols,
+                        bd_rows,
+                        c_cols,
+                        c_rows,
                     )
                 )
                 irb.emit(
                     tvm.tir.call_extern(
                         "",
                         "gemmini_extended_compute_accumulated",
-                        A_access_ptr,
-                        DB_access_ptr,
-                        A_cols,
-                        A_rows,
-                        DB_cols,
-                        DB_rows,
+                        a_access_ptr,
+                        db_access_ptr,
+                        a_cols,
+                        a_rows,
+                        db_cols,
+                        db_rows,
                     )
                 )
             return irb.get()
@@ -258,20 +258,20 @@ def gemm_cisc(
     inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
     bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
 
-    K = wgt.shape[0]
-    J = wgt.shape[1]
-    I = inp.shape[0]
+    dim_k = wgt.shape[0]
+    dim_j = wgt.shape[1]
+    dim_i = inp.shape[0]
 
-    k_ = te.reduce_axis((0, K), name="K")
+    k_reduce = te.reduce_axis((0, dim_k), name="dim_k")
 
-    output_shape = (I, J)
+    output_shape = (dim_i, dim_j)
 
     out = te.compute(
         output_shape,
         lambda x_, y_: te.sum(
-            inp[x_, k_].astype(env.inp_dtype) * wgt[k_, y_].astype(env.inp_dtype)
+            inp[x_, k_reduce].astype(env.inp_dtype) * wgt[k_reduce, y_].astype(env.inp_dtype)
             + bias[y_].astype(env.inp_dtype),
-            axis=[k_],
+            axis=[k_reduce],
         ),
     )
 
@@ -400,25 +400,25 @@ def conv2d_cisc(
     bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
 
     wgt.shape[3]
-    KH = wgt.shape[0]
-    KW = wgt.shape[1]
+    k_h = wgt.shape[0]
+    k_w = wgt.shape[1]
 
     inp.shape[0]
     inp.shape[1]
     inp.shape[2]
-    IC = inp.shape[3]
+    i_c = inp.shape[3]
 
-    ric = te.reduce_axis((0, IC), name="ric")
-    rkh = te.reduce_axis((0, KH), name="rkh")
-    rkw = te.reduce_axis((0, KW), name="rkw")
+    ric = te.reduce_axis((0, i_c), name="ric")
+    rkh = te.reduce_axis((0, k_h), name="rkh")
+    rkw = te.reduce_axis((0, k_w), name="rkw")
 
-    HSTR = strides[0]
-    WSTR = strides[1]
+    hstr = strides[0]
+    wstr = strides[1]
 
     out = te.compute(
         out_shape,
         lambda b_o, i, j, c_o: te.sum(
-            inp[b_o, i * HSTR + rkh, j * WSTR + rkw, ric].astype(env.inp_dtype)
+            inp[b_o, i * hstr + rkh, j * wstr + rkw, ric].astype(env.inp_dtype)
             * wgt[rkh, rkw, ric, c_o].astype(env.inp_dtype)
             + bias[c_o].astype(env.inp_dtype),
             axis=[rkh, rkw, ric],
@@ -572,24 +572,24 @@ def dw_conv2d_cisc(
     bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
 
     wgt.shape[0]
-    KH = wgt.shape[1]
-    KW = wgt.shape[2]
+    k_h = wgt.shape[1]
+    k_w = wgt.shape[2]
 
     inp.shape[0]
     inp.shape[1]
     inp.shape[2]
     inp.shape[3]
 
-    rkh = te.reduce_axis((0, KH), name="rkh")
-    rkw = te.reduce_axis((0, KW), name="rkw")
+    rkh = te.reduce_axis((0, k_h), name="rkh")
+    rkw = te.reduce_axis((0, k_w), name="rkw")
 
-    HSTR = strides[0]
-    WSTR = strides[1]
+    hstr = strides[0]
+    wstr = strides[1]
 
     out = te.compute(
         out_shape,
         lambda b_o, i, j, c_o: te.sum(
-            inp[b_o, i * HSTR + rkh, j * WSTR + rkw, c_o].astype(env.inp_dtype)
+            inp[b_o, i * hstr + rkh, j * wstr + rkw, c_o].astype(env.inp_dtype)
             * wgt[c_o, rkh, rkw].astype(env.inp_dtype)
             + bias[c_o].astype(env.inp_dtype),
             axis=[rkh, rkw],
diff --git a/python/tvm/contrib/gemmini/legalize.py b/python/tvm/contrib/gemmini/legalize.py
index 083268d9c469..f924f1dfe716 100644
--- a/python/tvm/contrib/gemmini/legalize.py
+++ b/python/tvm/contrib/gemmini/legalize.py
@@ -20,6 +20,7 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
+from typing import Tuple
 import tvm  # type: ignore
 from tvm import relay
 from tvm import ir
@@ -29,11 +30,7 @@
 
 from tvm.relay.op import _make  # type: ignore
 
-from .pattern_table import *  # type: ignore
-
-from .environment import Environment
-
-env = Environment.instance()
+from .pattern_table import AddParams, CONV2DParams, GEMMParams, MaxPoolParams  # type: ignore
 
 
 def gemmini_gemm(
@@ -464,9 +461,7 @@ def callback(
 class LegalizeAdd:
     """This is the pass that wraps the AddRewriter"""
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
         for global_var, func in mod.functions.items():
             func = rewrite(AddRewriter(), func)
             mod.update_func(global_var, func)
@@ -480,9 +475,7 @@ def __call__(self, *args, **kwargs):
 class LegalizeMaxPool2D:
     """This is the pass that wraps the MAXPOOL2DRewriter"""
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
         for global_var, func in mod.functions.items():
             func = rewrite(MAXPOOL2DRewriter(), func)
             mod.update_func(global_var, func)
@@ -496,9 +489,7 @@ def __call__(self, *args, **kwargs):
 class LegalizeGEMM:
     """This is the pass that wraps the GEMMRewriter"""
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
         for global_var, func in mod.functions.items():
             func = rewrite(GEMMRewriter(), func)
             mod.update_func(global_var, func)
@@ -512,9 +503,7 @@ def __call__(self, *args, **kwargs):
 class LegalizeCONV2D:
     """This is the pass that wraps the CONV2DRewriter"""
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
         for global_var, func in mod.functions.items():
             func = rewrite(CONV2DRewriter(), func)
             mod.update_func(global_var, func)
@@ -528,9 +517,7 @@ def __call__(self, *args, **kwargs):
 class LegalizeCONV2DExternalPad:
     """This is the pass that wraps the CONV2DExternalPadRewriter"""
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
         for global_var, func in mod.functions.items():
             func = rewrite(CONV2DExternalPadRewriter(), func)
             mod.update_func(global_var, func)
@@ -544,9 +531,7 @@ def __call__(self, *args, **kwargs):
 class LegalizeCONV2DExternalPadAndRelu6:
     """This is the pass that wraps the CONV2DExternalPadAndRelu6Rewriter"""
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
         for global_var, func in mod.functions.items():
             func = rewrite(CONV2DExternalPadAndRelu6Rewriter(), func)
             mod.update_func(global_var, func)
@@ -563,9 +548,7 @@ class LegalizeGemmini:
     operations.
     """
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
         """This is the method that replaces the operations with hardware/codegen supported
         operations.
         """
diff --git a/python/tvm/contrib/gemmini/pattern_table.py b/python/tvm/contrib/gemmini/pattern_table.py
index 9faecbe49d07..46e29ad6ffa6 100644
--- a/python/tvm/contrib/gemmini/pattern_table.py
+++ b/python/tvm/contrib/gemmini/pattern_table.py
@@ -26,13 +26,12 @@
 from tvm import relay
 from tvm.relay.op.contrib.register import register_pattern_table  # type: ignore
 from tvm.relay.dataflow_pattern import is_constant, wildcard, is_op
-from .utils import *
-
 from tvm.relay.frontend.common import infer_shape as _infer_shape
+from .utils import QDenseArgs, RequantArgs, BinaryElementwiseArgs, QConv2DArgs
 
 from .environment import Environment
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 class GEMMParams:
@@ -84,7 +83,7 @@ class AddParams:
     activation_map = {"clip": "CLIP"}
 
     def __init__(self, func_body: tvm.relay.Function):
-        if str(func_body.op) in self.activation_map.keys():
+        if str(func_body.op) in self.activation_map:
             add_op = func_body.args[0]
         else:
             add_op = func_body
@@ -421,6 +420,11 @@ def make_maxpool_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
 
 @register_pattern_table("gemmini")
 def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]:
+    """Declares Gemminis pattern table
+
+    Returns:
+        List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]: List of pattern, callable tuples
+    """
 
     pattern_table_filters = []
     pattern_table_filters.append(
@@ -452,7 +456,7 @@ def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Cal
         )
     )
 
-    if env.use_experimental_qnn_add:
+    if ENV.use_experimental_qnn_add:
         pattern_table_filters.append(
             (
                 AddParams.composite_name,
diff --git a/python/tvm/contrib/gemmini/transform.py b/python/tvm/contrib/gemmini/transform.py
index 22146175ce5a..eddd9012ae07 100644
--- a/python/tvm/contrib/gemmini/transform.py
+++ b/python/tvm/contrib/gemmini/transform.py
@@ -21,10 +21,10 @@
 **Author**: `Federico Peccia <https://fPecc.github.io/>`_
 """
 
-import tvm
 import ast
-from tvm.tir.ir_builder import IRBuilder
 from typing import Dict
+import tvm
+from tvm.tir.ir_builder import IRBuilder
 
 from .environment import Environment
 
@@ -40,12 +40,12 @@ def _get_counters(irb: IRBuilder):
     irb.emit(tvm.tir.call_extern("", "counter_snapshot_take"))
     irb.emit(tvm.tir.call_extern("", "printf", "Counter values:\\r\\n"))
     counter_vars = []
-    for i, (key, value) in enumerate(env.enabled_counters.items()):
+    for i, (_, value) in enumerate(env.enabled_counters.items()):
         counter_var = irb.let(
             value.lower() + "_var", tvm.tir.call_extern("uint32", "counter_read", i)
         )
         counter_vars.append(counter_var)
-        irb.emit(tvm.tir.call_extern("", "printf", tvm.tir.StringImm("%s," % value)))
+        irb.emit(tvm.tir.call_extern("", "printf", tvm.tir.StringImm(f"{value},")))
     irb.emit(tvm.tir.call_extern("", "printf", "\\r\\n"))
     for c in counter_vars:
         irb.emit(tvm.tir.call_extern("", "printf", tvm.tir.StringImm("%lu,"), c))
@@ -58,7 +58,7 @@ def _configure_timers(irb: IRBuilder):
     Args:
         irb (IRBuilder): IRBuilder
     """
-    for i, (key, value) in enumerate(env.enabled_counters.items()):
+    for i, (key, _) in enumerate(env.enabled_counters.items()):
         irb.emit(tvm.tir.call_extern("", "counter_configure", i, key))
 
 
@@ -303,7 +303,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("A mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -326,7 +326,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
 
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.A_mvin, _inject_copy)
 
@@ -347,7 +347,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("A mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             # TODO (FP): check this pointers types again!
@@ -369,7 +369,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.A_mvin + "_t", _inject_copy)
 
@@ -391,7 +391,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         wgt_base_address = tvm.runtime.const(env.WGT_SCR_BASE_ADDRESS, "int32")
         if dst.scope() == "global":
             raise RuntimeError("B mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -412,7 +412,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.B_mvin, _inject_copy)
 
@@ -433,7 +433,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("B mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -454,7 +454,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.B_mvin + "_t", _inject_copy)
 
@@ -475,7 +475,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("D mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -497,7 +497,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.D_mvin, _inject_copy)
 
@@ -518,7 +518,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("D mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -540,7 +540,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.D_mvin + "_t", _inject_copy)
 
@@ -561,7 +561,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if src.scope() == "global":
             raise RuntimeError("C mvout should have a local source")
-        elif dst.scope() == "global":
+        if dst.scope() == "global":
             # Store
             irb = tvm.tir.ir_builder.create()
             if len(dst.shape) == 1:
@@ -586,7 +586,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.C_mvout, _inject_copy)
 
@@ -607,7 +607,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if src.scope() == "global":
             raise RuntimeError("C mvout should have a local source")
-        elif dst.scope() == "global":
+        if dst.scope() == "global":
             # Store
             irb = tvm.tir.ir_builder.create()
             # TODO (FP): check this pointers types again!
@@ -633,7 +633,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.C_mvout + "_t", _inject_copy)
 
@@ -654,7 +654,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("C mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -676,7 +676,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.C_mvin, _inject_copy)
 
@@ -697,7 +697,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("C mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -719,7 +719,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.C_mvin + "_t", _inject_copy)
 
@@ -740,7 +740,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("C mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -761,7 +761,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.C_mvin_accum, _inject_copy)
 
@@ -782,7 +782,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         _ = pad_value
         if dst.scope() == "global":
             raise RuntimeError("C mvin should have a local destination")
-        elif src.scope() == "global":
+        if src.scope() == "global":
             # Load
             irb = tvm.tir.ir_builder.create()
             if len(src.shape) == 1:
@@ -803,6 +803,6 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             )
             return irb.get()
         else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
+            raise RuntimeError(f"Do not support copy {src.scope()}->{dst.scope()}")
 
     return tvm.tir.transform.InjectCopyIntrin(env.C_mvin_accum + "_t", _inject_copy)
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
index 0be4afebbb9e..a561a01d6c32 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
@@ -31,7 +31,7 @@
 from tvm.contrib.gemmini.helpers import get_greater_div
 
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 @autotvm.register_topi_compute("contrib.gemmini.add")
@@ -67,7 +67,7 @@ def add(
     # Derive shapes
     oshape = topi.utils.get_const_tuple(ifm1.shape)
 
-    tensor_type = env.inp_dtype
+    tensor_type = ENV.inp_dtype
 
     ofm_offset_stage = te.compute(
         oshape,
@@ -130,12 +130,10 @@ def schedule_add(
     ifm2, ofm_offset_op = ifm2_op.op.input_tensors
     ofm_offset_op.op.input_tensors[0]
 
-    b, x, y, c = sch[add_stage].op.axis
-
     # Prepare the scope of each buffer
-    cifm1 = sch.cache_read(ifm1, env.acc_scope, [add_stage])
-    sch[ifm2_op].set_scope(env.acc_scope)
-    sch[ofm_offset_op].set_scope(env.acc_scope)
+    cifm1 = sch.cache_read(ifm1, ENV.acc_scope, [add_stage])
+    sch[ifm2_op].set_scope(ENV.acc_scope)
+    sch[ofm_offset_op].set_scope(ENV.acc_scope)
 
     # Split axis, taking into account the maximum value of rows and columns that can be moved into Gemminis accumulator (DIM)
     y_factor = get_greater_div(int(sch[add_stage].op.axis[3].dom.extent))
@@ -150,23 +148,23 @@ def schedule_add(
     sch[ofm_offset_op].compute_at(sch[add_stage], y_o)
 
     # Split axis, taking into account the maximum value of rows and columns that can be moved into Gemminis accumulator (DIM)
-    cifm1_ax_0_1, cifm1_ax_0_2 = sch[cifm1].split(sch[cifm1].op.axis[2], factor=env.DIM)
+    cifm1_ax_0_1, cifm1_ax_0_2 = sch[cifm1].split(sch[cifm1].op.axis[2], factor=ENV.DIM)
     cifm1_ax_1_1, cifm1_ax_1_2 = sch[cifm1].split(
-        sch[cifm1].op.axis[3], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+        sch[cifm1].op.axis[3], factor=ENV.MAX_BLOCK_LEN_ACC * ENV.DIM
     )
     sch[cifm1].reorder(cifm1_ax_0_1, cifm1_ax_1_1, cifm1_ax_0_2, cifm1_ax_1_2)
 
-    cifm2_ax_0_1, cifm2_ax_0_2 = sch[ifm2_op].split(sch[ifm2_op].op.axis[2], factor=env.DIM)
+    cifm2_ax_0_1, cifm2_ax_0_2 = sch[ifm2_op].split(sch[ifm2_op].op.axis[2], factor=ENV.DIM)
     cifm2_ax_1_1, cifm2_ax_1_2 = sch[ifm2_op].split(
-        sch[ifm2_op].op.axis[3], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+        sch[ifm2_op].op.axis[3], factor=ENV.MAX_BLOCK_LEN_ACC * ENV.DIM
     )
     sch[ifm2_op].reorder(cifm2_ax_0_1, cifm2_ax_1_1, cifm2_ax_0_2, cifm2_ax_1_2)
 
     cofm_offset_ax_0_1, cofm_offset_ax_0_2 = sch[ofm_offset_op].split(
-        sch[ofm_offset_op].op.axis[2], factor=env.DIM
+        sch[ofm_offset_op].op.axis[2], factor=ENV.DIM
     )
     cofm_offset_ax_1_1, cofm_offset_ax_1_2 = sch[ofm_offset_op].split(
-        sch[ofm_offset_op].op.axis[3], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+        sch[ofm_offset_op].op.axis[3], factor=ENV.MAX_BLOCK_LEN_ACC * ENV.DIM
     )
     sch[ofm_offset_op].reorder(
         cofm_offset_ax_0_1, cofm_offset_ax_1_1, cofm_offset_ax_0_2, cofm_offset_ax_1_2
@@ -175,26 +173,26 @@ def schedule_add(
     # Set pragmas to insert mvin instructions
     oshape = (x_factor, y_factor)
     if x_factor == 1:
-        sch[cifm1].pragma(cifm1_ax_0_2, env.C_mvin + "_t")
-        sch[ofm_offset_op].pragma(cofm_offset_ax_0_2, env.C_mvin_accum + "_t")
+        sch[cifm1].pragma(cifm1_ax_0_2, ENV.C_mvin + "_t")
+        sch[ofm_offset_op].pragma(cofm_offset_ax_0_2, ENV.C_mvin_accum + "_t")
     else:
-        sch[cifm1].pragma(cifm1_ax_0_2, env.C_mvin)
-        sch[ofm_offset_op].pragma(cofm_offset_ax_0_2, env.C_mvin_accum)
+        sch[cifm1].pragma(cifm1_ax_0_2, ENV.C_mvin)
+        sch[ofm_offset_op].pragma(cofm_offset_ax_0_2, ENV.C_mvin_accum)
 
     # Tensorize
-    sch[ifm2_op].tensorize(cifm2_ax_0_2, env.add_tensorize(oshape))
-    sch[add_stage].tensorize(x_i, env.add_mvout_tensorize(oshape))
+    sch[ifm2_op].tensorize(cifm2_ax_0_2, ENV.add_tensorize(oshape))
+    sch[add_stage].tensorize(x_i, ENV.add_mvout_tensorize(oshape))
 
     # Create configuration dictionary
     config_dict = {}
     config_dict["A_size"] = int(ifm1.shape[3])
     config_dict["B_size"] = int(ifm2.shape[3])
     config_dict["C_size"] = int(output.shape[3])
-    config_dict["A_private_stride"] = env.DIM
-    config_dict["B_private_stride"] = env.DIM
+    config_dict["A_private_stride"] = ENV.DIM
+    config_dict["B_private_stride"] = ENV.DIM
     config_dict["execution_stride"] = 1
     config_dict["activation"] = 0
-    config_dict["mode"] = env.WEIGHT_STATIONARY
+    config_dict["mode"] = ENV.WEIGHT_STATIONARY
     config_dict["max_pixels_per_row"] = 1
     config_dict["ifm1_scale"] = float(add_stage.op.attrs["ifm1_scale"])
     config_dict["ifm2_scale"] = float(add_stage.op.attrs["ifm2_scale"])
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
index fdb9213aeb4a..f82bea64a51d 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
@@ -29,7 +29,7 @@
 
 from tvm.contrib.gemmini.environment import Environment
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 @autotvm.register_topi_compute("contrib.gemmini.conv2d_cisc")
@@ -75,32 +75,32 @@ def conv2d_cisc(
         orig_data.shape[1] == orig_data.shape[2]
     ), "GEMMINIs Conv2d CISC schedule only supports square inputs!"
 
-    OC = kernel.shape[3]
-    KH = kernel.shape[0]
-    KW = kernel.shape[1]
+    o_c = kernel.shape[3]
+    k_h = kernel.shape[0]
+    k_w = kernel.shape[1]
 
-    N = orig_data.shape[0]
-    IH = orig_data.shape[1]
-    IW = orig_data.shape[2]
-    IC = orig_data.shape[3]
+    n = orig_data.shape[0]
+    i_h = orig_data.shape[1]
+    i_w = orig_data.shape[2]
+    i_c = orig_data.shape[3]
 
-    HSTR = strides[0]
-    WSTR = strides[1]
-    TOP_PAD = padding[0]
-    LEFT_PAD = padding[1]
-    BOTTOM_PAD = padding[2]
-    RIGHT_PAD = padding[3]
+    hstr = strides[0]
+    wstr = strides[1]
+    top_pad = padding[0]
+    left_pad = padding[1]
+    bottom_pad = padding[2]
+    right_pad = padding[3]
 
-    OH = topi.utils.get_const_int(tvm.tir.div((IH + (TOP_PAD + BOTTOM_PAD) - KH), HSTR) + 1)
-    OW = topi.utils.get_const_int(tvm.tir.div((IW + (LEFT_PAD + RIGHT_PAD) - KW), WSTR) + 1)
+    o_h = topi.utils.get_const_int(tvm.tir.div((i_h + (top_pad + bottom_pad) - k_h), hstr) + 1)
+    o_w = topi.utils.get_const_int(tvm.tir.div((i_w + (left_pad + right_pad) - k_w), wstr) + 1)
 
-    ric = te.reduce_axis((0, IC), name="ric")
-    rkh = te.reduce_axis((0, KH), name="rkh")
-    rkw = te.reduce_axis((0, KW), name="rkw")
+    ric = te.reduce_axis((0, i_c), name="ric")
+    rkh = te.reduce_axis((0, k_h), name="rkh")
+    rkw = te.reduce_axis((0, k_w), name="rkw")
 
-    oshape = (N, OH, OW, OC)
+    oshape = (n, o_h, o_w, o_c)
 
-    if len(set(padding)) == 1 and (env.supports_non_zero_padding or ifm_offset == 0):
+    if len(set(padding)) == 1 and (ENV.supports_non_zero_padding or ifm_offset == 0):
         # If the padding is the same for all borders, there is no need to use topi.nn.pad,
         # because Gemminis CISC instructions support equal padding
         data = orig_data
@@ -108,8 +108,8 @@ def conv2d_cisc(
         # If not, then pad before calling Gemminis functions
         data = topi.nn.pad(
             orig_data,
-            [0, TOP_PAD, LEFT_PAD, 0],
-            [0, BOTTOM_PAD, RIGHT_PAD, 0],
+            [0, top_pad, left_pad, 0],
+            [0, bottom_pad, right_pad, 0],
             pad_value=ifm_offset,
             name="pad_data",
         )
@@ -117,16 +117,16 @@ def conv2d_cisc(
     res = te.compute(
         oshape,
         lambda b_o, i, j, c_o: te.sum(
-            data[b_o, i * HSTR + rkh, j * WSTR + rkw, ric].astype(env.inp_dtype)
-            * kernel[rkh, rkw, ric, c_o].astype(env.inp_dtype)
-            + bias[c_o].astype(env.inp_dtype),
+            data[b_o, i * hstr + rkh, j * wstr + rkw, ric].astype(ENV.inp_dtype)
+            * kernel[rkh, rkw, ric, c_o].astype(ENV.inp_dtype)
+            + bias[c_o].astype(ENV.inp_dtype),
             axis=[rkh, rkw, ric],
         ),
         name="res",
         tag="conv2d",
         attrs={
             "activation": activation,
-            "strides": [HSTR, WSTR],
+            "strides": [hstr, wstr],
             "padding": padding,
             "padding_value": ifm_offset,
             "scale": gemmini_scale,
@@ -138,9 +138,9 @@ def conv2d_cisc(
     )
 
     cfg.add_flop(
-        np.prod(topi.utils.get_const_tuple(oshape)) * KH * KW * IC
+        np.prod(topi.utils.get_const_tuple(oshape)) * k_h * k_w * i_c
         + np.prod(topi.utils.get_const_tuple(oshape))
-        * (KH * KW * IC - 1)  # Multiplications and additions needed
+        * (k_h * k_w * i_c - 1)  # Multiplications and additions needed
         + np.prod(  # Additions needed
             topi.utils.get_const_tuple(oshape)
         )  # Output scaling multiplications
@@ -202,28 +202,27 @@ def _traverse(op):
     else:
         pad_data = data
 
-    x_bo, x_i, x_j, x_co = sch[conv2d_stage].op.axis
-    rkh, rkw, ric = sch[conv2d_stage].op.reduce_axis
+    x_bo, _, _, _ = sch[conv2d_stage].op.axis
 
     x_bo_o, x_bo_i = sch[conv2d_stage].split(x_bo, factor=pad_data.shape[0])
 
     axis_for_start = x_bo_o
 
     # If topi.nn.pad was added, its because the padding was not equal in all dimensions.
-    padding_for_C_code = conv2d_stage.op.attrs["padding"] if pad_data == data else [0, 0, 0, 0]
-    padding_value_for_C_code = conv2d_stage.op.attrs["padding_value"] if pad_data == data else 0
+    padding = conv2d_stage.op.attrs["padding"] if pad_data == data else [0, 0, 0, 0]
+    padding_value = conv2d_stage.op.attrs["padding_value"] if pad_data == data else 0
 
     # Apply tensorization
     sch[conv2d_stage].tensorize(
         x_bo_i,
-        env.conv2d_cisc(
+        ENV.conv2d_cisc(
             pad_data.shape,
             kernel.shape,
             bias.shape,
             conv2d_stage.shape,
             conv2d_stage.op.attrs["strides"],
-            padding_for_C_code,
-            padding_value_for_C_code,
+            padding,
+            padding_value,
             conv2d_stage.op.attrs["activation"],
             conv2d_stage.op.attrs["scale"],
             conv2d_stage.op.attrs["pool_size"],
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
index d37e1922027d..d52557d8b703 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
@@ -31,7 +31,7 @@
 from tvm.contrib.gemmini.environment import Environment
 from tvm.contrib.gemmini.helpers import get_greater_div
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 @autotvm.register_topi_compute("contrib.gemmini.gemm")
@@ -66,7 +66,7 @@ def gemm(
 
     bias_stage = te.compute(
         oshape,
-        lambda x_o, y_o: bias[y_o].astype(env.inp_dtype),
+        lambda x_o, y_o: bias[y_o].astype(ENV.inp_dtype),
         name="bias.local.accumulator",
         tag="bias_add",
     )
@@ -74,8 +74,8 @@ def gemm(
     res = te.compute(
         oshape,
         lambda x_o, y_o: te.sum(
-            data[x_o, k_o].astype(env.inp_dtype) * weight[k_o, y_o].astype(env.inp_dtype)
-            + bias_stage[x_o, y_o].astype(env.inp_dtype),
+            data[x_o, k_o].astype(ENV.inp_dtype) * weight[k_o, y_o].astype(ENV.inp_dtype)
+            + bias_stage[x_o, y_o].astype(ENV.inp_dtype),
             axis=[k_o],
         ),
         name="res",
@@ -127,8 +127,8 @@ def schedule_gemm(
         policy="power2",
         filter=lambda ax: (
             ax.size[-1] == get_greater_div(int(data.shape[0]))
-            if (data.shape[0] >= env.DIM)
-            else ax.size[-1] <= env.DIM
+            if (data.shape[0] >= ENV.DIM)
+            else ax.size[-1] <= ENV.DIM
         ),
     )
 
@@ -139,8 +139,8 @@ def schedule_gemm(
         policy="power2",
         filter=lambda ax: (
             ax.size[-1] == get_greater_div(int(weight.shape[1]))
-            if (weight.shape[1] >= env.DIM)
-            else ax.size[-1] <= env.DIM
+            if (weight.shape[1] >= ENV.DIM)
+            else ax.size[-1] <= ENV.DIM
         ),
     )
 
@@ -151,8 +151,8 @@ def schedule_gemm(
         policy="power2",
         filter=lambda ax: (
             ax.size[-1] == get_greater_div(int(weight.shape[0]))
-            if (weight.shape[0] >= env.DIM)
-            else ax.size[-1] <= env.DIM
+            if (weight.shape[0] >= ENV.DIM)
+            else ax.size[-1] <= ENV.DIM
         ),
     )
 
@@ -167,7 +167,7 @@ def schedule_gemm(
     # WS/OS
     #   0: Gemmini will be configured as output stationary
     #   1: Gemmini will be configured as weight stationary
-    cfg.define_knob("WS/OS", [env.WEIGHT_STATIONARY, env.OUTPUT_STATIONARY])
+    cfg.define_knob("WS/OS", [ENV.WEIGHT_STATIONARY, ENV.OUTPUT_STATIONARY])
     # mvout_big_block
     #   False: generate mvout instructions moving as maximum DIM columns
     #   True: generate mvout instructions moving more than DIM columns
@@ -180,14 +180,14 @@ def schedule_gemm(
         cfg["accumulate_multiple_patches"] = OtherOptionEntity(0)
         cfg["exchange_axis"] = OtherOptionEntity(False)
         cfg["mvout_big_block"] = OtherOptionEntity(True)
-        cfg["WS/OS"] = OtherOptionEntity(env.WEIGHT_STATIONARY)
+        cfg["WS/OS"] = OtherOptionEntity(ENV.WEIGHT_STATIONARY)
 
     ###### space definition end ######
 
-    cdata = sch.cache_read(data, env.scr_scope, [dense_stage])
-    cweight = sch.cache_read(weight, env.scr_wgt_scope, [dense_stage])
-    dense_stage_acc = sch.cache_write(output, env.acc_scope)
-    sch[bias_op].set_scope(env.acc_scope)
+    cdata = sch.cache_read(data, ENV.scr_scope, [dense_stage])
+    cweight = sch.cache_read(weight, ENV.scr_wgt_scope, [dense_stage])
+    dense_stage_acc = sch.cache_write(output, ENV.acc_scope)
+    sch[bias_op].set_scope(ENV.acc_scope)
     (x_, y_) = sch[dense_stage_acc].op.axis
     (z_,) = sch[dense_stage_acc].op.reduce_axis
 
@@ -215,8 +215,8 @@ def schedule_gemm(
     sch[dense_stage_acc].compute_at(sch[output], axis_for_output)
 
     # # Split loops to generate the inner dimensions specified by knob tile_zo
-    xo_o, xi_o = sch[dense_stage_acc].split(x_, factor=env.DIM)
-    yo_o, yi_o = sch[dense_stage_acc].split(y_, factor=env.DIM)
+    xo_o, xi_o = sch[dense_stage_acc].split(x_, factor=ENV.DIM)
+    yo_o, yi_o = sch[dense_stage_acc].split(y_, factor=ENV.DIM)
     b_z, zo_o, zi_o = cfg["tile_zo"].apply(sch, dense_stage_acc, z_)
 
     # Apply the exchange_axis knob
@@ -242,20 +242,20 @@ def schedule_gemm(
     if cfg["axis_for_cdata"].val == 0:
         assert (
             cfg["tile_xo"].size[1] * cfg["tile_xo"].size[2] * data.shape[1]
-            <= env.INP_SCR_ROWS * env.DIM
+            <= ENV.INP_SCR_ROWS * ENV.DIM
         ), "Data matrix will not fit in scratchpad!"
     elif cfg["axis_for_cdata"].val == 1:
         assert (
-            cfg["tile_xo"].size[2] * data.shape[1] <= env.INP_SCR_ROWS * env.DIM
+            cfg["tile_xo"].size[2] * data.shape[1] <= ENV.INP_SCR_ROWS * ENV.DIM
         ), "Data matrix will not fit in scratchpad!"
     if cfg["axis_for_cweight"].val == 0:
         assert (
             cfg["tile_yo"].size[1] * cfg["tile_yo"].size[2] * weight.shape[0]
-            <= env.WGT_SCR_ROWS * env.DIM
+            <= ENV.WGT_SCR_ROWS * ENV.DIM
         ), "Weight matrix will not fit in scratchpad!"
     elif cfg["axis_for_cweight"].val == 1:
         assert (
-            cfg["tile_yo"].size[2] * weight.shape[0] <= env.WGT_SCR_ROWS * env.DIM
+            cfg["tile_yo"].size[2] * weight.shape[0] <= ENV.WGT_SCR_ROWS * ENV.DIM
         ), "Weight matrix will not fit in scratchpad!"
 
     # And here we assert that there is enough place available in the accumulator
@@ -265,12 +265,12 @@ def schedule_gemm(
             * cfg["tile_xo"].size[2]
             * cfg["tile_yo"].size[1]
             * cfg["tile_yo"].size[2]
-            <= env.ACC_ROWS * env.DIM
+            <= ENV.ACC_ROWS * ENV.DIM
         ), "Result matrix will not fit in accumulator!"
     elif cfg["accumulate_multiple_patches"].val == 1:
         assert (
             cfg["tile_xo"].size[2] * cfg["tile_yo"].size[1] * cfg["tile_yo"].size[2]
-            <= env.ACC_ROWS * env.DIM
+            <= ENV.ACC_ROWS * ENV.DIM
         ), "Result matrix will not fit in accumulator!"
 
     # Move the data and weight move instructions into the correct loops selected by the axis_for_cdata and axis_for_cweight knobs
@@ -282,21 +282,21 @@ def schedule_gemm(
     )
 
     # Split input moves because Gemmini's mvin only supports mvins with rows <= DIM and cols <= MAX_BLOCK_LEN
-    cdata_ax_0_1, cdata_ax_0_2 = sch[cdata].split(sch[cdata].op.axis[0], factor=env.DIM)
+    cdata_ax_0_1, cdata_ax_0_2 = sch[cdata].split(sch[cdata].op.axis[0], factor=ENV.DIM)
     cdata_ax_1_1, cdata_ax_1_2 = sch[cdata].split(
-        sch[cdata].op.axis[1], factor=env.MAX_BLOCK_LEN * env.DIM
+        sch[cdata].op.axis[1], factor=ENV.MAX_BLOCK_LEN * ENV.DIM
     )
     sch[cdata].reorder(cdata_ax_0_1, cdata_ax_1_1, cdata_ax_0_2, cdata_ax_1_2)
 
-    cweight_ax_0_1, cweight_ax_0_2 = sch[cweight].split(sch[cweight].op.axis[0], factor=env.DIM)
+    cweight_ax_0_1, cweight_ax_0_2 = sch[cweight].split(sch[cweight].op.axis[0], factor=ENV.DIM)
     cweight_ax_1_1, cweight_ax_1_2 = sch[cweight].split(
-        sch[cweight].op.axis[1], factor=env.MAX_BLOCK_LEN * env.DIM
+        sch[cweight].op.axis[1], factor=ENV.MAX_BLOCK_LEN * ENV.DIM
     )
     sch[cweight].reorder(cweight_ax_0_1, cweight_ax_1_1, cweight_ax_0_2, cweight_ax_1_2)
 
-    cbias_ax_0_1, cbias_ax_0_2 = sch[bias_op].split(sch[bias_op].op.axis[0], factor=env.DIM)
+    cbias_ax_0_1, cbias_ax_0_2 = sch[bias_op].split(sch[bias_op].op.axis[0], factor=ENV.DIM)
     cbias_ax_1_1, cbias_ax_1_2 = sch[bias_op].split(
-        sch[bias_op].op.axis[1], factor=env.MAX_BLOCK_LEN_ACC * env.DIM
+        sch[bias_op].op.axis[1], factor=ENV.MAX_BLOCK_LEN_ACC * ENV.DIM
     )
     sch[bias_op].reorder(cbias_ax_0_1, cbias_ax_1_1, cbias_ax_0_2, cbias_ax_1_2)
 
@@ -319,34 +319,34 @@ def schedule_gemm(
         fused_x = xi
         fused_y = yi
 
-    fused_x_1, fused_x_2 = sch[output].split(fused_x, factor=env.DIM)
+    fused_x_1, fused_x_2 = sch[output].split(fused_x, factor=ENV.DIM)
     fused_y_1, fused_y_2 = sch[output].split(
-        fused_y, factor=env.MAX_BLOCK_LEN * env.DIM if cfg["mvout_big_block"].val else env.DIM
+        fused_y, factor=ENV.MAX_BLOCK_LEN * ENV.DIM if cfg["mvout_big_block"].val else ENV.DIM
     )
     sch[output].reorder(fused_x_1, fused_y_1, fused_x_2, fused_y_2)
 
     # Tag loops with pragmas, in order to insert the move in and move out instructions
-    sch[cweight].pragma(cweight_ax_0_2, env.B_mvin)
+    sch[cweight].pragma(cweight_ax_0_2, ENV.B_mvin)
     if data.shape[0] == 1 and weight.shape[1] > 1:
-        sch[cdata].pragma(cdata_ax_0_2, env.A_mvin + "_t")
-        sch[bias_op].pragma(cbias_ax_0_2, env.D_mvin + "_t")
-        sch[output].pragma(fused_x_2, env.C_mvout + "_t")
+        sch[cdata].pragma(cdata_ax_0_2, ENV.A_mvin + "_t")
+        sch[bias_op].pragma(cbias_ax_0_2, ENV.D_mvin + "_t")
+        sch[output].pragma(fused_x_2, ENV.C_mvout + "_t")
     else:
-        sch[cdata].pragma(cdata_ax_0_2, env.A_mvin)
-        sch[bias_op].pragma(cbias_ax_0_2, env.D_mvin)
-        sch[output].pragma(fused_x_2, env.C_mvout)
+        sch[cdata].pragma(cdata_ax_0_2, ENV.A_mvin)
+        sch[bias_op].pragma(cbias_ax_0_2, ENV.D_mvin)
+        sch[output].pragma(fused_x_2, ENV.C_mvout)
 
     # Apply tensorize
-    I = data.shape[0] if data.shape[0] < env.DIM else cfg["tile_xo"].size[-1]
-    K = weight.shape[0] if weight.shape[0] < env.DIM else cfg["tile_zo"].size[-1]
-    J = weight.shape[1] if weight.shape[1] < env.DIM else cfg["tile_yo"].size[-1]
+    dim_i = data.shape[0] if data.shape[0] < ENV.DIM else cfg["tile_xo"].size[-1]
+    dim_k = weight.shape[0] if weight.shape[0] < ENV.DIM else cfg["tile_zo"].size[-1]
+    dim_j = weight.shape[1] if weight.shape[1] < ENV.DIM else cfg["tile_yo"].size[-1]
 
     sch[dense_stage_acc].tensorize(
         xi_o if cfg["exchange_axis"].val else yi_o,
-        env.gemm(
-            I,
-            K,
-            J,
+        ENV.gemm(
+            dim_i,
+            dim_k,
+            dim_j,
             mode=cfg["WS/OS"].val,
             accum_patch=tvm.tir.IntImm("uint8", 0)
             if cfg["exchange_axis"].val or cfg["tile_zo"].size[1] != 1
@@ -359,8 +359,8 @@ def schedule_gemm(
     config_dict["A_size"] = int(data.shape[1])
     config_dict["B_size"] = int(weight.shape[1])
     config_dict["C_size"] = int(output.shape[1])
-    config_dict["A_private_stride"] = env.DIM
-    config_dict["B_private_stride"] = env.DIM
+    config_dict["A_private_stride"] = ENV.DIM
+    config_dict["B_private_stride"] = ENV.DIM
     config_dict["execution_stride"] = 1
     config_dict["activation"] = 0
     config_dict["mode"] = cfg["WS/OS"].val
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
index 09097a003ce2..a3978fe5b63d 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
@@ -30,7 +30,7 @@
 
 from tvm.contrib.gemmini.environment import Environment
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 @autotvm.register_topi_compute("contrib.gemmini.gemm_cisc")
@@ -66,8 +66,8 @@ def gemm_cisc(
     res = te.compute(
         oshape,
         lambda x_o, y_o: te.sum(
-            data[x_o, k_o].astype(env.inp_dtype) * weight[k_o, y_o].astype(env.inp_dtype)
-            + bias[y_o].astype(env.inp_dtype),
+            data[x_o, k_o].astype(ENV.inp_dtype) * weight[k_o, y_o].astype(ENV.inp_dtype)
+            + bias[y_o].astype(ENV.inp_dtype),
             axis=[k_o],
         ),
         name="res",
@@ -108,11 +108,11 @@ def schedule_gemm_cisc(
     # WS/OS
     #   0: Gemmini will be configured as output stationary
     #   1: Gemmini will be configured as weight stationary
-    cfg.define_knob("WS/OS", [env.WEIGHT_STATIONARY, env.OUTPUT_STATIONARY])
+    cfg.define_knob("WS/OS", [ENV.WEIGHT_STATIONARY, ENV.OUTPUT_STATIONARY])
     if cfg.is_fallback:
-        cfg["WS/OS"] = OtherOptionEntity(env.WEIGHT_STATIONARY)
+        cfg["WS/OS"] = OtherOptionEntity(ENV.WEIGHT_STATIONARY)
 
-    x_, y_ = sch[dense_stage].op.axis
+    x_, _ = sch[dense_stage].op.axis
 
     x_o, x_i = sch[dense_stage].split(x_, factor=data.shape[0])
 
@@ -121,7 +121,7 @@ def schedule_gemm_cisc(
     # Apply tensorization
     sch[dense_stage].tensorize(
         x_i,
-        env.gemm_cisc(
+        ENV.gemm_cisc(
             data.shape, weight.shape, bias.shape, dense_stage.op.attrs["scale"], cfg["WS/OS"].val
         ),
     )
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
index eedbc6b052b0..d15392efeb32 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
@@ -29,7 +29,7 @@
 
 from tvm.contrib.gemmini.environment import Environment
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 @autotvm.register_topi_compute("contrib.gemmini.depthwiseconv2d_cisc")
@@ -68,58 +68,58 @@ def depthwise_conv2d_cisc(
         orig_data.shape[1] == orig_data.shape[2]
     ), "GEMMINIs depthwise conv2d CISC schedule only supports square inputs!"
 
-    OC = orig_kernel.shape[0]
-    KH = orig_kernel.shape[1]
-    KW = orig_kernel.shape[2]
+    o_c = orig_kernel.shape[0]
+    k_h = orig_kernel.shape[1]
+    k_w = orig_kernel.shape[2]
 
     kernel = orig_kernel
 
-    N = orig_data.shape[0]
-    IH = orig_data.shape[1]
-    IW = orig_data.shape[2]
+    n = orig_data.shape[0]
+    i_h = orig_data.shape[1]
+    i_w = orig_data.shape[2]
     orig_data.shape[3]
 
-    HSTR = strides[0]
-    WSTR = strides[1]
-    TOP_PAD = padding[0]
-    LEFT_PAD = padding[1]
-    BOTTOM_PAD = padding[2]
-    RIGHT_PAD = padding[3]
+    hstr = strides[0]
+    wstr = strides[1]
+    top_pad = padding[0]
+    left_pad = padding[1]
+    bottom_pad = padding[2]
+    right_pad = padding[3]
 
-    OH = topi.utils.get_const_int(tvm.tir.div((IH + (TOP_PAD + BOTTOM_PAD) - KH), HSTR) + 1)
-    OW = topi.utils.get_const_int(tvm.tir.div((IW + (LEFT_PAD + RIGHT_PAD) - KW), WSTR) + 1)
+    o_h = topi.utils.get_const_int(tvm.tir.div((i_h + (top_pad + bottom_pad) - k_h), hstr) + 1)
+    o_w = topi.utils.get_const_int(tvm.tir.div((i_w + (left_pad + right_pad) - k_w), wstr) + 1)
 
-    if len(set(padding)) == 1 and env.supports_non_zero_padding:
+    if len(set(padding)) == 1 and ENV.supports_non_zero_padding:
         # If the padding is the same for all borders, there is no need to use topi.nn.pad, because Gemminis CISC instructions support equal padding
         data = orig_data
     else:
         # If not, then pad before calling Gemminis functions
         data = topi.nn.pad(
             orig_data,
-            [0, TOP_PAD, LEFT_PAD, 0],
-            [0, BOTTOM_PAD, RIGHT_PAD, 0],
+            [0, top_pad, left_pad, 0],
+            [0, bottom_pad, right_pad, 0],
             pad_value=ifm_offset,
             name="pad_data",
         )
 
-    rkh = te.reduce_axis((0, KH), name="rkh")
-    rkw = te.reduce_axis((0, KW), name="rkw")
+    rkh = te.reduce_axis((0, k_h), name="rkh")
+    rkw = te.reduce_axis((0, k_w), name="rkw")
 
-    oshape = (N, OH, OW, OC)
+    oshape = (n, o_h, o_w, o_c)
 
     res = te.compute(
         oshape,
         lambda b_o, i, j, c_o: te.sum(
-            data[b_o, i * HSTR + rkh, j * WSTR + rkw, c_o].astype(env.inp_dtype)
-            * kernel[c_o, rkh, rkw].astype(env.inp_dtype)
-            + bias[c_o].astype(env.inp_dtype),
+            data[b_o, i * hstr + rkh, j * wstr + rkw, c_o].astype(ENV.inp_dtype)
+            * kernel[c_o, rkh, rkw].astype(ENV.inp_dtype)
+            + bias[c_o].astype(ENV.inp_dtype),
             axis=[rkh, rkw],
         ),
         name="res",
         tag="conv2d",
         attrs={
             "activation": activation,
-            "strides": [HSTR, WSTR],
+            "strides": [hstr, wstr],
             "padding": padding,
             "padding_value": ifm_offset,
             "scale": gemmini_scale,
@@ -127,9 +127,9 @@ def depthwise_conv2d_cisc(
     )
 
     cfg.add_flop(
-        np.prod(topi.utils.get_const_tuple(oshape)) * KH * KW
+        np.prod(topi.utils.get_const_tuple(oshape)) * k_h * k_w
         + np.prod(topi.utils.get_const_tuple(oshape))
-        * (KH * KW - 1)  # Multiplications and additions needed
+        * (k_h * k_w - 1)  # Multiplications and additions needed
         + np.prod(topi.utils.get_const_tuple(oshape))  # Output scaling factor multiplications
     )
 
@@ -188,28 +188,27 @@ def _traverse(op):
     else:
         pad_data = data
 
-    x_bo, x_i, x_j, x_co = sch[conv2d_stage].op.axis
-    rkh, rkw = sch[conv2d_stage].op.reduce_axis
+    x_bo, _, _, _ = sch[conv2d_stage].op.axis
 
     x_bo_o, x_bo_i = sch[conv2d_stage].split(x_bo, factor=pad_data.shape[0])
 
     axis_for_start = x_bo_o
 
     # If topi.nn.pad was added, its because the padding was not equal in all dimensions.
-    padding_for_C_code = conv2d_stage.op.attrs["padding"] if pad_data == data else [0, 0, 0, 0]
-    padding_value_for_C_code = conv2d_stage.op.attrs["padding_value"] if pad_data == data else 0
+    padding = conv2d_stage.op.attrs["padding"] if pad_data == data else [0, 0, 0, 0]
+    padding_value = conv2d_stage.op.attrs["padding_value"] if pad_data == data else 0
 
     # Apply tensorization
     sch[conv2d_stage].tensorize(
         x_bo_i,
-        env.dw_conv2d_cisc(
+        ENV.dw_conv2d_cisc(
             pad_data.shape,
             kernel.shape,
             bias.shape,
             conv2d_stage.shape,
             conv2d_stage.op.attrs["strides"],
-            padding_for_C_code,
-            padding_value_for_C_code,
+            padding,
+            padding_value,
             conv2d_stage.op.attrs["activation"],
             conv2d_stage.op.attrs["scale"],
         ),
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
index 292743eff78c..c1c83f8956f7 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
@@ -27,7 +27,7 @@
 
 from tvm.contrib.gemmini.environment import Environment
 
-env = Environment.instance()
+ENV = Environment.instance()
 
 
 @autotvm.register_topi_compute("contrib.gemmini.max_pool2d")
@@ -61,7 +61,7 @@ def max_pool2d(
     def irb_builder_func(ins, outs):
         irb = tvm.tir.ir_builder.create()
 
-        if env.supports_non_zero_padding:
+        if ENV.supports_non_zero_padding:
             irb.emit(
                 tvm.tir.call_extern(
                     "",
diff --git a/python/tvm/relay/backend/contrib/gemmini/op.py b/python/tvm/relay/backend/contrib/gemmini/op.py
index 6ca41c66d139..a37ef10428bf 100644
--- a/python/tvm/relay/backend/contrib/gemmini/op.py
+++ b/python/tvm/relay/backend/contrib/gemmini/op.py
@@ -24,24 +24,16 @@
 from __future__ import absolute_import as _abs
 
 import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
 
-from tvm.relay.op import op as reg
 from tvm.relay.op import strategy as _strategy
-from tvm.relay.op.op import OpPattern, OpStrategy
-
+from tvm.relay.op.op import OpStrategy
+from tvm.contrib.gemmini.environment import Environment
 from .gemmini_dense import gemm, schedule_gemm
 from .gemmini_dense_cisc import gemm_cisc, schedule_gemm_cisc
 from .gemmini_conv2d_cisc import conv2d_cisc, schedule_conv2d_cisc
 from .gemmini_depthwise_conv2d_cisc import depthwise_conv2d_cisc, schedule_depthwise_conv2d_cisc
 from .gemmini_add import add, schedule_add
 from .gemmini_max_pool2d import max_pool2d, schedule_max_pool2d
-from tvm.contrib.gemmini.environment import Environment
-
-from tvm.topi.utils import const_vector, get_const_int, get_const_float
-import numpy as np
 
 ENV = Environment.instance()
 

From 6483608f761654cccd89ae41bdcd5830b9e62e8b Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 16:32:41 +0100
Subject: [PATCH 264/286] More lint improvements

---
 python/tvm/contrib/gemmini/build_module.py    |  3 +-
 python/tvm/contrib/gemmini/environment.py     | 12 ++---
 python/tvm/contrib/gemmini/helpers.py         | 30 ++++++------
 python/tvm/contrib/gemmini/intrin.py          | 14 +-----
 python/tvm/contrib/gemmini/pattern_table.py   |  3 --
 python/tvm/contrib/gemmini/transform.py       | 25 ----------
 .../backend/contrib/gemmini/gemmini_add.py    |  1 -
 .../backend/contrib/gemmini/gemmini_dense.py  | 47 +++++++++----------
 .../contrib/gemmini/gemmini_dense_cisc.py     |  4 +-
 .../gemmini/gemmini_depthwise_conv2d_cisc.py  |  1 -
 .../contrib/gemmini/gemmini_max_pool2d.py     |  2 +-
 11 files changed, 51 insertions(+), 91 deletions(-)

diff --git a/python/tvm/contrib/gemmini/build_module.py b/python/tvm/contrib/gemmini/build_module.py
index fc72a6b03af8..bf2ff9832309 100644
--- a/python/tvm/contrib/gemmini/build_module.py
+++ b/python/tvm/contrib/gemmini/build_module.py
@@ -76,7 +76,6 @@ def internal_build_configs(usmp_alg=""):
     Returns:
         dict: configurations
     """
-    enable_usmp = False if usmp_alg == "" else True
     pass_list = [
         (0, tvm.tir.transform.StorageFlatten(16)),
         (1, InjectAMVINIntrin()),
@@ -101,7 +100,7 @@ def internal_build_configs(usmp_alg=""):
         "tir.add_lower_pass": pass_list,
         "tir.disable_vectorize": True,
         # "tir.CorrectGemminisScratchpadAndAccumulatorPointers": {"dim": env.DIM}
-        "tir.usmp.enable": enable_usmp,
+        "tir.usmp.enable": bool(usmp_alg),
         "tir.usmp.algorithm": usmp_alg,
     }
 
diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
index 56e8e61b646d..37e18987883b 100644
--- a/python/tvm/contrib/gemmini/environment.py
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -58,7 +58,7 @@ def init_overwrite(
         bank_rows=8192,
         bank_num=4,
         debug=False,
-        enabled_counters: Dict = {},
+        enabled_counters: Dict = None,
         supports_non_zero_padding: bool = False,
         use_experimental_qnn_add: bool = False,
     ):
@@ -75,7 +75,7 @@ def init_overwrite(
             bank_rows (int, optional): Rows of each bank in the scratchpad. Defaults to 8192.
             bank_num (int, optional): Banks for the scratchpad. Defaults to 4.
             debug (bool, optional): Adds debug of Gemmini counters. Defaults to False.
-            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to empty.
+            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to None.
             supports_non_zero_padding (bool, optional): Gemmini supports instructions with non-zero padding. Defaults to False.
             use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add. Defaults to False.
         """
@@ -120,7 +120,7 @@ def init(
         bank_rows=4096,
         bank_num=4,
         debug=False,
-        enabled_counters: Dict = {},
+        enabled_counters: Dict = None,
         supports_non_zero_padding: bool = False,
         use_experimental_qnn_add: bool = False,
     ):
@@ -137,7 +137,7 @@ def init(
             bank_rows (int, optional): Amount of rows of each bank in the scratchpad. Defaults to 8192.
             bank_num (int, optional): Amount of banks for the scratchpad. Defaults to 4.
             debug (bool, optional): Adds debug of Gemmini counters. Defaults to False.
-            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to empty.
+            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to None.
             supports_non_zero_padding (bool, optional): Gemmini supports instructions with non-zero padding. Defaults to False.
             use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add. Defaults to False.
         """
@@ -212,11 +212,11 @@ def init(
         self.supports_non_zero_padding = supports_non_zero_padding
         self.use_experimental_qnn_add = use_experimental_qnn_add
 
-        self.enabled_counters = enabled_counters if bool(enabled_counters) else counters
+        self.enabled_counters = enabled_counters if enabled_counters is not None else counters
         # Check that all enabled counters exist in the actual counters from Gemmini
         for key, value in self.enabled_counters.items():
             assert (
-                self.enabled_counters[key] == counters[key]
+                value == counters[key]
             ), f"Enabled counter with key {key} does not exist or has a different name in the actual counters dict!"
 
     def gemm(
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
index df3a9bfe9bce..0bc3b4f8f386 100644
--- a/python/tvm/contrib/gemmini/helpers.py
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -50,6 +50,9 @@ def create_header_file(
         debug (bool, optional): enable debug. Defaults to False.
         weights (bool, optional): For debug purposes. Defaults to None.
     """
+    if debug:
+        assert weights is not None, "When passing the debug flag as True, the weights parameter must be given!"
+
     file_path = pathlib.Path(f"{output_path}/" + name).resolve()
     # Create header file with npy_data as a C array
     raw_header_path = file_path.with_suffix(".h").resolve()
@@ -70,16 +73,16 @@ def create_header_file(
     else:
         assert False, f"Type {tensor_data.dtype} is not supported!"
 
-    with open(raw_header_path, "a+") as header_file:
+    with open(raw_header_path, "a+", encoding="utf8") as header_file:
         header_file.write(
             f"#define {tensor_name}_len {tensor_data.size}\n"
             + f"extern {datatype} {tensor_name}[{tensor_name}_len];\n"
         )
 
     if not raw_source_path.is_file():
-        with open(raw_source_path, "a+") as source_file:
+        with open(raw_source_path, "a+", encoding="utf8") as source_file:
             source_file.write("#include <stdint.h>\n")
-    with open(raw_source_path, "a+") as source_file:
+    with open(raw_source_path, "a+", encoding="utf8") as source_file:
 
         source_file.write(
             f'{datatype} {tensor_name}[] __attribute__((section("{section}"), aligned({align}))) = {{'
@@ -118,17 +121,16 @@ def create_header_file(
                         source_file.write("\n")
             source_file.write("*/\n")
 
-            if weights is not None:
-                source_file.write("/*\n")
-                for o_ch in range(weights.shape[3]):
-                    source_file.write(f"Output channel {o_ch}:\n")
-                    for i_ch in range(weights.shape[2]):
-                        source_file.write(f"Input channel {i_ch}:\n")
-                        for row in range(weights.shape[0]):
-                            for col in range(weights.shape[1]):
-                                source_file.write(f"{weights[row][col][i_ch][o_ch]}\t")
-                            source_file.write("\n")
-                source_file.write("*/\n")
+            source_file.write("/*\n")
+            for o_ch in range(weights.shape[3]):
+                source_file.write(f"Output channel {o_ch}:\n")
+                for i_ch in range(weights.shape[2]):
+                    source_file.write(f"Input channel {i_ch}:\n")
+                    for row in range(weights.shape[0]):
+                        for col in range(weights.shape[1]):
+                            source_file.write(f"{weights[row][col][i_ch][o_ch]}\t")
+                        source_file.write("\n")
+            source_file.write("*/\n")
 
 
 def get_divisors(x: int) -> List[int]:
diff --git a/python/tvm/contrib/gemmini/intrin.py b/python/tvm/contrib/gemmini/intrin.py
index d8809726555a..65c27caf119c 100644
--- a/python/tvm/contrib/gemmini/intrin.py
+++ b/python/tvm/contrib/gemmini/intrin.py
@@ -392,20 +392,16 @@ def conv2d_cisc(
     Returns:
         TensorIntrin: CONV2D CISC tensor intrinsic
     """
-
+    _ = pool_dilation
     # TODO (FP): add assertions here for the supported parameters?
 
     wgt = te.placeholder(wgt_shape, dtype=env.inp_dtype, name=env.scr_wgt_scope)
     inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
     bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
 
-    wgt.shape[3]
     k_h = wgt.shape[0]
     k_w = wgt.shape[1]
 
-    inp.shape[0]
-    inp.shape[1]
-    inp.shape[2]
     i_c = inp.shape[3]
 
     ric = te.reduce_axis((0, i_c), name="ric")
@@ -571,15 +567,9 @@ def dw_conv2d_cisc(
     inp = te.placeholder(inp_shape, dtype=env.inp_dtype, name=env.scr_scope)
     bias = te.placeholder(bias_shape, dtype=env.acc_dtype, name=env.scr_scope)
 
-    wgt.shape[0]
     k_h = wgt.shape[1]
     k_w = wgt.shape[2]
 
-    inp.shape[0]
-    inp.shape[1]
-    inp.shape[2]
-    inp.shape[3]
-
     rkh = te.reduce_axis((0, k_h), name="rkh")
     rkw = te.reduce_axis((0, k_w), name="rkw")
 
@@ -751,7 +741,7 @@ def add_tensorize(env, oshape: Tuple[int, ...]):
     def intrin_func(ins, outs):
         """Add intrinsic function"""
         difm1, difm2 = ins
-        outs[0]
+        _ = outs
 
         def _body():
             irb = tvm.tir.ir_builder.create()
diff --git a/python/tvm/contrib/gemmini/pattern_table.py b/python/tvm/contrib/gemmini/pattern_table.py
index 46e29ad6ffa6..37a93b8a51bb 100644
--- a/python/tvm/contrib/gemmini/pattern_table.py
+++ b/python/tvm/contrib/gemmini/pattern_table.py
@@ -248,9 +248,6 @@ class DepthwiseCONV2DParams(CONV2DParams):
     composite_name = "gemmini.depthwiseconv2d"
     activation_map = {"clip": "CLIP"}
 
-    def __init__(self, func_body: tvm.relay.Function):
-        super().__init__(func_body)
-
 
 class MaxPoolParams:
     """
diff --git a/python/tvm/contrib/gemmini/transform.py b/python/tvm/contrib/gemmini/transform.py
index eddd9012ae07..41455bb8d283 100644
--- a/python/tvm/contrib/gemmini/transform.py
+++ b/python/tvm/contrib/gemmini/transform.py
@@ -277,7 +277,6 @@ def _do_fold(stmt):
         return None
 
     def _ftransform(f, mod, ctx):
-        f.attrs["global_symbol"]
         return f.with_body(
             tvm.tir.stmt_functor.ir_transform(f.body, _do_fold, None, ["tir.AttrStmt"])
         )
@@ -295,8 +294,6 @@ def InjectAMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -339,8 +336,6 @@ def InjectAMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -382,8 +377,6 @@ def InjectBMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -425,8 +418,6 @@ def InjectBMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -467,8 +458,6 @@ def InjectDMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -510,8 +499,6 @@ def InjectDMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -553,8 +540,6 @@ def InjectCMVOUTIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -599,8 +584,6 @@ def InjectCMVOUTIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -646,8 +629,6 @@ def InjectCMVINIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -689,8 +670,6 @@ def InjectCMVINIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -732,8 +711,6 @@ def InjectCMVINAccumIntrin():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
@@ -774,8 +751,6 @@ def InjectCMVINAccumIntrinTransposed():
     fpass : tvm.transform.Pass
         The pass
     """
-    tvm.tir.indexdiv
-    tvm.tir.indexmod
 
     def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         # TODO (FP): add padding support...
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
index a561a01d6c32..f324b8f9732d 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
@@ -128,7 +128,6 @@ def schedule_add(
 
     ifm1, ifm2_op = add_stage.op.input_tensors
     ifm2, ofm_offset_op = ifm2_op.op.input_tensors
-    ofm_offset_op.op.input_tensors[0]
 
     # Prepare the scope of each buffer
     cifm1 = sch.cache_read(ifm1, ENV.acc_scope, [add_stage])
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
index d52557d8b703..d43bdc8fc5b7 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
@@ -113,11 +113,10 @@ def schedule_gemm(
     sch = te.create_schedule([x.op for x in outs])
 
     data, weight, bias_op = dense_stage.op.input_tensors
-    bias_op.op.input_tensors[0]
 
     ##### space definition begin #####
     x, y = sch[dense_stage].op.axis
-    (z,) = sch[dense_stage].op.reduce_axis
+    (z_axis,) = sch[dense_stage].op.reduce_axis
 
     # TODO (FP): add limits for scratchpad and accumulator sizes perhaps?
     cfg.define_split(
@@ -146,7 +145,7 @@ def schedule_gemm(
 
     cfg.define_split(
         "tile_zo",
-        z,
+        z_axis,
         num_outputs=3,
         policy="power2",
         filter=lambda ax: (
@@ -188,26 +187,26 @@ def schedule_gemm(
     cweight = sch.cache_read(weight, ENV.scr_wgt_scope, [dense_stage])
     dense_stage_acc = sch.cache_write(output, ENV.acc_scope)
     sch[bias_op].set_scope(ENV.acc_scope)
-    (x_, y_) = sch[dense_stage_acc].op.axis
-    (z_,) = sch[dense_stage_acc].op.reduce_axis
+    (x_axis, y_axis) = sch[dense_stage_acc].op.axis
+    (z_axis_int,) = sch[dense_stage_acc].op.reduce_axis
 
     # Split loops to generate the inner dimensions specified by knobs tile_xo and tile_yo
-    b_y, yo, yi = cfg["tile_yo"].apply(sch, output, sch[output].op.axis[1])
-    b_x, xo, xi = cfg["tile_xo"].apply(sch, output, sch[output].op.axis[0])
+    b_y, yo_axis, yi_axis = cfg["tile_yo"].apply(sch, output, sch[output].op.axis[1])
+    b_x, xo_axis, xi_axis = cfg["tile_xo"].apply(sch, output, sch[output].op.axis[0])
 
     # Apply the exchange_axis knob
     if cfg["exchange_axis"].val:
-        sch[output].reorder(b_y, b_x, yo, xo, yi, xi)
+        sch[output].reorder(b_y, b_x, yo_axis, xo_axis, yi_axis, xi_axis)
     else:
-        sch[output].reorder(b_x, b_y, xo, yo, xi, yi)
+        sch[output].reorder(b_x, b_y, xo_axis, yo_axis, xi_axis, yi_axis)
 
     # Apply the accumulate_multiple_patches knob
     if cfg["accumulate_multiple_patches"].val == 0:
         axis_for_output = b_x if cfg["exchange_axis"].val else b_y
     elif cfg["accumulate_multiple_patches"].val == 1:
-        axis_for_output = yo if cfg["exchange_axis"].val else xo
+        axis_for_output = yo_axis if cfg["exchange_axis"].val else xo_axis
     else:
-        axis_for_output = xo if cfg["exchange_axis"].val else yo
+        axis_for_output = xo_axis if cfg["exchange_axis"].val else yo_axis
 
     axis_gemm_start = b_y if cfg["exchange_axis"].val else b_x
 
@@ -215,9 +214,9 @@ def schedule_gemm(
     sch[dense_stage_acc].compute_at(sch[output], axis_for_output)
 
     # # Split loops to generate the inner dimensions specified by knob tile_zo
-    xo_o, xi_o = sch[dense_stage_acc].split(x_, factor=ENV.DIM)
-    yo_o, yi_o = sch[dense_stage_acc].split(y_, factor=ENV.DIM)
-    b_z, zo_o, zi_o = cfg["tile_zo"].apply(sch, dense_stage_acc, z_)
+    xo_o, xi_o = sch[dense_stage_acc].split(x_axis, factor=ENV.DIM)
+    yo_o, yi_o = sch[dense_stage_acc].split(y_axis, factor=ENV.DIM)
+    b_z, zo_o, zi_o = cfg["tile_zo"].apply(sch, dense_stage_acc, z_axis_int)
 
     # Apply the exchange_axis knob
     if cfg["exchange_axis"].val:
@@ -302,22 +301,22 @@ def schedule_gemm(
 
     # Mvout preparation
     if cfg["exchange_axis"].val:
-        sch[output].reorder(yo, yi, xo, xi)
+        sch[output].reorder(yo_axis, yi_axis, xo_axis, xi_axis)
     else:
-        sch[output].reorder(xo, xi, yo, yi)
+        sch[output].reorder(xo_axis, xi_axis, yo_axis, yi_axis)
     if cfg["accumulate_multiple_patches"].val == 0:
-        fused_x = sch[output].fuse(xo, xi)
-        fused_y = sch[output].fuse(yo, yi)
+        fused_x = sch[output].fuse(xo_axis, xi_axis)
+        fused_y = sch[output].fuse(yo_axis, yi_axis)
     elif cfg["accumulate_multiple_patches"].val == 1:
         if cfg["exchange_axis"].val:
-            fused_x = sch[output].fuse(xo, xi)
-            fused_y = yi
+            fused_x = sch[output].fuse(xo_axis, xi_axis)
+            fused_y = yi_axis
         else:
-            fused_x = xi
-            fused_y = sch[output].fuse(yo, yi)
+            fused_x = xi_axis
+            fused_y = sch[output].fuse(yo_axis, yi_axis)
     else:
-        fused_x = xi
-        fused_y = yi
+        fused_x = xi_axis
+        fused_y = yi_axis
 
     fused_x_1, fused_x_2 = sch[output].split(fused_x, factor=ENV.DIM)
     fused_y_1, fused_y_2 = sch[output].split(
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
index a3978fe5b63d..8fdc12e5d8d2 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
@@ -112,9 +112,9 @@ def schedule_gemm_cisc(
     if cfg.is_fallback:
         cfg["WS/OS"] = OtherOptionEntity(ENV.WEIGHT_STATIONARY)
 
-    x_, _ = sch[dense_stage].op.axis
+    x_axis, _ = sch[dense_stage].op.axis
 
-    x_o, x_i = sch[dense_stage].split(x_, factor=data.shape[0])
+    x_o, x_i = sch[dense_stage].split(x_axis, factor=data.shape[0])
 
     axis_for_start = x_o
 
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
index d15392efeb32..b25893bc9bd0 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
@@ -77,7 +77,6 @@ def depthwise_conv2d_cisc(
     n = orig_data.shape[0]
     i_h = orig_data.shape[1]
     i_w = orig_data.shape[2]
-    orig_data.shape[3]
 
     hstr = strides[0]
     wstr = strides[1]
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
index c1c83f8956f7..bd71705be711 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
@@ -115,7 +115,7 @@ def irb_builder_func(ins, outs):
         return irb.get()
 
     res = te.extern(
-        (1,), [data, weights], lambda ins, outs: irb_builder_func(ins, outs), dtype="int8"
+        (1,), [data, weights], lambda ins, outs: irb_builder_func(ins, outs), dtype="int8" # pylint: disable=W0108
     )
 
     # TODO (FP): add correct FLOPS

From dcb2845abb6f5c99414741a30c5f73679128c093 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 16:52:41 +0100
Subject: [PATCH 265/286] Fixed ALL pylint "Line too long"

---
 python/tvm/contrib/gemmini/environment.py     | 33 ++++++++++++-------
 python/tvm/contrib/gemmini/helpers.py         |  7 ++--
 python/tvm/contrib/gemmini/intrin.py          | 15 ++++++---
 python/tvm/contrib/gemmini/legalize.py        | 12 ++++---
 python/tvm/contrib/gemmini/pattern_table.py   |  3 +-
 .../backend/contrib/gemmini/gemmini_add.py    |  6 ++--
 .../backend/contrib/gemmini/gemmini_dense.py  | 12 ++++---
 .../gemmini/gemmini_depthwise_conv2d_cisc.py  |  3 +-
 .../contrib/gemmini/gemmini_max_pool2d.py     | 10 ++++--
 9 files changed, 69 insertions(+), 32 deletions(-)

diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
index 37e18987883b..1fa94acd9efe 100644
--- a/python/tvm/contrib/gemmini/environment.py
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -75,9 +75,12 @@ def init_overwrite(
             bank_rows (int, optional): Rows of each bank in the scratchpad. Defaults to 8192.
             bank_num (int, optional): Banks for the scratchpad. Defaults to 4.
             debug (bool, optional): Adds debug of Gemmini counters. Defaults to False.
-            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to None.
-            supports_non_zero_padding (bool, optional): Gemmini supports instructions with non-zero padding. Defaults to False.
-            use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add. Defaults to False.
+            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes.
+                Defaults to None.
+            supports_non_zero_padding (bool, optional): Gemmini supports instructions
+                with non-zero padding. Defaults to False.
+            use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add.
+                Defaults to False.
         """
         inst = Environment.instance()
         inst.init(
@@ -131,15 +134,20 @@ def init(
             dim (int, optional): Gemminis systolic array dimensions (DIM). Defaults to 32.
             max_bytes (int, optional): Limits maximum amount of mvin columns. Defaults to 64.
             inp_dtype (str, optional): Type of the Gemmini scratchpad. Defaults to "int8".
-            wgt_dtype (str, optional): Type of the Gemmini "logical" weight scratchpad. Defaults to "int8".
+            wgt_dtype (str, optional): Type of the Gemmini "logical" weight scratchpad.
+                Defaults to "int8".
             acc_dtype (str, optional): Type of the Gemmini accumulator. Defaults to "int32".
             acc_rows (int, optional): Amount of rows of the accumulator. Defaults to 4096.
-            bank_rows (int, optional): Amount of rows of each bank in the scratchpad. Defaults to 8192.
+            bank_rows (int, optional): Amount of rows of each bank in the scratchpad.
+                Defaults to 8192.
             bank_num (int, optional): Amount of banks for the scratchpad. Defaults to 4.
             debug (bool, optional): Adds debug of Gemmini counters. Defaults to False.
-            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes. Defaults to None.
-            supports_non_zero_padding (bool, optional): Gemmini supports instructions with non-zero padding. Defaults to False.
-            use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add. Defaults to False.
+            enabled_counters (dict, optional): Enabled Gemmini counters for debug purposes.
+                Defaults to None.
+            supports_non_zero_padding (bool, optional): Gemmini supports instructions
+                with non-zero padding. Defaults to False.
+            use_experimental_qnn_add (bool, optional): Pattern matching for qnn.add.
+                Defaults to False.
         """
 
         assert batch == 1, "Only batch size of 1 is currently supported"
@@ -191,7 +199,8 @@ def init(
         self.scr_scope = "local.scratchpad"
         self.acc_scope = "local.accumulator"
         # Actually, only one scratchpad should exist.
-        # But we do this logical partition to correctly manage the pointers to the buffers stored in this memories.
+        # But we do this logical partition to correctly manage the pointers
+        # to the buffers stored in this memories.
         # Should see how we can fix this in the future.
         self.scr_wgt_scope = "local.scratchpad_weight"
 
@@ -217,7 +226,8 @@ def init(
         for key, value in self.enabled_counters.items():
             assert (
                 value == counters[key]
-            ), f"Enabled counter with key {key} does not exist or has a different name in the actual counters dict!"
+            ), f"Enabled counter with key {key} does not exist \
+            or has a different name in the actual counters dict!"
 
     def gemm(
         self,
@@ -236,7 +246,8 @@ def gemm(
             K (int): reduction axis dimension
             J (int): output second axis dimension
             stride (int, optional): Stride, useful for convolutions. Defaults to 1.
-            is_depthwise_conv2d (bool, optional): Flag to explain if this is a GEMM for a depthwise convolution. Defaults to False.
+            is_depthwise_conv2d (bool, optional): Flag to explain if this is a
+                GEMM for a depthwise convolution. Defaults to False.
             mode (int, optional): Systolic array mode (WS=1,OS=0). Defaults to 1.
             accum_patch (_type_, optional): Var of the reduction axis loop. Defaults to None.
 
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
index 0bc3b4f8f386..5ebf4c719a06 100644
--- a/python/tvm/contrib/gemmini/helpers.py
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -51,7 +51,9 @@ def create_header_file(
         weights (bool, optional): For debug purposes. Defaults to None.
     """
     if debug:
-        assert weights is not None, "When passing the debug flag as True, the weights parameter must be given!"
+        assert (
+            weights is not None
+        ), "When passing the debug flag as True, the weights parameter must be given!"
 
     file_path = pathlib.Path(f"{output_path}/" + name).resolve()
     # Create header file with npy_data as a C array
@@ -85,7 +87,8 @@ def create_header_file(
     with open(raw_source_path, "a+", encoding="utf8") as source_file:
 
         source_file.write(
-            f'{datatype} {tensor_name}[] __attribute__((section("{section}"), aligned({align}))) = {{'
+            f'{datatype} {tensor_name}[] __attribute__((section("{section}"), \
+                aligned({align}))) = {{'
             if section
             else f"{datatype} {tensor_name}[] __attribute__((aligned({align}))) = {{"
         )
diff --git a/python/tvm/contrib/gemmini/intrin.py b/python/tvm/contrib/gemmini/intrin.py
index 65c27caf119c..6aa20c2c8198 100644
--- a/python/tvm/contrib/gemmini/intrin.py
+++ b/python/tvm/contrib/gemmini/intrin.py
@@ -45,7 +45,8 @@ def gemm(
         dim_k (int): reduction axis dimension
         dim_j (int): output second axis dimension
         stride (int, optional): Stride, useful for convolutions. Defaults to 1.
-        is_depthwise_conv2d (bool, optional): Flag to explain if this is a GEMM for a depthwise convolution. Defaults to False.
+        is_depthwise_conv2d (bool, optional): Flag to explain if this is a GEMM for
+            a depthwise convolution. Defaults to False.
         mode (int, optional): Systolic array mode (WS=1,OS=0). Defaults to 1.
         accum_patch (tvm.tir.Var, optional): Var of the reduction axis loop. Defaults to None.
 
@@ -137,7 +138,8 @@ def intrin_func(ins, outs):
         garbage = tvm.runtime.const(0xFFFFFFFF, "uint32")
 
         def _body():
-            """Generate matrix-matrix multiply Gemmini instruction, without accumulate (garbage address in compute_preloaded)"""
+            """Generate matrix-matrix multiply Gemmini instruction,
+            without accumulate (garbage address in compute_preloaded)"""
             irb = tvm.tir.ir_builder.create()
 
             inp_access_ptr = dinp.access_ptr("r", "uint32")
@@ -238,7 +240,8 @@ def gemm_cisc(
     scale: float,
     matmul_type: int,
 ):
-    """Matrix-matrix multiply intrinsic, inserts the calls to the function provided by the Gemmini developers to run matrix multiplication using the loop instructions
+    """Matrix-matrix multiply intrinsic, inserts the calls to the function
+    provided by the Gemmini developers to run matrix multiplication using the loop instructions
 
     Args:
         env (Environment): Environment with configurations
@@ -371,7 +374,8 @@ def conv2d_cisc(
     pool_dilation: List[int],
     pool_padding: List[int],
 ):
-    """2D convolution intrinsic, inserts the calls to the function provided by the Gemmini developers to run a 2D convolution using the loop instructions
+    """2D convolution intrinsic, inserts the calls to the function provided
+    by the Gemmini developers to run a 2D convolution using the loop instructions
 
     Args:
         env (Environment): Environment with configurations
@@ -543,7 +547,8 @@ def dw_conv2d_cisc(
     activation: int,
     scale: float,
 ):
-    """2D depthwise convolution intrinsic, inserts the calls to the function provided by the Gemmini developers to run a 2D depthwise convolution using the loop instructions
+    """2D depthwise convolution intrinsic, inserts the calls to the function
+    provided by the Gemmini developers to run a 2D depthwise convolution using the loop instructions
 
     Args:
         env (Environment): Environment with configurations
diff --git a/python/tvm/contrib/gemmini/legalize.py b/python/tvm/contrib/gemmini/legalize.py
index f924f1dfe716..4d74707e5acf 100644
--- a/python/tvm/contrib/gemmini/legalize.py
+++ b/python/tvm/contrib/gemmini/legalize.py
@@ -156,10 +156,14 @@ def gemmini_conv2d(
         pool_padding (tvm.relay.Expr): Pooling padding in each direction
         input_req_offset_out (tvm.relay.Expr): Requantize layer output offset
         has_activation (bool): Has activation?
-        activation_scale_in (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer input scaling factor
-        activation_offset_in (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer input offset
-        activation_scale_out (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer output scaling factor
-        activation_offset_out (tvm.relay.Expr): TODO (FP): check if this can be deleted and made more simple. Activation layer output offset
+        activation_scale_in (tvm.relay.Expr): TODO (FP): check if this can be deleted
+            and made more simple. Activation layer input scaling factor
+        activation_offset_in (tvm.relay.Expr): TODO (FP): check if this can be deleted
+            and made more simple. Activation layer input offset
+        activation_scale_out (tvm.relay.Expr): TODO (FP): check if this can be deleted
+            and made more simple. Activation layer output scaling factor
+        activation_offset_out (tvm.relay.Expr): TODO (FP): check if this can be deleted
+            and made more simple. Activation layer output offset
 
     Returns:
         tvm.relay.Call: Call to the contrib.gemmini.conv2d operator
diff --git a/python/tvm/contrib/gemmini/pattern_table.py b/python/tvm/contrib/gemmini/pattern_table.py
index 37a93b8a51bb..ddb4b69acef9 100644
--- a/python/tvm/contrib/gemmini/pattern_table.py
+++ b/python/tvm/contrib/gemmini/pattern_table.py
@@ -420,7 +420,8 @@ def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Cal
     """Declares Gemminis pattern table
 
     Returns:
-        List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]: List of pattern, callable tuples
+        List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]:
+            List of pattern, callable tuples
     """
 
     pattern_table_filters = []
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
index f324b8f9732d..d019fe4cbc3e 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
@@ -134,7 +134,8 @@ def schedule_add(
     sch[ifm2_op].set_scope(ENV.acc_scope)
     sch[ofm_offset_op].set_scope(ENV.acc_scope)
 
-    # Split axis, taking into account the maximum value of rows and columns that can be moved into Gemminis accumulator (DIM)
+    # Split axis, taking into account the maximum value of rows and columns
+    # that can be moved into Gemminis accumulator (DIM)
     y_factor = get_greater_div(int(sch[add_stage].op.axis[3].dom.extent))
     x_factor = get_greater_div(int(sch[add_stage].op.axis[2].dom.extent))
     y_o, y_i = sch[add_stage].split(sch[add_stage].op.axis[3], factor=y_factor)
@@ -146,7 +147,8 @@ def schedule_add(
     sch[ifm2_op].compute_at(sch[add_stage], y_o)
     sch[ofm_offset_op].compute_at(sch[add_stage], y_o)
 
-    # Split axis, taking into account the maximum value of rows and columns that can be moved into Gemminis accumulator (DIM)
+    # Split axis, taking into account the maximum value of rows and columns
+    # that can be moved into Gemminis accumulator (DIM)
     cifm1_ax_0_1, cifm1_ax_0_2 = sch[cifm1].split(sch[cifm1].op.axis[2], factor=ENV.DIM)
     cifm1_ax_1_1, cifm1_ax_1_2 = sch[cifm1].split(
         sch[cifm1].op.axis[3], factor=ENV.MAX_BLOCK_LEN_ACC * ENV.DIM
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
index d43bdc8fc5b7..dbb7d12f7da5 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
@@ -237,7 +237,8 @@ def schedule_gemm(
     # Compute the move of the bias in the correct loop
     sch[bias_op].compute_at(sch[output], axis_for_output)
 
-    # We assert here that the mvin of data does not use more space than the available one in the scratchpad
+    # We assert here that the mvin of data does not use more space
+    # than the available one in the scratchpad
     if cfg["axis_for_cdata"].val == 0:
         assert (
             cfg["tile_xo"].size[1] * cfg["tile_xo"].size[2] * data.shape[1]
@@ -272,7 +273,8 @@ def schedule_gemm(
             <= ENV.ACC_ROWS * ENV.DIM
         ), "Result matrix will not fit in accumulator!"
 
-    # Move the data and weight move instructions into the correct loops selected by the axis_for_cdata and axis_for_cweight knobs
+    # Move the data and weight move instructions into the correct loops selected
+    # by the axis_for_cdata and axis_for_cweight knobs
     axis_for_cdata = axis_to_input_data[cfg["axis_for_cdata"].val]
     axis_for_cweight = axis_to_input_weights[cfg["axis_for_cweight"].val]
     sch[cdata].compute_at(sch[stages_to_input_data[cfg["axis_for_cdata"].val]], axis_for_cdata)
@@ -280,7 +282,8 @@ def schedule_gemm(
         sch[stages_to_input_data[cfg["axis_for_cweight"].val]], axis_for_cweight
     )
 
-    # Split input moves because Gemmini's mvin only supports mvins with rows <= DIM and cols <= MAX_BLOCK_LEN
+    # Split input moves because Gemmini's mvin only supports mvins with
+    # rows <= DIM and cols <= MAX_BLOCK_LEN
     cdata_ax_0_1, cdata_ax_0_2 = sch[cdata].split(sch[cdata].op.axis[0], factor=ENV.DIM)
     cdata_ax_1_1, cdata_ax_1_2 = sch[cdata].split(
         sch[cdata].op.axis[1], factor=ENV.MAX_BLOCK_LEN * ENV.DIM
@@ -353,7 +356,8 @@ def schedule_gemm(
         ),
     )
 
-    # Generate configuration dictionary, in order to correctly generate the calls to the configuration instructions
+    # Generate configuration dictionary, in order to correctly generate
+    # the calls to the configuration instructions
     config_dict = {}
     config_dict["A_size"] = int(data.shape[1])
     config_dict["B_size"] = int(weight.shape[1])
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
index b25893bc9bd0..d33749823268 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
@@ -89,7 +89,8 @@ def depthwise_conv2d_cisc(
     o_w = topi.utils.get_const_int(tvm.tir.div((i_w + (left_pad + right_pad) - k_w), wstr) + 1)
 
     if len(set(padding)) == 1 and ENV.supports_non_zero_padding:
-        # If the padding is the same for all borders, there is no need to use topi.nn.pad, because Gemminis CISC instructions support equal padding
+        # If the padding is the same for all borders, there is no need to use topi.nn.pad,
+        # because Gemminis CISC instructions support equal padding
         data = orig_data
     else:
         # If not, then pad before calling Gemminis functions
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
index bd71705be711..2e7880bcbdfe 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
@@ -41,7 +41,10 @@ def max_pool2d(
     pool_dilation: tvm.ir.container.Array,
     pool_padding: tvm.ir.container.Array,
 ) -> tvm.te.tensor.Tensor:
-    """Computation definition to run a max pooling layer on Gemmini. Uses a trick: we call a dw convolution + max pooling, but all weights are 1. So the depthwise convolution does nothing, and the Gemmini accelerator takes care internally of applying the max pooling.
+    """Computation definition to run a max pooling layer on Gemmini.
+    Uses a trick: we call a dw convolution + max pooling, but all weights are 1.
+    So the depthwise convolution does nothing, and the Gemmini accelerator takes care
+    internally of applying the max pooling.
 
     Args:
         cfg (tvm.autotvm.task.space.FallbackConfigEntity): AutoTVM configuration entity
@@ -115,7 +118,10 @@ def irb_builder_func(ins, outs):
         return irb.get()
 
     res = te.extern(
-        (1,), [data, weights], lambda ins, outs: irb_builder_func(ins, outs), dtype="int8" # pylint: disable=W0108
+        (1,),
+        [data, weights],
+        lambda ins, outs: irb_builder_func(ins, outs),  # pylint: disable=W0108
+        dtype="int8",
     )
 
     # TODO (FP): add correct FLOPS

From e340fabdedc0145f844b51892307db477fff5318 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 17:14:22 +0100
Subject: [PATCH 266/286] Pending pylint fixes

---
 python/tvm/contrib/gemmini/environment.py | 6 +++---
 python/tvm/contrib/gemmini/helpers.py     | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
index 1fa94acd9efe..565c1db13f11 100644
--- a/python/tvm/contrib/gemmini/environment.py
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -32,7 +32,7 @@
     add_tensorize,
     add_mvout_tensorize,
 )
-from .utils import counters
+from .utils import COUNTERS
 
 
 class Environment(object):
@@ -221,11 +221,11 @@ def init(
         self.supports_non_zero_padding = supports_non_zero_padding
         self.use_experimental_qnn_add = use_experimental_qnn_add
 
-        self.enabled_counters = enabled_counters if enabled_counters is not None else counters
+        self.enabled_counters = enabled_counters if enabled_counters is not None else COUNTERS
         # Check that all enabled counters exist in the actual counters from Gemmini
         for key, value in self.enabled_counters.items():
             assert (
-                value == counters[key]
+                value == COUNTERS[key]
             ), f"Enabled counter with key {key} does not exist \
             or has a different name in the actual counters dict!"
 
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
index 5ebf4c719a06..69dca3a6b0de 100644
--- a/python/tvm/contrib/gemmini/helpers.py
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -22,8 +22,8 @@
 
 import pathlib
 from typing import List
-import numpy as np
 from six.moves import range
+import numpy as np
 from .environment import Environment
 
 
From 99756b219e94a419608b73547309eb8c8e72331d Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 8 Feb 2023 17:14:34 +0100
Subject: [PATCH 267/286] Pending pylint fixes

---
 python/tvm/contrib/gemmini/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/contrib/gemmini/utils.py b/python/tvm/contrib/gemmini/utils.py
index 1f9d6b26134f..e43d2a7a23f9 100644
--- a/python/tvm/contrib/gemmini/utils.py
+++ b/python/tvm/contrib/gemmini/utils.py
@@ -22,7 +22,7 @@
 
 from enum import Enum
 
-counters = {
+COUNTERS = {
     1: "MAIN_LD_CYCLES",
     2: "MAIN_ST_CYCLES",
     3: "MAIN_EX_CYCLES",

From d7e6a93697dc4dacc577339ef58503a1b296cb88 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Thu, 9 Feb 2023 08:35:54 +0100
Subject: [PATCH 268/286] Docs fix

---
 gallery/tutorial/micro_gemmini_dwconv2d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gallery/tutorial/micro_gemmini_dwconv2d.py b/gallery/tutorial/micro_gemmini_dwconv2d.py
index 6030d14ea024..3fbd41a1c21b 100644
--- a/gallery/tutorial/micro_gemmini_dwconv2d.py
+++ b/gallery/tutorial/micro_gemmini_dwconv2d.py
@@ -16,7 +16,7 @@
 # under the License.
 """
 Running TVM on the Gemmini accelerator - A single 2d depthwise convolutional layer example
-======================================================================================
+===========================================================================================
 **Author**:
 `Federico Peccia <https://fPecc.github.io/>`_
 

From 1eaaee0b0637896888d76f918b8705198ec87863 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Thu, 9 Feb 2023 13:10:29 +0100
Subject: [PATCH 269/286] Added missing license text

---
 apps/microtvm/gemmini/README.md                 | 17 +++++++++++++++++
 .../gemmini/template_project/src/Makefile       | 17 +++++++++++++++++
 .../gemmini/template_project/src/Makefrag.mk    | 17 +++++++++++++++++
 cmake/modules/contrib/Gemmini.cmake             | 17 +++++++++++++++++
 4 files changed, 68 insertions(+)

diff --git a/apps/microtvm/gemmini/README.md b/apps/microtvm/gemmini/README.md
index 9b4c45716062..2691844797f5 100644
--- a/apps/microtvm/gemmini/README.md
+++ b/apps/microtvm/gemmini/README.md
@@ -1,3 +1,20 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
 This directory contains code to create code for the Gemmini accelerator using microTVM. These tests are then executed on the Spike RISC-V ISA simulator.
 
 In order to use this correctly, the Spike simulator has to be installed. This can be done by following the steps found on the [Chipyard](https://chipyard.readthedocs.io/en/stable/) repository. The instructions to also install the patch of the Spike simulator that adds the Gemmini functional simulator can be found in the [Gemmini](https://github.com/ucb-bar/gemmini) repository.
diff --git a/apps/microtvm/gemmini/template_project/src/Makefile b/apps/microtvm/gemmini/template_project/src/Makefile
index b8da778d7eec..c1badcf1816c 100644
--- a/apps/microtvm/gemmini/template_project/src/Makefile
+++ b/apps/microtvm/gemmini/template_project/src/Makefile
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 include $(abs_top_srcdir)/Makefrag
 
 tests_baremetal = $(tests:=-baremetal)
diff --git a/apps/microtvm/gemmini/template_project/src/Makefrag.mk b/apps/microtvm/gemmini/template_project/src/Makefrag.mk
index a60184526081..cb4e5ee72da9 100644
--- a/apps/microtvm/gemmini/template_project/src/Makefrag.mk
+++ b/apps/microtvm/gemmini/template_project/src/Makefrag.mk
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 XLEN ?= 64
 
 CC_BAREMETAL := riscv$(XLEN)-unknown-elf-gcc
diff --git a/cmake/modules/contrib/Gemmini.cmake b/cmake/modules/contrib/Gemmini.cmake
index 757a99217510..2e5a76bcc06c 100644
--- a/cmake/modules/contrib/Gemmini.cmake
+++ b/cmake/modules/contrib/Gemmini.cmake
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 if(USE_GEMMINI)
   message(STATUS "Add Gemmini for microTVM")
 

From 60cbfd1cafcc952b7e86b96bf8aaf3dc0b30cc13 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Thu, 9 Feb 2023 13:54:12 +0100
Subject: [PATCH 270/286] Small lint fixes

---
 src/relay/op/contrib/gemmini/convolution.cc           | 1 -
 src/relay/op/contrib/gemmini/depthwise_convolution.cc | 1 -
 src/relay/op/contrib/gemmini/gemm.cc                  | 1 -
 src/relay/op/contrib/gemmini/max_pool2d.cc            | 1 -
 src/tir/ir/stmt.cc                                    | 2 +-
 5 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/relay/op/contrib/gemmini/convolution.cc b/src/relay/op/contrib/gemmini/convolution.cc
index 1ac0a3ad0df5..78c7f249c51c 100644
--- a/src/relay/op/contrib/gemmini/convolution.cc
+++ b/src/relay/op/contrib/gemmini/convolution.cc
@@ -26,7 +26,6 @@
 
 #include "../../../qnn/utils.h"
 #include "../../op_common.h"
-//#include "common.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/contrib/gemmini/depthwise_convolution.cc b/src/relay/op/contrib/gemmini/depthwise_convolution.cc
index d9cb264fb514..c956c5e1b815 100644
--- a/src/relay/op/contrib/gemmini/depthwise_convolution.cc
+++ b/src/relay/op/contrib/gemmini/depthwise_convolution.cc
@@ -26,7 +26,6 @@
 
 #include "../../../qnn/utils.h"
 #include "../../op_common.h"
-//#include "common.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/contrib/gemmini/gemm.cc b/src/relay/op/contrib/gemmini/gemm.cc
index 6002e72aaa41..eacbabafdc77 100644
--- a/src/relay/op/contrib/gemmini/gemm.cc
+++ b/src/relay/op/contrib/gemmini/gemm.cc
@@ -26,7 +26,6 @@
 
 #include "../../../qnn/utils.h"
 #include "../../op_common.h"
-//#include "common.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/contrib/gemmini/max_pool2d.cc b/src/relay/op/contrib/gemmini/max_pool2d.cc
index 2e435ceea875..082a4492547b 100644
--- a/src/relay/op/contrib/gemmini/max_pool2d.cc
+++ b/src/relay/op/contrib/gemmini/max_pool2d.cc
@@ -26,7 +26,6 @@
 
 #include "../../../qnn/utils.h"
 #include "../../op_common.h"
-//#include "common.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index ff28121db27d..250465257301 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -38,7 +38,7 @@ LetStmt::LetStmt(Var var, PrimExpr value, Stmt body, Span span) {
   // It is still valid to bind a pointer type
   // var to a value that is of type handle.
   if (var->type_annotation.as<PointerTypeNode>()) {
-    // TODO (FP): Is this check really necessary?
+    // TODO(FP): Is this check really necessary?
     // auto vdtype = value.dtype();
     // ICHECK(vdtype.is_handle());
   } else {

From d563c00a3aa7a59cebefe183d21401f7f9776736 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Mon, 27 Mar 2023 13:44:46 +0200
Subject: [PATCH 271/286] Recommended changes for merge

---
 .../template_project/crt_config/crt_config.h  |  57 ---------
 .../template_project/microtvm_api_server.py   |  50 +++++---
 .../gemmini/template_project/src/Makefile     |   2 +
 .../src/{ => add_example}/add.c               |  11 +-
 .../src/{ => conv2d_example}/conv2d.c         |  10 +-
 .../src/{ => dense_example}/dense.c           |  10 +-
 .../src/{ => dwconv2d_example}/dwconv2d.c     |  10 +-
 .../src/{ => maxpool2d_example}/maxpool2d.c   |  10 +-
 .../src/{ => mobilenet_example}/mobilenet.c   |   4 +-
 cmake/modules/contrib/Gemmini.cmake           |  14 +--
 cmake/utils/CRTConfig.cmake                   |   2 +
 gallery/tutorial/micro_gemmini_add.py         |  64 +++++-----
 gallery/tutorial/micro_gemmini_conv2d.py      |  56 ++++-----
 gallery/tutorial/micro_gemmini_dense.py       |  55 +++++----
 gallery/tutorial/micro_gemmini_dwconv2d.py    |  50 ++++----
 gallery/tutorial/micro_gemmini_maxpool2d.py   |  55 +++++----
 gallery/tutorial/micro_gemmini_mobilenet.py   |  54 ++++-----
 python/tvm/contrib/gemmini/__init__.py        |   8 --
 python/tvm/contrib/gemmini/build_module.py    |   5 -
 python/tvm/contrib/gemmini/environment.py     |   5 -
 python/tvm/contrib/gemmini/helpers.py         | 111 ------------------
 python/tvm/contrib/gemmini/intrin.py          |   5 -
 python/tvm/contrib/gemmini/legalize.py        |   5 -
 python/tvm/contrib/gemmini/pattern_table.py   |   7 +-
 python/tvm/contrib/gemmini/transform.py       |   5 -
 python/tvm/contrib/gemmini/utils.py           |   5 -
 python/tvm/micro/testing/utils.py             |   2 +-
 .../relay/backend/contrib/gemmini/__init__.py |  15 ---
 .../backend/contrib/gemmini/gemmini_add.py    |   5 -
 .../contrib/gemmini/gemmini_conv2d_cisc.py    |   5 -
 .../backend/contrib/gemmini/gemmini_dense.py  |   5 -
 .../contrib/gemmini/gemmini_dense_cisc.py     |   5 -
 .../gemmini/gemmini_depthwise_conv2d_cisc.py  |   5 -
 .../contrib/gemmini/gemmini_max_pool2d.py     |   5 -
 .../tvm/relay/backend/contrib/gemmini/op.py   |   5 -
 35 files changed, 232 insertions(+), 490 deletions(-)
 delete mode 100644 apps/microtvm/gemmini/template_project/crt_config/crt_config.h
 rename apps/microtvm/gemmini/template_project/src/{ => add_example}/add.c (90%)
 rename apps/microtvm/gemmini/template_project/src/{ => conv2d_example}/conv2d.c (91%)
 rename apps/microtvm/gemmini/template_project/src/{ => dense_example}/dense.c (90%)
 rename apps/microtvm/gemmini/template_project/src/{ => dwconv2d_example}/dwconv2d.c (91%)
 rename apps/microtvm/gemmini/template_project/src/{ => maxpool2d_example}/maxpool2d.c (90%)
 rename apps/microtvm/gemmini/template_project/src/{ => mobilenet_example}/mobilenet.c (98%)

diff --git a/apps/microtvm/gemmini/template_project/crt_config/crt_config.h b/apps/microtvm/gemmini/template_project/crt_config/crt_config.h
deleted file mode 100644
index b3126cfac920..000000000000
--- a/apps/microtvm/gemmini/template_project/crt_config/crt_config.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \brief CRT configuration for the host-linked CRT.
- */
-#ifndef TVM_RUNTIME_MICRO_CRT_CONFIG_H_
-#define TVM_RUNTIME_MICRO_CRT_CONFIG_H_
-
-/*! Log level of the CRT runtime */
-#define TVM_CRT_LOG_LEVEL TVM_CRT_LOG_LEVEL_DEBUG
-
-/*! Support low-level debugging in MISRA-C runtime */
-#define TVM_CRT_DEBUG 0
-
-/*! Maximum supported dimension in NDArray */
-#define TVM_CRT_MAX_NDIM 6
-/*! Maximum supported arguments in generated functions */
-#define TVM_CRT_MAX_ARGS 10
-/*! Maximum supported string length in dltype, e.g. "int8", "int16", "float32" */
-#define TVM_CRT_MAX_STRLEN_DLTYPE 10
-/*! Maximum supported string length in function names */
-#define TVM_CRT_MAX_STRLEN_FUNCTION_NAME 120
-/*! Maximum supported string length in parameter names */
-#define TVM_CRT_MAX_STRLEN_PARAM_NAME 80
-
-/*! Maximum number of registered modules. */
-#define TVM_CRT_MAX_REGISTERED_MODULES 2
-
-/*! Size of the global function registry, in bytes. */
-#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 512
-
-/*! Maximum packet size, in bytes, including the length header. */
-#define TVM_CRT_MAX_PACKET_SIZE_BYTES 8 * 1024
-
-/*! \brief Maximum length of a PackedFunc function name. */
-#define TVM_CRT_MAX_FUNCTION_NAME_LENGTH_BYTES 30
-
-// #define TVM_CRT_FRAMER_ENABLE_LOGS
-
-#endif  // TVM_RUNTIME_MICRO_CRT_CONFIG_H_
diff --git a/apps/microtvm/gemmini/template_project/microtvm_api_server.py b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
index 1f55eedf1e3d..036850afe8f2 100644
--- a/apps/microtvm/gemmini/template_project/microtvm_api_server.py
+++ b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
@@ -14,11 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""
-MicroTVM API Server for Gemmini baremetal tests on the Spike simulator
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 import atexit
 import collections
@@ -56,15 +51,28 @@
 
 IS_TEMPLATE = not (API_SERVER_DIR / MODEL_LIBRARY_FORMAT_RELPATH).exists()
 
-PROJECT_TYPES = [
-    "dense_example",
-    "conv2d_example",
-    "dwconv2d_example",
-    "add_example",
-    "maxpool2d_example",
-    "mobilenet_example",
-]
+# PROJECT_TYPES = [
+#    "dense_example",
+#    "conv2d_example",
+#    "dwconv2d_example",
+#    "add_example",
+#    "maxpool2d_example",
+#    "mobilenet_example",
+# ]
+
+PROJECT_TYPES = []
+if IS_TEMPLATE:
+    for d in (API_SERVER_DIR / "src").iterdir():
+        if d.is_dir():
+            PROJECT_TYPES.append(d.name)
+
+PROJECT_OPTIONS = server.default_project_options(
+    project_type={"choices": tuple(PROJECT_TYPES)},
+    board={"choices": "", "optional": ["flash", "open_transport"]},
+    warning_as_error={"optional": ["build", "flash"]},
+)
 
+"""
 PROJECT_OPTIONS = [
     server.ProjectOption(
         "project_type",
@@ -74,6 +82,7 @@
         help="Type of project to generate.",
     )
 ]
+"""
 
 
 class Handler(server.ProjectAPIHandler):
@@ -232,6 +241,7 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         project_dir.mkdir()
         source_dir = project_dir / "src"
         source_dir.mkdir()
+        extra_files_tar = options.get("extra_files_tar")
 
         # Copies files from the template folder to project_dir
         shutil.copy2(API_SERVER_DIR / "microtvm_api_server.py", project_dir)
@@ -251,7 +261,19 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         metadata = self._disassemble_mlf(model_library_format_path, source_dir)
         shutil.copy2(model_library_format_path, project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
 
-        self._copy_debug_data_files(project_dir)
+        # self._copy_debug_data_files(project_dir)
+        if extra_files_tar:
+            with tarfile.open(extra_files_tar, mode="r:*") as tf:
+                tf.extractall(project_dir)
+                for filename in project_dir.rglob(f"include/tvm/*.h"):
+                    with filename.open("rb") as src_file:
+                        lines = src_file.readlines()
+                    new_lines = []
+                    for line in lines:
+                        if "dlpack" not in str(line):
+                            new_lines.append(line)
+                    with filename.open("wb") as dst_file:
+                        dst_file.writelines(new_lines)
 
         # Recursively change includes
         self._convert_includes(project_dir, source_dir)
diff --git a/apps/microtvm/gemmini/template_project/src/Makefile b/apps/microtvm/gemmini/template_project/src/Makefile
index c1badcf1816c..df459ba96121 100644
--- a/apps/microtvm/gemmini/template_project/src/Makefile
+++ b/apps/microtvm/gemmini/template_project/src/Makefile
@@ -31,6 +31,7 @@ RISCV_TESTS = ${TVM_HOME}/3rdparty/gemmini/software/gemmini-rocc-tests/riscv-tes
 BENCH_COMMON = ${RISCV_TESTS}/benchmarks/common
 GEMMINI_HEADERS = $(abs_top_srcdir)/include/gemmini.h $(abs_top_srcdir)/include/gemmini_params.h $(abs_top_srcdir)/include/gemmini_testutils.h
 STANDALONE_CRT = $(abs_top_srcdir)/standalone_crt
+DEBUG_DATA_HEADERS = $(abs_top_srcdir)/../include/tvm
 
 CFLAGS := $(CFLAGS) \
 	-DPREALLOCATE=1 \
@@ -48,6 +49,7 @@ CFLAGS := $(CFLAGS) \
 	-I$(abs_top_srcdir) \
 	-I$(abs_top_srcdir)/include \
 	-I$(BENCH_COMMON) \
+	-I$(DEBUG_DATA_HEADERS) \
 	-DID_STRING=$(ID_STRING) \
 	-DPRINT_TILE=0 \
 
diff --git a/apps/microtvm/gemmini/template_project/src/add.c b/apps/microtvm/gemmini/template_project/src/add_example/add.c
similarity index 90%
rename from apps/microtvm/gemmini/template_project/src/add.c
rename to apps/microtvm/gemmini/template_project/src/add_example/add.c
index 13aeb1a80e3f..960c6e0d4ab7 100644
--- a/apps/microtvm/gemmini/template_project/src/add.c
+++ b/apps/microtvm/gemmini/template_project/src/add_example/add.c
@@ -25,11 +25,12 @@
 #ifndef BAREMETAL
 #include "sys/mman.h"
 #endif
-#include "model/inputs.h"
-#include "model/outputs.h"
+#include "input_1.h"
+#include "input_2.h"
+#include "output.h"
 #include "model/tvmgen_default.h"
 
-int8_t output_add[output_len];
+int8_t output_add[OUTPUT_LEN];
 
 int main() {
   printf("Starting add test...\r\n");
@@ -50,7 +51,7 @@ int main() {
   tvmgen_default_run(&inputs, &outputs);
 
   // Look for errors!
-  for (int i = 0; i < output_len; i++) {
+  for (int i = 0; i < OUTPUT_LEN; i++) {
     if (output_add[i] != output[i]) {
       error_counter += 1;
       printf("ERROR IN ADD EXAMPLE! output_add[%d] (%d) != output[%d] (%d)\r\n", i, output_add[i],
@@ -60,7 +61,7 @@ int main() {
   }
 
   // We allow for a very small percentage of errors, this could be related to rounding errors
-  float error_perc = ((float)(error_counter / output_len) * 100);
+  float error_perc = ((float)(error_counter / OUTPUT_LEN) * 100);
   if (error_perc < 1)
     printf("SUCCESS! (error_counter = %d)\r\n", error_counter);
   else
diff --git a/apps/microtvm/gemmini/template_project/src/conv2d.c b/apps/microtvm/gemmini/template_project/src/conv2d_example/conv2d.c
similarity index 91%
rename from apps/microtvm/gemmini/template_project/src/conv2d.c
rename to apps/microtvm/gemmini/template_project/src/conv2d_example/conv2d.c
index 22f1bcb1d281..5ec633081b5e 100644
--- a/apps/microtvm/gemmini/template_project/src/conv2d.c
+++ b/apps/microtvm/gemmini/template_project/src/conv2d_example/conv2d.c
@@ -25,11 +25,11 @@
 #ifndef BAREMETAL
 #include "sys/mman.h"
 #endif
-#include "model/inputs.h"
-#include "model/outputs.h"
+#include "input.h"
+#include "output.h"
 #include "model/tvmgen_default.h"
 
-int8_t output_conv[output_len];
+int8_t output_conv[OUTPUT_LEN];
 
 int main() {
   printf("Starting conv2d test...\r\n");
@@ -49,7 +49,7 @@ int main() {
   tvmgen_default_run(&inputs, &outputs);
 
   // Look for errors!
-  for (int i = 0; i < output_len; i++) {
+  for (int i = 0; i < OUTPUT_LEN; i++) {
     if (output_conv[i] != output[i]) {
       error_counter += 1;
       printf("ERROR IN CONV2D EXAMPLE! output_conv[%d] (%d) != output[%d] (%d)\r\n", i,
@@ -59,7 +59,7 @@ int main() {
   }
 
   // We allow for a very small percentage of errors, this could be related to rounding errors
-  if (((float)(error_counter / output_len) * 100) < 1)
+  if (((float)(error_counter / OUTPUT_LEN) * 100) < 1)
     printf("SUCCESS!\r\n");
   else
     printf("FAIL!\r\n");
diff --git a/apps/microtvm/gemmini/template_project/src/dense.c b/apps/microtvm/gemmini/template_project/src/dense_example/dense.c
similarity index 90%
rename from apps/microtvm/gemmini/template_project/src/dense.c
rename to apps/microtvm/gemmini/template_project/src/dense_example/dense.c
index 414eeac88020..cb6cd27c7e51 100644
--- a/apps/microtvm/gemmini/template_project/src/dense.c
+++ b/apps/microtvm/gemmini/template_project/src/dense_example/dense.c
@@ -25,11 +25,11 @@
 #ifndef BAREMETAL
 #include "sys/mman.h"
 #endif
-#include "model/inputs.h"
-#include "model/outputs.h"
+#include "input.h"
+#include "output.h"
 #include "model/tvmgen_default.h"
 
-int8_t output_gemm[output_len];
+int8_t output_gemm[OUTPUT_LEN];
 
 int main() {
   printf("Starting dense test...\r\n");
@@ -49,7 +49,7 @@ int main() {
   tvmgen_default_run(&inputs, &outputs);
 
   // Look for errors!
-  for (int i = 0; i < output_len; i++) {
+  for (int i = 0; i < OUTPUT_LEN; i++) {
     if (output_gemm[i] != output[i]) {
       error_counter += 1;
       printf("ERROR IN DENSE EXAMPLE! output_gemm[%d] (%d) != output[%d] (%d)\r\n", i,
@@ -59,7 +59,7 @@ int main() {
   }
 
   // We allow for a very small percentage of errors, this could be related to rounding errors
-  if (((float)(error_counter / output_len) * 100) < 1)
+  if (((float)(error_counter / OUTPUT_LEN) * 100) < 1)
     printf("SUCCESS!\r\n");
   else
     printf("FAIL!\r\n");
diff --git a/apps/microtvm/gemmini/template_project/src/dwconv2d.c b/apps/microtvm/gemmini/template_project/src/dwconv2d_example/dwconv2d.c
similarity index 91%
rename from apps/microtvm/gemmini/template_project/src/dwconv2d.c
rename to apps/microtvm/gemmini/template_project/src/dwconv2d_example/dwconv2d.c
index ee125e2fdc25..772c343f5c1f 100644
--- a/apps/microtvm/gemmini/template_project/src/dwconv2d.c
+++ b/apps/microtvm/gemmini/template_project/src/dwconv2d_example/dwconv2d.c
@@ -25,11 +25,11 @@
 #ifndef BAREMETAL
 #include "sys/mman.h"
 #endif
-#include "model/inputs.h"
-#include "model/outputs.h"
+#include "input.h"
+#include "output.h"
 #include "model/tvmgen_default.h"
 
-int8_t output_conv[output_len];
+int8_t output_conv[OUTPUT_LEN];
 
 int main() {
   printf("Starting dw conv2d test...\r\n");
@@ -49,7 +49,7 @@ int main() {
   tvmgen_default_run(&inputs, &outputs);
 
   // Look for errors!
-  for (int i = 0; i < output_len; i++) {
+  for (int i = 0; i < OUTPUT_LEN; i++) {
     if (output_conv[i] != output[i]) {
       error_counter += 1;
       printf("ERROR IN DW CONV2D EXAMPLE! output_conv[%d] (%d) != output[%d] (%d)\r\n", i,
@@ -59,7 +59,7 @@ int main() {
   }
 
   // We allow for a very small percentage of errors, this could be related to rounding errors
-  if (((float)(error_counter / output_len) * 100) < 1)
+  if (((float)(error_counter / OUTPUT_LEN) * 100) < 1)
     printf("SUCCESS!\r\n");
   else
     printf("FAIL!\r\n");
diff --git a/apps/microtvm/gemmini/template_project/src/maxpool2d.c b/apps/microtvm/gemmini/template_project/src/maxpool2d_example/maxpool2d.c
similarity index 90%
rename from apps/microtvm/gemmini/template_project/src/maxpool2d.c
rename to apps/microtvm/gemmini/template_project/src/maxpool2d_example/maxpool2d.c
index 8f508333c492..0843e67c628f 100644
--- a/apps/microtvm/gemmini/template_project/src/maxpool2d.c
+++ b/apps/microtvm/gemmini/template_project/src/maxpool2d_example/maxpool2d.c
@@ -25,11 +25,11 @@
 #ifndef BAREMETAL
 #include "sys/mman.h"
 #endif
-#include "model/inputs.h"
-#include "model/outputs.h"
+#include "input.h"
+#include "output.h"
 #include "model/tvmgen_default.h"
 
-int8_t output_maxpool2d[output_len];
+int8_t output_maxpool2d[OUTPUT_LEN];
 
 int main() {
   printf("Starting max pooling 2D test...\r\n");
@@ -49,7 +49,7 @@ int main() {
   tvmgen_default_run(&inputs, &outputs);
 
   // Look for errors!
-  for (int i = 0; i < output_len; i++) {
+  for (int i = 0; i < OUTPUT_LEN; i++) {
     if (output_maxpool2d[i] != output[i]) {
       error_counter += 1;
       printf("ERROR IN MAX POOL 2D EXAMPLE! output_maxpool2d[%d] (%d) != output[%d] (%d)\r\n", i,
@@ -59,7 +59,7 @@ int main() {
   }
 
   // We allow for a very small percentage of errors, this could be related to rounding errors
-  if (((float)(error_counter / output_len) * 100) < 1)
+  if (((float)(error_counter / OUTPUT_LEN) * 100) < 1)
     printf("SUCCESS!\r\n");
   else
     printf("FAIL!\r\n");
diff --git a/apps/microtvm/gemmini/template_project/src/mobilenet.c b/apps/microtvm/gemmini/template_project/src/mobilenet_example/mobilenet.c
similarity index 98%
rename from apps/microtvm/gemmini/template_project/src/mobilenet.c
rename to apps/microtvm/gemmini/template_project/src/mobilenet_example/mobilenet.c
index 45b606004653..a42fe6d32d05 100644
--- a/apps/microtvm/gemmini/template_project/src/mobilenet.c
+++ b/apps/microtvm/gemmini/template_project/src/mobilenet_example/mobilenet.c
@@ -25,8 +25,8 @@
 #ifndef BAREMETAL
 #include "sys/mman.h"
 #endif
-#include "model/inputs.h"
-#include "model/outputs.h"
+#include "input.h"
+#include "output.h"
 #include "model/tvmgen_default.h"
 
 uint8_t output_pred[1001];
diff --git a/cmake/modules/contrib/Gemmini.cmake b/cmake/modules/contrib/Gemmini.cmake
index 2e5a76bcc06c..aaac04ec7ba1 100644
--- a/cmake/modules/contrib/Gemmini.cmake
+++ b/cmake/modules/contrib/Gemmini.cmake
@@ -23,10 +23,9 @@ if(USE_GEMMINI)
       APPEND
       GEMMINI_FILE_COPY_JOBS
       "apps/microtvm/gemmini/template_project microtvm_api_server.py -> gemmini"
-      "apps/microtvm/gemmini/template_project/crt_config *.h -> gemmini/crt_config"
 
       # Dense example project generation
-      "apps/microtvm/gemmini/template_project/src dense.c -> gemmini/src/dense_example"
+      "apps/microtvm/gemmini/template_project/src/dense_example dense.c -> gemmini/src/dense_example"
       "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/dense_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dense_example"
       "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/dense_example"
@@ -36,7 +35,7 @@ if(USE_GEMMINI)
       "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/dense_example/rocc-software/src"
 
       # CONV2D example project generation
-      "apps/microtvm/gemmini/template_project/src conv2d.c -> gemmini/src/conv2d_example"
+      "apps/microtvm/gemmini/template_project/src/conv2d_example conv2d.c -> gemmini/src/conv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/conv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/conv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/conv2d_example"
@@ -46,7 +45,7 @@ if(USE_GEMMINI)
       "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/conv2d_example/rocc-software/src"
 
       # DW CONV2D example project generation
-      "apps/microtvm/gemmini/template_project/src dwconv2d.c -> gemmini/src/dwconv2d_example"
+      "apps/microtvm/gemmini/template_project/src/dwconv2d_example dwconv2d.c -> gemmini/src/dwconv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/dwconv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/dwconv2d_example"
       "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/dwconv2d_example"
@@ -56,7 +55,7 @@ if(USE_GEMMINI)
       "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/dwconv2d_example/rocc-software/src"
 
       # ADD example project generation
-      "apps/microtvm/gemmini/template_project/src add.c -> gemmini/src/add_example"
+      "apps/microtvm/gemmini/template_project/src/add_example add.c -> gemmini/src/add_example"
       "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/add_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/add_example"
       "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/add_example"
@@ -66,7 +65,7 @@ if(USE_GEMMINI)
       "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/add_example/rocc-software/src"
 
       # Max pooling 2d example project generation
-      "apps/microtvm/gemmini/template_project/src maxpool2d.c -> gemmini/src/maxpool2d_example"
+      "apps/microtvm/gemmini/template_project/src/maxpool2d_example maxpool2d.c -> gemmini/src/maxpool2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/maxpool2d_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/maxpool2d_example"
       "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/maxpool2d_example"
@@ -76,7 +75,7 @@ if(USE_GEMMINI)
       "3rdparty/gemmini/software/gemmini-rocc-tests/rocc-software/src *.h -> gemmini/src/maxpool2d_example/rocc-software/src"
 
       # Mobilenet example project generation
-      "apps/microtvm/gemmini/template_project/src mobilenet.c -> gemmini/src/mobilenet_example"
+      "apps/microtvm/gemmini/template_project/src/mobilenet_example mobilenet.c -> gemmini/src/mobilenet_example"
       "apps/microtvm/gemmini/template_project/src Makefile -> gemmini/src/mobilenet_example"
       "apps/microtvm/gemmini/template_project/src Makefile.in -> gemmini/src/mobilenet_example"
       "apps/microtvm/gemmini/template_project/src Makefrag.mk -> gemmini/src/mobilenet_example"
@@ -130,5 +129,6 @@ if(USE_GEMMINI)
   endfunction()
 
   microtvm_add_gemmini()
+  generate_crt_config(gemmini "${CMAKE_CURRENT_BINARY_DIR}/microtvm_template_projects/gemmini/crt_config/crt_config.h")
 
 endif(USE_MICRO)
diff --git a/cmake/utils/CRTConfig.cmake b/cmake/utils/CRTConfig.cmake
index 42c523b08786..1d767cb72c13 100644
--- a/cmake/utils/CRTConfig.cmake
+++ b/cmake/utils/CRTConfig.cmake
@@ -30,6 +30,8 @@ function(generate_crt_config platform output_path)
     set(TVM_CRT_MAX_PACKET_SIZE_BYTES 512)
   elseif("${platform}" STREQUAL "arduino")
     set(TVM_CRT_MAX_PACKET_SIZE_BYTES 8*1024)
+  elseif("${platform}" STREQUAL "gemmini")
+    set(TVM_CRT_MAX_PACKET_SIZE_BYTES 8*1024)
   endif()
   configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/runtime/crt/crt_config.h.template" "${output_path}")
 endfunction()
diff --git a/gallery/tutorial/micro_gemmini_add.py b/gallery/tutorial/micro_gemmini_add.py
index c90344aa75f0..2b1e85dbdeb8 100644
--- a/gallery/tutorial/micro_gemmini_add.py
+++ b/gallery/tutorial/micro_gemmini_add.py
@@ -27,11 +27,15 @@
 
 import tensorflow as tf
 from tensorflow.keras import layers
+import tarfile
+import tempfile
+import pathlib
 import numpy as np
 import os
 import tvm.contrib.gemmini as gemmini
 from tvm import relay
 import tvm
+from tvm.micro.testing.utils import create_header_file
 
 ##################################
 # Pre-requisites
@@ -115,20 +119,18 @@ def representative_data_gen():
 tflite_model = converter.convert()
 
 # Save the model.
-with open("add.tflite", "wb") as f:
+tmpdir = tvm.contrib.utils.tempdir()
+tflite_file = tmpdir / "add.tflite"
+with open(tflite_file, "wb") as f:
     f.write(tflite_model)
 
 # Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
 
-os.system("rm -rf model.tar dev/ include/ generated-project/")
-
-tflite_file = "./add.tflite"
+os.system("rm -rf generated-project/")
 tflite_model_buf = open(tflite_file, "rb").read()
 input_tensor = "layer1_input"
 input_dtype = "uint8"
 
-os.system("mkdir -p include")
-
 try:
     import tflite
 
@@ -139,7 +141,9 @@ def representative_data_gen():
     tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
 
 # Load the TFLite model and allocate tensors.
-interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)
+interpreter = tf.lite.Interpreter(
+    model_path=str(tflite_file), experimental_preserve_all_tensors=True
+)
 interpreter.allocate_tensors()
 input_details = interpreter.get_input_details()
 output_details = interpreter.get_output_details()
@@ -158,11 +162,6 @@ def representative_data_gen():
 interpreter.invoke()
 expected_output = interpreter.get_tensor(output_details[0]["index"])
 
-# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
-gemmini.create_header_file("inputs", "data", "input_1", input_matrix_2, "./include")
-gemmini.create_header_file("inputs", "data", "input_2", input_matrix_1, "./include")
-gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
-
 ##################################
 # Compiling the model with TVM
 # --------------------------------
@@ -208,32 +207,27 @@ def representative_data_gen():
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
-# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
-
-import pathlib
-
-os.system("mkdir dev")
-model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
-tvm.micro.export_model_library_format(module, model_library_format_tar_path)
-
-import tarfile
-
-with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
-    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
-
-# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
-
-template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {"project_type": "add_example"}
-
-generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
-generated_project = tvm.micro.generate_project(
-    template_project_path, module, generated_project_dir, project_options
-)
+tmpdir = tvm.contrib.utils.tempdir()
+model_library_format_tar_path = tvm.micro.export_model_library_format(module, tmpdir / "model.tar")
+with tempfile.NamedTemporaryFile() as tar_temp_file:
+    with tarfile.open(tar_temp_file.name, "w:gz") as tar_file:
+        # Here, we create headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+        create_header_file("input_1", input_matrix_1, "include/tvm", tar_file)
+        create_header_file("input_2", input_matrix_2, "include/tvm", tar_file)
+        create_header_file("output", expected_output, "include/tvm", tar_file)
+
+    # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+    template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+    project_options = {"project_type": "add_example", "extra_files_tar": tar_temp_file.name}
+
+    generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+    generated_project = tvm.micro.generate_project(
+        template_project_path, module, generated_project_dir, project_options
+    )
 
 # We build the project. This will generate an executable we can run on the Spike simulator.
 generated_project.build()
 
 # Finally, we execute the compiled baremetal project on the Spike simulator.
 # Note: if there are errors, these can be related to rounding errors.
-# generated_project.flash()
+generated_project.flash()
diff --git a/gallery/tutorial/micro_gemmini_conv2d.py b/gallery/tutorial/micro_gemmini_conv2d.py
index 14ac6933be98..cc96a4a6a1bf 100644
--- a/gallery/tutorial/micro_gemmini_conv2d.py
+++ b/gallery/tutorial/micro_gemmini_conv2d.py
@@ -26,12 +26,16 @@
 
 import tensorflow as tf
 from tensorflow import keras
+import tarfile
+import tempfile
+import pathlib
 from tensorflow.keras import layers
 import numpy as np
 import os
 import tvm.contrib.gemmini as gemmini
 from tvm import relay
 import tvm
+from tvm.micro.testing.utils import create_header_file
 
 ##################################
 # Pre-requisites
@@ -116,19 +120,20 @@ def representative_data_gen():
 tflite_model = converter.convert()
 
 # Save the model.
-with open("conv.tflite", "wb") as f:
+tmpdir = tvm.contrib.utils.tempdir()
+tflite_file = tmpdir / "conv.tflite"
+with open(tflite_file, "wb") as f:
     f.write(tflite_model)
 
 # Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
 
-os.system("rm -rf model.tar dev/ include/ generated-project/")
+os.system("rm -rf generated-project/")
 
-tflite_file = "./conv.tflite"
 tflite_model_buf = open(tflite_file, "rb").read()
 input_tensor = "layer1_input"
 input_dtype = "uint8"
 
-os.system("mkdir -p include")
+# os.system("mkdir -p include")
 
 try:
     import tflite
@@ -140,7 +145,7 @@ def representative_data_gen():
     tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
 
 # Load the TFLite model and allocate tensors.
-interpreter = tf.lite.Interpreter(model_path="./conv.tflite")
+interpreter = tf.lite.Interpreter(model_path=str(tflite_file))
 interpreter.allocate_tensors()
 input_details = interpreter.get_input_details()
 output_details = interpreter.get_output_details()
@@ -151,10 +156,6 @@ def representative_data_gen():
 interpreter.invoke()
 expected_output = interpreter.get_tensor(output_details[0]["index"])
 
-# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
-gemmini.create_header_file("inputs", "data", "input", input_matrix, "./include")
-gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
-
 ##################################
 # Compiling the model with TVM
 # --------------------------------
@@ -191,27 +192,22 @@ def representative_data_gen():
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
-# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
-import pathlib
-
-os.system("mkdir dev")
-model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
-tvm.micro.export_model_library_format(module, model_library_format_tar_path)
-
-import tarfile
-
-with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
-    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
-
-# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
-
-template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {"project_type": "conv2d_example"}
-
-generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
-generated_project = tvm.micro.generate_project(
-    template_project_path, module, generated_project_dir, project_options
-)
+tmpdir = tvm.contrib.utils.tempdir()
+model_library_format_tar_path = tvm.micro.export_model_library_format(module, tmpdir / "model.tar")
+with tempfile.NamedTemporaryFile() as tar_temp_file:
+    with tarfile.open(tar_temp_file.name, "w:gz") as tar_file:
+        # Here, we create headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+        create_header_file("input", input_matrix, "include/tvm", tar_file)
+        create_header_file("output", expected_output, "include/tvm", tar_file)
+
+    # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+    template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+    project_options = {"project_type": "conv2d_example", "extra_files_tar": tar_temp_file.name}
+
+    generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+    generated_project = tvm.micro.generate_project(
+        template_project_path, module, generated_project_dir, project_options
+    )
 
 # We build the project. This will generate an executable we can run on the Spike simulator.
 generated_project.build()
diff --git a/gallery/tutorial/micro_gemmini_dense.py b/gallery/tutorial/micro_gemmini_dense.py
index 22419ad22276..c4fb5c82d01b 100644
--- a/gallery/tutorial/micro_gemmini_dense.py
+++ b/gallery/tutorial/micro_gemmini_dense.py
@@ -26,10 +26,14 @@
 
 import tensorflow as tf
 import numpy as np
+import tarfile
+import tempfile
+import pathlib
 import os
 import tvm.contrib.gemmini as gemmini
 from tvm import relay
 import tvm
+from tvm.micro.testing.utils import create_header_file
 
 ##################################
 # Pre-requisites
@@ -103,13 +107,14 @@ def representative_data_gen():
 tflite_model = converter.convert()
 
 # Save the model.
-with open("matmul.tflite", "wb") as f:
+tmpdir = tvm.contrib.utils.tempdir()
+tflite_file = tmpdir / "matmul.tflite"
+with open(tflite_file, "wb") as f:
     f.write(tflite_model)
 
 # Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
-os.system("rm -rf model.tar dev/ include/ generated-project/")
+os.system("rm -rf generated-project/")
 
-tflite_file = "./matmul.tflite"
 tflite_model_buf = open(tflite_file, "rb").read()
 input_tensor = "layer1_input"
 input_dtype = "uint8"
@@ -126,7 +131,9 @@ def representative_data_gen():
     tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
 
 # Load the TFLite model and allocate tensors.
-interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)
+interpreter = tf.lite.Interpreter(
+    model_path=str(tflite_file), experimental_preserve_all_tensors=True
+)
 interpreter.allocate_tensors()
 input_details = interpreter.get_input_details()
 output_details = interpreter.get_output_details()
@@ -138,10 +145,6 @@ def representative_data_gen():
 interpreter.invoke()
 expected_output = interpreter.get_tensor(output_details[0]["index"])
 
-# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
-gemmini.create_header_file("inputs", "data", "input", input1, "./include")
-gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
-
 ##################################
 # Compiling the model with TVM
 # --------------------------------
@@ -182,26 +185,22 @@ def representative_data_gen():
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
-# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
-import pathlib
-
-os.system("mkdir dev")
-model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
-tvm.micro.export_model_library_format(module, model_library_format_tar_path)
-
-import tarfile
-
-with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
-    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
-
-# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
-template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {"project_type": "dense_example"}
-
-generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
-generated_project = tvm.micro.generate_project(
-    template_project_path, module, generated_project_dir, project_options
-)
+tmpdir = tvm.contrib.utils.tempdir()
+model_library_format_tar_path = tvm.micro.export_model_library_format(module, tmpdir / "model.tar")
+with tempfile.NamedTemporaryFile() as tar_temp_file:
+    with tarfile.open(tar_temp_file.name, "w:gz") as tar_file:
+        # Here, we create headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+        create_header_file("input", input1, "include/tvm", tar_file)
+        create_header_file("output", expected_output, "include/tvm", tar_file)
+
+    # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+    template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+    project_options = {"project_type": "dense_example", "extra_files_tar": tar_temp_file.name}
+
+    generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+    generated_project = tvm.micro.generate_project(
+        template_project_path, module, generated_project_dir, project_options
+    )
 
 # We build the project. This will generate an executable we can run on the Spike simulator.
 generated_project.build()
diff --git a/gallery/tutorial/micro_gemmini_dwconv2d.py b/gallery/tutorial/micro_gemmini_dwconv2d.py
index 3fbd41a1c21b..fb57aacf862f 100644
--- a/gallery/tutorial/micro_gemmini_dwconv2d.py
+++ b/gallery/tutorial/micro_gemmini_dwconv2d.py
@@ -27,6 +27,9 @@
 from pyrsistent import v
 import tensorflow as tf
 from tensorflow import keras
+import tarfile
+import tempfile
+import pathlib
 from tensorflow.keras import layers
 import numpy as np
 import os
@@ -35,6 +38,7 @@
 import tvm.contrib.gemmini as gemmini
 from tvm import relay
 import tvm
+from tvm.micro.testing.utils import create_header_file
 
 ##################################
 # Pre-requisites
@@ -106,13 +110,14 @@ def representative_data_gen():
 tflite_model = converter.convert()
 
 # Save the model.
-with open("dwconv.tflite", "wb") as f:
+tmpdir = tvm.contrib.utils.tempdir()
+tflite_file = tmpdir / "dwconv.tflite"
+with open(tflite_file, "wb") as f:
     f.write(tflite_model)
 
 # Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
-os.system("rm -rf model.tar dev/ include/ generated-project/")
+os.system("rm -rf generated-project/")
 
-tflite_file = "./dwconv.tflite"
 tflite_model_buf = open(tflite_file, "rb").read()
 input_tensor = "layer1_input"
 input_dtype = "uint8"
@@ -129,7 +134,7 @@ def representative_data_gen():
     tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
 
 # Load the TFLite model and allocate tensors.
-interpreter = tf.lite.Interpreter(model_path="./dwconv.tflite")
+interpreter = tf.lite.Interpreter(model_path=str(tflite_file))
 interpreter.allocate_tensors()
 input_details = interpreter.get_input_details()
 output_details = interpreter.get_output_details()
@@ -141,10 +146,6 @@ def representative_data_gen():
 interpreter.invoke()
 expected_output = interpreter.get_tensor(output_details[0]["index"])
 
-# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
-gemmini.create_header_file("inputs", "data", "input", input, "./include")
-gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
-
 ##################################
 # Compiling the model with TVM
 # --------------------------------
@@ -181,31 +182,26 @@ def representative_data_gen():
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
-# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
-import pathlib
-
-os.system("mkdir dev")
-model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
-tvm.micro.export_model_library_format(module, model_library_format_tar_path)
+tmpdir = tvm.contrib.utils.tempdir()
+model_library_format_tar_path = tvm.micro.export_model_library_format(module, tmpdir / "model.tar")
+with tempfile.NamedTemporaryFile() as tar_temp_file:
+    with tarfile.open(tar_temp_file.name, "w:gz") as tar_file:
+        # Here, we create headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+        create_header_file("input", input, "include/tvm", tar_file)
+        create_header_file("output", expected_output, "include/tvm", tar_file)
 
-import tarfile
-
-with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
-    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
+    # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+    template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+    project_options = {"project_type": "dwconv2d_example", "extra_files_tar": tar_temp_file.name}
 
-# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
-template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {"project_type": "dwconv2d_example"}
-
-generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
-generated_project = tvm.micro.generate_project(
-    template_project_path, module, generated_project_dir, project_options
-)
+    generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+    generated_project = tvm.micro.generate_project(
+        template_project_path, module, generated_project_dir, project_options
+    )
 
 # We build the project. This will generate an executable we can run on the Spike simulator.
 generated_project.build()
 
 # Finally, we execute the compiled baremetal project on the Spike simulator.
 # Note: if there are errors, these can be related to rounding errors.
-
 generated_project.flash()
diff --git a/gallery/tutorial/micro_gemmini_maxpool2d.py b/gallery/tutorial/micro_gemmini_maxpool2d.py
index 39f84f88fba5..f4587d2d510d 100644
--- a/gallery/tutorial/micro_gemmini_maxpool2d.py
+++ b/gallery/tutorial/micro_gemmini_maxpool2d.py
@@ -25,11 +25,15 @@
 
 import tensorflow as tf
 from tensorflow.keras import layers
+import tarfile
+import tempfile
+import pathlib
 import numpy as np
 import os
 import tvm.contrib.gemmini as gemmini
 from tvm import relay
 import tvm
+from tvm.micro.testing.utils import create_header_file
 
 ##################################
 # Pre-requisites
@@ -103,13 +107,14 @@ def representative_data_gen():
 tflite_model = converter.convert()
 
 # Save the model.
-with open("maxpool.tflite", "wb") as f:
+tmpdir = tvm.contrib.utils.tempdir()
+tflite_file = tmpdir / "maxpool.tflite"
+with open(tflite_file, "wb") as f:
     f.write(tflite_model)
 
 # Now that we have created the model, we import the model and run it. We store the output, in order to compare it with the output that will be later obtained from the Gemmini accelerator.
-os.system("rm -rf model.tar dev/ include/ generated-project/")
+os.system("rm -rf generated-project/")
 
-tflite_file = "./maxpool.tflite"
 tflite_model_buf = open(tflite_file, "rb").read()
 input_tensor = "layer1_input"
 input_dtype = "uint8"
@@ -126,7 +131,9 @@ def representative_data_gen():
     tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
 
 # Load the TFLite model and allocate tensors.
-interpreter = tf.lite.Interpreter(model_path=tflite_file, experimental_preserve_all_tensors=True)
+interpreter = tf.lite.Interpreter(
+    model_path=str(tflite_file), experimental_preserve_all_tensors=True
+)
 interpreter.allocate_tensors()
 input_details = interpreter.get_input_details()
 output_details = interpreter.get_output_details()
@@ -141,10 +148,6 @@ def representative_data_gen():
 interpreter.invoke()
 expected_output = interpreter.get_tensor(output_details[0]["index"])
 
-# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
-gemmini.create_header_file("inputs", "data", "input", input_matrix_1, "./include")
-gemmini.create_header_file("outputs", "data", "output", expected_output, "./include")
-
 ##################################
 # Compiling the model with TVM
 # --------------------------------
@@ -181,26 +184,22 @@ def representative_data_gen():
 #
 # In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
 
-# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
-import pathlib
-
-os.system("mkdir dev")
-model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
-tvm.micro.export_model_library_format(module, model_library_format_tar_path)
-
-import tarfile
-
-with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
-    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
-
-# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
-template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {"project_type": "maxpool2d_example"}
-
-generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
-generated_project = tvm.micro.generate_project(
-    template_project_path, module, generated_project_dir, project_options
-)
+tmpdir = tvm.contrib.utils.tempdir()
+model_library_format_tar_path = tvm.micro.export_model_library_format(module, tmpdir / "model.tar")
+with tempfile.NamedTemporaryFile() as tar_temp_file:
+    with tarfile.open(tar_temp_file.name, "w:gz") as tar_file:
+        # Here, we create headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+        create_header_file("input", input_matrix_1, "include/tvm", tar_file)
+        create_header_file("output", expected_output, "include/tvm", tar_file)
+
+    # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+    template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+    project_options = {"project_type": "maxpool2d_example", "extra_files_tar": tar_temp_file.name}
+
+    generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+    generated_project = tvm.micro.generate_project(
+        template_project_path, module, generated_project_dir, project_options
+    )
 
 # We build the project. This will generate an executable we can run on the Spike simulator.
 generated_project.build()
diff --git a/gallery/tutorial/micro_gemmini_mobilenet.py b/gallery/tutorial/micro_gemmini_mobilenet.py
index ca3690fbdb33..28e80c4fb563 100644
--- a/gallery/tutorial/micro_gemmini_mobilenet.py
+++ b/gallery/tutorial/micro_gemmini_mobilenet.py
@@ -27,10 +27,13 @@
 import tensorflow as tf
 import os
 import tvm.contrib.gemmini as gemmini
+import tarfile
+import tempfile
+import pathlib
 from tvm import relay
 import tvm
-from mobilenet_utils import generate_mobilenet_tflite_model, get_real_image, run_tflite_model
 from tvm.contrib.download import download_testdata
+from tvm.micro.testing.utils import create_header_file
 
 ##################################
 # Pre-requisites
@@ -170,8 +173,7 @@ def generate_mobilenet_tflite_model():
 # In this section, we will generate the baseline input and expected output, which we are going to use to compare with the actual obtained output after running on the Gemmini accelerator.
 
 # We clean and prepare the workspace
-os.system("rm -rf model.tar dev/ include/ generated-project/")
-os.system("mkdir -p include")
+os.system("rm -rf generated-project/")
 
 # We will generate a prequantized TFLite model, because for now the Gemmini integration only supports models that were quantized with specific flags as input.
 tflite_model_dir = generate_mobilenet_tflite_model()
@@ -196,10 +198,6 @@ def generate_mobilenet_tflite_model():
 print("Expected argmax = %i" % (tflite_pred[0],))
 print("Expected max labels = %s" % (tflite_pred,))
 
-# Here, we create C files and headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
-gemmini.create_header_file("inputs", "data", "input", input_image, "./include")
-gemmini.create_header_file("outputs", "data", "output", tflite_pred.astype(np.uint32), "./include")
-
 ##################################
 # Compiling the model with TVM
 # --------------------------------
@@ -215,11 +213,9 @@ def generate_mobilenet_tflite_model():
 
 mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict)
 mod = relay.transform.InferType()(mod)
-mod["main"]
 
 # In order to be able to build a model for the Gemmini accelerator, we need to replace all supported layers by the Gemmini specific operators. This is done using the gemmini.preprocess pass. Notice the changes in the "main" function after running the preprocess pass.
 mod = gemmini.preprocess_pass(mod)
-mod["main"]
 
 # Now, we build the Relay Graph. Notice that we are using the CRT runtime, the target is C because we want to generate C code (but the device is Gemmini), and we use the AOT executor and the USMP feature in order to get a complete bare metal C code, without calls to memory allocator APIs.
 # The gemmini.build_config function returns a PassContext object containing the specific parameters needed to correctly build the model for the Gemmini accelerator.
@@ -234,28 +230,24 @@ def generate_mobilenet_tflite_model():
 # Exporting and testing the model using microTVM
 # -----------------------------------------------
 #
-# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator.
-
-# The builded model is exported to the model library format. This will be used in the next steps to generate the baremetal project.
-import pathlib
-
-os.system("mkdir dev")
-model_library_format_tar_path = pathlib.Path(pathlib.Path.cwd(), "dev/model.tar")
-tvm.micro.export_model_library_format(module, model_library_format_tar_path)
-
-import tarfile
-
-with tarfile.open(model_library_format_tar_path, "r:*") as tar_f:
-    print("\n".join(f" - {m.name}" for m in tar_f.getmembers()))
-
-# Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
-template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
-project_options = {"project_type": "mobilenet_example"}
-
-generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
-generated_project = tvm.micro.generate_project(
-    template_project_path, module, generated_project_dir, project_options
-)
+# In this section, we will export the model using one of the provided example microTVM projects, we will compile it using the Chipyard tool, and then test the generated baremetal code on the Spike simulator
+
+tmpdir = tvm.contrib.utils.tempdir()
+model_library_format_tar_path = tvm.micro.export_model_library_format(module, tmpdir / "model.tar")
+with tempfile.NamedTemporaryFile() as tar_temp_file:
+    with tarfile.open(tar_temp_file.name, "w:gz") as tar_file:
+        # Here, we create headers with the inputs and expected output, so that we can then execute the same operation on the Gemmini accelerator, and compare the expected output with the actual predicted one.
+        create_header_file("input", input_image, "include/tvm", tar_file)
+        create_header_file("output", tflite_pred.astype(np.int32), "include/tvm", tar_file)
+
+    # Here, we create the test project, using the example project provided for this tutorial in the Gemmini microTVM template projects.
+    template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("gemmini"))
+    project_options = {"project_type": "mobilenet_example", "extra_files_tar": tar_temp_file.name}
+
+    generated_project_dir = pathlib.Path(pathlib.Path.cwd(), "generated-project")
+    generated_project = tvm.micro.generate_project(
+        template_project_path, module, generated_project_dir, project_options
+    )
 
 # We build the project. This will generate an executable we can run on the Spike simulator.
 generated_project.build()
diff --git a/python/tvm/contrib/gemmini/__init__.py b/python/tvm/contrib/gemmini/__init__.py
index 02d10645e2a3..7fe67311b265 100644
--- a/python/tvm/contrib/gemmini/__init__.py
+++ b/python/tvm/contrib/gemmini/__init__.py
@@ -14,18 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""
-Gemmini package is a TVM backend extension to support the Gemmini hardware accelerator
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 import tvm._ffi.base
 
 from tvm.relay.backend.contrib.gemmini import *
 from .environment import Environment
 from .build_module import build_config, lower, build, preprocess_pass
-from .helpers import create_header_file
 from .utils import *
-
-__version__ = "0.1.0"
diff --git a/python/tvm/contrib/gemmini/build_module.py b/python/tvm/contrib/gemmini/build_module.py
index bf2ff9832309..22c5c1620d7a 100644
--- a/python/tvm/contrib/gemmini/build_module.py
+++ b/python/tvm/contrib/gemmini/build_module.py
@@ -14,11 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""
-Helpers and functions related to the build process to generate code for the Gemmini accelerator
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 import tvm
 from tvm import relay
diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
index 565c1db13f11..56863eebf71e 100644
--- a/python/tvm/contrib/gemmini/environment.py
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -15,11 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name, exec-used
-"""
-Environment declaration. Contains Gemminis hardware parameters.
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 from __future__ import absolute_import as _abs
 import re
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
index 69dca3a6b0de..fb28d471980a 100644
--- a/python/tvm/contrib/gemmini/helpers.py
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -14,11 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""
-Miscellaneous helpers
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 import pathlib
 from typing import List
@@ -30,112 +25,6 @@
 ENV = Environment.instance()
 
 
-def create_header_file(
-    name: str,
-    section: str,
-    tensor_name: str,
-    tensor_data: np.ndarray,
-    output_path: str,
-    debug: bool = False,
-    weights: bool = None,
-):
-    """This function generates a header file containing the data from the numpy array provided.
-
-    Args:
-        name (str): Header file name
-        section (str): section to assign the generated variable
-        tensor_name (str): name for the generated variable
-        tensor_data (np.ndarray): data to fill the variable with
-        output_path (str): output path where the header file will be generated
-        debug (bool, optional): enable debug. Defaults to False.
-        weights (bool, optional): For debug purposes. Defaults to None.
-    """
-    if debug:
-        assert (
-            weights is not None
-        ), "When passing the debug flag as True, the weights parameter must be given!"
-
-    file_path = pathlib.Path(f"{output_path}/" + name).resolve()
-    # Create header file with npy_data as a C array
-    raw_header_path = file_path.with_suffix(".h").resolve()
-    raw_source_path = file_path.with_suffix(".c").resolve()
-
-    if tensor_data.dtype == np.float32:
-        datatype = "float"
-        align = 32
-    elif tensor_data.dtype == np.int8:
-        datatype = "int8_t"
-        align = 16
-    elif tensor_data.dtype == np.uint8:
-        datatype = "uint8_t"
-        align = 16
-    elif tensor_data.dtype == np.uint32:
-        datatype = "uint32_t"
-        align = 16
-    else:
-        assert False, f"Type {tensor_data.dtype} is not supported!"
-
-    with open(raw_header_path, "a+", encoding="utf8") as header_file:
-        header_file.write(
-            f"#define {tensor_name}_len {tensor_data.size}\n"
-            + f"extern {datatype} {tensor_name}[{tensor_name}_len];\n"
-        )
-
-    if not raw_source_path.is_file():
-        with open(raw_source_path, "a+", encoding="utf8") as source_file:
-            source_file.write("#include <stdint.h>\n")
-    with open(raw_source_path, "a+", encoding="utf8") as source_file:
-
-        source_file.write(
-            f'{datatype} {tensor_name}[] __attribute__((section("{section}"), \
-                aligned({align}))) = {{'
-            if section
-            else f"{datatype} {tensor_name}[] __attribute__((aligned({align}))) = {{"
-        )
-        data_hexstr = tensor_data.tobytes().hex()
-        flatten = tensor_data.flatten()
-
-        if tensor_data.dtype in (np.float32, np.uint32):
-            for element in flatten:
-                source_file.write(f"{element},")
-            source_file.write("};\n\n")
-        else:
-            for i in range(0, len(data_hexstr), 2):
-                if flatten[int(i / 2)] < 0:
-                    # Special treatment to generate negative numbers correctly!
-                    data_hexstr_2comp = (
-                        (~int(flatten[int(i / 2)]) + 1).to_bytes(length=1, byteorder="big").hex()
-                    )
-                    source_file.write(f"-0x{data_hexstr_2comp}")
-                else:
-                    source_file.write(f"+0x{data_hexstr[i:i+2]}")
-                if i != (len(flatten) - 1) * 2:
-                    source_file.write(",")
-            source_file.write("};\n\n")
-
-        if debug:
-            source_file.write("/*\n")
-            for n in range(tensor_data.shape[0]):
-                for i_ch in range(tensor_data.shape[3]):
-                    source_file.write(f"Channel {i_ch}:\n")
-                    for row in range(tensor_data.shape[1]):
-                        for col in range(tensor_data.shape[2]):
-                            source_file.write(f"{tensor_data[n][row][col][i_ch]}\t")
-                        source_file.write("\n")
-            source_file.write("*/\n")
-
-            source_file.write("/*\n")
-            for o_ch in range(weights.shape[3]):
-                source_file.write(f"Output channel {o_ch}:\n")
-                for i_ch in range(weights.shape[2]):
-                    source_file.write(f"Input channel {i_ch}:\n")
-                    for row in range(weights.shape[0]):
-                        for col in range(weights.shape[1]):
-                            source_file.write(f"{weights[row][col][i_ch][o_ch]}\t")
-                        source_file.write("\n")
-            source_file.write("*/\n")
-
-
 def get_divisors(x: int) -> List[int]:
     """Gets all the numbers that perfectly divide x
 
diff --git a/python/tvm/contrib/gemmini/intrin.py b/python/tvm/contrib/gemmini/intrin.py
index 6aa20c2c8198..5b20023c2169 100644
--- a/python/tvm/contrib/gemmini/intrin.py
+++ b/python/tvm/contrib/gemmini/intrin.py
@@ -14,11 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""
-Gemmini related intrinsics
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 from __future__ import absolute_import as _abs
 
diff --git a/python/tvm/contrib/gemmini/legalize.py b/python/tvm/contrib/gemmini/legalize.py
index 4d74707e5acf..cbef09c14db6 100644
--- a/python/tvm/contrib/gemmini/legalize.py
+++ b/python/tvm/contrib/gemmini/legalize.py
@@ -14,11 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""
-A set of passes to legalize the Gemmini operators
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 from typing import Tuple
 import tvm  # type: ignore
diff --git a/python/tvm/contrib/gemmini/pattern_table.py b/python/tvm/contrib/gemmini/pattern_table.py
index ddb4b69acef9..96e3ba38523c 100644
--- a/python/tvm/contrib/gemmini/pattern_table.py
+++ b/python/tvm/contrib/gemmini/pattern_table.py
@@ -14,11 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""
-Pattern table declaring the supported Gemmini operators
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 from typing import Callable, List, Tuple
 
@@ -191,7 +186,7 @@ def __init__(self, func_body: tvm.relay.Function):
             and self.data.op.name == "nn.pad"
         ):
             padding = self.data.attrs.pad_width
-            self.padding = [padding[1][0], padding[1][1], padding[2][0], padding[2][1]]
+            self.padding = [padding[1][0], padding[2][0], padding[1][1], padding[2][1]]
             self.has_external_pad = True
         self.weights = conv2d_op.args[1]
         self.weights_shape = _infer_shape(self.weights)
diff --git a/python/tvm/contrib/gemmini/transform.py b/python/tvm/contrib/gemmini/transform.py
index 41455bb8d283..456576ffb260 100644
--- a/python/tvm/contrib/gemmini/transform.py
+++ b/python/tvm/contrib/gemmini/transform.py
@@ -15,11 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=len-as-condition, no-else-return, unused-argument, invalid-name
-"""
-Transformation passes for Gemmini
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 import ast
 from typing import Dict
diff --git a/python/tvm/contrib/gemmini/utils.py b/python/tvm/contrib/gemmini/utils.py
index e43d2a7a23f9..07a50f74877d 100644
--- a/python/tvm/contrib/gemmini/utils.py
+++ b/python/tvm/contrib/gemmini/utils.py
@@ -14,11 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""
-Useful enumerations and others
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 from enum import Enum
 
diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py
index 755a85839d02..88aa7ac2d423 100644
--- a/python/tvm/micro/testing/utils.py
+++ b/python/tvm/micro/testing/utils.py
@@ -168,7 +168,7 @@ def create_header_file(
     header_file.write("#include <stddef.h>\n")
     header_file.write("#include <stdint.h>\n")
     header_file.write("#include <dlpack/dlpack.h>\n")
-    header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n")
+    header_file.write(f"#define {tensor_name.upper()}_LEN {npy_data.size}\n")
     header_file.write(f"{_npy_dtype_to_ctype(npy_data)} {tensor_name}[] =")
 
     header_file.write("{")
diff --git a/python/tvm/relay/backend/contrib/gemmini/__init__.py b/python/tvm/relay/backend/contrib/gemmini/__init__.py
index a4d93cc8bf49..a1c4a510cd0c 100644
--- a/python/tvm/relay/backend/contrib/gemmini/__init__.py
+++ b/python/tvm/relay/backend/contrib/gemmini/__init__.py
@@ -14,20 +14,5 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-<<<<<<<< HEAD:python/tvm/relay/backend/contrib/gemmini/__init__.py
-"""
-Gemmini operators compute and schedule declarations
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 from . import op
-========
-
-if(USE_AMX)
-    file(GLOB AMX_RUNTIME_CONFIG src/runtime/contrib/amx/amx_config.cc)
-    list(APPEND COMPILER_SRCS ${AMX_RUNTIME_CONFIG})
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=sapphirerapids")
-    message(STATUS "Build with Intel AMX support...")
-endif()
->>>>>>>> upstream/main:cmake/modules/contrib/AMX.cmake
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
index d019fe4cbc3e..6c22441f2182 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
@@ -15,11 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-argument
-"""
-Add operator declaration and schedule registration for Gemmini
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 import numpy as np
 import tvm
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
index f82bea64a51d..30e3775491cf 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
@@ -15,11 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-argument
-"""
-Conv2d operator declaration and schedule registration for Gemmini's CISC instructions
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 import numpy as np
 import tvm
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
index dbb7d12f7da5..22191771ce43 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
@@ -15,11 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-argument
-"""
-Dense (GEMM) operator declaration and schedule registration for Gemmini's intrinsic instructions
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 import numpy as np
 import tvm
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
index 8fdc12e5d8d2..7143e1999cb0 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
@@ -15,11 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-argument
-"""
-Dense (GEMM) operator declaration and schedule registration for Gemmini's CISC instructions
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 import numpy as np
 import tvm
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
index d33749823268..361b5e2daca7 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
@@ -15,11 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-argument
-"""
-Depthwise conv2d operator declaration and schedule registration for Gemmini's CISC instructions
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 import numpy as np
 import tvm
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
index 2e7880bcbdfe..1db42b392811 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
@@ -15,11 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-argument
-"""
-MaxPool2D operator declaration and schedule registration for Gemmini's CISC instructions
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 import tvm
 from tvm import te
diff --git a/python/tvm/relay/backend/contrib/gemmini/op.py b/python/tvm/relay/backend/contrib/gemmini/op.py
index a37ef10428bf..32869cbb2acf 100644
--- a/python/tvm/relay/backend/contrib/gemmini/op.py
+++ b/python/tvm/relay/backend/contrib/gemmini/op.py
@@ -15,11 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-argument, ungrouped-imports
-"""
-Namespace for the supported Relay operators on Gemmini
-=====================
-**Author**: `Federico Peccia <https://fPecc.github.io/>`_
-"""
 
 from __future__ import absolute_import as _abs
 

From 6316f83d5fc40c0ec29143f70d4f87e91e59d8eb Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Mon, 27 Mar 2023 13:55:53 +0200
Subject: [PATCH 272/286] Fixed merge issues

---
 cmake/modules/contrib/AMX.cmake               |  7 ------
 python/tvm/contrib/cutlass/_ffi_api.py        | 10 ---------
 src/driver/driver_api.cc                      |  7 ------
 src/tir/schedule/concrete_schedule.cc         |  5 -----
 .../contrib/test_ethosu/test_codegen.py       |  3 ---
 .../python/unittest/test_arith_detect_cse.py  | 22 -------------------
 6 files changed, 54 deletions(-)

diff --git a/cmake/modules/contrib/AMX.cmake b/cmake/modules/contrib/AMX.cmake
index 4c29e43d767e..ac349c4336a2 100644
--- a/cmake/modules/contrib/AMX.cmake
+++ b/cmake/modules/contrib/AMX.cmake
@@ -14,7 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-<<<<<<<< HEAD:cmake/modules/contrib/AMX.cmake
 
 if(USE_AMX)
     file(GLOB AMX_RUNTIME_CONFIG src/runtime/contrib/amx/amx_config.cc)
@@ -22,9 +21,3 @@ if(USE_AMX)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=sapphirerapids")
     message(STATUS "Build with Intel AMX support...")
 endif()
-========
-"""FFI API for CUTLASS BYOC."""
-import tvm._ffi
-
-tvm._ffi._init_api("contrib.cutlass", __name__)
->>>>>>>> upstream/main:python/tvm/contrib/cutlass/_ffi_api.py
diff --git a/python/tvm/contrib/cutlass/_ffi_api.py b/python/tvm/contrib/cutlass/_ffi_api.py
index 4c29e43d767e..e71eb8c13f19 100644
--- a/python/tvm/contrib/cutlass/_ffi_api.py
+++ b/python/tvm/contrib/cutlass/_ffi_api.py
@@ -14,17 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-<<<<<<<< HEAD:cmake/modules/contrib/AMX.cmake
-
-if(USE_AMX)
-    file(GLOB AMX_RUNTIME_CONFIG src/runtime/contrib/amx/amx_config.cc)
-    list(APPEND COMPILER_SRCS ${AMX_RUNTIME_CONFIG})
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=sapphirerapids")
-    message(STATUS "Build with Intel AMX support...")
-endif()
-========
 """FFI API for CUTLASS BYOC."""
 import tvm._ffi
 
 tvm._ffi._init_api("contrib.cutlass", __name__)
->>>>>>>> upstream/main:python/tvm/contrib/cutlass/_ffi_api.py
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 9820a2b2f014..3458376848f1 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -63,13 +63,6 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.ptx_ldg32", Bool);
 // the "experimental" notation for this feature.
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.experimental_dma_bypass_cache", Bool);
 
-// WARNING: May cause coherency issues resulting data miscompares
-// Experimental feature that, when enabled by the runtime, bypasses the cache when using DMA. When
-// bypassing the cache TVM must manage cache coherency in software. Software managed cache coherency
-// can be tricky e.g. it is yet to be proven out in the Hexagon runtime. Hence the warning above and
-// the "experimental" notation for this feature.
-TVM_REGISTER_PASS_CONFIG_OPTION("tir.experimental_dma_bypass_cache", Bool);
-
 using tvm::Array;
 using tvm::transform::Pass;
 
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index b3e8440a84f1..93ea38169d74 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -574,11 +574,6 @@ BlockRV ConcreteScheduleNode::ReindexCacheRead(const BlockRV& block_rv, int read
                                                const String& storage_scope,
                                                const IndexMap& index_map) {
   StmtSRef result{nullptr};
-  // Create a new array of SRefs from the consumer block list.
-  Array<StmtSRef> consumer_block_refs = {};
-  for (BlockRV block : consumer_blocks) {
-    consumer_block_refs.push_back(this->GetSRef(block));
-  }
   TVM_TIR_SCHEDULE_BEGIN();
   result = tir::ReindexCacheRead(state_, this->GetSRef(block_rv), read_buffer_index, storage_scope,
                                  index_map);
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 5f3a147d3058..6eb382d8f588 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -1343,8 +1343,6 @@ def fully_connected(x):
     )
 
 
-<<<<<<< HEAD
-=======
 @pytest.mark.parametrize("accel_type", ["ethos-u55-256", "ethos-u65-256"])
 def test_tflite_subtract_sigmoid(accel_type):
     np.random.seed(0)
@@ -1364,6 +1362,5 @@ def subtract_sigmoid_function(lhs, rhs):
     )
 
 
->>>>>>> upstream/main
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_arith_detect_cse.py b/tests/python/unittest/test_arith_detect_cse.py
index 5d9bfba44d68..eba0920cb2da 100755
--- a/tests/python/unittest/test_arith_detect_cse.py
+++ b/tests/python/unittest/test_arith_detect_cse.py
@@ -1,4 +1,3 @@
-#!/bin/bash
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,26 +14,11 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-<<<<<<<< HEAD:tests/python/unittest/test_arith_detect_cse.py
 import tvm
 import tvm.testing
 from tvm.script import tir as T
-========
 
-function show_usage() {
-    cat <<EOF
-This script is for running microtvm_api_server with Zephyr.
-Usage: launch_microtvm_api_server.sh <microtvm_api_server.py> --read-fd <READ_FD_PATH> --write-fd <WRITE_FD_PATH>
-EOF
-}
->>>>>>>> upstream/main:apps/microtvm/zephyr/template_project/launch_microtvm_api_server.sh
 
-if [ "$#" -lt 5 -o "$1" == "--help" ]; then
-    show_usage
-    exit -1
-fi
-
-<<<<<<<< HEAD:tests/python/unittest/test_arith_detect_cse.py
 def test_detect_cs():
     x = T.Var("x", dtype="int32")
     y = T.Var("y", dtype="int32")
@@ -47,9 +31,3 @@ def test_detect_cs():
 
 if __name__ == "__main__":
     tvm.testing.main()
-========
-PYTHON_CMD=$(sed 's/#!//; q' $(which west))
-
-# Run server
-$PYTHON_CMD $1 $2 $3 $4 $5
->>>>>>>> upstream/main:apps/microtvm/zephyr/template_project/launch_microtvm_api_server.sh

From 0315a7c761b68063c067fb12d932d8ec33c3ef1b Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Mon, 27 Mar 2023 14:04:03 +0200
Subject: [PATCH 273/286] Fix merge

---
 src/tir/transforms/lower_async_dma.cc | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/tir/transforms/lower_async_dma.cc b/src/tir/transforms/lower_async_dma.cc
index f358dc28f09f..3406c2bdff5a 100644
--- a/src/tir/transforms/lower_async_dma.cc
+++ b/src/tir/transforms/lower_async_dma.cc
@@ -75,12 +75,6 @@ class AsyncDMALowerer : public arith::IRMutatorWithAnalyzer {
               dst_extent * src->dtype.bytes(), dma_bypass_cache_}));
   }
 
-  // Create member statement to track a mapping from iter var to iter range
-  Stmt VisitStmt_(const ForNode* op) final {
-    input_iters.Set(op->loop_var, Range(op->min, op->extent));
-    return StmtExprMutator::VisitStmt_(op);
-  }
-
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     // populate analyzer knowledge of loop iterators
     auto previsit = arith::IRMutatorWithAnalyzer::VisitStmt_(op);
@@ -184,4 +178,4 @@ TVM_REGISTER_GLOBAL("tir.transform.LowerAsyncDMA").set_body_typed(LowerAsyncDMA)
 }  // namespace transform
 
 }  // namespace tir
-}  // namespace tvm
+}  // namespace tvm
\ No newline at end of file

From 0e84cfeb44211513a4871947b75c85039b0ab5c4 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Mon, 27 Mar 2023 14:17:06 +0200
Subject: [PATCH 274/286] Fixed lint problem

---
 src/tir/transforms/lower_async_dma.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tir/transforms/lower_async_dma.cc b/src/tir/transforms/lower_async_dma.cc
index 3406c2bdff5a..d899b6ec70ab 100644
--- a/src/tir/transforms/lower_async_dma.cc
+++ b/src/tir/transforms/lower_async_dma.cc
@@ -178,4 +178,4 @@ TVM_REGISTER_GLOBAL("tir.transform.LowerAsyncDMA").set_body_typed(LowerAsyncDMA)
 }  // namespace transform
 
 }  // namespace tir
-}  // namespace tvm
\ No newline at end of file
+}  // namespace tvm

From 2f0308f91328fb8fdfd1342ff82eb9108420b865 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Tue, 28 Mar 2023 08:41:06 +0200
Subject: [PATCH 275/286] .utils does not exist!

---
 python/tvm/topi/x86/batch_matmul.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
index c9b1d78ce9f2..0a0545fa947b 100644
--- a/python/tvm/topi/x86/batch_matmul.py
+++ b/python/tvm/topi/x86/batch_matmul.py
@@ -28,7 +28,8 @@
 from ..utils import get_const_tuple, get_max_power2_factor, traverse_inline
 from .dense import dense_amx_int8_schedule, dense_int8_schedule
 from .injective import schedule_injective_from_existing
-from .utils import target_has_avx512, target_has_amx
+
+# from .utils import target_has_avx512, target_has_amx
 
 
 @autotvm.register_topi_compute("batch_matmul_int8.x86")

From 81c82df24e4caebff9b20558877cc48b640eea0c Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Tue, 28 Mar 2023 09:34:29 +0200
Subject: [PATCH 276/286] Added docstrings to all Python files

---
 .../gemmini/template_project/microtvm_api_server.py         | 4 ++++
 python/tvm/contrib/gemmini/__init__.py                      | 4 ++++
 python/tvm/contrib/gemmini/build_module.py                  | 4 ++++
 python/tvm/contrib/gemmini/environment.py                   | 4 ++++
 python/tvm/contrib/gemmini/helpers.py                       | 6 ++++--
 python/tvm/contrib/gemmini/intrin.py                        | 4 ++++
 python/tvm/contrib/gemmini/legalize.py                      | 4 ++++
 python/tvm/contrib/gemmini/pattern_table.py                 | 4 ++++
 python/tvm/contrib/gemmini/transform.py                     | 4 ++++
 python/tvm/contrib/gemmini/utils.py                         | 4 ++++
 python/tvm/relay/backend/contrib/gemmini/__init__.py        | 4 ++++
 python/tvm/relay/backend/contrib/gemmini/gemmini_add.py     | 4 ++++
 .../relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py    | 4 ++++
 python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py   | 4 ++++
 .../tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py | 4 ++++
 .../contrib/gemmini/gemmini_depthwise_conv2d_cisc.py        | 4 ++++
 .../tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py | 4 ++++
 python/tvm/relay/backend/contrib/gemmini/op.py              | 4 ++++
 18 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/apps/microtvm/gemmini/template_project/microtvm_api_server.py b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
index 036850afe8f2..86661cb7320f 100644
--- a/apps/microtvm/gemmini/template_project/microtvm_api_server.py
+++ b/apps/microtvm/gemmini/template_project/microtvm_api_server.py
@@ -14,6 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""
+MicroTVM API Server for Gemmini baremetal tests on the Spike simulator
+=====================
+"""
 
 import atexit
 import collections
diff --git a/python/tvm/contrib/gemmini/__init__.py b/python/tvm/contrib/gemmini/__init__.py
index 7fe67311b265..34abef4b085a 100644
--- a/python/tvm/contrib/gemmini/__init__.py
+++ b/python/tvm/contrib/gemmini/__init__.py
@@ -14,6 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""
+Gemmini package is a TVM backend extension to support the Gemmini hardware accelerator
+=====================
+"""
 
 import tvm._ffi.base
 
diff --git a/python/tvm/contrib/gemmini/build_module.py b/python/tvm/contrib/gemmini/build_module.py
index 22c5c1620d7a..fdabfd102bca 100644
--- a/python/tvm/contrib/gemmini/build_module.py
+++ b/python/tvm/contrib/gemmini/build_module.py
@@ -14,6 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""
+Helpers and functions related to the build process to generate code for the Gemmini accelerator
+=====================
+"""
 
 import tvm
 from tvm import relay
diff --git a/python/tvm/contrib/gemmini/environment.py b/python/tvm/contrib/gemmini/environment.py
index 56863eebf71e..3e579a4f4870 100644
--- a/python/tvm/contrib/gemmini/environment.py
+++ b/python/tvm/contrib/gemmini/environment.py
@@ -15,6 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name, exec-used
+"""
+Environment declaration. Contains Gemminis hardware parameters.
+=====================
+"""
 
 from __future__ import absolute_import as _abs
 import re
diff --git a/python/tvm/contrib/gemmini/helpers.py b/python/tvm/contrib/gemmini/helpers.py
index fb28d471980a..e0f99c3373e4 100644
--- a/python/tvm/contrib/gemmini/helpers.py
+++ b/python/tvm/contrib/gemmini/helpers.py
@@ -14,11 +14,13 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""
+Miscellaneous helpers
+=====================
+"""
 
-import pathlib
 from typing import List
 from six.moves import range
-import numpy as np
 from .environment import Environment
 
 
diff --git a/python/tvm/contrib/gemmini/intrin.py b/python/tvm/contrib/gemmini/intrin.py
index 5b20023c2169..58f53b6b3e0a 100644
--- a/python/tvm/contrib/gemmini/intrin.py
+++ b/python/tvm/contrib/gemmini/intrin.py
@@ -14,6 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""
+Gemmini related intrinsics
+=====================
+"""
 
 from __future__ import absolute_import as _abs
 
diff --git a/python/tvm/contrib/gemmini/legalize.py b/python/tvm/contrib/gemmini/legalize.py
index cbef09c14db6..c9a72eadbc07 100644
--- a/python/tvm/contrib/gemmini/legalize.py
+++ b/python/tvm/contrib/gemmini/legalize.py
@@ -14,6 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""
+A set of passes to legalize the Gemmini operators
+=====================
+"""
 
 from typing import Tuple
 import tvm  # type: ignore
diff --git a/python/tvm/contrib/gemmini/pattern_table.py b/python/tvm/contrib/gemmini/pattern_table.py
index 96e3ba38523c..8240640ac4e5 100644
--- a/python/tvm/contrib/gemmini/pattern_table.py
+++ b/python/tvm/contrib/gemmini/pattern_table.py
@@ -14,6 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""
+Pattern table declaring the supported Gemmini operators
+=====================
+"""
 
 from typing import Callable, List, Tuple
 
diff --git a/python/tvm/contrib/gemmini/transform.py b/python/tvm/contrib/gemmini/transform.py
index 456576ffb260..bd377771ce32 100644
--- a/python/tvm/contrib/gemmini/transform.py
+++ b/python/tvm/contrib/gemmini/transform.py
@@ -15,6 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=len-as-condition, no-else-return, unused-argument, invalid-name
+"""
+Transformation passes for Gemmini
+=====================
+"""
 
 import ast
 from typing import Dict
diff --git a/python/tvm/contrib/gemmini/utils.py b/python/tvm/contrib/gemmini/utils.py
index 07a50f74877d..22428ee2d3c7 100644
--- a/python/tvm/contrib/gemmini/utils.py
+++ b/python/tvm/contrib/gemmini/utils.py
@@ -14,6 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""
+Useful enumerations and others
+=====================
+"""
 
 from enum import Enum
 
diff --git a/python/tvm/relay/backend/contrib/gemmini/__init__.py b/python/tvm/relay/backend/contrib/gemmini/__init__.py
index a1c4a510cd0c..6cb685ffe3d1 100644
--- a/python/tvm/relay/backend/contrib/gemmini/__init__.py
+++ b/python/tvm/relay/backend/contrib/gemmini/__init__.py
@@ -14,5 +14,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""
+Gemmini operators compute and schedule declarations
+=====================
+"""
 
 from . import op
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
index 6c22441f2182..90a8eb72088b 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_add.py
@@ -15,6 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-argument
+"""
+Add operator declaration and schedule registration for Gemmini
+=====================
+"""
 
 import numpy as np
 import tvm
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
index 30e3775491cf..44d10ca89306 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_conv2d_cisc.py
@@ -15,6 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-argument
+"""
+Conv2d operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+"""
 
 import numpy as np
 import tvm
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
index 22191771ce43..e9da2903bc87 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense.py
@@ -15,6 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-argument
+"""
+Dense (GEMM) operator declaration and schedule registration for Gemmini's intrinsic instructions
+=====================
+"""
 
 import numpy as np
 import tvm
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
index 7143e1999cb0..872b017d1f4b 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_dense_cisc.py
@@ -15,6 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-argument
+"""
+Dense (GEMM) operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+"""
 
 import numpy as np
 import tvm
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
index 361b5e2daca7..1fc35df9e182 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_depthwise_conv2d_cisc.py
@@ -15,6 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-argument
+"""
+Depthwise conv2d operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+"""
 
 import numpy as np
 import tvm
diff --git a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
index 1db42b392811..0cc7bde80812 100644
--- a/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
+++ b/python/tvm/relay/backend/contrib/gemmini/gemmini_max_pool2d.py
@@ -15,6 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-argument
+"""
+MaxPool2D operator declaration and schedule registration for Gemmini's CISC instructions
+=====================
+"""
 
 import tvm
 from tvm import te
diff --git a/python/tvm/relay/backend/contrib/gemmini/op.py b/python/tvm/relay/backend/contrib/gemmini/op.py
index 32869cbb2acf..990ae11f9808 100644
--- a/python/tvm/relay/backend/contrib/gemmini/op.py
+++ b/python/tvm/relay/backend/contrib/gemmini/op.py
@@ -15,6 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-argument, ungrouped-imports
+"""
+Namespace for the supported Relay operators on Gemmini
+=====================
+"""
 
 from __future__ import absolute_import as _abs
 

From 0cecb86672a40accce5d60c0d703a41b7bb73512 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Tue, 28 Mar 2023 10:05:46 +0200
Subject: [PATCH 277/286] Lint fixes

---
 apps/microtvm/gemmini/template_project/src/add_example/add.c    | 2 +-
 .../gemmini/template_project/src/conv2d_example/conv2d.c        | 2 +-
 .../microtvm/gemmini/template_project/src/dense_example/dense.c | 2 +-
 .../gemmini/template_project/src/dwconv2d_example/dwconv2d.c    | 2 +-
 .../gemmini/template_project/src/maxpool2d_example/maxpool2d.c  | 2 +-
 .../gemmini/template_project/src/mobilenet_example/mobilenet.c  | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/apps/microtvm/gemmini/template_project/src/add_example/add.c b/apps/microtvm/gemmini/template_project/src/add_example/add.c
index 960c6e0d4ab7..f0ca93422efe 100644
--- a/apps/microtvm/gemmini/template_project/src/add_example/add.c
+++ b/apps/microtvm/gemmini/template_project/src/add_example/add.c
@@ -27,8 +27,8 @@
 #endif
 #include "input_1.h"
 #include "input_2.h"
-#include "output.h"
 #include "model/tvmgen_default.h"
+#include "output.h"
 
 int8_t output_add[OUTPUT_LEN];
 
diff --git a/apps/microtvm/gemmini/template_project/src/conv2d_example/conv2d.c b/apps/microtvm/gemmini/template_project/src/conv2d_example/conv2d.c
index 5ec633081b5e..6b91db406eaf 100644
--- a/apps/microtvm/gemmini/template_project/src/conv2d_example/conv2d.c
+++ b/apps/microtvm/gemmini/template_project/src/conv2d_example/conv2d.c
@@ -26,8 +26,8 @@
 #include "sys/mman.h"
 #endif
 #include "input.h"
-#include "output.h"
 #include "model/tvmgen_default.h"
+#include "output.h"
 
 int8_t output_conv[OUTPUT_LEN];
 
diff --git a/apps/microtvm/gemmini/template_project/src/dense_example/dense.c b/apps/microtvm/gemmini/template_project/src/dense_example/dense.c
index cb6cd27c7e51..64ca5b821d22 100644
--- a/apps/microtvm/gemmini/template_project/src/dense_example/dense.c
+++ b/apps/microtvm/gemmini/template_project/src/dense_example/dense.c
@@ -26,8 +26,8 @@
 #include "sys/mman.h"
 #endif
 #include "input.h"
-#include "output.h"
 #include "model/tvmgen_default.h"
+#include "output.h"
 
 int8_t output_gemm[OUTPUT_LEN];
 
diff --git a/apps/microtvm/gemmini/template_project/src/dwconv2d_example/dwconv2d.c b/apps/microtvm/gemmini/template_project/src/dwconv2d_example/dwconv2d.c
index 772c343f5c1f..b352512e08a1 100644
--- a/apps/microtvm/gemmini/template_project/src/dwconv2d_example/dwconv2d.c
+++ b/apps/microtvm/gemmini/template_project/src/dwconv2d_example/dwconv2d.c
@@ -26,8 +26,8 @@
 #include "sys/mman.h"
 #endif
 #include "input.h"
-#include "output.h"
 #include "model/tvmgen_default.h"
+#include "output.h"
 
 int8_t output_conv[OUTPUT_LEN];
 
diff --git a/apps/microtvm/gemmini/template_project/src/maxpool2d_example/maxpool2d.c b/apps/microtvm/gemmini/template_project/src/maxpool2d_example/maxpool2d.c
index 0843e67c628f..a81bc7d3c612 100644
--- a/apps/microtvm/gemmini/template_project/src/maxpool2d_example/maxpool2d.c
+++ b/apps/microtvm/gemmini/template_project/src/maxpool2d_example/maxpool2d.c
@@ -26,8 +26,8 @@
 #include "sys/mman.h"
 #endif
 #include "input.h"
-#include "output.h"
 #include "model/tvmgen_default.h"
+#include "output.h"
 
 int8_t output_maxpool2d[OUTPUT_LEN];
 
diff --git a/apps/microtvm/gemmini/template_project/src/mobilenet_example/mobilenet.c b/apps/microtvm/gemmini/template_project/src/mobilenet_example/mobilenet.c
index a42fe6d32d05..70bd145da2a1 100644
--- a/apps/microtvm/gemmini/template_project/src/mobilenet_example/mobilenet.c
+++ b/apps/microtvm/gemmini/template_project/src/mobilenet_example/mobilenet.c
@@ -26,8 +26,8 @@
 #include "sys/mman.h"
 #endif
 #include "input.h"
-#include "output.h"
 #include "model/tvmgen_default.h"
+#include "output.h"
 
 uint8_t output_pred[1001];
 

From d53013a3d95afe61b45ddccb65598329f9489047 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 29 Mar 2023 14:38:04 +0200
Subject: [PATCH 278/286] Fixed merge error

---
 web/src/webgpu.ts | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts
index ef382bdebef3..eaba56f4eeb6 100644
--- a/web/src/webgpu.ts
+++ b/web/src/webgpu.ts
@@ -102,9 +102,6 @@ export class WebGPUContext {
       entries: layoutEntries
     });
 
-    const textDecoder = new TextDecoder('utf-8')
-    const codeString = textDecoder.decode(data.buffer)
-
     const pipeline = this.device.createComputePipeline({
       layout: this.device.createPipelineLayout({
         bindGroupLayouts: [ bindGroupLayout ]
@@ -344,4 +341,4 @@ export class WebGPUContext {
       return idx;
     }
   }
-}
+}
\ No newline at end of file

From 17369a4588758a23b77196c215749abf1450da07 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 29 Mar 2023 14:47:29 +0200
Subject: [PATCH 279/286] Small lint fix

---
 web/src/webgpu.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts
index eaba56f4eeb6..faf6fac990c8 100644
--- a/web/src/webgpu.ts
+++ b/web/src/webgpu.ts
@@ -341,4 +341,4 @@ export class WebGPUContext {
       return idx;
     }
   }
-}
\ No newline at end of file
+}

From 777816d04d825c48130f64bbf6a464f87619f4ac Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Thu, 30 Mar 2023 08:27:20 +0200
Subject: [PATCH 280/286] Test fix

---
 tests/python/unittest/test_tvmscript_roundtrip.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index ff90edd65eb6..dd028201c48f 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3762,7 +3762,7 @@ def func(
 
 def test_roundtrip(ir_generator):
     original = ir_generator()
-    after_roundtrip = tvm.script.from_source(original.script())
+    after_roundtrip = tvm.script.from_source(original.script(show_meta=True))
     tvm.ir.assert_structural_equal(original, after_roundtrip, True)
 
 
@@ -3773,4 +3773,4 @@ def test_return_none_no_trailing_type():
 
 
 if __name__ == "__main__":
-    tvm.testing.main()
+    tvm.testing.main()
\ No newline at end of file

From cb3fdaa0d1a49db43d59e1679f3971fc260fb022 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Thu, 30 Mar 2023 09:06:43 +0200
Subject: [PATCH 281/286] Lint fix

---
 tests/python/unittest/test_tvmscript_roundtrip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index dd028201c48f..cd7f1726c9d9 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3773,4 +3773,4 @@ def test_return_none_no_trailing_type():
 
 
 if __name__ == "__main__":
-    tvm.testing.main()
\ No newline at end of file
+    tvm.testing.main()

From 0ecb826fa80fcefda054e1e00413588add348737 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Mon, 3 Apr 2023 13:22:57 +0200
Subject: [PATCH 282/286] Changed URL in tutorial to the standard one used in
 other tutorials

---
 gallery/tutorial/micro_gemmini_mobilenet.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gallery/tutorial/micro_gemmini_mobilenet.py b/gallery/tutorial/micro_gemmini_mobilenet.py
index 28e80c4fb563..e37d25980124 100644
--- a/gallery/tutorial/micro_gemmini_mobilenet.py
+++ b/gallery/tutorial/micro_gemmini_mobilenet.py
@@ -100,8 +100,7 @@ def run_tflite_model(tflite_model_buf, input_data):
 
 def download_model():
     model_url = (
-        "https://storage.googleapis.com/download.tensorflow.org/models/"
-        "tflite_11_05_08/mobilenet_v2_1.0_224.tgz"
+        "http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224.tgz"
     )
 
     # Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite

From d498f8842896afd8c18712309c5e9e3bfc3cbdbe Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Mon, 3 Apr 2023 13:41:04 +0200
Subject: [PATCH 283/286] Lint fix

---
 gallery/tutorial/micro_gemmini_mobilenet.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/gallery/tutorial/micro_gemmini_mobilenet.py b/gallery/tutorial/micro_gemmini_mobilenet.py
index e37d25980124..b0df1573ffef 100644
--- a/gallery/tutorial/micro_gemmini_mobilenet.py
+++ b/gallery/tutorial/micro_gemmini_mobilenet.py
@@ -99,9 +99,7 @@ def run_tflite_model(tflite_model_buf, input_data):
 
 
 def download_model():
-    model_url = (
-        "http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224.tgz"
-    )
+    model_url = "http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224.tgz"
 
     # Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite
     model_path = download_testdata(

From aff88a1be0ac409a566a1069155ed551777b6a3d Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 5 Apr 2023 08:19:40 +0200
Subject: [PATCH 284/286] Merge fix

---
 .../launch_microtvm_api_server.sh             | 23 +------------------
 gallery/tutorial/micro_gemmini_dwconv2d.py    |  1 -
 2 files changed, 1 insertion(+), 23 deletions(-)

diff --git a/apps/microtvm/zephyr/template_project/launch_microtvm_api_server.sh b/apps/microtvm/zephyr/template_project/launch_microtvm_api_server.sh
index 5d9bfba44d68..4243015cc590 100755
--- a/apps/microtvm/zephyr/template_project/launch_microtvm_api_server.sh
+++ b/apps/microtvm/zephyr/template_project/launch_microtvm_api_server.sh
@@ -15,11 +15,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-<<<<<<<< HEAD:tests/python/unittest/test_arith_detect_cse.py
-import tvm
-import tvm.testing
-from tvm.script import tir as T
-========
 
 function show_usage() {
     cat <<EOF
@@ -27,29 +22,13 @@ This script is for running microtvm_api_server with Zephyr.
 Usage: launch_microtvm_api_server.sh <microtvm_api_server.py> --read-fd <READ_FD_PATH> --write-fd <WRITE_FD_PATH>
 EOF
 }
->>>>>>>> upstream/main:apps/microtvm/zephyr/template_project/launch_microtvm_api_server.sh
 
 if [ "$#" -lt 5 -o "$1" == "--help" ]; then
     show_usage
     exit -1
 fi
 
-<<<<<<<< HEAD:tests/python/unittest/test_arith_detect_cse.py
-def test_detect_cs():
-    x = T.Var("x", dtype="int32")
-    y = T.Var("y", dtype="int32")
-    z = T.Var("z", dtype="int32")
-    c = T.floor(x + y + 0.5) + x + z * (T.floor(x + y + 0.5))
-    m = tvm.arith.detect_common_subexpr(c, 2)
-    assert c.a.a in m
-    assert m[c.a.a] == 2
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
-========
 PYTHON_CMD=$(sed 's/#!//; q' $(which west))
 
 # Run server
-$PYTHON_CMD $1 $2 $3 $4 $5
->>>>>>>> upstream/main:apps/microtvm/zephyr/template_project/launch_microtvm_api_server.sh
+$PYTHON_CMD $1 $2 $3 $4 $5
\ No newline at end of file
diff --git a/gallery/tutorial/micro_gemmini_dwconv2d.py b/gallery/tutorial/micro_gemmini_dwconv2d.py
index fb57aacf862f..ebdf8c1be22c 100644
--- a/gallery/tutorial/micro_gemmini_dwconv2d.py
+++ b/gallery/tutorial/micro_gemmini_dwconv2d.py
@@ -24,7 +24,6 @@
 """
 
 import itertools
-from pyrsistent import v
 import tensorflow as tf
 from tensorflow import keras
 import tarfile

From fb6330f35a575ae4f7aff8bd2587dd535058d0b6 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Wed, 5 Apr 2023 08:40:03 +0200
Subject: [PATCH 285/286] Lint fix

---
 .../zephyr/template_project/launch_microtvm_api_server.sh       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/microtvm/zephyr/template_project/launch_microtvm_api_server.sh b/apps/microtvm/zephyr/template_project/launch_microtvm_api_server.sh
index 4243015cc590..1531c453dc27 100755
--- a/apps/microtvm/zephyr/template_project/launch_microtvm_api_server.sh
+++ b/apps/microtvm/zephyr/template_project/launch_microtvm_api_server.sh
@@ -31,4 +31,4 @@ fi
 PYTHON_CMD=$(sed 's/#!//; q' $(which west))
 
 # Run server
-$PYTHON_CMD $1 $2 $3 $4 $5
\ No newline at end of file
+$PYTHON_CMD $1 $2 $3 $4 $5

From 7236a8bb30ee3c83f7053a7c991c1a8b536aa388 Mon Sep 17 00:00:00 2001
From: Federico Peccia <peccia@fzi.de>
Date: Mon, 17 Apr 2023 13:03:11 +0200
Subject: [PATCH 286/286] Fix test

---
 tests/python/unittest/test_micro_ms_tuning.py | 32 ++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_micro_ms_tuning.py b/tests/python/unittest/test_micro_ms_tuning.py
index edb27396e324..58ffa7845470 100644
--- a/tests/python/unittest/test_micro_ms_tuning.py
+++ b/tests/python/unittest/test_micro_ms_tuning.py
@@ -27,9 +27,39 @@
 from tvm import meta_schedule as ms
 
 
+def create_relay_module():
+    data_shape = (1, 3, 16, 16)
+    weight_shape = (8, 3, 5, 5)
+    data = relay.var("data", relay.TensorType(data_shape, "float32"))
+    weight = relay.var("weight", relay.TensorType(weight_shape, "float32"))
+    y = relay.nn.conv2d(
+        data,
+        weight,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        kernel_layout="OIHW",
+        out_dtype="float32",
+    )
+    f = relay.Function([data, weight], y)
+    mod = tvm.IRModule.from_expr(f)
+    mod = relay.transform.InferType()(mod)
+
+    weight_sample = np.random.rand(
+        weight_shape[0], weight_shape[1], weight_shape[2], weight_shape[3]
+    ).astype("float32")
+    params = {mod["main"].params[1].name_hint: weight_sample}
+
+    model_info = {
+        "in_tensor": "data",
+        "in_shape": data_shape,
+        "in_dtype": "float32",
+    }
+
+    return mod, params, model_info
+
+
 @tvm.testing.requires_micro
 def test_micro_tuning_with_meta_schedule():
-    from tests.micro.zephyr.test_ms_tuning import create_relay_module
     from tvm.contrib.micro.meta_schedule.local_builder_micro import get_local_builder_micro
     from tvm.contrib.micro.meta_schedule.rpc_runner_micro import get_rpc_runner_micro