diff --git a/.asf.yaml b/.asf.yaml
index 34e813f39639..7fd3f6930fb1 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -32,3 +32,17 @@ github:
     - vulkan
     - spirv
     - machine-learning
+
+  # Triage perm for collaborators(test run)
+  #
+  # The perm is given based on needs and not based on
+  # evaluation of past contributions. The rationale
+  # is that people may need the permission to start
+  # contributing in this way. It serves to diversify
+  # the ways to contribute.
+  #
+  # There is a limited number of slots. To enable broad
+  # participation, permission is given on a three month
+  # cycle. PMC may review and recycle slots when necessary.
+  collaborators:
+    - denise-k
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c40b0c878905..24f0653b3a78 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -693,7 +693,7 @@ if(USE_CCACHE) # True for AUTO, ON, /path/to/ccache
       message(STATUS "Found the path to ccache, enabling ccache")
       set(PATH_TO_CCACHE ccache)
     else()
-      message(FATAL_ERROR "Cannot find ccache. Set USE_CCACHE mode to AUTO or OFF to build without ccache. USE_CCACHE=" "${USE_CCACHE")
+      message(FATAL_ERROR "Cannot find ccache. Set USE_CCACHE mode to AUTO or OFF to build without ccache. USE_CCACHE=" "${USE_CCACHE}")
     endif(CCACHE_FOUND)
   else() # /path/to/ccache
     set(PATH_TO_CCACHE USE_CCACHE)
diff --git a/src/runtime/hexagon/launcher/README.md b/apps/hexagon_launcher/README.md
similarity index 76%
rename from src/runtime/hexagon/launcher/README.md
rename to apps/hexagon_launcher/README.md
index a8a570918514..85e6897b74a3 100644
--- a/src/runtime/hexagon/launcher/README.md
+++ b/apps/hexagon_launcher/README.md
@@ -19,9 +19,7 @@
 ## Compilation
 
 The launcher consists of two parts: part running on Hexagon, and part running
-on Android. They need to be compiled separately. Since some source files are
-shared between these two parts, make sure to delete all object files between
-compilations. Compile the Hexagon code first.
+on Android. Each component must be compiled separately. 
 
 The supported Snapdragon architectures are 855, 865, and 888.
 
@@ -33,7 +31,46 @@ The supported Snapdragon architectures are 855, 865, and 888.
 Android NDK can be downloaded from https://developer.android.com/ndk.
 Hexagon SDK is available at //developer.qualcomm.com/software/hexagon-dsp-sdk.
 
-### Compilation of the Hexagon part
+### Compilation with TVM
+
+Building the Hexagon launcher application as a component of the main TVM build
+used for Hexagon codegen can be achieved by setting `USE_HEXAGON_LAUNCHER=ON`.
+This option will compile core tvm, the android launcher binary and its corresponding
+tvm_runtime, as well as the Hexagon launcher shared library and its corresponding
+tvm_runtime. As described in the [Manual compilation](#Manual compilation) section 
+each component requires Hexagon and android dependencies. When building the launcher 
+along with TVM these configurations must be providing when invoking cmake. A minimal 
+example invocation for compiling TVM along with the Hexagon launcher is included below,
+
+```
+cmake -DCMAKE_MAKE_PROGRAM=make \
+      -DCMAKE_C_COMPILER=clang \
+      -DCMAKE_CXX_COMPILER=clang++ \
+      -DCMAKE_CXX_FLAGS='-stdlib=libc++' \
+      -DCMAKE_CXX_STANDARD=14 \
+      -DUSE_LLVM=/path/to/hexagon/llvm/bin/llvm-config \
+      -DUSE_HEXAGON_LAUNCHER=ON \
+      -DUSE_ANDROID_TOOLCHAIN=/path/to/android-ndk/build/cmake/android.toolchain.cmake \
+      -DANDROID_PLATFORM=android-28 \
+      -DANDROID_ABI=arm64-v8a \
+      -DUSE_HEXAGON_ARCH=v68 \
+      -DUSE_HEXAGON_SDK=/path/to/hexagon/SDK \
+      -DUSE_HEXAGON_TOOLCHAIN=/path/to/hexagon/Toolchain/ ..
+```
+
+The Hexagon launcher application is an android binary and thus requires the use 
+of an android toolchain for compilation. Similarly, the Hexagon tvm runtime 
+requires the use of the Hexagon toolchain and depends on the Hexagon SDK. The 
+resulting hexagon launcher binaries can be found in the `launcher` subdirectory 
+of the cmake build directory. 
+
+### Manual compilation
+
+Since some source files are shared between the Hexagon and android builds, 
+make sure to delete all object files between compilations. Compile the Hexagon
+code first.
+
+#### Compilation of the Hexagon part
 
 1. Build the static version of TVM runtime for Hexagon. Use Hexagon clang
    from the Hexagon SDK. This step is the same as building the shared version,
@@ -55,7 +92,7 @@ Hexagon SDK is available at //developer.qualcomm.com/software/hexagon-dsp-sdk.
 
 3. Run `make`. This will create `liblauncher_rpc_skel.so`.
 
-### Compilation of the Android part
+#### Compilation of the Android part
 
 1. Build TVM runtime for Android, using clang for AArch64 from the Android
    NDK. Unlike in the Hexagon case, this should be the dynamic library (which
diff --git a/apps/hexagon_launcher/cmake/HexagonLauncher.cmake b/apps/hexagon_launcher/cmake/HexagonLauncher.cmake
new file mode 100644
index 000000000000..4a7f803ce1ab
--- /dev/null
+++ b/apps/hexagon_launcher/cmake/HexagonLauncher.cmake
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(NOT "${FASTRPC_LIBS}" STREQUAL "SKEL" AND
+    NOT "${FASTRPC_LIBS}" STREQUAL "STUB")
+  message(SEND_ERROR "Please set FASTRPC_LIBS to either SKEL or STUB")
+endif()
+
+if(NOT DEFINED USE_HEXAGON_SDK)
+  message(SEND_ERROR "Please set USE_HEXAGON_SDK to the location of Hexagon SDK")
+endif()
+if (NOT DEFINED USE_HEXAGON_ARCH)
+  message(SEND_ERROR "Please set USE_HEXAGON_ARCH to the Hexagon architecture version")
+endif()
+
+set(TVM_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../")
+
+include(ExternalProject)
+include("${TVM_SOURCE_DIR}/cmake/modules/HexagonSDK.cmake")
+
+find_hexagon_sdk_root("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}")
+
+include_directories(SYSTEM ${HEXAGON_SDK_INCLUDES} ${HEXAGON_REMOTE_ROOT})
+
+set(QAIC_EXE "${HEXAGON_QAIC_EXE}")
+foreach(INCDIR IN LISTS HEXAGON_SDK_INCLUDES HEXAGON_REMOTE_ROOT)
+  list(APPEND QAIC_FLAGS "-I${INCDIR}")
+endforeach()
+
+set(LAUNCHER_SRC "${CMAKE_CURRENT_SOURCE_DIR}/../../")
+set(CMAKE_SKIP_RPATH TRUE)
+
+# Qaic for the domain header.
+#
+# Don't add paths to these filenames, or otherwise cmake may spontaneously
+# add -o option to the qaic invocation (with an undesirable path).
+set(LAUNCHER_RPC_IDL "launcher_rpc.idl")
+set(LAUNCHER_RPC_H "launcher_rpc.h")
+set(LAUNCHER_RPC_SKEL_C "launcher_rpc_skel.c")
+set(LAUNCHER_RPC_STUB_C "launcher_rpc_stub.c")
+
+include_directories(
+  "${LAUNCHER_SRC}"
+  "${TVM_SOURCE_DIR}/include"
+  "${TVM_SOURCE_DIR}/3rdparty/dlpack/include"
+  "${TVM_SOURCE_DIR}/3rdparty/dmlc-core/include"
+)
diff --git a/apps/hexagon_launcher/cmake/android/CMakeLists.txt b/apps/hexagon_launcher/cmake/android/CMakeLists.txt
new file mode 100644
index 000000000000..c000b0e97cad
--- /dev/null
+++ b/apps/hexagon_launcher/cmake/android/CMakeLists.txt
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+cmake_minimum_required(VERSION 3.2)
+project(HexagonAndroidLauncher C CXX)
+
+include("${CMAKE_CURRENT_SOURCE_DIR}/../HexagonLauncher.cmake")
+
+add_custom_command(
+  OUTPUT ${LAUNCHER_RPC_STUB_C}
+  COMMAND ${QAIC_EXE} ${QAIC_FLAGS}
+  "${LAUNCHER_SRC}/${LAUNCHER_RPC_IDL}"
+  COMMAND ${CMAKE_COMMAND} -E rename "${LAUNCHER_RPC_H}"
+  "${LAUNCHER_SRC}/${LAUNCHER_RPC_H}"
+  MAIN_DEPENDENCY "${LAUNCHER_SRC}/${LAUNCHER_RPC_IDL}"
+)
+
+include_directories(SYSTEM
+  "${HEXAGON_SDK_INCLUDES}"
+  "${HEXAGON_RPCMEM_ROOT}/inc"
+)
+
+link_directories(${HEXAGON_REMOTE_ROOT})
+
+add_definitions(-DDMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+
+set(STUB_SRCS
+  "${LAUNCHER_SRC}/launcher_android.cc"
+  "${LAUNCHER_SRC}/launcher_core.cc"
+  "${LAUNCHER_SRC}/launcher_main.cc"
+  "${LAUNCHER_SRC}/launcher_util.cc"
+)
+
+add_executable(launcher_android
+  "${STUB_SRCS}"
+  "${LAUNCHER_RPC_STUB_C}"
+)
+
+ExternalProject_Add(android_tvm_runtime
+  SOURCE_DIR "${TVM_SOURCE_DIR}"
+  BUILD_COMMAND $(MAKE) runtime
+  CMAKE_ARGS
+  "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}"
+  "-DANDROID_PLATFORM=${ANDROID_PLATFORM}"
+  "-DANDROID_ABI=${ANDROID_ABI}"
+  "-DCMAKE_CXX_STANDARD=14"
+  "-DUSE_LIBBACKTRACE=OFF"
+  "-DUSE_LLVM=OFF"
+  "-DUSE_RPC=OFF"
+  INSTALL_COMMAND ""
+  BUILD_ALWAYS ON
+)
+ExternalProject_Get_Property(android_tvm_runtime BINARY_DIR)
+ExternalProject_Add_Step(android_tvm_runtime copy_binaries
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${BINARY_DIR} ${CMAKE_INSTALL_PREFIX}
+  DEPENDEES install
+)
+
+add_dependencies(launcher_android android_tvm_runtime)
+add_library(tvm_runtime SHARED IMPORTED)
+set_target_properties(tvm_runtime PROPERTIES IMPORTED_LOCATION "${BINARY_DIR}/libtvm_runtime.so")
+
+target_link_libraries(launcher_android cdsprpc log tvm_runtime)
diff --git a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
new file mode 100644
index 000000000000..c76fcccc5a1a
--- /dev/null
+++ b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+cmake_minimum_required(VERSION 3.2)
+project(HexagonLauncherRPCSkel C CXX)
+
+include("${CMAKE_CURRENT_SOURCE_DIR}/../HexagonLauncher.cmake")
+
+add_custom_command(
+  OUTPUT ${LAUNCHER_RPC_SKEL_C} ${LAUNCHER_RPC_H}
+  COMMAND ${QAIC_EXE} ${QAIC_FLAGS}
+  "${LAUNCHER_SRC}/${LAUNCHER_RPC_IDL}"
+  MAIN_DEPENDENCY "${LAUNCHER_SRC}/${LAUNCHER_RPC_IDL}"
+)
+
+include_directories(SYSTEM ${HEXAGON_QURT_INCLUDES})
+
+link_directories(${HEXAGON_QURT_LIBS})
+
+add_definitions(-D_MACH_I32=int)
+add_definitions(-DDMLC_CXX11_THREAD_LOCAL=0)
+add_definitions(-DDMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+
+# Extra compile flags (both C and C++).
+set(EXTRA_COMP_FLAGS
+  "-O3"
+  "-m${USE_HEXAGON_ARCH}"
+)
+string(REGEX REPLACE ";" " " EXTRA_COMP_FLAGS_STR "${EXTRA_COMP_FLAGS}")
+set(CMAKE_C_FLAGS "${EXTRA_COMP_FLAGS_STR} ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${EXTRA_COMP_FLAGS_STR} ${CMAKE_CXX_FLAGS}")
+
+set(SKEL_SRCS
+  "${LAUNCHER_SRC}/launcher_core.cc"
+  "${LAUNCHER_SRC}/launcher_hexagon.cc"
+)
+add_library(launcher_rpc_skel SHARED
+  "${LAUNCHER_SRC}/${LAUNCHER_RPC_H}"
+  "${LAUNCHER_RPC_SKEL_C}"
+  "${SKEL_SRCS}"
+)
+
+ExternalProject_Add(static_hexagon_tvm_runtime
+  SOURCE_DIR "${TVM_SOURCE_DIR}"
+  BUILD_COMMAND $(MAKE) runtime
+  CMAKE_ARGS
+  "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+  "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+  "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
+  "-DCMAKE_CXX_STANDARD=14"
+  "-DUSE_LIBBACKTRACE=OFF"
+  "-DUSE_LLVM=OFF"
+  "-DUSE_RPC=OFF"
+  "-DBUILD_STATIC_RUNTIME=ON"
+  "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
+  INSTALL_COMMAND ""
+  BUILD_ALWAYS ON
+)
+ExternalProject_Get_Property(static_hexagon_tvm_runtime BINARY_DIR)
+ExternalProject_Add_Step(static_hexagon_tvm_runtime copy_binaries
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${BINARY_DIR} ${CMAKE_INSTALL_PREFIX}
+  DEPENDEES install
+)
+
+add_dependencies(launcher_rpc_skel static_hexagon_tvm_runtime)
+add_library(static_tvm_runtime STATIC IMPORTED)
+set_target_properties(static_tvm_runtime PROPERTIES IMPORTED_LOCATION "${BINARY_DIR}/libtvm_runtime.a")
+
+target_link_libraries(launcher_rpc_skel -Wl,--whole-archive static_tvm_runtime -Wl,--no-whole-archive)
+
diff --git a/src/runtime/hexagon/launcher/launcher_android.cc b/apps/hexagon_launcher/launcher_android.cc
similarity index 100%
rename from src/runtime/hexagon/launcher/launcher_android.cc
rename to apps/hexagon_launcher/launcher_android.cc
diff --git a/src/runtime/hexagon/launcher/launcher_core.cc b/apps/hexagon_launcher/launcher_core.cc
similarity index 100%
rename from src/runtime/hexagon/launcher/launcher_core.cc
rename to apps/hexagon_launcher/launcher_core.cc
diff --git a/src/runtime/hexagon/launcher/launcher_core.h b/apps/hexagon_launcher/launcher_core.h
similarity index 100%
rename from src/runtime/hexagon/launcher/launcher_core.h
rename to apps/hexagon_launcher/launcher_core.h
diff --git a/src/runtime/hexagon/launcher/launcher_hexagon.cc b/apps/hexagon_launcher/launcher_hexagon.cc
similarity index 100%
rename from src/runtime/hexagon/launcher/launcher_hexagon.cc
rename to apps/hexagon_launcher/launcher_hexagon.cc
diff --git a/src/runtime/hexagon/launcher/launcher_main.cc b/apps/hexagon_launcher/launcher_main.cc
similarity index 100%
rename from src/runtime/hexagon/launcher/launcher_main.cc
rename to apps/hexagon_launcher/launcher_main.cc
diff --git a/src/runtime/hexagon/launcher/launcher_rpc.idl b/apps/hexagon_launcher/launcher_rpc.idl
similarity index 100%
rename from src/runtime/hexagon/launcher/launcher_rpc.idl
rename to apps/hexagon_launcher/launcher_rpc.idl
diff --git a/src/runtime/hexagon/launcher/launcher_util.cc b/apps/hexagon_launcher/launcher_util.cc
similarity index 100%
rename from src/runtime/hexagon/launcher/launcher_util.cc
rename to apps/hexagon_launcher/launcher_util.cc
diff --git a/src/runtime/hexagon/launcher/launcher_util.h b/apps/hexagon_launcher/launcher_util.h
similarity index 100%
rename from src/runtime/hexagon/launcher/launcher_util.h
rename to apps/hexagon_launcher/launcher_util.h
diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
index 3a5fd18cede7..42b90c661704 100755
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -388,13 +388,15 @@ def test_command(args):
         microtvm_test_config["microtvm_board"] = args.microtvm_board
 
     providers = args.provider
-    provider_passed = {p: False for p in providers}
 
     release_test_dir = os.path.join(THIS_DIR, f"release-test-{args.platform}")
 
-    if args.skip_build:
-        assert len(providers) == 1, "--skip-build was given, but >1 provider specified"
+    if args.skip_build or args.skip_destroy:
+        assert (
+            len(providers) == 1
+        ), "--skip-build and/or --skip-destroy was given, but >1 provider specified"
 
+    test_failed = False
     for provider_name in providers:
         try:
             if not args.skip_build:
@@ -408,18 +410,27 @@ def test_command(args):
                 microtvm_test_config,
                 args.test_device_serial,
             )
-            provider_passed[provider_name] = True
+
+        except subprocess.CalledProcessError:
+            test_failed = True
+            sys.exit(
+                f"\n\nERROR: Provider '{provider_name}' failed the release test. "
+                "You can re-run it to reproduce the issue without building everything "
+                "again by passing the --skip-build and specifying only the provider that failed. "
+                "The VM is still running in case you want to connect it via SSH to "
+                "investigate further the issue, thus it's necessary to destroy it manually "
+                "to release the resources back to the host, like a USB device attached to the VM."
+            )
 
         finally:
-            if not args.skip_build and len(providers) > 1:
+            # if we reached out here do_run_release_test() succeeded, hence we can
+            # destroy the VM and release the resources back to the host if user haven't
+            # requested to not destroy it.
+            if not (args.skip_destroy or test_failed):
                 subprocess.check_call(["vagrant", "destroy", "-f"], cwd=release_test_dir)
                 shutil.rmtree(release_test_dir)
 
-        if not all(provider_passed[p] for p in provider_passed.keys()):
-            sys.exit(
-                "some providers failed release test: "
-                + ",".join(name for name, passed in provider_passed if not passed)
-            )
+    print(f'\n\nThe release tests passed on all specified providers: {", ".join(providers)}.')
 
 
 def release_command(args):
@@ -493,9 +504,20 @@ def parse_args():
         "--skip-build",
         action="store_true",
         help=(
-            "If given, assume a box has already been built in "
-            "the release-test subdirectory. Attach a USB device to this box and execute the "
-            "release test script--do not delete it."
+            "If given, assume a box has already been built in the release-test subdirectory, "
+            "so use that box to execute the release test script. If the tests fail the VM used "
+            "for testing will be left running for further investigation and will need to be "
+            "destroyed manually. If all tests pass on all specified providers no VM is left running, "
+            "unless --skip-destroy is given too."
+        ),
+    )
+    parser_test.add_argument(
+        "--skip-destroy",
+        action="store_true",
+        help=(
+            "Skip destroying the test VM even if all tests pass. Can only be used if a single "
+            "provider is specified. Default is to destroy the VM if all tests pass (and always "
+            "skip destroying it if a test fails)."
         ),
     )
     parser_test.add_argument(
diff --git a/cmake/config.cmake b/cmake/config.cmake
index ade9d5c815c1..1fce11f90aed 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -279,6 +279,9 @@ set(USE_FALLBACK_STL_MAP OFF)
 set(USE_HEXAGON_DEVICE OFF)
 set(USE_HEXAGON_SDK /path/to/sdk)
 
+# Whether to build the hexagon launcher
+set(USE_HEXAGON_LAUNCHER OFF)
+
 # Hexagon architecture to target when compiling TVM itself (not the target for
 # compiling _by_ TVM). This applies to components like the TVM runtime, but is
 # also used to select correct include/library paths from the Hexagon SDK when
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index eb3ad1f5ae4a..1491a4558611 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -53,23 +53,86 @@ if(BUILD_FOR_HEXAGON)
   include_directories(SYSTEM ${HEXAGON_SDK_INCLUDES} ${HEXAGON_QURT_INCLUDES})
 endif()
 
-if(USE_HEXAGON_DEVICE STREQUAL "OFF")
-  list(APPEND COMPILER_SRCS src/target/opt/build_hexagon_off.cc)
-  return()
-elseif(NOT USE_HEXAGON_DEVICE STREQUAL "${PICK_SIM}" AND
-       NOT USE_HEXAGON_DEVICE STREQUAL "${PICK_HW}")
-  set(ERROR_MSG
+if(USE_HEXAGON_LAUNCHER STREQUAL "ON")
+  set(USE_HEXAGON_DEVICE "${PICK_SIM}")
+else()
+  if(USE_HEXAGON_DEVICE STREQUAL "OFF")
+    list(APPEND COMPILER_SRCS src/target/opt/build_hexagon_off.cc)
+    return()
+  elseif(NOT USE_HEXAGON_DEVICE STREQUAL "${PICK_SIM}" AND
+      NOT USE_HEXAGON_DEVICE STREQUAL "${PICK_HW}")
+    set(ERROR_MSG
       "USE_HEXAGON_DEVICE must be one of [${PICK_NONE}|${PICK_SIM}|${PICK_HW}]")
-  message(SEND_ERROR "${ERROR_MSG}")
-  return()
+    message(SEND_ERROR "${ERROR_MSG}")
+    return()
+  endif()
 endif()
-# If USE_HEXAGON_DEVICE is set to a valid value, make sure that USE_HEXAGON_SDK
+
+# If USE_HEXAGON_DEVICE/LAUNCHER is set to a valid value, make sure that USE_HEXAGON_SDK
 # is defined.
 if(NOT USE_HEXAGON_SDK)
   message(SEND_ERROR "Please set USE_HEXAGON_SDK to the Hexagon SDK root")
   return()
 endif()
 
+if(USE_HEXAGON_LAUNCHER STREQUAL "ON")
+
+  if(DEFINED USE_ANDROID_TOOLCHAIN)
+    if(NOT DEFINED ANDROID_PLATFORM)
+      message(SEND_ERROR "Please set ANDROID_PLATFORM "
+        "when providing an Android cmake toolchain.")
+    endif()
+    if(NOT DEFINED ANDROID_ABI)
+      message(SEND_ERROR "Please set ANDROID_ABI "
+        "when providing an Android cmake toolchain.")
+    endif()
+  else()
+    message(SEND_ERROR "Please set USE_ANDROID_TOOLCHAIN to build the android "
+      " launcher for hexagon.")
+  endif()
+
+  set(LAUNCHER_BINARY_DIR "${CMAKE_BINARY_DIR}/launcher")
+  ExternalProject_Add(launcher_android
+    SOURCE_DIR "${CMAKE_SOURCE_DIR}/apps/hexagon_launcher/cmake/android"
+    INSTALL_DIR "${LAUNCHER_BINARY_DIR}"
+    BUILD_ALWAYS ON
+    CMAKE_ARGS
+    "-DCMAKE_TOOLCHAIN_FILE=${USE_ANDROID_TOOLCHAIN}"
+    "-DANDROID_PLATFORM=${ANDROID_PLATFORM}"
+    "-DANDROID_ABI=${ANDROID_ABI}"
+    "-DFASTRPC_LIBS=STUB"
+    "-DUSE_HEXAGON_ARCH=v68"
+    "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
+    "-DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>"
+    INSTALL_COMMAND ""
+  )
+  ExternalProject_Get_Property(launcher_android BINARY_DIR)
+  ExternalProject_Add_Step(launcher_android copy_binaries
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${BINARY_DIR} ${LAUNCHER_BINARY_DIR}
+    DEPENDEES install
+  )
+  ExternalProject_Add(launcher_hexagon
+    SOURCE_DIR "${CMAKE_SOURCE_DIR}/apps/hexagon_launcher/cmake/hexagon"
+    INSTALL_DIR "${LAUNCHER_BINARY_DIR}"
+    BUILD_ALWAYS ON
+    CMAKE_ARGS
+    "-DCMAKE_C_COMPILER=${USE_HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang"
+    "-DCMAKE_CXX_COMPILER=${USE_HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++"
+    "-DFASTRPC_LIBS=SKEL"
+    "-DUSE_HEXAGON_ARCH=v68"
+    "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
+    "-DCMAKE_INSTALL_PREFIX:PATH=<INSTALL_DIR>"
+    INSTALL_COMMAND ""
+  )
+  ExternalProject_Get_Property(launcher_hexagon BINARY_DIR)
+  ExternalProject_Add_Step(launcher_hexagon copy_binaries
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${BINARY_DIR} ${LAUNCHER_BINARY_DIR}
+    DEPENDEES install
+  )
+
+  set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAUNCHER_BINARY_DIR}")
+endif()
+
 if(USE_HEXAGON_DEVICE STREQUAL "${PICK_SIM}")
   find_hexagon_toolchain()
   message(STATUS "Hexagon toolchain: ${HEXAGON_TOOLCHAIN}")
diff --git a/cmake/modules/contrib/ExampleTargetHooks.cmake b/cmake/modules/contrib/ExampleTargetHooks.cmake
index eb53dda133d2..e9003b02103e 100644
--- a/cmake/modules/contrib/ExampleTargetHooks.cmake
+++ b/cmake/modules/contrib/ExampleTargetHooks.cmake
@@ -15,5 +15,5 @@
 # specific language governing permissions and limitations
 # under the License.
 
-file(GLOB EXAMPLE_TARGET_HOOKS_SRC src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc)
+file(GLOB EXAMPLE_TARGET_HOOKS_SRC src/relay/backend/contrib/example_target_hooks/*.cc)
 list(APPEND COMPILER_SRCS ${EXAMPLE_TARGET_HOOKS_SRC})
diff --git a/docker/build.sh b/docker/build.sh
index 4e1a9b346895..3ac74835f0ab 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -85,7 +85,7 @@ if [[ "$1" == "--cache-from" ]]; then
     shift 1
     cached_image="$1"
     DOCKER_NO_CACHE_ARG=
-    CI_DOCKER_BUILD_EXTRA_PARAMS+=("--cache-from tvm.$CONTAINER_TYPE")
+    CI_DOCKER_BUILD_EXTRA_PARAMS+=("--cache-from tvm.$CONTAINER_TYPE:$DOCKER_IMAGE_TAG")
     CI_DOCKER_BUILD_EXTRA_PARAMS+=("--cache-from $cached_image")
     shift 1
 fi
diff --git a/docker/install/ubuntu_install_ethosu_driver_stack.sh b/docker/install/ubuntu_install_ethosu_driver_stack.sh
index 35b2b4c74b7b..db8b47399390 100755
--- a/docker/install/ubuntu_install_ethosu_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosu_driver_stack.sh
@@ -24,7 +24,7 @@ fvp_dir="/opt/arm/FVP_Corstone_SSE-300_Ethos-U55"
 cmake_dir="/opt/arm/cmake"
 ethosu_dir="/opt/arm/ethosu"
 ethosu_driver_ver="21.05"
-cmsis_ver="5.7.0"
+cmsis_ver="5.8.0"
 
 mkdir -p /opt/arm
 
@@ -92,3 +92,13 @@ cd "${ethosu_dir}"
 git clone "https://github.com/ARM-software/CMSIS_5.git" cmsis
 cd cmsis
 git checkout -f tags/${cmsis_ver}
+
+# Build Driver
+mkdir ${ethosu_dir}/core_driver/build && cd ${ethosu_dir}/core_driver/build
+cmake -DCMAKE_TOOLCHAIN_FILE=${ethosu_dir}/core_platform/cmake/toolchain/arm-none-eabi-gcc.cmake -DETHOSU_LOG_SEVERITY=debug -DTARGET_CPU=cortex-m55 ..
+make
+
+# Build NN Library
+mkdir ${ethosu_dir}/cmsis/CMSIS/NN/build/ && cd ${ethosu_dir}/cmsis/CMSIS/NN/build/
+cmake .. -DCMAKE_TOOLCHAIN_FILE=${ethosu_dir}/core_platform/cmake/toolchain/arm-none-eabi-gcc.cmake -DTARGET_CPU=cortex-m55 -DBUILD_CMSIS_NN_FUNCTIONS=YES
+make
diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index d1fa340ac37d..fb0f596d6552 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -36,6 +36,6 @@ pip3 install \
     pytest-xdist \
     requests \
     scipy \
-    synr==0.4.1 \
+    synr==0.5.0 \
     six \
     tornado
diff --git a/docker/install/ubuntu_install_sphinx.sh b/docker/install/ubuntu_install_sphinx.sh
index 66720d411832..12208bbe6643 100755
--- a/docker/install/ubuntu_install_sphinx.sh
+++ b/docker/install/ubuntu_install_sphinx.sh
@@ -29,5 +29,5 @@ pip3 install \
     matplotlib \
     sphinx \
     sphinx_autodoc_annotation \
-    sphinx-gallery==0.4.1 \
+    sphinx-gallery==0.4.0 \
     sphinx_rtd_theme
diff --git a/docs/dev/benchmark.rst b/docs/arch/benchmark.rst
similarity index 100%
rename from docs/dev/benchmark.rst
rename to docs/arch/benchmark.rst
diff --git a/docs/dev/convert_layout.rst b/docs/arch/convert_layout.rst
similarity index 100%
rename from docs/dev/convert_layout.rst
rename to docs/arch/convert_layout.rst
diff --git a/docs/dev/debugger.rst b/docs/arch/debugger.rst
similarity index 100%
rename from docs/dev/debugger.rst
rename to docs/arch/debugger.rst
diff --git a/docs/dev/device_target_interactions.rst b/docs/arch/device_target_interactions.rst
similarity index 100%
rename from docs/dev/device_target_interactions.rst
rename to docs/arch/device_target_interactions.rst
diff --git a/docs/dev/frontend/tensorflow.rst b/docs/arch/frontend/tensorflow.rst
similarity index 100%
rename from docs/dev/frontend/tensorflow.rst
rename to docs/arch/frontend/tensorflow.rst
diff --git a/docs/dev/hybrid_script.rst b/docs/arch/hybrid_script.rst
similarity index 100%
rename from docs/dev/hybrid_script.rst
rename to docs/arch/hybrid_script.rst
diff --git a/docs/dev/index.rst b/docs/arch/index.rst
similarity index 100%
rename from docs/dev/index.rst
rename to docs/arch/index.rst
diff --git a/docs/dev/inferbound.rst b/docs/arch/inferbound.rst
similarity index 100%
rename from docs/dev/inferbound.rst
rename to docs/arch/inferbound.rst
diff --git a/docs/dev/introduction_to_module_serialization.rst b/docs/arch/introduction_to_module_serialization.rst
similarity index 100%
rename from docs/dev/introduction_to_module_serialization.rst
rename to docs/arch/introduction_to_module_serialization.rst
diff --git a/docs/dev/microtvm_design.rst b/docs/arch/microtvm_design.rst
similarity index 99%
rename from docs/dev/microtvm_design.rst
rename to docs/arch/microtvm_design.rst
index 885ef2c8fc0d..087b8166c226 100644
--- a/docs/dev/microtvm_design.rst
+++ b/docs/arch/microtvm_design.rst
@@ -13,6 +13,8 @@
     specific language governing permissions and limitations
     under the License.
 
+.. _microtvm-design:
+
 **************************
 microTVM Design Document
 **************************
diff --git a/docs/dev/model_library_format.rst b/docs/arch/model_library_format.rst
similarity index 100%
rename from docs/dev/model_library_format.rst
rename to docs/arch/model_library_format.rst
diff --git a/docs/dev/pass_infra.rst b/docs/arch/pass_infra.rst
similarity index 100%
rename from docs/dev/pass_infra.rst
rename to docs/arch/pass_infra.rst
diff --git a/docs/dev/relay_intro.rst b/docs/arch/relay_intro.rst
similarity index 100%
rename from docs/dev/relay_intro.rst
rename to docs/arch/relay_intro.rst
diff --git a/docs/dev/relay_op_strategy.rst b/docs/arch/relay_op_strategy.rst
similarity index 100%
rename from docs/dev/relay_op_strategy.rst
rename to docs/arch/relay_op_strategy.rst
diff --git a/docs/dev/runtime.rst b/docs/arch/runtime.rst
similarity index 100%
rename from docs/dev/runtime.rst
rename to docs/arch/runtime.rst
diff --git a/docs/dev/runtimes/vulkan.rst b/docs/arch/runtimes/vulkan.rst
similarity index 100%
rename from docs/dev/runtimes/vulkan.rst
rename to docs/arch/runtimes/vulkan.rst
diff --git a/docs/dev/security.rst b/docs/arch/security.rst
similarity index 100%
rename from docs/dev/security.rst
rename to docs/arch/security.rst
diff --git a/docs/dev/virtual_machine.rst b/docs/arch/virtual_machine.rst
similarity index 100%
rename from docs/dev/virtual_machine.rst
rename to docs/arch/virtual_machine.rst
diff --git a/docs/conf.py b/docs/conf.py
index 03e0fae97090..766fda49997f 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -51,9 +51,8 @@
 else:
     tvm_path = Path(os.pardir)
 
-
-sys.path.insert(0, str(tvm_path / "python"))
-sys.path.insert(0, str(tvm_path / "vta" / "python"))
+sys.path.insert(0, str(tvm_path.resolve() / "python"))
+sys.path.insert(0, str(tvm_path.resolve() / "vta" / "python"))
 
 # -- General configuration ------------------------------------------------
 
@@ -209,22 +208,37 @@ def git_describe_version(original_version):
 
 from sphinx_gallery.sorting import ExplicitOrder
 
-examples_dirs = [tvm_path.joinpath("tutorials"), tvm_path.joinpath("vta", "tutorials")]
-gallery_dirs = ["tutorials", "vta/tutorials"]
+examples_dirs = [
+    tvm_path.joinpath("gallery", "tutorial"),
+    tvm_path.joinpath("gallery", "how_to", "compile_models"),
+    tvm_path.joinpath("gallery", "how_to", "deploy_models"),
+    tvm_path.joinpath("gallery", "how_to", "work_with_relay"),
+    tvm_path.joinpath("gallery", "how_to", "work_with_schedules"),
+    tvm_path.joinpath("gallery", "how_to", "optimize_operators"),
+    tvm_path.joinpath("gallery", "how_to", "tune_with_autotvm"),
+    tvm_path.joinpath("gallery", "how_to", "tune_with_autoscheduler"),
+    tvm_path.joinpath("gallery", "how_to", "work_with_microtvm"),
+    tvm_path.joinpath("gallery", "how_to", "extend_tvm"),
+    tvm_path.joinpath("vta", "tutorials"),
+]
+
+gallery_dirs = [
+    "tutorial",
+    "how_to/compile_models",
+    "how_to/deploy_models",
+    "how_to/work_with_relay",
+    "how_to/work_with_schedules",
+    "how_to/optimize_operators",
+    "how_to/tune_with_autotvm",
+    "how_to/tune_with_autoscheduler",
+    "how_to/work_with_microtvm",
+    "how_to/extend_tvm",
+    "topic/vta/tutorials",
+]
 
 subsection_order = ExplicitOrder(
     str(p)
     for p in [
-        tvm_path / "tutorials" / "get_started",
-        tvm_path / "tutorials" / "frontend",
-        tvm_path / "tutorials" / "language",
-        tvm_path / "tutorials" / "optimize",
-        tvm_path / "tutorials" / "autotvm",
-        tvm_path / "tutorials" / "auto_scheduler",
-        tvm_path / "tutorials" / "dev",
-        tvm_path / "tutorials" / "topi",
-        tvm_path / "tutorials" / "deployment",
-        tvm_path / "tutorials" / "micro",
         tvm_path / "vta" / "tutorials" / "frontend",
         tvm_path / "vta" / "tutorials" / "optimize",
         tvm_path / "vta" / "tutorials" / "autotvm",
@@ -236,7 +250,7 @@ def git_describe_version(original_version):
 # The unlisted files are sorted by filenames.
 # The unlisted files always appear after listed files.
 within_subsection_order = {
-    "get_started": [
+    "tutorial": [
         "introduction.py",
         "install.py",
         "tvmc_command_line_driver.py",
@@ -244,10 +258,11 @@ def git_describe_version(original_version):
         "tensor_expr_get_started.py",
         "autotvm_matmul_x86.py",
         "auto_scheduler_matmul_x86.py",
+        "topi.pi",
         "cross_compilation_and_rpc.py",
         "relay_quick_start.py",
     ],
-    "frontend": [
+    "compile_models": [
         "from_pytorch.py",
         "from_tensorflow.py",
         "from_mxnet.py",
@@ -259,7 +274,7 @@ def git_describe_version(original_version):
         "from_caffe2.py",
         "from_paddle.py",
     ],
-    "language": [
+    "work_with_schedules": [
         "schedule_primitives.py",
         "reduction.py",
         "intrin_math.py",
@@ -269,26 +284,25 @@ def git_describe_version(original_version):
         "tuple_inputs.py",
         "tedd.py",
     ],
-    "optimize": [
+    "optimize_operators": [
         "opt_gemm.py",
         "opt_conv_cuda.py",
         "opt_conv_tensorcore.py",
     ],
-    "autotvm": [
-        "tune_simple_template.py",
+    "tune_with_autotvm": [
         "tune_conv2d_cuda.py",
         "tune_relay_cuda.py",
         "tune_relay_x86.py",
         "tune_relay_arm.py",
         "tune_relay_mobile_gpu.py",
     ],
-    "auto_scheduler": [
+    "tune_with_autoscheduler": [
         "tune_matmul_x86.py",
         "tune_conv2d_layer_cuda.py",
         "tune_network_x86.py",
         "tune_network_cuda.py",
     ],
-    "dev": [
+    "extend_tvm": [
         "low_level_custom_pass.py",
         "use_pass_infra.py",
         "use_pass_instrument.py",
diff --git a/docs/contribute/document.rst b/docs/contribute/document.rst
index 3652a2891b37..e3d12e83865d 100644
--- a/docs/contribute/document.rst
+++ b/docs/contribute/document.rst
@@ -17,22 +17,125 @@
 
 .. _doc_guide:
 
-Write Document and Tutorials
-============================
+Write Documentation for TVM
+===========================
+
+TVM documentation loosely follows the `formal documentation style described by
+Divio <https://documentation.divio.com>`_. This system has been chosen because
+it is a "simple, comprehensive and nearly universally-applicable scheme. It is
+proven in practice across a wide variety of fields and applications."
+
+This document describes the organization of TVM documentation, and how to write
+new documentation.
+
+The Four Document Types
+***********************
+
+Introductory Tutorials
+----------------------
+
+These are step by step guides to introduce new users to a project. An
+introductory tutorial is designed to get a user engaged with the software
+without necessarily explaining why the software works the way it does. Those
+explanations can be saved for other document types. An introductory tutorial
+focuses on a successful first experience. These are the most important docs to
+turning newcomers into new users and developers. A fully end-to-end
+tutorial&mdash; from installing TVM and supporting ML software, to creating and
+training a model, to compiling to different architectures&mdash;will give a new
+user the opportunity to use TVM in the most efficient way possible. A tutorial
+teaches a beginner something they need to know. This is in contrast with a
+how-to, which is meant to be an answer to a question that a user with some
+experience would ask.
+
+Tutorials need to be repeatable and reliable, because the lack of success means
+a user will look for other solutions.
+
+How-to Guides
+-------------
+
+These are step by step guides on how to solve particular problems. The user can
+ask meaningful questions, and the documents provide answers. An examples of
+this type of document might be, "how do I compile an optimized model for ARM
+architecture?" or "how do I compile and optimize a TensorFlow model?" These
+documents should be open enough that a user could see how to apply it to a new
+use case. Practical usability is more important than completeness. The title
+should tell the user what problem the how-to is solving.
+
+How are tutorials different from how-tos? A tutorial is oriented towards the
+new developer, and focuses on successfully introducing them to the software and
+community. A how-to, in contrast, focuses on accomplishing a specific task
+within the context of basic understanding. A tutorial helps to on-board and
+assumes no prior knowledge. A how-to assumes minimum knowledge, and is meant to
+guide someone to accomplish a specific task.
+
+Reference
+---------
+
+Reference documentation describes how the software is configured and operated.
+APIs, key functions, commands, and interfaces are all candidates for reference
+documentation. These are the technical manuals that let users build their own
+interfaces and programs. They are information oriented, focused on lists and
+descriptions. You can assume that the audience has a grasp on how the software
+works and is looking for specific answers to specific questions. Ideally, the
+reference documentation should have the same structure as the code base and be
+generated automatically as much as possible.
+
+Architecture Guides
+-------------------
+
+Architecture Guides are explanations are background material on a topic. These
+documents help to illuminate and understand the application environment. Why
+are things the way they are? What were the design decisions, what alternatives
+were considered, what are the RFCs describing the existing system? This
+includes academic papers and links to publications relevant to the software.
+Within these documents you can explore contradictory and conflicting position,
+and help the reader make sense of how and why the software was built the way it
+is. It's not the place for how-tos and descriptions on how to accomplish tasks.
+They instead focus on higher level concepts that help with the understanding of
+the project.  Generally these are written by the architects and developers of
+the project, but can useful to help both users and developers to have a deeper
+understanding of why the software works the way it does, and how to contribute
+to it in ways that are consistent with the underlying design principles.
+
+Special considerations for TVM
+------------------------------
+
+The TVM community has some special considerations that require deviation from
+the simple docs style outlined by Divio. The first consideration is that there
+is frequently overlap between the user and developer communities. Many projects
+document the developer and user experience with separate systems, but it is
+appropriate to consider both in this system, with differentiations where
+appropriate. As a result the tutorials and how-tos will be divided between
+"User Guides" that focus on the user experience, and "Developer Guides" that
+focus on the developer experience.
+
+The next consideration is that there are special topics within the TVM
+community that benefit from additional attention. These topics include, but are
+not limited to, microTVM and VTA. Special "Topic Guides" can be created to
+index existing material, and provide context on how to navigate that material
+most effectively.
+
+To facilitate newcomers, a special "Getting Started" section with installation
+instructions, a overview of why to use TVM, and other first-experience
+documents will be produced.
+
+
+Technical Details
+*****************
 
 We use the `Sphinx <http://sphinx-doc.org>`_ for the main documentation.
-Sphinx support both the reStructuredText and markdown.
-When possible, we encourage to use reStructuredText as it has richer features.
-Note that the python doc-string and tutorials allow you to embed reStructuredText syntax.
+Sphinx support both the reStructuredText and markdown.  When possible, we
+encourage to use reStructuredText as it has richer features.  Note that the
+python doc-string and tutorials allow you to embed reStructuredText syntax.
 
 
-Document Python
----------------
-We use `numpydoc <https://numpydoc.readthedocs.io/en/latest/>`_
-format to document the function and classes.
-The following snippet gives an example docstring.
-We always document all the public functions,
-when necessary, provide an usage example of the features we support(as shown below).
+Python Reference Documentation
+------------------------------
+
+We use `numpydoc <https://numpydoc.readthedocs.io/en/latest/>`_ format to
+document the function and classes.  The following snippet gives an example
+docstring.  We always document all the public functions, when necessary,
+provide an usage example of the features we support(as shown below).
 
 .. code:: python
 
@@ -64,18 +167,20 @@ when necessary, provide an usage example of the features we support(as shown bel
         """
         return rv1
 
-Be careful to leave blank lines between sections of your documents.
-In the above case, there has to be a blank line before `Parameters`, `Returns` and `Examples`
-in order for the doc to be built correctly. To add a new function to the doc,
-we need to add the `sphinx.autodoc <http://www.sphinx-doc.org/en/master/ext/autodoc.html>`_
-rules to the `docs/api/python <https://github.com/apache/tvm/tree/main/docs/api/python>`_).
-You can refer to the existing files under this folder on how to add the functions.
+Be careful to leave blank lines between sections of your documents.  In the
+above case, there has to be a blank line before `Parameters`, `Returns` and
+`Examples` in order for the doc to be built correctly. To add a new function to
+the doc, we need to add the `sphinx.autodoc
+<http://www.sphinx-doc.org/en/master/ext/autodoc.html>`_ rules to the
+`docs/api/python <https://github.com/apache/tvm/tree/main/docs/api/python>`_).
+You can refer to the existing files under this folder on how to add the
+functions.
 
+C++ Reference Documentation
+---------------------------
 
-Document C++
-------------
-We use the doxgen format to document c++ functions.
-The following snippet shows an example of c++ docstring.
+We use the doxgen format to document c++ functions.  The following snippet
+shows an example of c++ docstring.
 
 .. code:: c++
 
@@ -89,21 +194,27 @@ The following snippet shows an example of c++ docstring.
       // When necessary, also add comment to clarify internal logics
     }
 
-Besides documenting function usages, we also highly recommend contributors
-to add comments about code logics to improve readability.
+Besides documenting function usages, we also highly recommend contributors to
+add comments about code logics to improve readability.
 
+Sphinx Gallery How-Tos
+----------------------
 
-Write Tutorials
----------------
-We use the `sphinx-gallery <https://sphinx-gallery.github.io/>`_ to build python tutorials.
-You can find the source code under `tutorials <https://github.com/apache/tvm/tree/main/tutorials>`_ quite self explanatory.
-One thing that worth noting is that the comment blocks are written in reStructuredText instead of markdown so be aware of the syntax.
+We use the `sphinx-gallery <https://sphinx-gallery.github.io/>`_ to build many
+python how-tos.  You can find the source code under `gallery
+<https://github.com/apache/tvm/tree/main/gallery>`_ quite self explanatory.
+One thing that worth noting is that the comment blocks are written in
+reStructuredText instead of markdown so be aware of the syntax.
 
-The tutorial code will run on our build server to generate the document page.
-So we may have a restriction like not being able to access a remote Raspberry Pi,
-in such case add a flag variable to the tutorial (e.g. `use_rasp`) and allow users to easily switch to the real device by changing one flag.
-Then use the existing environment to demonstrate the usage.
+The how-to code will run on our build server to generate the document page.  So
+we may have a restriction like not being able to access a remote Raspberry Pi,
+in such case add a flag variable to the tutorial (e.g. `use_rasp`) and allow
+users to easily switch to the real device by changing one flag. Then use the
+existing environment to demonstrate the usage.
 
+If you add a new categorization of how-to, you will need to add references to
+`conf.py <https://github.com/apache/tvm/tree/main/docs/conf.py>`_ and the
+`how-to index <https://github.com/apache/tvm/tree/main/docs/how-to/index.rst>`_
 
 Refer to Another Location in the Document
 -----------------------------------------
diff --git a/docs/contribute/index.rst b/docs/contribute/index.rst
index e3e4119d803e..acacfdc8a6e2 100644
--- a/docs/contribute/index.rst
+++ b/docs/contribute/index.rst
@@ -15,7 +15,7 @@
     specific language governing permissions and limitations
     under the License.
 
-Contribute to TVM
+Contributor Guide
 =================
 
 TVM has been developed by community members.
diff --git a/docs/dev/how_to.rst b/docs/dev/how_to/how_to.rst
similarity index 97%
rename from docs/dev/how_to.rst
rename to docs/dev/how_to/how_to.rst
index dcf8df2331eb..844ae0ad527e 100644
--- a/docs/dev/how_to.rst
+++ b/docs/dev/how_to/how_to.rst
@@ -28,5 +28,4 @@ various areas of the TVM stack.
    relay_add_op
    relay_add_pass
    relay_bring_your_own_codegen
-   codebase_walkthrough
    pytest_target_parametrization
diff --git a/docs/dev/pytest_target_parametrization.rst b/docs/dev/how_to/pytest_target_parametrization.rst
similarity index 100%
rename from docs/dev/pytest_target_parametrization.rst
rename to docs/dev/how_to/pytest_target_parametrization.rst
diff --git a/docs/dev/relay_add_op.rst b/docs/dev/how_to/relay_add_op.rst
similarity index 96%
rename from docs/dev/relay_add_op.rst
rename to docs/dev/how_to/relay_add_op.rst
index f9ade45f0800..2a8c771dc63d 100644
--- a/docs/dev/relay_add_op.rst
+++ b/docs/dev/how_to/relay_add_op.rst
@@ -190,18 +190,16 @@ useful for fusing operators. ``kOpaque`` tells TVM to not bother trying to fuse
 While we've now defined the interface for our operations we still need to define 
 how to perform the actual calculations for cumulative sum and product. 
 
-Writing this code is outside the scope of the tutorial. For now, we assume
-we have a well tested implementation for the operation's compute. For 
-more details on how to do this, we recommend looking up the tutorials
-on `tensor expressions`_, `TVM's operator inventory (topi)`_ and looking at the 
-example cumulative sum and product implementations found in `python/tvm/topi/scan.py`_ 
-and the gpu versions in `python/tvm/topi/cuda/scan.py`_. In the case of our cumulative 
-sum and product operations we write things directly in `TIR`_ which is the 
+Writing this code is outside the scope of the tutorial. For now, we assume we
+have a well tested implementation for the operation's compute. For more details
+on how to do this, we recommend looking up the tutorials on :ref:`tensor
+expressions <tutorial-tensor-expr-get-started>`, :ref:`TVM's operator inventory
+(topi) <tutorial-topi>` and looking at the example cumulative sum and product
+implementations found in `python/tvm/topi/scan.py`_ and the gpu versions in
+`python/tvm/topi/cuda/scan.py`_. In the case of our cumulative sum and product
+operations we write things directly in :ref:`TIR <api-python-tir>` which is the
 representation where tensor expressions and topi will lower into.
 
-.. _tensor expressions: https://tvm.apache.org/docs/tutorials/get_started/tensor_expr_get_started.html
-.. _TVM's operator inventory (topi): https://tvm.apache.org/docs/tutorials/topi/intro_topi.html
-.. _TIR: https://tvm.apache.org/docs/dev/index.html?highlight=tir#tvm-tir
 .. _python/tvm/topi/scan.py: https://github.com/apache/tvm/blob/main/python/tvm/topi/scan.py
 .. _python/tvm/topi/cuda/scan.py: https://github.com/apache/tvm/blob/main/python/tvm/topi/cuda/scan.py
 
diff --git a/docs/dev/relay_add_pass.rst b/docs/dev/how_to/relay_add_pass.rst
similarity index 100%
rename from docs/dev/relay_add_pass.rst
rename to docs/dev/how_to/relay_add_pass.rst
diff --git a/docs/dev/relay_bring_your_own_codegen.rst b/docs/dev/how_to/relay_bring_your_own_codegen.rst
similarity index 100%
rename from docs/dev/relay_bring_your_own_codegen.rst
rename to docs/dev/how_to/relay_bring_your_own_codegen.rst
diff --git a/docs/dev/codebase_walkthrough.rst b/docs/dev/tutorial/codebase_walkthrough.rst
similarity index 100%
rename from docs/dev/codebase_walkthrough.rst
rename to docs/dev/tutorial/codebase_walkthrough.rst
diff --git a/docs/dev/tutorial/index.rst b/docs/dev/tutorial/index.rst
new file mode 100644
index 000000000000..77fbe0f5405e
--- /dev/null
+++ b/docs/dev/tutorial/index.rst
@@ -0,0 +1,29 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _dev-tutorial:
+
+Developer Tutorial
+==================
+
+This section is a guide to the TVM codebase, and an introduction on how to
+contribute to different parts of the platform.
+
+.. toctree::
+   :maxdepth: 1
+
+   codebase_walkthrough
diff --git a/docs/errors.rst b/docs/errors.rst
index f911ab9c1bab..84ea5551601a 100644
--- a/docs/errors.rst
+++ b/docs/errors.rst
@@ -16,8 +16,8 @@
     under the License.
 
 
-What do to when encountering TVM Errors
-=======================================
+Handle TVM Errors
+=================
 
 When running TVM, you may encounter an error message like:
 
diff --git a/docs/deploy/android.rst b/docs/how_to/deploy/android.rst
similarity index 100%
rename from docs/deploy/android.rst
rename to docs/how_to/deploy/android.rst
diff --git a/docs/deploy/arm_compute_lib.rst b/docs/how_to/deploy/arm_compute_lib.rst
similarity index 98%
rename from docs/deploy/arm_compute_lib.rst
rename to docs/how_to/deploy/arm_compute_lib.rst
index 6fb531a0a8f6..831438273cca 100644
--- a/docs/deploy/arm_compute_lib.rst
+++ b/docs/how_to/deploy/arm_compute_lib.rst
@@ -142,9 +142,9 @@ Export the module.
     lib.export_library(lib_path, cc=cross_compile)
 
 
-Run Inference. This must be on an Arm device. If compiling on x86 device and running on AArch64,
-consider using the RPC mechanism. Tutorials for using the RPC mechanism:
-https://tvm.apache.org/docs/tutorials/get_started/cross_compilation_and_rpc.html
+Run Inference. This must be on an Arm device. If compiling on x86 device and
+running on AArch64, consider using the RPC mechanism. :ref:`Tutorials for using
+the RPC mechanism <tutorial-cross-compilation-and-rpc>`
 
 .. code:: python
 
diff --git a/docs/deploy/bnns.rst b/docs/how_to/deploy/bnns.rst
similarity index 100%
rename from docs/deploy/bnns.rst
rename to docs/how_to/deploy/bnns.rst
diff --git a/docs/deploy/cpp_deploy.rst b/docs/how_to/deploy/cpp_deploy.rst
similarity index 100%
rename from docs/deploy/cpp_deploy.rst
rename to docs/how_to/deploy/cpp_deploy.rst
diff --git a/docs/deploy/hls.rst b/docs/how_to/deploy/hls.rst
similarity index 100%
rename from docs/deploy/hls.rst
rename to docs/how_to/deploy/hls.rst
diff --git a/docs/deploy/index.rst b/docs/how_to/deploy/index.rst
similarity index 93%
rename from docs/deploy/index.rst
rename to docs/how_to/deploy/index.rst
index b127de982b61..73269e85db70 100644
--- a/docs/deploy/index.rst
+++ b/docs/how_to/deploy/index.rst
@@ -17,8 +17,8 @@
 
 .. _deploy-and-integration:
 
-Deploy and Integration
-======================
+Deploy Models and Integrate TVM
+===============================
 
 This page contains guidelines on how to deploy TVM to various platforms
 as well as how to integrate it with your project.
@@ -176,4 +176,14 @@ target device without relying on RPC. See the following resources on how to do s
    vitis_ai
    bnns
 
+Additional Deployment How-Tos
+-----------------------------
+
+We have also developed a number of how-tos targeting specific devices, with
+working Python code that can be viewed in a Jupyter notebook. These how-tos
+describe how to prepare and deploy models to many of the supported backends.
+
+.. toctree::
+   :maxdepth: 1
 
+   ../deploy_models/index
diff --git a/docs/deploy/integrate.rst b/docs/how_to/deploy/integrate.rst
similarity index 100%
rename from docs/deploy/integrate.rst
rename to docs/how_to/deploy/integrate.rst
diff --git a/docs/deploy/tensorrt.rst b/docs/how_to/deploy/tensorrt.rst
similarity index 100%
rename from docs/deploy/tensorrt.rst
rename to docs/how_to/deploy/tensorrt.rst
diff --git a/docs/deploy/vitis_ai.rst b/docs/how_to/deploy/vitis_ai.rst
similarity index 100%
rename from docs/deploy/vitis_ai.rst
rename to docs/how_to/deploy/vitis_ai.rst
diff --git a/docs/how_to/index.rst b/docs/how_to/index.rst
new file mode 100644
index 000000000000..433d7acee95a
--- /dev/null
+++ b/docs/how_to/index.rst
@@ -0,0 +1,39 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+How To Guides
+=============
+
+These user-focused "how to" guides are designed to help you find answers to
+specific questions, like "How do I compile a model?" or "How to I optimize a
+schedule with tesor expressions?"
+
+.. toctree::
+   :maxdepth: 1
+
+   compile_models/index
+   deploy/index
+   work_with_relay/index
+   work_with_schedules/index
+   optimize_operators/index
+   tune_with_autotvm/index
+   tune_with_autoscheduler/index
+   work_with_microtvm/index
+   extend_tvm/index
+   profile/index
+   ../errors
+   ../faq
diff --git a/docs/profiling/index.rst b/docs/how_to/profile/index.rst
similarity index 92%
rename from docs/profiling/index.rst
rename to docs/how_to/profile/index.rst
index 9443fef25ea6..33642cf469e7 100644
--- a/docs/profiling/index.rst
+++ b/docs/how_to/profile/index.rst
@@ -15,8 +15,8 @@
     specific language governing permissions and limitations
     under the License.
 
-Profiling Deep Learning Models
-====================================
+Profile Models
+==============
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/profiling/papi.rst b/docs/how_to/profile/papi.rst
similarity index 100%
rename from docs/profiling/papi.rst
rename to docs/how_to/profile/papi.rst
diff --git a/docs/index.rst b/docs/index.rst
index 491c42712e9a..a264c9beb8f8 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -15,76 +15,59 @@
     specific language governing permissions and limitations
     under the License.
 
-TVM Documentation
-=================
-
-TVM is an open deep learning compiler stack for CPUs, GPUs, and specialized accelerators. It aims to close the gap between the productivity-focused deep learning frameworks, and the performance- or efficiency-oriented hardware backends.
-
-
-Get Started
------------
-
-- Follow the :doc:`instructions <install/index>` to install TVM.
-- Checkout the :doc:`Getting Started with TVM Tutorial <tutorials/index>`.
-
-For Developers
---------------
-
-- :doc:`dev/index` is useful for developers who want to understand the
-  architecture of TVM and/or actively develop on the project.
-- :doc:`dev/how_to` gives quick development tips on various topics.
-
+Apache TVM Documentation
+========================
+
+Welcome to the the documentation for Apache TVM, a deep learning compiler that
+enables access to high-performance machine learning anywhere for everyone.
+TVM's diverse community of hardware vendors, compiler engineers and ML
+researchers work together to build a unified, programmable software stack, that
+enriches the entire ML technology ecosystem and make it accessible to the wider
+ML community. TVM empowers users to leverage community-driven ML-based
+optimizations to push the limits and amplify the reach of their research and
+development, which in turn raises the collective performance of all ML, while
+driving its costs down.
 
 .. toctree::
    :maxdepth: 1
-   :caption: How to
-   :hidden:
+   :caption: Getting Started
 
    install/index
    contribute/index
-   deploy/index
-   dev/how_to
-   microtvm/index
-   errors
-   faq
 
 .. toctree::
    :maxdepth: 1
-   :caption: Tutorials
-   :hidden:
-
-   tutorials/index
+   :caption: User Guide
 
+   tutorial/index
+   how_to/index
 
 .. toctree::
-   :maxdepth: 2
-   :caption: References
-   :hidden:
+   :maxdepth: 1
+   :caption: Developer Guide
 
-   langref/index
-   api/python/index
-   api/links
+   dev/tutorial/index
+   dev/how_to/how_to.rst
 
 .. toctree::
-   :maxdepth: 2
-   :hidden:
-   :caption: Deep Dive
+   :maxdepth: 1
+   :caption: Architecture  Guide
 
-   dev/index
+   arch/index
 
 .. toctree::
-   :maxdepth: 2
-   :hidden:
-   :caption: MISC
-
-   vta/index
-   profiling/index
-
+   :maxdepth: 1
+   :caption: Topic Guides
 
-Index
------
+   topic/microtvm/index
+   topic/vta/index
 
 .. toctree::
    :maxdepth: 1
+   :caption: Reference Guide
 
+   reference/langref/index
+   reference/api/python/index
+   reference/api/links
+   reference/publications
    genindex
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 5c3a2544c578..23be3198bf7c 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -52,8 +52,8 @@ Build the Shared Library
 Our goal is to build the shared libraries:
 
    - On Linux the target library are `libtvm.so` and `libtvm_runtime.so`
-   - On macOS the target library are `libtvm.dylib` and `libtvm_runtime.dylib` 
-   - On Windows the target library are `libtvm.dll` and `libtvm_runtime.dll` 
+   - On macOS the target library are `libtvm.dylib` and `libtvm_runtime.dylib`
+   - On Windows the target library are `libtvm.dll` and `libtvm_runtime.dll`
 
 It is also possible to :ref:`build the runtime <deploy-and-integration>` library only.
 
@@ -74,14 +74,14 @@ linux operating systems, execute (in a terminal):
     sudo apt-get update
     sudo apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
-Use Homebrew to install the required dependencies for macOS running either the Intel or M1 processors. You must follow the post-installation steps specified by 
+Use Homebrew to install the required dependencies for macOS running either the Intel or M1 processors. You must follow the post-installation steps specified by
 Homebrew to ensure the dependencies are correctly installed and configured:
 
-.. code:: bash 
+.. code:: bash
 
-    brew install gcc git cmake 
-    brew install llvm 
-    brew install python@3.8 
+    brew install gcc git cmake
+    brew install llvm
+    brew install python@3.8
 
 
 We use cmake to build the library.
@@ -107,7 +107,7 @@ The configuration of TVM can be modified by editing `config.cmake` and/or by pas
 
       .. code:: bash
 
-          export TVM_LOG_DEBUG=1
+          export TVM_LOG_DEBUG="ir/transform.cc=1;relay/ir/transform.cc=1"
 
 - TVM requires LLVM for for CPU codegen. We highly recommend you to build with the LLVM support on.
 
@@ -303,20 +303,20 @@ like ``virtualenv``.
 
        pip3 install --user tornado psutil xgboost cloudpickle
 
-Note on M1 macs, you may have trouble installing xgboost / scipy. scipy and xgboost requires some additional dependencies to be installed, 
-including openblas and its dependencies. Use the following commands to install scipy and xgboost with the required dependencies and 
+Note on M1 macs, you may have trouble installing xgboost / scipy. scipy and xgboost requires some additional dependencies to be installed,
+including openblas and its dependencies. Use the following commands to install scipy and xgboost with the required dependencies and
 configuration. A workaround for this is to do the following commands:
 
-    .. code:: bash 
+    .. code:: bash
 
         brew install openblas gfortran
 
-        pip install pybind11 cython pythran   
-        
-        export OPENBLAS=/opt/homebrew/opt/openblas/lib/ 
-        
+        pip install pybind11 cython pythran  
+
+        export OPENBLAS=/opt/homebrew/opt/openblas/lib/
+
         pip install scipy --no-use-pep517
-        
+
         pip install xgboost
 
 Install Contrib Libraries
diff --git a/docs/install/index.rst b/docs/install/index.rst
index 5f739418add3..0082fa338665 100644
--- a/docs/install/index.rst
+++ b/docs/install/index.rst
@@ -17,16 +17,22 @@
 
 .. _installation:
 
-Installation
-============
-To install TVM, please read :ref:`install-from-source`.
-If you are interested in deploying to mobile/embedded devices,
-you do not need to install the entire TVM stack on your device,
-instead, you only need the runtime, please read :ref:`deploy-and-integration`.
-If you would like to quickly try out TVM or do demo/tutorials, checkout :ref:`docker-images`
+Installing TVM
+==============
+
+Checkout to :ref:`install TVM from source <install-from-source>`. Installing
+from source gives you the maximum flexibility to configure the build effectively from the official source releases.
+If you are interested in deploying to mobile or embedded devices, you do not need to
+install the entire TVM stack on your device. Instead, you only need the runtime and can install using the
+:ref:`deployment and integration guide <deploy-and-integration>`.
+
+If you would like to quickly try out TVM or run some demo and tutorials, you
+can checkout :ref:`install from Docker <docker-images>`. You can also
+tryout pre-built from third-party :doc:`TLCPack Python and Conda packages <tlcpack>`.
 
 .. toctree::
    :maxdepth: 2
 
    from_source
    docker
+   tlcpack
diff --git a/docs/install/tlcpack.rst b/docs/install/tlcpack.rst
new file mode 100644
index 000000000000..c903e30b68bf
--- /dev/null
+++ b/docs/install/tlcpack.rst
@@ -0,0 +1,24 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _tlcpack:
+
+TLCPack
+=======
+
+TLCPack is a third-party conda and pip packaging based on TVM.
+Checkout more information on `TLCPack site <https://tlcpack.ai>`_.
diff --git a/docs/api/links.rst b/docs/reference/api/links.rst
similarity index 86%
rename from docs/api/links.rst
rename to docs/reference/api/links.rst
index 4b8ab4dd43f6..a2a43bf7d0f2 100644
--- a/docs/api/links.rst
+++ b/docs/reference/api/links.rst
@@ -15,10 +15,11 @@
     specific language governing permissions and limitations
     under the License.
 
-Links to Other API References
-=============================
+Other APIs
+==========
 
-This page contains links to API references that are build with different doc build system.
+This page contains links to API references that are built with different doc
+build system.
 
 * `C++ doyxgen API <doxygen/index.html>`_
 * `Typescript typedoc API <typedoc/index.html>`_
diff --git a/docs/api/python/auto_scheduler.rst b/docs/reference/api/python/auto_scheduler.rst
similarity index 100%
rename from docs/api/python/auto_scheduler.rst
rename to docs/reference/api/python/auto_scheduler.rst
diff --git a/docs/api/python/autotvm.rst b/docs/reference/api/python/autotvm.rst
similarity index 100%
rename from docs/api/python/autotvm.rst
rename to docs/reference/api/python/autotvm.rst
diff --git a/docs/api/python/contrib.rst b/docs/reference/api/python/contrib.rst
similarity index 100%
rename from docs/api/python/contrib.rst
rename to docs/reference/api/python/contrib.rst
diff --git a/docs/api/python/driver.rst b/docs/reference/api/python/driver.rst
similarity index 100%
rename from docs/api/python/driver.rst
rename to docs/reference/api/python/driver.rst
diff --git a/docs/api/python/error.rst b/docs/reference/api/python/error.rst
similarity index 100%
rename from docs/api/python/error.rst
rename to docs/reference/api/python/error.rst
diff --git a/docs/api/python/graph_executor.rst b/docs/reference/api/python/graph_executor.rst
similarity index 100%
rename from docs/api/python/graph_executor.rst
rename to docs/reference/api/python/graph_executor.rst
diff --git a/docs/api/python/index.rst b/docs/reference/api/python/index.rst
similarity index 100%
rename from docs/api/python/index.rst
rename to docs/reference/api/python/index.rst
diff --git a/docs/api/python/ir.rst b/docs/reference/api/python/ir.rst
similarity index 100%
rename from docs/api/python/ir.rst
rename to docs/reference/api/python/ir.rst
diff --git a/docs/api/python/micro.rst b/docs/reference/api/python/micro.rst
similarity index 100%
rename from docs/api/python/micro.rst
rename to docs/reference/api/python/micro.rst
diff --git a/docs/api/python/ndarray.rst b/docs/reference/api/python/ndarray.rst
similarity index 100%
rename from docs/api/python/ndarray.rst
rename to docs/reference/api/python/ndarray.rst
diff --git a/docs/api/python/relay/analysis.rst b/docs/reference/api/python/relay/analysis.rst
similarity index 100%
rename from docs/api/python/relay/analysis.rst
rename to docs/reference/api/python/relay/analysis.rst
diff --git a/docs/api/python/relay/backend.rst b/docs/reference/api/python/relay/backend.rst
similarity index 100%
rename from docs/api/python/relay/backend.rst
rename to docs/reference/api/python/relay/backend.rst
diff --git a/docs/api/python/relay/dataflow_pattern.rst b/docs/reference/api/python/relay/dataflow_pattern.rst
similarity index 100%
rename from docs/api/python/relay/dataflow_pattern.rst
rename to docs/reference/api/python/relay/dataflow_pattern.rst
diff --git a/docs/api/python/relay/frontend.rst b/docs/reference/api/python/relay/frontend.rst
similarity index 100%
rename from docs/api/python/relay/frontend.rst
rename to docs/reference/api/python/relay/frontend.rst
diff --git a/docs/api/python/relay/image.rst b/docs/reference/api/python/relay/image.rst
similarity index 100%
rename from docs/api/python/relay/image.rst
rename to docs/reference/api/python/relay/image.rst
diff --git a/docs/api/python/relay/index.rst b/docs/reference/api/python/relay/index.rst
similarity index 100%
rename from docs/api/python/relay/index.rst
rename to docs/reference/api/python/relay/index.rst
diff --git a/docs/api/python/relay/nn.rst b/docs/reference/api/python/relay/nn.rst
similarity index 100%
rename from docs/api/python/relay/nn.rst
rename to docs/reference/api/python/relay/nn.rst
diff --git a/docs/api/python/relay/testing.rst b/docs/reference/api/python/relay/testing.rst
similarity index 100%
rename from docs/api/python/relay/testing.rst
rename to docs/reference/api/python/relay/testing.rst
diff --git a/docs/api/python/relay/transform.rst b/docs/reference/api/python/relay/transform.rst
similarity index 100%
rename from docs/api/python/relay/transform.rst
rename to docs/reference/api/python/relay/transform.rst
diff --git a/docs/api/python/relay/vision.rst b/docs/reference/api/python/relay/vision.rst
similarity index 100%
rename from docs/api/python/relay/vision.rst
rename to docs/reference/api/python/relay/vision.rst
diff --git a/docs/api/python/rpc.rst b/docs/reference/api/python/rpc.rst
similarity index 100%
rename from docs/api/python/rpc.rst
rename to docs/reference/api/python/rpc.rst
diff --git a/docs/api/python/runtime.rst b/docs/reference/api/python/runtime.rst
similarity index 100%
rename from docs/api/python/runtime.rst
rename to docs/reference/api/python/runtime.rst
diff --git a/docs/api/python/target.rst b/docs/reference/api/python/target.rst
similarity index 100%
rename from docs/api/python/target.rst
rename to docs/reference/api/python/target.rst
diff --git a/docs/api/python/te.rst b/docs/reference/api/python/te.rst
similarity index 100%
rename from docs/api/python/te.rst
rename to docs/reference/api/python/te.rst
diff --git a/docs/api/python/tir.rst b/docs/reference/api/python/tir.rst
similarity index 98%
rename from docs/api/python/tir.rst
rename to docs/reference/api/python/tir.rst
index b0b8f1cff5fb..2152be69ea6f 100644
--- a/docs/api/python/tir.rst
+++ b/docs/reference/api/python/tir.rst
@@ -15,6 +15,8 @@
     specific language governing permissions and limitations
     under the License.
 
+.. _api-python-tir:
+
 tvm.tir
 -------
 .. automodule:: tvm.tir
diff --git a/docs/api/python/topi.rst b/docs/reference/api/python/topi.rst
similarity index 100%
rename from docs/api/python/topi.rst
rename to docs/reference/api/python/topi.rst
diff --git a/docs/api/python/vta/index.rst b/docs/reference/api/python/vta/index.rst
similarity index 100%
rename from docs/api/python/vta/index.rst
rename to docs/reference/api/python/vta/index.rst
diff --git a/docs/langref/hybrid_script.rst b/docs/reference/langref/hybrid_script.rst
similarity index 100%
rename from docs/langref/hybrid_script.rst
rename to docs/reference/langref/hybrid_script.rst
diff --git a/docs/langref/index.rst b/docs/reference/langref/index.rst
similarity index 100%
rename from docs/langref/index.rst
rename to docs/reference/langref/index.rst
diff --git a/docs/langref/relay_adt.rst b/docs/reference/langref/relay_adt.rst
similarity index 100%
rename from docs/langref/relay_adt.rst
rename to docs/reference/langref/relay_adt.rst
diff --git a/docs/langref/relay_expr.rst b/docs/reference/langref/relay_expr.rst
similarity index 100%
rename from docs/langref/relay_expr.rst
rename to docs/reference/langref/relay_expr.rst
diff --git a/docs/langref/relay_op.rst b/docs/reference/langref/relay_op.rst
similarity index 100%
rename from docs/langref/relay_op.rst
rename to docs/reference/langref/relay_op.rst
diff --git a/docs/langref/relay_pattern.rst b/docs/reference/langref/relay_pattern.rst
similarity index 100%
rename from docs/langref/relay_pattern.rst
rename to docs/reference/langref/relay_pattern.rst
diff --git a/docs/langref/relay_type.rst b/docs/reference/langref/relay_type.rst
similarity index 100%
rename from docs/langref/relay_type.rst
rename to docs/reference/langref/relay_type.rst
diff --git a/docs/reference/publications.rst b/docs/reference/publications.rst
new file mode 100644
index 000000000000..3a90a3ad3c25
--- /dev/null
+++ b/docs/reference/publications.rst
@@ -0,0 +1,31 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+Publications
+============
+
+TVM is developed as part of peer-reviewed research in machine learning compiler
+framework for CPUs, GPUs, and machine learning accelerators.
+
+This document includes references to publications describing the research,
+results, and design underlying TVM.
+
+* `TVM: An Automated End-to-End Optimizing Compiler for Deep Learning <https://arxiv.org/abs/1802.04799>`_
+* `Learning to Optimize Tensor Programs <https://arxiv.org/pdf/1805.08166.pdf>`_
+* `Ansor: Generating High-Performance Tensor Programs for Deep Learning <https://arxiv.org/abs/2006.06762>`_
+* `Nimble: Efficiently Compiling Dynamic Neural Networks for Model Inference
+  <https://arxiv.org/abs/2006.03031>`_
diff --git a/docs/microtvm/index.rst b/docs/topic/microtvm/index.rst
similarity index 93%
rename from docs/microtvm/index.rst
rename to docs/topic/microtvm/index.rst
index a67b1547d229..ebcadb34427c 100644
--- a/docs/microtvm/index.rst
+++ b/docs/topic/microtvm/index.rst
@@ -55,7 +55,7 @@ tutorials to get started with microTVM:
 1. :ref:`Start the microTVM Reference VM <tutorial-micro-reference-vm>`. The microTVM tutorials
    depend on Zephyr and on a compiler toolchain for your hardware. The reference VM is a convenient
    way to install those dependencies.
-2. Try the :doc:`microTVM with TFLite Tutorial </tutorials/micro/micro_tflite>`.
+2. Try the :ref:`microTVM with TFLite Tutorial <microTVM-with-TFLite>`.
 3. Try running a more complex `CIFAR10-CNN model <https://github.com/areusch/microtvm-blogpost-eval>`_.
 
 
@@ -63,7 +63,7 @@ How microTVM Works
 ~~~~~~~~~~~~~~~~~~
 
 
-You can read more about the design of these pieces at the :doc:`microTVM Design Document </dev/microtvm_design>`.
+You can read more about the design of these pieces at the :ref:`microTVM Design Document <microTVM-design>`.
 
 
 Help and Discussion
diff --git a/docs/vta/.gitignore b/docs/topic/vta/.gitignore
similarity index 100%
rename from docs/vta/.gitignore
rename to docs/topic/vta/.gitignore
diff --git a/docs/vta/dev/config.rst b/docs/topic/vta/dev/config.rst
similarity index 100%
rename from docs/vta/dev/config.rst
rename to docs/topic/vta/dev/config.rst
diff --git a/docs/vta/dev/hardware.rst b/docs/topic/vta/dev/hardware.rst
similarity index 100%
rename from docs/vta/dev/hardware.rst
rename to docs/topic/vta/dev/hardware.rst
diff --git a/docs/vta/dev/index.rst b/docs/topic/vta/dev/index.rst
similarity index 100%
rename from docs/vta/dev/index.rst
rename to docs/topic/vta/dev/index.rst
diff --git a/docs/vta/index.rst b/docs/topic/vta/index.rst
similarity index 96%
rename from docs/vta/index.rst
rename to docs/topic/vta/index.rst
index d97705ef140f..d09b31bb360b 100644
--- a/docs/vta/index.rst
+++ b/docs/topic/vta/index.rst
@@ -17,8 +17,8 @@
 
 .. _vta-index:
 
-VTA: Deep Learning Accelerator Stack
-====================================
+VTA: Versatile Tensor Accelerator
+=================================
 
 The Versatile Tensor Accelerator (VTA) is an open, generic, and customizable deep learning accelerator with a complete TVM-based compiler stack. We designed VTA to expose the most salient and common characteristics of mainstream deep learning accelerators. Together TVM and VTA form an end-to-end hardware-software deep learning system stack that includes hardware design, drivers, a JIT runtime, and an optimizing compiler stack based on TVM.
 
diff --git a/docs/vta/install.rst b/docs/topic/vta/install.rst
similarity index 98%
rename from docs/vta/install.rst
rename to docs/topic/vta/install.rst
index 2248975b61b1..e4b309ea9b61 100644
--- a/docs/vta/install.rst
+++ b/docs/topic/vta/install.rst
@@ -30,8 +30,8 @@ We present three installation guides, each extending on the previous one:
 VTA Simulator Installation
 --------------------------
 
-You need `TVM installed <https://tvm.apache.org/docs/install/index.html>`_ on your machine.
-For a quick and easy start, checkout the `Docker Guide <https://tvm.apache.org/docs/install/docker.html>`_.
+You need :ref:`TVM installed <installation>` on your machine.  For a quick and
+easy start, checkout the :ref:`Docker Guide <docker-images>`.
 
 You'll need to set the following paths to use VTA:
 
@@ -65,7 +65,7 @@ To ensure that you've properly installed the VTA python package, run the followi
 
    python <tvm root>/vta/tests/python/integration/test_benchmark_topi_conv2d.py
 
-You are invited to try out our `VTA programming tutorials <https://tvm.apache.org/docs/vta/tutorials/index.html>`_.
+You are invited to try out our :ref:`VTA programming tutorials <vta-tutorials>`.
 
    **Note**: You'll notice that for every convolution layer, the throughput gets reported in GOPS. These numbers are actually the computational throughput that the simulator achieves, by evaluating the convolutions in software.
 
@@ -222,9 +222,7 @@ The performance metrics measured on the Pynq board will be reported for each con
 
 **Tip**: You can track progress of the FPGA programming and the runtime rebuilding steps by looking at the RPC server's logging messages in your Pynq ``ssh`` session.
 
-You can also try out our `VTA programming tutorials <https://tvm.apache.org/docs/vta/tutorials/index.html>`_.
-
-
+You can also try out our :ref:`VTA programming tutorials <vta-tutorials>`.
 
 Intel DE10 FPGA Setup
 ---------------------
diff --git a/gallery/how_to/compile_models/README.txt b/gallery/how_to/compile_models/README.txt
new file mode 100644
index 000000000000..61cc5be2cb79
--- /dev/null
+++ b/gallery/how_to/compile_models/README.txt
@@ -0,0 +1,7 @@
+.. _tutorial-frontend:
+
+Compile Deep Learning Models
+============================
+
+TVM includes a variety of front-ends that can import models in different
+formats. These how-tos demostrate how to import models using the Python API.
diff --git a/tutorials/frontend/from_caffe2.py b/gallery/how_to/compile_models/from_caffe2.py
similarity index 100%
rename from tutorials/frontend/from_caffe2.py
rename to gallery/how_to/compile_models/from_caffe2.py
diff --git a/tutorials/frontend/from_coreml.py b/gallery/how_to/compile_models/from_coreml.py
similarity index 100%
rename from tutorials/frontend/from_coreml.py
rename to gallery/how_to/compile_models/from_coreml.py
diff --git a/tutorials/frontend/from_darknet.py b/gallery/how_to/compile_models/from_darknet.py
similarity index 100%
rename from tutorials/frontend/from_darknet.py
rename to gallery/how_to/compile_models/from_darknet.py
diff --git a/tutorials/frontend/from_keras.py b/gallery/how_to/compile_models/from_keras.py
similarity index 100%
rename from tutorials/frontend/from_keras.py
rename to gallery/how_to/compile_models/from_keras.py
diff --git a/tutorials/frontend/from_mxnet.py b/gallery/how_to/compile_models/from_mxnet.py
similarity index 100%
rename from tutorials/frontend/from_mxnet.py
rename to gallery/how_to/compile_models/from_mxnet.py
diff --git a/tutorials/frontend/from_onnx.py b/gallery/how_to/compile_models/from_onnx.py
similarity index 100%
rename from tutorials/frontend/from_onnx.py
rename to gallery/how_to/compile_models/from_onnx.py
diff --git a/tutorials/frontend/from_paddle.py b/gallery/how_to/compile_models/from_paddle.py
similarity index 100%
rename from tutorials/frontend/from_paddle.py
rename to gallery/how_to/compile_models/from_paddle.py
diff --git a/tutorials/frontend/from_pytorch.py b/gallery/how_to/compile_models/from_pytorch.py
similarity index 100%
rename from tutorials/frontend/from_pytorch.py
rename to gallery/how_to/compile_models/from_pytorch.py
diff --git a/tutorials/frontend/from_tensorflow.py b/gallery/how_to/compile_models/from_tensorflow.py
similarity index 100%
rename from tutorials/frontend/from_tensorflow.py
rename to gallery/how_to/compile_models/from_tensorflow.py
diff --git a/tutorials/frontend/from_tflite.py b/gallery/how_to/compile_models/from_tflite.py
similarity index 100%
rename from tutorials/frontend/from_tflite.py
rename to gallery/how_to/compile_models/from_tflite.py
diff --git a/gallery/how_to/deploy_models/README.txt b/gallery/how_to/deploy_models/README.txt
new file mode 100644
index 000000000000..22323c4e521b
--- /dev/null
+++ b/gallery/how_to/deploy_models/README.txt
@@ -0,0 +1,6 @@
+Deploy Deep Learning Models
+---------------------------
+
+TVM is capable of deploying models to a variety of different platforms. These
+how-tos describe how to prepapre and deploy models to many of the supported
+backends.
diff --git a/tutorials/frontend/deploy_model_on_android.py b/gallery/how_to/deploy_models/deploy_model_on_android.py
similarity index 100%
rename from tutorials/frontend/deploy_model_on_android.py
rename to gallery/how_to/deploy_models/deploy_model_on_android.py
diff --git a/tutorials/frontend/deploy_model_on_rasp.py b/gallery/how_to/deploy_models/deploy_model_on_rasp.py
similarity index 100%
rename from tutorials/frontend/deploy_model_on_rasp.py
rename to gallery/how_to/deploy_models/deploy_model_on_rasp.py
diff --git a/tutorials/frontend/deploy_object_detection_pytorch.py b/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py
similarity index 100%
rename from tutorials/frontend/deploy_object_detection_pytorch.py
rename to gallery/how_to/deploy_models/deploy_object_detection_pytorch.py
diff --git a/tutorials/frontend/deploy_prequantized.py b/gallery/how_to/deploy_models/deploy_prequantized.py
similarity index 100%
rename from tutorials/frontend/deploy_prequantized.py
rename to gallery/how_to/deploy_models/deploy_prequantized.py
diff --git a/tutorials/frontend/deploy_prequantized_tflite.py b/gallery/how_to/deploy_models/deploy_prequantized_tflite.py
similarity index 96%
rename from tutorials/frontend/deploy_prequantized_tflite.py
rename to gallery/how_to/deploy_models/deploy_prequantized_tflite.py
index 7bbb06bdf801..830e2ab07466 100644
--- a/tutorials/frontend/deploy_prequantized_tflite.py
+++ b/gallery/how_to/deploy_models/deploy_prequantized_tflite.py
@@ -255,8 +255,8 @@ def run_tvm(lib):
 #    * Set the environment variable TVM_NUM_THREADS to the number of physical cores
 #    * Choose the best target for your hardware, such as "llvm -mcpu=skylake-avx512" or
 #      "llvm -mcpu=cascadelake" (more CPUs with AVX512 would come in the future)
-#    * Perform autotuning - `Auto-tuning a convolution network for x86 CPU
-#      <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html>`_.
-#    * To get best inference performance on ARM CPU, change target argument according to your
-#      device and follow `Auto-tuning a convolution network for ARM CPU
-#      <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_arm.html>`_.
+#    * Perform autotuning - :ref:`Auto-tuning a convolution network for x86 CPU
+#      <tune_relay_x86>`.
+#    * To get best inference performance on ARM CPU, change target argument
+#      according to your device and follow :ref:`Auto-tuning a convolution
+#      network for ARM CPU <tune_relay_arm>`.
diff --git a/tutorials/frontend/deploy_quantized.py b/gallery/how_to/deploy_models/deploy_quantized.py
similarity index 100%
rename from tutorials/frontend/deploy_quantized.py
rename to gallery/how_to/deploy_models/deploy_quantized.py
diff --git a/tutorials/frontend/deploy_sparse.py b/gallery/how_to/deploy_models/deploy_sparse.py
similarity index 100%
rename from tutorials/frontend/deploy_sparse.py
rename to gallery/how_to/deploy_models/deploy_sparse.py
diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/gallery/how_to/deploy_models/deploy_ssd_gluoncv.py
similarity index 100%
rename from tutorials/frontend/deploy_ssd_gluoncv.py
rename to gallery/how_to/deploy_models/deploy_ssd_gluoncv.py
diff --git a/gallery/how_to/extend_tvm/README.txt b/gallery/how_to/extend_tvm/README.txt
new file mode 100644
index 000000000000..09cd220c80b3
--- /dev/null
+++ b/gallery/how_to/extend_tvm/README.txt
@@ -0,0 +1,7 @@
+Extend TVM
+----------
+
+TVM is an extensible development platform, with many points of entry to work
+with, including options for bringing new datatypes and adding lower level
+custom optimization passes. These how-tos describe some ways that TVM can be
+extended.
diff --git a/tutorials/dev/bring_your_own_datatypes.py b/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
similarity index 100%
rename from tutorials/dev/bring_your_own_datatypes.py
rename to gallery/how_to/extend_tvm/bring_your_own_datatypes.py
diff --git a/tutorials/dev/low_level_custom_pass.py b/gallery/how_to/extend_tvm/low_level_custom_pass.py
similarity index 100%
rename from tutorials/dev/low_level_custom_pass.py
rename to gallery/how_to/extend_tvm/low_level_custom_pass.py
diff --git a/tutorials/dev/use_pass_infra.py b/gallery/how_to/extend_tvm/use_pass_infra.py
similarity index 100%
rename from tutorials/dev/use_pass_infra.py
rename to gallery/how_to/extend_tvm/use_pass_infra.py
diff --git a/tutorials/dev/use_pass_instrument.py b/gallery/how_to/extend_tvm/use_pass_instrument.py
similarity index 100%
rename from tutorials/dev/use_pass_instrument.py
rename to gallery/how_to/extend_tvm/use_pass_instrument.py
diff --git a/gallery/how_to/optimize_operators/README.txt b/gallery/how_to/optimize_operators/README.txt
new file mode 100644
index 000000000000..889974c7873c
--- /dev/null
+++ b/gallery/how_to/optimize_operators/README.txt
@@ -0,0 +1,5 @@
+Optimize Tensor Operators
+-------------------------
+
+These how-tos demonstrate how to optimize a variety of tensor operations for a
+variety of targets.
diff --git a/tutorials/optimize/opt_conv_cuda.py b/gallery/how_to/optimize_operators/opt_conv_cuda.py
similarity index 100%
rename from tutorials/optimize/opt_conv_cuda.py
rename to gallery/how_to/optimize_operators/opt_conv_cuda.py
diff --git a/tutorials/optimize/opt_conv_tensorcore.py b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
similarity index 100%
rename from tutorials/optimize/opt_conv_tensorcore.py
rename to gallery/how_to/optimize_operators/opt_conv_tensorcore.py
diff --git a/tutorials/optimize/opt_gemm.py b/gallery/how_to/optimize_operators/opt_gemm.py
similarity index 100%
rename from tutorials/optimize/opt_gemm.py
rename to gallery/how_to/optimize_operators/opt_gemm.py
diff --git a/gallery/how_to/tune_with_autoscheduler/README.txt b/gallery/how_to/tune_with_autoscheduler/README.txt
new file mode 100644
index 000000000000..636a636d81c0
--- /dev/null
+++ b/gallery/how_to/tune_with_autoscheduler/README.txt
@@ -0,0 +1,6 @@
+Use AutoScheduler for Template-Free Scheduling
+----------------------------------------------
+
+The TVM AutoScheduler offers a template-free way to tune models. These how-tos
+demonstrate how to tune a variety of different models to target a number of
+common platforms.
diff --git a/tutorials/auto_scheduler/ci_logs/conv2d.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/conv2d.json
similarity index 100%
rename from tutorials/auto_scheduler/ci_logs/conv2d.json
rename to gallery/how_to/tune_with_autoscheduler/ci_logs/conv2d.json
diff --git a/tutorials/auto_scheduler/ci_logs/matmul.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/matmul.json
similarity index 100%
rename from tutorials/auto_scheduler/ci_logs/matmul.json
rename to gallery/how_to/tune_with_autoscheduler/ci_logs/matmul.json
diff --git a/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
similarity index 100%
rename from tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
rename to gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json
diff --git a/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
similarity index 100%
rename from tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
rename to gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json
diff --git a/tutorials/auto_scheduler/ci_logs/sparse_dense.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/sparse_dense.json
similarity index 100%
rename from tutorials/auto_scheduler/ci_logs/sparse_dense.json
rename to gallery/how_to/tune_with_autoscheduler/ci_logs/sparse_dense.json
diff --git a/tutorials/auto_scheduler/tune_conv2d_layer_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
similarity index 100%
rename from tutorials/auto_scheduler/tune_conv2d_layer_cuda.py
rename to gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
diff --git a/tutorials/auto_scheduler/tune_network_arm.py b/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py
similarity index 100%
rename from tutorials/auto_scheduler/tune_network_arm.py
rename to gallery/how_to/tune_with_autoscheduler/tune_network_arm.py
diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py
similarity index 100%
rename from tutorials/auto_scheduler/tune_network_cuda.py
rename to gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py
diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py
similarity index 100%
rename from tutorials/auto_scheduler/tune_network_mali.py
rename to gallery/how_to/tune_with_autoscheduler/tune_network_mali.py
diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py
similarity index 100%
rename from tutorials/auto_scheduler/tune_network_x86.py
rename to gallery/how_to/tune_with_autoscheduler/tune_network_x86.py
diff --git a/tutorials/auto_scheduler/tune_sparse_x86.py b/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py
similarity index 100%
rename from tutorials/auto_scheduler/tune_sparse_x86.py
rename to gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py
diff --git a/gallery/how_to/tune_with_autotvm/README.txt b/gallery/how_to/tune_with_autotvm/README.txt
new file mode 100644
index 000000000000..2b32c085c2fd
--- /dev/null
+++ b/gallery/how_to/tune_with_autotvm/README.txt
@@ -0,0 +1,9 @@
+.. _tutorials-autotvm-sec:
+
+Auto-Tune with Templates and AutoTVM
+------------------------------------
+
+AutoTVM offers a way to tune models and operators by providing a template
+schedule, and searcing the parameter space defined by the template. These
+how-tos demonstrate how to write template schedules and optimize them for a
+variety of different hardware platforms.
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
similarity index 100%
rename from tutorials/autotvm/tune_conv2d_cuda.py
rename to gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
diff --git a/tutorials/autotvm/tune_relay_arm.py b/gallery/how_to/tune_with_autotvm/tune_relay_arm.py
similarity index 100%
rename from tutorials/autotvm/tune_relay_arm.py
rename to gallery/how_to/tune_with_autotvm/tune_relay_arm.py
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py
similarity index 100%
rename from tutorials/autotvm/tune_relay_cuda.py
rename to gallery/how_to/tune_with_autotvm/tune_relay_cuda.py
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py
similarity index 100%
rename from tutorials/autotvm/tune_relay_mobile_gpu.py
rename to gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py
diff --git a/tutorials/autotvm/tune_relay_x86.py b/gallery/how_to/tune_with_autotvm/tune_relay_x86.py
similarity index 100%
rename from tutorials/autotvm/tune_relay_x86.py
rename to gallery/how_to/tune_with_autotvm/tune_relay_x86.py
diff --git a/gallery/how_to/work_with_microtvm/README.txt b/gallery/how_to/work_with_microtvm/README.txt
new file mode 100644
index 000000000000..a3cc5a0c6c43
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/README.txt
@@ -0,0 +1,7 @@
+.. _tutorial-micro:
+
+Work With microTVM
+------------------
+microTVM enables inference on bare-metal platforms, for example, those without
+a traditional Operating System such as Linux, OS X, or Windows. These how-tos
+demonstrate how to tune and deploy models with microTVM.
diff --git a/tutorials/micro/micro_autotune.py b/gallery/how_to/work_with_microtvm/micro_autotune.py
similarity index 100%
rename from tutorials/micro/micro_autotune.py
rename to gallery/how_to/work_with_microtvm/micro_autotune.py
diff --git a/tutorials/micro/micro_reference_vm.py b/gallery/how_to/work_with_microtvm/micro_reference_vm.py
similarity index 100%
rename from tutorials/micro/micro_reference_vm.py
rename to gallery/how_to/work_with_microtvm/micro_reference_vm.py
diff --git a/tutorials/micro/micro_tflite.py b/gallery/how_to/work_with_microtvm/micro_tflite.py
similarity index 99%
rename from tutorials/micro/micro_tflite.py
rename to gallery/how_to/work_with_microtvm/micro_tflite.py
index d1d4477f63e1..cab105cb450f 100644
--- a/tutorials/micro/micro_tflite.py
+++ b/gallery/how_to/work_with_microtvm/micro_tflite.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """
+.. _microTVM-with-TFLite:
+
 microTVM with TFLite Models
 ===========================
 **Author**: `Tom Gall <https://github.com/tom-gall>`_
diff --git a/gallery/how_to/work_with_relay/README.txt b/gallery/how_to/work_with_relay/README.txt
new file mode 100644
index 000000000000..e25e1782c03f
--- /dev/null
+++ b/gallery/how_to/work_with_relay/README.txt
@@ -0,0 +1,2 @@
+Work With Relay
+---------------
diff --git a/tutorials/frontend/build_gcn.py b/gallery/how_to/work_with_relay/build_gcn.py
similarity index 100%
rename from tutorials/frontend/build_gcn.py
rename to gallery/how_to/work_with_relay/build_gcn.py
diff --git a/tutorials/frontend/using_external_lib.py b/gallery/how_to/work_with_relay/using_external_lib.py
similarity index 100%
rename from tutorials/frontend/using_external_lib.py
rename to gallery/how_to/work_with_relay/using_external_lib.py
diff --git a/gallery/how_to/work_with_schedules/README.txt b/gallery/how_to/work_with_schedules/README.txt
new file mode 100644
index 000000000000..6516af3b3d2f
--- /dev/null
+++ b/gallery/how_to/work_with_schedules/README.txt
@@ -0,0 +1,2 @@
+Work With Tensor Expression and Schedules
+-----------------------------------------
diff --git a/tutorials/language/extern_op.py b/gallery/how_to/work_with_schedules/extern_op.py
similarity index 100%
rename from tutorials/language/extern_op.py
rename to gallery/how_to/work_with_schedules/extern_op.py
diff --git a/tutorials/language/intrin_math.py b/gallery/how_to/work_with_schedules/intrin_math.py
similarity index 100%
rename from tutorials/language/intrin_math.py
rename to gallery/how_to/work_with_schedules/intrin_math.py
diff --git a/tutorials/language/reduction.py b/gallery/how_to/work_with_schedules/reduction.py
similarity index 100%
rename from tutorials/language/reduction.py
rename to gallery/how_to/work_with_schedules/reduction.py
diff --git a/tutorials/language/scan.py b/gallery/how_to/work_with_schedules/scan.py
similarity index 100%
rename from tutorials/language/scan.py
rename to gallery/how_to/work_with_schedules/scan.py
diff --git a/tutorials/language/schedule_primitives.py b/gallery/how_to/work_with_schedules/schedule_primitives.py
similarity index 99%
rename from tutorials/language/schedule_primitives.py
rename to gallery/how_to/work_with_schedules/schedule_primitives.py
index ade79f69707f..65fdeda57c3b 100644
--- a/tutorials/language/schedule_primitives.py
+++ b/gallery/how_to/work_with_schedules/schedule_primitives.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """
+.. _schedule_primitives:
+
 Schedule Primitives in TVM
 ==========================
 **Author**: `Ziheng Jiang <https://github.com/ZihengJiang>`_
diff --git a/tutorials/language/tedd.py b/gallery/how_to/work_with_schedules/tedd.py
similarity index 100%
rename from tutorials/language/tedd.py
rename to gallery/how_to/work_with_schedules/tedd.py
diff --git a/tutorials/language/tensorize.py b/gallery/how_to/work_with_schedules/tensorize.py
similarity index 100%
rename from tutorials/language/tensorize.py
rename to gallery/how_to/work_with_schedules/tensorize.py
diff --git a/tutorials/language/tuple_inputs.py b/gallery/how_to/work_with_schedules/tuple_inputs.py
similarity index 100%
rename from tutorials/language/tuple_inputs.py
rename to gallery/how_to/work_with_schedules/tuple_inputs.py
diff --git a/gallery/tutorial/README.txt b/gallery/tutorial/README.txt
new file mode 100644
index 000000000000..ceac645d7dc1
--- /dev/null
+++ b/gallery/tutorial/README.txt
@@ -0,0 +1,10 @@
+User Tutorial
+-------------
+
+This tutorial provides an introduction to TVM, meant to address user who is new
+to the TVM project. It begins with some basic information on how TVM works,
+then works through installing TVM, compiling and optimizing models, then
+digging in deeper to the Tensor Expression language and the tuning and
+optimization tools that are built on top of it. After completing the tutorial,
+a new user should be familiar enough with TVM to optimize models, and will be
+prepared to dig into TVM more deeply.
diff --git a/tutorials/get_started/auto_scheduler_matmul_x86.py b/gallery/tutorial/auto_scheduler_matmul_x86.py
similarity index 100%
rename from tutorials/get_started/auto_scheduler_matmul_x86.py
rename to gallery/tutorial/auto_scheduler_matmul_x86.py
diff --git a/tutorials/get_started/autotvm_matmul_x86.py b/gallery/tutorial/autotvm_matmul_x86.py
similarity index 100%
rename from tutorials/get_started/autotvm_matmul_x86.py
rename to gallery/tutorial/autotvm_matmul_x86.py
diff --git a/tutorials/get_started/autotvm_relay_x86.py b/gallery/tutorial/autotvm_relay_x86.py
similarity index 98%
rename from tutorials/get_started/autotvm_relay_x86.py
rename to gallery/tutorial/autotvm_relay_x86.py
index 67faec4505a6..8b9c45c2a859 100644
--- a/tutorials/get_started/autotvm_relay_x86.py
+++ b/gallery/tutorial/autotvm_relay_x86.py
@@ -81,10 +81,9 @@
 #
 # .. note:: Working with Other Model Formats
 #
-#   TVM supports many popular model formats. A list can be found in the `Compile
-#   Deep Learning Models
-#   <https://tvm.apache.org/docs/tutorials/index.html#compile-deep-learning-models>`_
-#   section of the TVM Documentation.
+#   TVM supports many popular model formats. A list can be found in the
+#   :ref:`Compile Deep Learning Models <tutorial-frontend>` section of the TVM
+#   Documentation.
 
 model_url = "".join(
     [
@@ -150,9 +149,8 @@
 #
 #   Specifying the correct target can have a huge impact on the performance of
 #   the compiled module, as it can take advantage of hardware features
-#   available on the target. For more information, please refer to `Auto-tuning
-#   a convolutional network for x86 CPU
-#   <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html#define-network>`_.
+#   available on the target. For more information, please refer to
+#   :ref:`Auto-tuning a convolutional network for x86 CPU <tune_relay_x86>`.
 #   We recommend identifying which CPU you are running, along with optional
 #   features, and set the target appropriately. For example, for some
 #   processors ``target = "llvm -mcpu=skylake"``, or ``target = "llvm
diff --git a/tutorials/get_started/cross_compilation_and_rpc.py b/gallery/tutorial/cross_compilation_and_rpc.py
similarity index 100%
rename from tutorials/get_started/cross_compilation_and_rpc.py
rename to gallery/tutorial/cross_compilation_and_rpc.py
diff --git a/tutorials/get_started/install.py b/gallery/tutorial/install.py
similarity index 94%
rename from tutorials/get_started/install.py
rename to gallery/tutorial/install.py
index b69b8b493a4f..67ce093b9d7f 100644
--- a/tutorials/get_started/install.py
+++ b/gallery/tutorial/install.py
@@ -35,8 +35,8 @@
 # allow you to enable specific features such as GPU support, microcontroller
 # support (microTVM), and a debugging runtime, and other features. You will also
 # want to install from source if you want to actively contribute to the TVM
-# project. The full instructions are on the `Install TVM From Source
-# <https://tvm.apache.org/docs/install/from_source.html>`_ page.
+# project. The full instructions are on the :ref:`Install TVM From Source
+# <install-from-source>` page.
 
 ################################################################################
 # Installing From Binary Packages
diff --git a/tutorials/topi/intro_topi.py b/gallery/tutorial/intro_topi.py
similarity index 99%
rename from tutorials/topi/intro_topi.py
rename to gallery/tutorial/intro_topi.py
index 8138e4718cd9..dad8c53bf4ae 100644
--- a/tutorials/topi/intro_topi.py
+++ b/gallery/tutorial/intro_topi.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """
+.. _tutorial-topi:
+
 Introduction to TOPI
 ====================
 **Author**: `Ehsan M. Kermani <https://github.com/ehsanmok>`_
diff --git a/tutorials/get_started/introduction.py b/gallery/tutorial/introduction.py
similarity index 100%
rename from tutorials/get_started/introduction.py
rename to gallery/tutorial/introduction.py
diff --git a/tutorials/get_started/relay_quick_start.py b/gallery/tutorial/relay_quick_start.py
similarity index 100%
rename from tutorials/get_started/relay_quick_start.py
rename to gallery/tutorial/relay_quick_start.py
diff --git a/tutorials/get_started/tensor_expr_get_started.py b/gallery/tutorial/tensor_expr_get_started.py
similarity index 99%
rename from tutorials/get_started/tensor_expr_get_started.py
rename to gallery/tutorial/tensor_expr_get_started.py
index 310d6bdbfee4..fda332cb63ba 100644
--- a/tutorials/get_started/tensor_expr_get_started.py
+++ b/gallery/tutorial/tensor_expr_get_started.py
@@ -512,7 +512,7 @@ def evaluate_addition(func, target, optimization, log):
 #     before it moves on to the next stage.
 #
 #   A complete description of these primitives can be found in the
-#   [Schedule Primitives](https://tvm.apache.org/docs/tutorials/language/schedule_primitives.html) docs page.
+#   :ref:`Schedule Primitives <schedule_primitives>` docs page.
 
 ################################################################################
 # Example 2: Manually Optimizing Matrix Multiplication with TE
diff --git a/tutorials/get_started/tvmc_command_line_driver.py b/gallery/tutorial/tvmc_command_line_driver.py
similarity index 98%
rename from tutorials/get_started/tvmc_command_line_driver.py
rename to gallery/tutorial/tvmc_command_line_driver.py
index ea3254054ecf..7a0b97895e4f 100644
--- a/tutorials/get_started/tvmc_command_line_driver.py
+++ b/gallery/tutorial/tvmc_command_line_driver.py
@@ -154,11 +154,9 @@
 #   Specifying the correct target (option ``--target``) can have a huge
 #   impact on the performance of the compiled module, as it can take
 #   advantage of hardware features available on the target. For more
-#   information, please refer to `Auto-tuning a convolutional network
-#   for x86 CPU <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html#define-network>`_.
-#   We recommend identifying which CPU you are running, along with optional features,
-#   and set the target appropriately.
-#
+#   information, please refer to :ref:`Auto-tuning a convolutional network for
+#   x86 CPU <tune_relay_x86>`. We recommend identifying which CPU you are
+#   running, along with optional features, and set the target appropriately.
 
 ################################################################################
 # Running the Model from The Compiled Module with TVMC
diff --git a/include/tvm/driver/driver_api.h b/include/tvm/driver/driver_api.h
index 418d532fdd5f..45a938247cc8 100644
--- a/include/tvm/driver/driver_api.h
+++ b/include/tvm/driver/driver_api.h
@@ -30,6 +30,7 @@
 #define TVM_DRIVER_DRIVER_API_H_
 
 #include <tvm/ir/module.h>
+#include <tvm/ir/transform.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/support/with.h>
 #include <tvm/target/target.h>
@@ -43,6 +44,34 @@
 #include <vector>
 
 namespace tvm {
+using tvm::transform::Pass;
+
+/*!
+ * \brief Configures and returns the composite Pass for the fused module (pre split) that contains
+ * device and host code.
+ * \param mixed_mod The original mixed module.
+ * \param target The device Target.
+ * \return The composite Pass for the fused module.
+//  */
+TVM_DLL transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target);
+
+/*!
+ * \brief Configures and returns the composite Pass for the device Target after device/host from
+ * mixed module.
+ * \param mixed_mod The optimized mixed module.
+ * \param target The device Target.
+ * \return The composite Pass for the device module.
+ */
+TVM_DLL transform::Sequential DeviceModulePassManager(IRModule mixed_mod, Target target);
+
+/*!
+ * \brief Configures and returns the composite Pass for the host Target after device/host from mixed
+ * module.
+ * \param mixed_mod The optimized mixed module.
+ * \param target_host The host Target.
+ * \return The composite Pass for the host module.
+ */
+TVM_DLL transform::Sequential HostModulePassManager(IRModule mixed_mod, Target target_host);
 
 /*!
  * \brief Lower an IRModule (optimize with it with the pass list defined in CreatePassList)
@@ -136,6 +165,7 @@ TVM_DLL runtime::Module build(const Map<Target, IRModule>& input, const Target&
  * \return The built module that contains code for different processors.
  */
 TVM_DLL runtime::Module build(const Map<String, IRModule>& input, const Target& target_host);
+
 }  // namespace tvm
 
 #endif  // TVM_DRIVER_DRIVER_API_H_
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index de60deb9cccb..26d2c72c824d 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -686,6 +686,7 @@ struct MaxPool2DAttrs : public tvm::AttrsNode<MaxPool2DAttrs> {
   Array<IndexExpr> padding;
   Array<IndexExpr> dilation;
   tvm::String layout;
+  tvm::String out_layout;
   bool ceil_mode;
 
   TVM_DECLARE_ATTRS(MaxPool2DAttrs, "relay.attrs.MaxPool2DAttrs") {
@@ -709,6 +710,13 @@ struct MaxPool2DAttrs : public tvm::AttrsNode<MaxPool2DAttrs> {
         "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
         "dimensions respectively. Pooling is applied on the 'H' and"
         "'W' dimensions.");
+    TVM_ATTR_FIELD(out_layout)
+        .set_default("")
+        .describe(
+            "Dimension ordering of output data. Can be 'NCHW', 'NHWC', etc."
+            "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+            "dimensions respectively. Pooling is applied on the 'H' and"
+            "'W' dimensions.");
     TVM_ATTR_FIELD(ceil_mode).set_default(false).describe(
         "When true, will use ceil instead of floor to compute the output shape.");
   }
@@ -721,6 +729,7 @@ struct AvgPool2DAttrs : public tvm::AttrsNode<AvgPool2DAttrs> {
   Array<IndexExpr> padding;
   Array<IndexExpr> dilation;
   tvm::String layout;
+  tvm::String out_layout;
   bool ceil_mode;
   bool count_include_pad;
 
@@ -745,6 +754,13 @@ struct AvgPool2DAttrs : public tvm::AttrsNode<AvgPool2DAttrs> {
         "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
         "dimensions respectively. Pooling is applied on the 'H' and"
         "'W' dimensions.");
+    TVM_ATTR_FIELD(out_layout)
+        .set_default("")
+        .describe(
+            "Dimension ordering of output data. Can be 'NCHW', 'NHWC', etc."
+            "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+            "dimensions respectively. Pooling is applied on the 'H' and"
+            "'W' dimensions.");
     TVM_ATTR_FIELD(ceil_mode).set_default(false).describe(
         "When true, will use ceil instead of floor to compute the output shape.");
     TVM_ATTR_FIELD(count_include_pad)
@@ -756,6 +772,7 @@ struct AvgPool2DAttrs : public tvm::AttrsNode<AvgPool2DAttrs> {
 /*! \brief Attributes for global pool operator */
 struct GlobalPool2DAttrs : public tvm::AttrsNode<GlobalPool2DAttrs> {
   tvm::String layout;
+  tvm::String out_layout;
 
   TVM_DECLARE_ATTRS(GlobalPool2DAttrs, "relay.attrs.GlobalPool2DAttrs") {
     TVM_ATTR_FIELD(layout).set_default("NCHW").describe(
@@ -763,6 +780,13 @@ struct GlobalPool2DAttrs : public tvm::AttrsNode<GlobalPool2DAttrs> {
         "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
         "dimensions respectively. Pooling is applied on the 'H' and"
         "'W' dimensions.");
+    TVM_ATTR_FIELD(out_layout)
+        .set_default("")
+        .describe(
+            "Dimension ordering of output data. Can be 'NCHW', 'NHWC', etc."
+            "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+            "dimensions respectively. Pooling is applied on the 'H' and"
+            "'W' dimensions.");
   }
 };
 
@@ -770,6 +794,7 @@ struct GlobalPool2DAttrs : public tvm::AttrsNode<GlobalPool2DAttrs> {
 struct AdaptivePool1DAttrs : public tvm::AttrsNode<AdaptivePool1DAttrs> {
   Array<IndexExpr> output_size;
   std::string layout;
+  tvm::String out_layout;
 
   TVM_DECLARE_ATTRS(AdaptivePool1DAttrs, "relay.attrs.AdaptivePool1DAttrs") {
     TVM_ATTR_FIELD(output_size).set_default(Array<IndexExpr>({})).describe("Output width.");
@@ -778,6 +803,13 @@ struct AdaptivePool1DAttrs : public tvm::AttrsNode<AdaptivePool1DAttrs> {
         "'N', 'C', 'W' stands for batch, channel, and width"
         "dimensions respectively. Pooling is applied on the"
         "'W' dimension.");
+    TVM_ATTR_FIELD(out_layout)
+        .set_default("")
+        .describe(
+            "Dimension ordering of output data. Can be 'NCW', 'NWC', etc."
+            "'N', 'C', 'W' stands for batch, channel, and width"
+            "dimensions respectively. Pooling is applied on the"
+            "'W' dimension.");
   }
 };
 
@@ -785,6 +817,7 @@ struct AdaptivePool1DAttrs : public tvm::AttrsNode<AdaptivePool1DAttrs> {
 struct AdaptivePool2DAttrs : public tvm::AttrsNode<AdaptivePool2DAttrs> {
   Array<IndexExpr> output_size;
   std::string layout;
+  tvm::String out_layout;
 
   TVM_DECLARE_ATTRS(AdaptivePool2DAttrs, "relay.attrs.AdaptivePool2DAttrs") {
     TVM_ATTR_FIELD(output_size)
@@ -795,6 +828,13 @@ struct AdaptivePool2DAttrs : public tvm::AttrsNode<AdaptivePool2DAttrs> {
         "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
         "dimensions respectively. Pooling is applied on the 'H' and"
         "'W' dimensions.");
+    TVM_ATTR_FIELD(out_layout)
+        .set_default("")
+        .describe(
+            "Dimension ordering of output data. Can be 'NCHW', 'NHWC', etc."
+            "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+            "dimensions respectively. Pooling is applied on the 'H' and"
+            "'W' dimensions.");
   }
 };
 
@@ -802,6 +842,7 @@ struct AdaptivePool2DAttrs : public tvm::AttrsNode<AdaptivePool2DAttrs> {
 struct AdaptivePool3DAttrs : public tvm::AttrsNode<AdaptivePool3DAttrs> {
   Array<IndexExpr> output_size;
   std::string layout;
+  tvm::String out_layout;
 
   TVM_DECLARE_ATTRS(AdaptivePool3DAttrs, "relay.attrs.AdaptivePool3DAttrs") {
     TVM_ATTR_FIELD(output_size)
@@ -812,6 +853,13 @@ struct AdaptivePool3DAttrs : public tvm::AttrsNode<AdaptivePool3DAttrs> {
         "'N', 'C', 'D', 'H', 'W' stands for batch, channel, depth, height, and width"
         "dimensions respectively. Pooling is applied on 'D', 'H' and"
         "'W' dimensions.");
+    TVM_ATTR_FIELD(out_layout)
+        .set_default("")
+        .describe(
+            "Dimension ordering of output data. Can be 'NCDHW', 'NDHWC', etc."
+            "'N', 'C', 'D', 'H', 'W' stands for batch, channel, depth, height, and width"
+            "dimensions respectively. Pooling is applied on 'D', 'H' and"
+            "'W' dimensions.");
   }
 };
 
@@ -822,6 +870,7 @@ struct MaxPool1DAttrs : public tvm::AttrsNode<MaxPool1DAttrs> {
   Array<IndexExpr> dilation;
   Array<IndexExpr> padding;
   std::string layout;
+  tvm::String out_layout;
   bool ceil_mode;
 
   TVM_DECLARE_ATTRS(MaxPool1DAttrs, "relay.attrs.MaxPool1DAttrs") {
@@ -844,6 +893,12 @@ struct MaxPool1DAttrs : public tvm::AttrsNode<MaxPool1DAttrs> {
         "Dimension ordering of input data. Can be 'NCW', 'NWC', etc."
         "'N', 'C', 'W' stands for batch, channel, and width"
         "dimensions respectively. Pooling is applied on the 'W' dimensions.");
+    TVM_ATTR_FIELD(out_layout)
+        .set_default("")
+        .describe(
+            "Dimension ordering of output data. Can be 'NCW', 'NWC', etc."
+            "'N', 'C', 'W' stands for batch, channel, and width"
+            "dimensions respectively. Pooling is applied on the 'W' dimensions.");
     TVM_ATTR_FIELD(ceil_mode).set_default(false).describe(
         "When true, will use ceil instead of floor to compute the output shape.");
   }
@@ -856,6 +911,7 @@ struct AvgPool1DAttrs : public tvm::AttrsNode<AvgPool1DAttrs> {
   Array<IndexExpr> dilation;
   Array<IndexExpr> padding;
   std::string layout;
+  tvm::String out_layout;
   bool ceil_mode;
   bool count_include_pad;
 
@@ -879,6 +935,12 @@ struct AvgPool1DAttrs : public tvm::AttrsNode<AvgPool1DAttrs> {
         "Dimension ordering of input data. Can be 'NCW', 'NHC', etc."
         "'N', 'C', 'W' stands for batch, channel, and width"
         "dimensions respectively. Pooling is applied on the 'W' dimension.");
+    TVM_ATTR_FIELD(out_layout)
+        .set_default("")
+        .describe(
+            "Dimension ordering of output data. Can be 'NCW', 'NHC', etc."
+            "'N', 'C', 'W' stands for batch, channel, and width"
+            "dimensions respectively. Pooling is applied on the 'W' dimension.");
     TVM_ATTR_FIELD(ceil_mode).set_default(false).describe(
         "When true, will use ceil instead of floor to compute the output shape.");
     TVM_ATTR_FIELD(count_include_pad)
@@ -894,6 +956,7 @@ struct MaxPool3DAttrs : public tvm::AttrsNode<MaxPool3DAttrs> {
   Array<IndexExpr> dilation;
   Array<IndexExpr> padding;
   std::string layout;
+  tvm::String out_layout;
   bool ceil_mode;
 
   TVM_DECLARE_ATTRS(MaxPool3DAttrs, "relay.attrs.MaxPool3DAttrs") {
@@ -917,6 +980,13 @@ struct MaxPool3DAttrs : public tvm::AttrsNode<MaxPool3DAttrs> {
         "'N', 'C', 'D', 'H', 'W' stands for batch, channel, depth, height, and width"
         "dimensions respectively. Pooling is applied on the 'D', 'H' and"
         "'W' dimensions.");
+    TVM_ATTR_FIELD(out_layout)
+        .set_default("")
+        .describe(
+            "Dimension ordering of output data. Can be 'NCDHW', 'NDHWC', etc."
+            "'N', 'C', 'D', 'H', 'W' stands for batch, channel, depth, height, and width"
+            "dimensions respectively. Pooling is applied on the 'D', 'H' and"
+            "'W' dimensions.");
     TVM_ATTR_FIELD(ceil_mode).set_default(false).describe(
         "When true, will use ceil instead of floor to compute the output shape.");
   }
@@ -929,6 +999,7 @@ struct AvgPool3DAttrs : public tvm::AttrsNode<AvgPool3DAttrs> {
   Array<IndexExpr> dilation;
   Array<IndexExpr> padding;
   std::string layout;
+  tvm::String out_layout;
   bool ceil_mode;
   bool count_include_pad;
 
@@ -953,6 +1024,13 @@ struct AvgPool3DAttrs : public tvm::AttrsNode<AvgPool3DAttrs> {
         "'N', 'C', 'D', 'H', 'W' stands for batch, channel, depth, height, and width"
         "dimensions respectively. Pooling is applied on the 'D', 'H' and"
         "'W' dimensions.");
+    TVM_ATTR_FIELD(out_layout)
+        .set_default("")
+        .describe(
+            "Dimension ordering of output data. Can be 'NCDHW', 'NDHWC', etc."
+            "'N', 'C', 'D', 'H', 'W' stands for batch, channel, depth, height, and width"
+            "dimensions respectively. Pooling is applied on the 'D', 'H' and"
+            "'W' dimensions.");
     TVM_ATTR_FIELD(ceil_mode).set_default(false).describe(
         "When true, will use ceil instead of floor to compute the output shape.");
     TVM_ATTR_FIELD(count_include_pad)
diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 7ee140622bfc..7b9a68063f16 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -452,11 +452,23 @@ class CountNode : public Object {
   TVM_DECLARE_FINAL_OBJECT_INFO(CountNode, Object);
 };
 
-/*! \brief String representation of an array or NDArray shapes
+/*! \brief String representation of an array of NDArray shapes
  *  \param shapes Array of NDArrays to get the shapes of.
  *  \return A textual representation of the shapes. For example: `float32[2], int64[1, 2]`.
  */
 String ShapeString(const std::vector<NDArray>& shapes);
+/*! \brief String representation of shape encoded as an NDArray
+ *  \param shape NDArray containing the shape.
+ *  \param dtype The dtype of the shape.
+ *  \return A textual representation of the shape. For example: `float32[2]`.
+ */
+String ShapeString(NDArray shape, DLDataType dtype);
+/*! \brief String representation of a shape encoded as a vector
+ *  \param shape Shape as a vector of integers.
+ *  \param dtype The dtype of the shape.
+ *  \return A textual representation of the shape. For example: `float32[2]`.
+ */
+String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype);
 
 }  // namespace profiling
 }  // namespace runtime
diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index 831336b9dbfe..039b1894d7c4 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -198,14 +198,14 @@ class VirtualMachine : public runtime::ModuleNode {
    * \param reg The register to read from.
    * \return The read object.
    */
-  inline ObjectRef ReadRegister(RegName reg) const;
+  ObjectRef ReadRegister(RegName reg) const;
 
   /*!
    * \brief Read a VM register and cast it to int32_t
    * \param reg The register to read from.
    * \return The read scalar.
    */
-  inline int64_t LoadScalarInt(RegName reg) const;
+  int64_t LoadScalarInt(RegName reg) const;
 
   /*!
    * \brief Invoke a VM function.
@@ -268,6 +268,22 @@ class VirtualMachine : public runtime::ModuleNode {
    */
   void SetInput(std::string name, TVMArgs args, int offset);
 
+  /*!
+   * \brief Internal hook for profiling the start of an op.
+   *
+   * This hook is only called on certain ops that are likely to take a
+   * significant amount of runtime (normally because they alloc or transfer to
+   * device).
+   *
+   * \param instr Instruction that will be executed after this hook fires
+   */
+  virtual void OpStartHook(Instruction instr);
+
+  /*!
+   * \brief Internal hook for profiling the end of an op.
+   */
+  virtual void OpStopHook();
+
  protected:
   /*! \brief The virtual machine's packed function table. */
   std::vector<PackedFunc> packed_funcs_;
diff --git a/include/tvm/target/target_kind.h b/include/tvm/target/target_kind.h
index 8a2bbcbd0121..e802a3088d2d 100644
--- a/include/tvm/target/target_kind.h
+++ b/include/tvm/target/target_kind.h
@@ -24,6 +24,7 @@
 #ifndef TVM_TARGET_TARGET_KIND_H_
 #define TVM_TARGET_TARGET_KIND_H_
 
+#include <tvm/ir/transform.h>
 #include <tvm/node/attr_registry_map.h>
 #include <tvm/node/node.h>
 
@@ -33,6 +34,33 @@
 #include <vector>
 
 namespace tvm {
+
+class Target;
+
+/*!
+ * \brief RelayToTIR tvm::transform::Pass specific to a TargetKind
+ *
+ * Called before the default lowering passes.
+ *
+ * \param mod The module that an optimization pass runs on.
+ * \param pass_ctx The pass context that can provide information for the optimization.
+ *
+ * \return The transformed module.
+ */
+using FTVMRelayToTIR = transform::Pass;
+
+/*!
+ * \brief TIRToRuntime conversion specific to a TargetKind
+ *
+ * This function is responsible for scanning an IRModule for appropriate Target-specific functions
+ and generating a Runtime module representing the compiled output
+ *
+ * \param ir_module Unified IRModule
+ * \param target Target to filter on or retrieve arguments from
+ * \return Runtime Module containing compiled functions
+ */
+using FTVMTIRToRuntime = runtime::TypedPackedFunc<runtime::Module(IRModule, Target)>;
+
 namespace detail {
 template <typename, typename, typename>
 struct ValueTypeInfoMaker;
@@ -201,6 +229,12 @@ class TargetKindRegEntry {
    * \return The entry names.
    */
   TVM_DLL static Array<String> ListTargetKinds();
+  /*!
+   * \brief Get all supported option names and types for a given Target kind.
+   * \return Map of option name to type
+   */
+  TVM_DLL static Map<String, String> ListTargetKindOptions(const TargetKind& kind);
+
   /*!
    * \brief Register or get a new entry.
    * \param target_kind_name The name of the TargetKind.
diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index 23057f7140e4..e4a3d3d1e21b 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -199,9 +199,10 @@ class LinkedParam : public ObjectRef {
  *  def mem_copy(a: T.handle, b: T.handle, m: T.int32, n: T.int32) -> None:
  *      A = T.match_buffer(a, (m, n), "float32")
  *      B = T.match_buffer(b, (m, n), "float32")
- *
- *      with T.block([m, n], "") as [vi, vj]:
- *          B[vi, vj] = A[vi, vj]
+ *      for i, j in T.grid(m, n):
+ *          with T.block():
+ *              vi, vj = T.axis.remap("SS", [i, j])
+ *              B[vi, vj] = A[vi, vj]
  * \endcode
  *
  * Then we can make it specialized with given shapes or buffers.
@@ -218,9 +219,10 @@ class LinkedParam : public ObjectRef {
  *  def mem_copy_16_16(a: T.handle, b: T.handle) -> None:
  *      A = T.match_buffer(a, (16, 16), "float32")
  *      B = T.match_buffer(b, (16, 16), "float32")
- *
- *      with T.block([16, 16], "") as [vi, vj]:
- *          B[vi, vj] = A[vi, vj]
+ *      for i, j in T.grid(16, 16):
+ *          with T.block():
+ *              vi, vj = T.axis.remap("SS", [i, j])
+ *              B[vi, vj] = A[vi, vj]
  * \endcode
  */
 PrimFunc Specialize(PrimFunc func, const Map<Var, ObjectRef>& param_map);
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 5cd860b8e929..4f5772822d9e 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -1078,9 +1078,9 @@ class MatchBufferRegion : public ObjectRef {
  * \note Block's body is parameterized by iter vars.
  * \code
  *
- *  with T.block([extent0, extent1, ...], name) as [v0, v1, ...]:
- *      T.bind(v0, value0)
- *      T.bind(v1, value1)
+ *  with T.block(name):
+ *      v0 = T.axis.S(domain, value0)
+ *      v1 = T.axis.R(domain, value1)
  *      ...
  *      T.reads([buffer0[start:end, ...], ...])
  *      T.writes([buffer1[start:end, ...], ...])
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index e94b966bc0fc..e6b0af9773d9 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -388,7 +388,7 @@ TVM_DLL Pass ConvertBlocksToOpaque();
  *  \code
  *
  *  for i in range(0, 16):
- *      with T.block([]):
+ *      with T.block():
  *          B = T.alloc_buffer(16, 16)
  *          for j in range(0, 16):
  *              B[i, j] = A[i, j] + 1
@@ -404,7 +404,7 @@ TVM_DLL Pass ConvertBlocksToOpaque();
  *  \code
  *
  *  for i in range(0, 16):
- *      with T.block([]):
+ *      with T.block():
  *          B = T.alloc_buffer(1, 16)
  *          for j in range(0, 16):
  *              B[0, j] = A[i, j] + 1
@@ -463,6 +463,15 @@ TVM_DLL Pass UnifyThreadBinding();
  */
 TVM_DLL Pass MergeDynamicSharedMemoryAllocations();
 
+/*!
+ * \brief This pass is post-scheduling pass to convert all
+ *        Parallel For loops to Serial ones. This is run
+ *        to attain lesser memory and/or executor/backend
+ *        does not support parallel launch of For loops.
+ * \return The pass.
+ */
+TVM_DLL Pass ConvertForLoopsToSerial();
+
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index 8d1a49a4cc5f..3df9caf55d5c 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -1233,8 +1233,10 @@ inline Tensor gather(const Tensor& data, int axis, const Tensor& indices,
   }
   ICHECK_GE(axis, 0);
   ICHECK_LT(axis, ndim_d);
-  size_t indices_dim_i = static_cast<size_t>(GetConstInt(indices->shape[axis]));
-  ICHECK_GE(indices_dim_i, 1);
+  if (indices->shape[axis].as<IntImmNode>()) {
+    size_t indices_dim_i = static_cast<size_t>(GetConstInt(indices->shape[axis]));
+    ICHECK_GE(indices_dim_i, 1);
+  }
   ICHECK(indices->dtype.is_int());
 
   Array<PrimExpr> out_shape;
diff --git a/python/gen_requirements.py b/python/gen_requirements.py
index fa94d6a64130..e9f3772ee733 100755
--- a/python/gen_requirements.py
+++ b/python/gen_requirements.py
@@ -198,6 +198,7 @@
                 "sphinx_autodoc_annotation",
                 "sphinx_gallery",
                 "sphinx_rtd_theme",
+                "types-psutil",
             ],
         ),
     ),
@@ -250,7 +251,7 @@
     ("sphinx_autodoc_annotation", None),
     ("sphinx_gallery", None),
     ("sphinx_rtd_theme", None),
-    ("synr", "==0.4.1"),
+    ("synr", "==0.5.0"),
     ("tensorflow", None),
     ("tensorflow-estimator", None),
     ("tflite", None),
diff --git a/python/tvm/_ffi/_ctypes/ndarray.py b/python/tvm/_ffi/_ctypes/ndarray.py
index 1aa6964d952f..fc8768448dde 100644
--- a/python/tvm/_ffi/_ctypes/ndarray.py
+++ b/python/tvm/_ffi/_ctypes/ndarray.py
@@ -53,9 +53,9 @@ def _dlpack_deleter(pycapsule):
     if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor):
         ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor)
         # enforce type to make sure it works for all ctypes
-        ptr = ctypes.cast(ctypes.c_void_p, ptr)
+        ptr = ctypes.cast(ptr, ctypes.c_void_p)
         _LIB.TVMDLManagedTensorCallDeleter(ptr)
-        ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
+        ctypes.pythonapi.PyCapsule_SetDestructor(pycapsule, None)
 
 
 _c_dlpack_deleter = TVMPyCapsuleDestructor(_dlpack_deleter)
diff --git a/python/tvm/autotvm/feature.py b/python/tvm/autotvm/feature.py
index 8d2591dce50b..f73c65fbd1d8 100644
--- a/python/tvm/autotvm/feature.py
+++ b/python/tvm/autotvm/feature.py
@@ -31,7 +31,6 @@
 import tvm._ffi
 
 from tvm.target import Target
-from tvm.te import schedule
 from tvm.driver import build_module
 
 
@@ -39,13 +38,12 @@ def ana_lower(sch, args, binds=None, simple_mode=True):
     """Do lower while keeping all axes in IR
     i.e. Do not eliminate loop with extent of 1, do not vectorize, unroll or inject virtual threads
     """
-    binds, _ = build_module.get_binds(args, compact=False, binds=binds)
     sch = sch.normalize()
     # Phase 0
-    bounds = schedule.InferBound(sch)
-    stmt = schedule.ScheduleOps(sch, bounds, True)
-    func = schedule.SchedulePostProcToPrimFunc(args, stmt, None)
-    mod = tvm.IRModule.from_expr(func._move())
+    context = tvm.transform.PassContext(config={"tir.debug_keep_trivial_loop": True})
+    with context:
+        mod = build_module.schedule_to_module(sch, args, binds=binds)
+
     mod = tvm.tir.transform.StorageFlatten(64)(mod._move())
     mod = tvm.tir.transform.Simplify()(mod._move())
     assert simple_mode
diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
index beb1aa03090d..25d56cf8cf02 100644
--- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
+++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
@@ -166,7 +166,7 @@ def __init__(
         if isinstance(graph, relay.function.Function):
             node_dict = {}
             graph = bind_inputs(graph, input_shapes, dtype)
-            expr2graph(graph, self._target_ops, node_dict, self._node_list)
+            expr2graph(graph, self._target_ops, node_dict, self._node_list, target)
         else:
             raise RuntimeError("Unsupported graph type: %s" % str(type(graph)))
 
diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
index f61d34284e01..723e7fa77006 100644
--- a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
+++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
@@ -17,6 +17,7 @@
 # pylint: disable=too-many-locals,too-many-statements,too-many-branches,protected-access
 """API for graph traversing."""
 import threading
+import re
 
 import tvm
 from tvm import relay, autotvm
@@ -30,7 +31,7 @@
 from .._base import OPT_OUT_OP
 
 
-def expr2graph(expr, target_ops, node_dict, node_list):
+def expr2graph(expr, target_ops, node_dict, node_list, tvm_target):
     """Convert relay expr to graph data structure
     and fetch workloads of target operators.
 
@@ -50,6 +51,9 @@ def expr2graph(expr, target_ops, node_dict, node_list):
         Each node will be stored as a dictionary in the format of
         {"op": str, "node": tvm.relay.expr, "inputs": [int], "types": [tvm.relay.Type],
          "name": str, "workloads": [tuple], "topi_op": [function]}
+
+    tvm_target : tvm.target
+        The TVM target object.
     """
     # TODO(@kevinthesun, @icemelon9): Currently graph tuning pass relies on the fact
     #   that # autotvm tasks == # ops. But this won't be true after having relay op
@@ -58,12 +62,12 @@ def expr2graph(expr, target_ops, node_dict, node_list):
     env.reset(target_ops)
     # pylint: disable=not-context-manager
     with env:
-        _expr2graph_impl(expr, target_ops, node_dict, node_list)
+        _expr2graph_impl(expr, target_ops, node_dict, node_list, tvm_target)
         task_pos = 0
         for node_entry in node_list:
             if node_entry["op"] in target_ops:
                 task_name, args = env.task_collection[task_pos]
-                task = autotvm.task.create(task_name, args, target="llvm")
+                task = autotvm.task.create(task_name, args, target=tvm_target)
                 node_entry["workloads"] = [task.workload]
                 node_entry["topi_op"] = [task_name]
                 task_pos += 1
@@ -77,7 +81,18 @@ def _infer_type(node):
     return entry if isinstance(node, relay.Function) else entry.body
 
 
-def _expr2graph_impl(expr, target_ops, node_dict, node_list):
+def _replace_device_with_tracing(target):
+    """This is to replace -device=XXX with -device=tracing in the tvm_target string.
+    It is a stand-along function for testability.
+    We need to have device=tracing in order to fetch the workloads, it is not used
+    for anything beyond that so it is safe to override the device here only."""
+    target = str(target)
+    if "-device" in target:
+        return re.sub("-device=[^\\-$]+", "-device=tracing ", target).strip(" ")
+    return target + " -device=tracing"
+
+
+def _expr2graph_impl(expr, target_ops, node_dict, node_list, tvm_target):
     """Implementation to convert relay expr to graph data structure"""
 
     def _traverse_expr(node):
@@ -128,8 +143,9 @@ def _traverse_expr(node):
                 call = relay.Call(node.op, params, node.attrs)
                 mod = tvm.IRModule.from_expr(relay.Function(params, call))
                 relay.backend.compile_engine.get().clear()
+                tracing_target = _replace_device_with_tracing(tvm_target)
                 build_thread = threading.Thread(
-                    target=relay.build, args=(mod, "llvm -device=tracing", None, None)
+                    target=relay.build, args=(mod, tracing_target, None, None)
                 )
                 build_thread.start()
                 build_thread.join()
@@ -139,7 +155,7 @@ def _traverse_expr(node):
         elif isinstance(node, Function):
             # Ignore root node since it equals to input function expression
             if node != expr:
-                _expr2graph_impl(node, target_ops, node_dict, node_list)
+                _expr2graph_impl(node, target_ops, node_dict, node_list, tvm_target)
             return
         elif isinstance(node, TupleGetItem):
             in_node_idx = node_dict[node.tuple_value]
diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index 36c03891d210..37b9fed8eb91 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -16,6 +16,7 @@
 # under the License.
 """Pipeline executor that executes a series of modules in a pipeline fashion."""
 import json
+import os
 import tvm._ffi
 from tvm import relay
 from tvm.relay.transform import InferType
@@ -47,13 +48,13 @@ def build(pipe_configs):
     ret: PipelineExecutorFactoryModule
         Common interface for pipeline executor factory modules.
     """
-    mods = {}
+    libs = {}
     mod_n_configs = pipe_configs.get_config()
     config_len = len(mod_n_configs)
     string_config = [{} for _ in range(config_len)]
     for ir_mod, mod_config in mod_n_configs.items():
         mconf = mod_config["pipeline"].copy()
-        mod_idx = mconf["mod_idx"] - 1
+        mod_idx = mconf["mod_idx"]
         dev = mod_config["dev"]
         target = mod_config["target"]
         build_func = relay.build
@@ -61,7 +62,7 @@ def build(pipe_configs):
         if "build" in mod_config and mod_config["build"]:
             build_func = mod_config["build"]
 
-        mod = build_func(
+        lib = build_func(
             ir_mod,
             target,
             params=mod_config["params"],
@@ -72,9 +73,9 @@ def build(pipe_configs):
         mconf["dev"] = "{},{}".format(dev.device_type, dev.device_id)
         # Create a pipeline configuration.
         string_config[mod_idx] = mconf
-        mods[mod] = {"dev": dev}
+        libs[mod_idx] = {"lib": lib, "dev": dev}
 
-    return PipelineExecutorFactoryModule(mods, string_config)
+    return PipelineExecutorFactoryModule(libs, string_config)
 
 
 class PipelineModule(object):
@@ -82,12 +83,59 @@ class PipelineModule(object):
 
     Parameters
     ----------
-    module : PipelineExecutorFactoryModule
-        Common interface for pipeline executor factory modules.
+    module : Union[PipelineExecutorFactoryModule, Module]
+        Common interface for pipeline executor factory modules or Module.
     """
 
     def __init__(self, module):
-        self.module = module.module
+        if isinstance(module, PipelineExecutorFactoryModule):
+            self.module = module.module
+        else:
+            self.module = module
+        # Get the packed functions from the pipeline executor.
+        self._get_num_outputs = self.module["get_num_outputs"]
+
+    @property
+    def num_outputs(self):
+        """Get the number of outputs.
+        Returns
+        -------
+        count : int
+            The number of outputs.
+        """
+        return self._get_num_outputs()
+
+    @staticmethod
+    def load_library(config_file_name):
+        """Import files to create a pipeline executor.
+
+        Parameters
+        ----------
+        config_file_name : str
+            Path and name of the configuration file, the configuration file contains the
+            disk path of the parameter file, library file, and JSON file.
+        """
+        with open(config_file_name, "r") as file_handle:
+            config = file_handle.read()
+        config = json.loads(config)
+        if "load_config" not in config or "pipeline_config" not in config:
+            raise RuntimeError(
+                '"load_config" or "pipeline_config" is missing in %s' % config_file_name
+            )
+
+        # The config file used to load library, prameters, and JSON files.
+        with open(config["load_config"], "r") as file_handle:
+            load_config = file_handle.read()
+
+        # The config file used to load pipeline compute config.
+        with open(config["pipeline_config"], "r") as file_handle:
+            pipeline_config = file_handle.read()
+
+        # Load a PipelineExecutor from the disk files.
+        load_library = tvm._ffi.get_global_func("tvm.pipeline_executor.load", allow_missing=False)
+        module = load_library(load_config, pipeline_config)
+
+        return PipelineModule(module)
 
 
 class PipelineConfig(object):
@@ -139,13 +187,14 @@ def get_owner_idx(self):
             if isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
                 return self.io_owner.idx
 
-            return 0
+            return -1
 
-        def is_global_interface(self):
-            """The global interface is the interface visible to the caller which use a pipeline
-            executor, the global input interface is responsible for passing parameters to the
-            internal module interface, and the global output interface is responsible for
-            outputting the results computed by the pipeline executor to a caller.
+        def is_pipeline_executor_interface(self):
+            """The pipeline interface is used to interact with the caller. There are two types
+            of interfaces, one is 'input' another is 'output'. The pipeline input interface
+            is responsible for passing parameters to the internal module interface, and the
+            pipeline output interface is responsible for outputting the results computed by
+            the pipeline executor to the caller.
             """
             return not isinstance(self.io_owner, PipelineConfig.ModuleWrapper)
 
@@ -182,9 +231,9 @@ def check_dag_acyclic(self, start, inputs):
 
         def connect(self, binding):
             """Connect the current interface to the destination interface.
-            Correct connections are as follows: 1. global input connected to module input,
-            2. module output connected to global output, 3. module output connected to
-            module input.
+            Correct connections are as follows: 1. the pipeline input connected to a module input,
+            2. the module output connected to a pipeline output, 3. the module output connected to
+            a module input.
 
             Parameters
             ----------
@@ -196,31 +245,31 @@ def connect(self, binding):
             if self.io_owner == binding.io_owner:
                 raise RuntimeError(f"Can not bind itself.")
 
-            if not self.is_global_interface() and self.io_type == "input":
+            if not self.is_pipeline_executor_interface() and self.io_type == "input":
                 raise RuntimeError(f"Module can only bind from output interface!")
 
             if (
-                not self.is_global_interface()
-                and not binding.is_global_interface()
+                not self.is_pipeline_executor_interface()
+                and not binding.is_pipeline_executor_interface()
                 and binding.io_type == "output"
             ):
                 raise RuntimeError(f"Can not bind module output with another module output!")
 
             if (
-                not self.is_global_interface()
-                and binding.is_global_interface()
+                not self.is_pipeline_executor_interface()
+                and binding.is_pipeline_executor_interface()
                 and binding.io_type == "input"
             ):
-                raise RuntimeError(f"Can not bind module output with global input!")
+                raise RuntimeError(f"Can not bind module output with pipeline input!")
 
-            if self.is_global_interface() and self.io_type == "output":
+            if self.is_pipeline_executor_interface() and self.io_type == "output":
                 raise RuntimeError(f"Global output can not be used as binding start point.")
 
-            if self.is_global_interface() and binding.io_type != "input":
+            if self.is_pipeline_executor_interface() and binding.io_type != "input":
                 raise RuntimeError(f"Global input can only bind with module input.")
 
             self.bindings.append(binding)
-            if not self.is_global_interface():
+            if not self.is_pipeline_executor_interface():
                 # Check whether the data types of the source and destination are the same.
                 if (
                     isinstance(binding.io_owner, PipelineConfig.ModuleWrapper)
@@ -431,13 +480,16 @@ def get_config(self):
                     for dep in binding.bindings:
                         dep_item = {}
                         _, dname = dep.get_name()
-                        dep_item["mod_idx"] = dep.get_owner_idx()
-                        dep_item["input_name"] = dname
+                        if dep.is_pipeline_executor_interface():
+                            dep_item["global_output_index"] = int(dname)
+                        else:
+                            dep_item["mod_idx"] = dep.get_owner_idx()
+                            dep_item["input_name"] = dname
                         dep_conf.append(dep_item)
 
                 # The value of ouput_idx start from 0.
                 output["output_idx"] = int(binding.name)
-                output["dependent"] = dep_conf
+                output["dependencies"] = dep_conf
                 output_conf.append(output)
 
             mconf["mod_idx"] = module.idx
@@ -472,7 +524,7 @@ def dag_topology_sort(self):
             mlist += temp_list
 
         for mod, i in zip(mlist, range(len(mlist))):
-            self.mod_wrapper[mod].set_idx_name(i + 1)
+            self.mod_wrapper[mod].set_idx_name(i)
 
     def get_mod_idx(self, mod):
         # Return the module index.
@@ -502,16 +554,13 @@ class PipelineExecutorFactoryModule(object):
     """
 
     def __init__(self, pipeline_mods, mods_config):
-        mods, config = self.graph_executor_create(pipeline_mods, mods_config)
-        assert (
-            pipeline_executor_enabled()
-        ), "Pipeline executor is not enabled. Please \
-              re-build TVM with USE_PIPELINE_EXECUTOR=ON"
-        pipeline_create = tvm._ffi.get_global_func(
+        self.pipeline_mods = pipeline_mods
+        self.mods_config = mods_config
+        graph_executors, config = self.graph_executor_create(pipeline_mods, mods_config)
+        self.pipeline_create = tvm._ffi.get_global_func(
             "tvm.pipeline_executor.create", allow_missing=False
         )
-        assert pipeline_create
-        self.module = pipeline_create(mods, config)
+        self.module = self.pipeline_create(graph_executors, config)
 
     def graph_executor_create(self, pipeline_mods, mod_config):
         """Create graph_executor list and return configuration as a json string.
@@ -532,12 +581,70 @@ def graph_executor_create(self, pipeline_mods, mod_config):
         mod_config : str
             The Modudle configuration.
         """
+        # Should store modules in the list named 'mods' in index order.
+        mods = [None for _ in range(len(pipeline_mods))]
+        for lib_index in pipeline_mods:
+            pipeline_lib = pipeline_mods[lib_index]["lib"]
+            dev = pipeline_mods[lib_index]["dev"]
+            lib = graph_executor.GraphModule(pipeline_lib["default"](dev))
+            # Return a module list sorted by lib_index.
+            mods[lib_index] = lib.module
+
+        return mods, json.dumps(mod_config)
+
+    def export_library(self, directory_path):
+        """Export the pipeline executor into disk files.
 
-        mods = []
-        for pipeline_mod in pipeline_mods:
-            mod = graph_executor.GraphModule(
-                pipeline_mod["default"](pipeline_mods[pipeline_mod]["dev"])
+        Parameters
+        ----------
+        directory_path : str
+            Export the files to this directory.
+        """
+        if not self.pipeline_mods:
+            raise RuntimeError(f"The pipeline executor has not been initialized.")
+
+        # Check if the directory_path exists.
+        if not os.path.exists(directory_path):
+            raise RuntimeError(f"The directory {directory_path} does not exist.")
+        # Create an load configuration.
+        load_config_file_name = "{}/load_config".format(directory_path)
+        pipeline_config_file_name = "{}/pipeline_config".format(directory_path)
+        config = {}
+        config["load_config"] = load_config_file_name
+        config["pipeline_config"] = pipeline_config_file_name
+        load_config = []
+        # Export the library, JSON, and parameter into files, then export these files path
+        # into a configuration file.
+        for lib_index in self.pipeline_mods:
+            mconfig = {}
+            mconfig["mod_idx"] = lib_index
+            mconfig["lib_name"] = "{}/lib{}.so".format(directory_path, lib_index)
+            mconfig["json_name"] = "{}/json{}".format(directory_path, lib_index)
+            mconfig["params_name"] = "{}/params{}".format(directory_path, lib_index)
+            mconfig["dev"] = "{},{}".format(
+                self.pipeline_mods[lib_index]["dev"].device_type,
+                self.pipeline_mods[lib_index]["dev"].device_id,
             )
-            mods.append(mod.module)
 
-        return mods, json.dumps(mod_config)
+            # Get the graph, lib, and parameters from GraphExecutorFactoryModule.
+            graph, lib, params = self.pipeline_mods[lib_index]["lib"]
+            # Export the lib, graph, and parameters to disk.
+            lib.export_library(mconfig["lib_name"])
+            with open(mconfig["json_name"], "w") as file_handle:
+                file_handle.write(graph)
+            with open(mconfig["params_name"], "wb") as file_handle:
+                file_handle.write(relay.save_param_dict(params))
+
+            load_config.append(mconfig)
+
+        with open(load_config_file_name, "w") as file_handle:
+            json.dump(load_config, file_handle)
+
+        with open(pipeline_config_file_name, "w") as file_handle:
+            json.dump(self.mods_config, file_handle)
+
+        config_file_name = "{}/config".format(directory_path)
+        with open(config_file_name, "w") as file_handle:
+            json.dump(config, file_handle)
+
+        return config_file_name
diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py
index c44a2fe4a136..6d5e10f611db 100644
--- a/python/tvm/contrib/xcode.py
+++ b/python/tvm/contrib/xcode.py
@@ -45,7 +45,23 @@ def xcrun(cmd):
     return out.strip()
 
 
-def create_dylib(output, objects, arch, sdk="macosx"):
+def __get_min_os_version(sdk):
+    if sdk in ("macosx", "iphonesimulator"):
+        return None
+    if sdk == "iphoneos":
+        return "13.0"
+    raise RuntimeError("Unsupported sdk: %s" % sdk)
+
+
+def __get_min_os_version_cmd(sdk, min_os_version):
+    if min_os_version is None:
+        min_os_version = __get_min_os_version(sdk)
+    if min_os_version is not None:
+        return "-mios-version-min=" + min_os_version
+    return ""
+
+
+def create_dylib(output, objects, arch, sdk="macosx", min_os_version=None):
     """Create dynamic library.
 
     Parameters
@@ -71,6 +87,7 @@ def create_dylib(output, objects, arch, sdk="macosx"):
     cmd += ["-dynamiclib"]
     cmd += ["-arch", arch]
     cmd += ["-isysroot", sdk_path]
+    cmd += [__get_min_os_version_cmd(sdk, min_os_version)]
     cmd += ["-o", output]
     if isinstance(objects, str):
         cmd += [objects]
@@ -90,7 +107,7 @@ def create_dylib(output, objects, arch, sdk="macosx"):
 create_dylib.output_format = "dylib"
 
 
-def compile_metal(code, path_target=None, sdk="macosx"):
+def compile_metal(code, path_target=None, sdk="macosx", min_os_version=None):
     """Compile metal with CLI tool from env.
 
     Parameters
@@ -123,7 +140,14 @@ def compile_metal(code, path_target=None, sdk="macosx"):
     #
     #   xcrun -sdk macosx metal -c MyLibrary.metal -o MyLibrary.air
     #   xcrun -sdk macosx metallib MyLibrary.air -o MyLibrary.metallib
-    cmd1 = ["xcrun", "-sdk", sdk, "metal", "-O3"]
+    min_target = __get_min_os_version_cmd(sdk, min_os_version)
+    if sdk == "macosx":
+        language_version = "-std=macos-metal2.3"
+    elif sdk in ("iphoneos", "iphonesimulator"):
+        language_version = "-std=ios-metal2.3"
+    else:
+        raise RuntimeError("Unsupported sdk: %s" % sdk)
+    cmd1 = ["xcrun", "-sdk", sdk, "metal", language_version, min_target, "-O3"]
     cmd1 += ["-c", temp_code, "-o", temp_ir]
     cmd2 = ["xcrun", "-sdk", sdk, "metallib"]
     cmd2 += [temp_ir, "-o", file_target]
diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index a7ebc00c315f..5ec44c6d6ed1 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -16,27 +16,23 @@
 # under the License.
 
 # pylint: disable=invalid-name
-"""The build utils in python.
-"""
+"""The build utils in python."""
 
 from typing import Union, Optional, List, Mapping
-import warnings
 
 import tvm.tir
 
 from tvm.runtime import Module
 from tvm.runtime import ndarray
 from tvm.ir import container
-from tvm.ir import CallingConv
 from tvm.tir import PrimFunc
 from tvm.ir.module import IRModule
-from tvm.ir.transform import PassContext
-from tvm.target import codegen
 from tvm.te import tensor
 from tvm.te import schedule
 from tvm.target import Target
 from tvm.tir.buffer import Buffer
 from tvm.tir.expr import Var
+from tvm.driver import _ffi_api as _driver_ffi
 
 from . import _ffi_api as ffi
 
@@ -71,6 +67,11 @@ def schedule_to_module(
     binds: Optional[Mapping[tensor.Tensor, Buffer]] = None,
 ) -> IRModule:
     """According to the given schedule, form a function.
+
+    This is a low-level function intended for testing purposes, and
+    does not apply any optimization passes.  In general, `tvm.lower`
+    and `tvm.build` should be used instead.
+
     Parameters
     ----------
     sch : tvm.te.schedule.Schedule
@@ -104,8 +105,8 @@ def lower(
 
     args : Optional[List[Union[tvm.tir.Buffer, tensor.Tensor, Var]]]
         The argument lists to the function for TE schedule.
-        It should be None if we want to lower TensorIR.
 
+        It should be None if we want to lower TensorIR.
     name : str
         The name of the result function.
 
@@ -132,98 +133,6 @@ def lower(
     raise ValueError("Expected input to be an IRModule, PrimFunc or Schedule, but got, ", type(inp))
 
 
-def _build_for_device(input_mod, target, target_host):
-    """Build the lowered functions for a device with the given compilation
-    target.
-
-    Parameters
-    ----------
-    input_mod : IRModule
-        The schedule to be built.
-
-    target : str or :any:`tvm.target.Target`
-        The target and option of the compilation.
-
-    target_host : str or :any:`tvm.target.Target`
-        The host compilation target.
-
-    Returns
-    -------
-    fhost : IRModule
-        The host IRModule.
-
-    mdev : tvm.module
-        A module that contains device code.
-    """
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
-    device_type = ndarray.device(target.kind.name, 0).device_type
-
-    mod_mixed = input_mod
-    mod_mixed = tvm.tir.transform.Apply(lambda f: f.with_attr("target", target))(mod_mixed)
-
-    opt_mixed = [
-        tvm.tir.transform.VerifyMemory(),
-        tvm.tir.transform.MergeDynamicSharedMemoryAllocations(),
-    ]
-    if len(mod_mixed.functions) == 1:
-        opt_mixed += [tvm.tir.transform.Apply(lambda f: f.with_attr("tir.is_entry_func", True))]
-
-    if PassContext.current().config.get("tir.detect_global_barrier", False):
-        opt_mixed += [tvm.tir.transform.ThreadSync("global")]
-    opt_mixed += [
-        tvm.tir.transform.ThreadSync("shared"),
-        tvm.tir.transform.ThreadSync("warp"),
-        tvm.tir.transform.InferFragment(),
-        tvm.tir.transform.LowerThreadAllreduce(),
-        tvm.tir.transform.MakePackedAPI(),
-        tvm.tir.transform.SplitHostDevice(),
-    ]
-    mod_mixed = tvm.transform.Sequential(opt_mixed)(mod_mixed)
-
-    # device optimizations
-    opt_device = tvm.transform.Sequential(
-        [
-            tvm.tir.transform.Filter(
-                lambda f: "calling_conv" in f.attrs
-                and f.attrs["calling_conv"].value == CallingConv.DEVICE_KERNEL_LAUNCH
-            ),
-            tvm.tir.transform.LowerWarpMemory(),
-            tvm.tir.transform.Simplify(),
-            tvm.tir.transform.LowerDeviceStorageAccessInfo(),
-            tvm.tir.transform.LowerCustomDatatypes(),
-            tvm.tir.transform.LowerIntrin(),
-        ]
-    )
-    mod_dev = opt_device(mod_mixed)
-
-    # host optimizations
-    opt_host = tvm.transform.Sequential(
-        [
-            tvm.tir.transform.Filter(
-                lambda f: "calling_conv" not in f.attrs
-                or f.attrs["calling_conv"].value != CallingConv.DEVICE_KERNEL_LAUNCH
-            ),
-            tvm.tir.transform.Apply(lambda f: f.with_attr("target", target_host)),
-            tvm.tir.transform.LowerTVMBuiltin(),
-            tvm.tir.transform.LowerDeviceStorageAccessInfo(),
-            tvm.tir.transform.LowerCustomDatatypes(),
-            tvm.tir.transform.LowerIntrin(),
-            tvm.tir.transform.CombineContextCall(),
-        ]
-    )
-    mod_host = opt_host(mod_mixed)
-
-    if device_type == ndarray.cpu(0).device_type and target_host == target:
-        assert len(mod_dev.functions) == 0
-    if "gpu" in target.keys and len(mod_dev.functions) == 0:
-        warnings.warn(
-            "Specified target %s, but cannot find device code, did you do " "bind?" % target
-        )
-
-    rt_mod_dev = codegen.build_module(mod_dev, target) if len(mod_dev.functions) != 0 else None
-    return mod_host, rt_mod_dev
-
-
 def build(
     inputs: Union[schedule.Schedule, PrimFunc, IRModule, Mapping[str, IRModule]],
     args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None,
@@ -237,7 +146,8 @@ def build(
 
     Parameters
     ----------
-    inputs : Union[tvm.te.schedule.Schedule, tvm.tir.PrimFunc, IRModule, Mapping[str, IRModule]]
+    inputs : Union[tvm.te.schedule.Schedule,
+        tvm.tir.PrimFunc, IRModule, Mapping[str, IRModule]]
         The input to be built
 
     args : Optional[List[Union[tvm.tir.Buffer, tensor.Tensor, Var]]]
@@ -253,7 +163,7 @@ def build(
         setup the dimensions and parameters correctly.
         target_host is used to specify the host side codegen target.
         By default, llvm is used if it is enabled,
-        otherwise a stackvm intepreter is used.
+        otherwise a stackvm interpreter is used.
 
     name : Optional[str]
         The name of result function.
@@ -350,21 +260,11 @@ def build(
         target_input_mod, target_host
     )
 
-    mod_host_all = tvm.IRModule({})
-
-    device_modules = []
-    for tar, input_mod in target_input_mod.items():
-        mod_host, mdev = _build_for_device(input_mod, tar, target_host)
-        mod_host_all.update(mod_host)
-        device_modules.append(mdev)
-
-    # Generate a unified host module.
-    rt_mod_host = codegen.build_module(mod_host_all, target_host)
+    rt_mod_host = _driver_ffi.preprocess_module(target_input_mod, target_host)
 
-    # Import all modules.
-    for mdev in device_modules:
-        if mdev:
-            rt_mod_host.import_module(mdev)
+    target_input_mod, target_host = Target.check_and_update_host_consist(
+        target_input_mod, target_host
+    )
 
     if not isinstance(target_host, Target):
         target_host = Target(target_host)
diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index dab855abfb11..92d13a99acd5 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -21,7 +21,7 @@
 import logging
 import time
 from copy import deepcopy
-from typing import Optional, Dict, List, Union
+from typing import Any, Optional, Dict, List, Union
 
 from urllib.parse import urlparse
 
@@ -38,6 +38,7 @@
 from .common import TVMCException
 from .main import register_parser
 from .model import TVMCModel
+from .target import generate_target_args, reconstruct_target_args
 
 
 # pylint: disable=invalid-name
@@ -106,16 +107,14 @@ def add_tune_parser(subparsers):
         help="hostname (required) and port (optional, defaults to 9090) of the RPC tracker, "
         "e.g. '192.168.0.100:9999'",
     )
-    parser.add_argument(
-        "--target",
-        help="compilation target as plain string, inline JSON or path to a JSON file",
-        required=True,
-    )
+
+    generate_target_args(parser)
     parser.add_argument(
         "--target-host",
         help="the host compilation target, defaults to 'llvm'",
         default="llvm",
     )
+
     parser.add_argument("--timeout", type=int, default=10, help="compilation timeout, in seconds")
     parser.add_argument(
         "--trials",
@@ -286,6 +285,7 @@ def drive_tune(args):
         hardware_params=hardware_params,
         include_simple_tasks=args.include_simple_tasks,
         log_estimated_latency=args.log_estimated_latency,
+        additional_target_options=reconstruct_target_args(args),
     )
 
 
@@ -311,6 +311,7 @@ def tune_model(
     hardware_params: Optional[HardwareParams] = None,
     include_simple_tasks: bool = False,
     log_estimated_latency: bool = False,
+    additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None,
 ):
     """Use tuning to automatically optimize the functions in a model.
 
@@ -367,13 +368,15 @@ def tune_model(
         the autoscheduler.
     log_estimated_latency : bool, optional
         If using the autoscheduler, write the estimated latency at each step of tuning to file.
+    additional_target_options: Optional[Dict[str, Dict[str, Any]]]
+        Additional target options in a dictionary to combine with initial Target arguments
 
     Returns
     -------
     tuning_records : str
         The path to the produced tuning log file.
     """
-    target, extra_targets = common.target_from_cli(target)
+    target, extra_targets = common.target_from_cli(target, additional_target_options)
     target, target_host = Target.check_and_update_host_consist(target, target_host)
     # TODO(jwfromm) Remove this deepcopy once AlterOpLayout bug that mutates source
     # model is fixed. For now, creating a clone avoids the issue.
diff --git a/python/tvm/driver/tvmc/common.py b/python/tvm/driver/tvmc/common.py
index 9ef2f6f1fbfa..1ee24cf69d44 100644
--- a/python/tvm/driver/tvmc/common.py
+++ b/python/tvm/driver/tvmc/common.py
@@ -80,7 +80,7 @@ def convert_graph_layout(mod, desired_layout):
             )
 
 
-def validate_targets(parse_targets):
+def validate_targets(parse_targets, additional_target_options=None):
     """
     Apply a series of validations in the targets provided via CLI.
     """
@@ -104,6 +104,15 @@ def validate_targets(parse_targets):
             f"Found: {verbose_tvm_targets}."
         )
 
+    if additional_target_options is not None:
+        for target_name in additional_target_options:
+            if not any([target for target in parse_targets if target["name"] == target_name]):
+                first_option = list(additional_target_options[target_name].keys())[0]
+                raise TVMCException(
+                    f"Passed --target-{target_name}-{first_option}"
+                    f" but did not specify {target_name} target"
+                )
+
 
 def tokenize_target(target):
     """
@@ -261,7 +270,21 @@ def is_inline_json(target):
         return False
 
 
-def target_from_cli(target):
+def _combine_target_options(target, additional_target_options=None):
+    if additional_target_options is None:
+        return target
+    if target["name"] in additional_target_options:
+        target["opts"].update(additional_target_options[target["name"]])
+    return target
+
+
+def _recombobulate_target(target):
+    name = target["name"]
+    opts = " ".join([f"-{key}={value}" for key, value in target["opts"].items()])
+    return f"{name} {opts}"
+
+
+def target_from_cli(target, additional_target_options=None):
     """
     Create a tvm.target.Target instance from a
     command line interface (CLI) string.
@@ -272,6 +295,10 @@ def target_from_cli(target):
         compilation target as plain string,
         inline JSON or path to a JSON file
 
+    additional_target_options: Optional[Dict[str, Dict[str,str]]]
+        dictionary of additional target options to be
+        combined with parsed targets
+
     Returns
     -------
     tvm.target.Target
@@ -298,18 +325,22 @@ def target_from_cli(target):
         except ValueError as ex:
             raise TVMCException(f"Error parsing target string '{target}'.\nThe error was: {ex}")
 
-        validate_targets(parsed_targets)
-        tvm_targets = [t for t in parsed_targets if t["is_tvm_target"]]
+        validate_targets(parsed_targets, additional_target_options)
+        tvm_targets = [
+            _combine_target_options(t, additional_target_options)
+            for t in parsed_targets
+            if t["is_tvm_target"]
+        ]
 
         # Validated target strings have 1 or 2 tvm targets, otherwise
         # `validate_targets` above will fail.
         if len(tvm_targets) == 1:
-            target = tvm_targets[0]["raw"]
+            target = _recombobulate_target(tvm_targets[0])
             target_host = None
         else:
             assert len(tvm_targets) == 2
-            target = tvm_targets[0]["raw"]
-            target_host = tvm_targets[1]["raw"]
+            target = _recombobulate_target(tvm_targets[0])
+            target_host = _recombobulate_target(tvm_targets[1])
 
         extra_targets = [t for t in parsed_targets if not t["is_tvm_target"]]
 
@@ -387,7 +418,7 @@ def parse_shape_string(inputs_string):
     ----------
     inputs_string: str
         A string of the form "input_name:[dim1,dim2,...,dimn] input_name2:[dim1,dim2]" that
-        indicates the desired shape for specific model inputs. Colons and forward slashes
+        indicates the desired shape for specific model inputs. Colons, forward slashes and dots
         within input_names are supported. Spaces are supported inside of dimension arrays.
 
     Returns
@@ -401,7 +432,8 @@ def parse_shape_string(inputs_string):
     # * Spaces inside arrays
     # * forward slashes inside names (but not at the beginning or end)
     # * colons inside names (but not at the beginning or end)
-    pattern = r"(?:\w+\/)?[:\w]+\:\s*\[\-?\d+(?:\,\s*\-?\d+)*\]"
+    # * dots inside names
+    pattern = r"(?:\w+\/)?[:\w.]+\:\s*\[\-?\d+(?:\,\s*\-?\d+)*\]"
     input_mappings = re.findall(pattern, inputs_string)
     if not input_mappings:
         raise argparse.ArgumentTypeError(
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index 9eb85a4934cb..7623a141c27a 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -19,7 +19,7 @@
 """
 import logging
 import os.path
-from typing import Optional, Dict, List, Union, Callable
+from typing import Any, Optional, Dict, List, Union, Callable
 from pathlib import Path
 
 import tvm
@@ -30,6 +30,7 @@
 from . import common, composite_target, frontends
 from .model import TVMCModel, TVMCPackage
 from .main import register_parser
+from .target import generate_target_args, reconstruct_target_args
 
 
 # pylint: disable=invalid-name
@@ -91,11 +92,7 @@ def add_compile_parser(subparsers):
         "times, each one to set one configuration value, "
         "e.g. '--pass-config relay.backend.use_auto_scheduler=0'.",
     )
-    parser.add_argument(
-        "--target",
-        help="compilation targets as comma separated string, inline JSON or path to a JSON file.",
-        required=True,
-    )
+    generate_target_args(parser)
     parser.add_argument(
         "--tuning-records",
         metavar="PATH",
@@ -154,6 +151,7 @@ def drive_compile(args):
         desired_layout=args.desired_layout,
         disabled_pass=args.disabled_pass,
         pass_context_configs=args.pass_config,
+        additional_target_options=reconstruct_target_args(args),
     )
 
     return 0
@@ -172,6 +170,7 @@ def compile_model(
     desired_layout: Optional[str] = None,
     disabled_pass: Optional[str] = None,
     pass_context_configs: Optional[List[str]] = None,
+    additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None,
 ):
     """Compile a model from a supported framework into a TVM module.
 
@@ -215,6 +214,8 @@ def compile_model(
     pass_context_configs: list[str], optional
         List of strings containing a set of configurations to be passed to the
         PassContext.
+    additional_target_options: Optional[Dict[str, Dict[str, Any]]]
+        Additional target options in a dictionary to combine with initial Target arguments
 
 
     Returns
@@ -230,7 +231,7 @@ def compile_model(
     if desired_layout:
         mod = common.convert_graph_layout(mod, desired_layout)
 
-    tvm_target, extra_targets = common.target_from_cli(target)
+    tvm_target, extra_targets = common.target_from_cli(target, additional_target_options)
     tvm_target, target_host = Target.check_and_update_host_consist(tvm_target, target_host)
 
     for codegen_from_cli in extra_targets:
diff --git a/python/tvm/driver/tvmc/target.py b/python/tvm/driver/tvmc/target.py
new file mode 100644
index 000000000000..7a078b8be087
--- /dev/null
+++ b/python/tvm/driver/tvmc/target.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+This file contains functions for processing target inputs for the TVMC CLI
+"""
+
+from tvm.target import Target
+
+# We can't tell the type inside an Array but all current options are strings so
+# it can default to that. Bool is used alongside Integer but aren't distinguished
+# between as both are represented by IntImm
+INTERNAL_TO_NATIVE_TYPE = {"runtime.String": str, "IntImm": int, "Array": str}
+INTERNAL_TO_HELP = {"runtime.String": " string", "IntImm": "", "Array": " options"}
+
+
+def _generate_target_kind_args(parser, kind):
+    target_group = parser.add_argument_group(f"target {kind.name}")
+    for target_option, target_type in kind.options.items():
+        if target_type in INTERNAL_TO_NATIVE_TYPE:
+            target_group.add_argument(
+                f"--target-{kind.name}-{target_option}",
+                type=INTERNAL_TO_NATIVE_TYPE[target_type],
+                help=f"target {kind.name} {target_option}{INTERNAL_TO_HELP[target_type]}",
+            )
+
+
+def generate_target_args(parser):
+    """Walks through the TargetKind registry and generates arguments for each Target's options"""
+    parser.add_argument(
+        "--target",
+        help="compilation target as plain string, inline JSON or path to a JSON file",
+        required=True,
+    )
+    target_kinds = Target.list_kinds()
+    for target_kind in target_kinds:
+        target = Target(target_kind)
+        _generate_target_kind_args(parser, target.kind)
+
+
+def _reconstruct_target_kind_args(args, kind):
+    kind_options = {}
+    for target_option, target_type in kind.options.items():
+        if target_type in INTERNAL_TO_NATIVE_TYPE:
+            var_name = f"target_{kind.name}_{target_option.replace('-', '_')}"
+            option_value = getattr(args, var_name)
+            if option_value is not None:
+                kind_options[target_option] = getattr(args, var_name)
+    return kind_options
+
+
+def reconstruct_target_args(args):
+    """Reconstructs the target options from the arguments"""
+    target_kinds = Target.list_kinds()
+    reconstructed = {}
+    for target_kind in target_kinds:
+        target = Target(target_kind)
+        kind_options = _reconstruct_target_kind_args(args, target.kind)
+        if kind_options:
+            reconstructed[target.kind.name] = kind_options
+    return reconstructed
diff --git a/python/tvm/meta_schedule/runner/local_runner.py b/python/tvm/meta_schedule/runner/local_runner.py
index e3c78d741b20..caa266f97eb3 100644
--- a/python/tvm/meta_schedule/runner/local_runner.py
+++ b/python/tvm/meta_schedule/runner/local_runner.py
@@ -239,13 +239,11 @@ def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
                 result: List[float] = future.result()
                 error_message: str = None
             except TimeoutError as exception:
-                result: List[float] = None
-                error_message: str = (
-                    f"LocalRunner: Timeout, killed after {self.timeout_sec} seconds\n"
-                )
+                result = None
+                error_message = f"LocalRunner: Timeout, killed after {self.timeout_sec} seconds\n"
             except Exception as exception:  # pylint: disable=broad-except
-                result: List[float] = None
-                error_message: str = "LocalRunner: An exception occurred\n" + str(exception)
+                result = None
+                error_message = "LocalRunner: An exception occurred\n" + str(exception)
             local_future = LocalRunnerFuture(res=result, error_message=error_message)
             results.append(local_future)
         return results
diff --git a/python/tvm/meta_schedule/search_strategy/replay_trace.py b/python/tvm/meta_schedule/search_strategy/replay_trace.py
index 3afdff6de77e..15f8295f2524 100644
--- a/python/tvm/meta_schedule/search_strategy/replay_trace.py
+++ b/python/tvm/meta_schedule/search_strategy/replay_trace.py
@@ -41,7 +41,7 @@ class ReplayTrace(SearchStrategy):
     def __init__(self, num_trials_per_iter: int, num_trials_total: int):
         """Constructor"""
         self.__init_handle_by_constructor__(
-            _ffi_api.ReplayTrace,  # pylint: disable=no-member
+            _ffi_api.ReplayTrace,  # type: ignore # pylint: disable=no-member
             num_trials_per_iter,
             num_trials_total,
         )
diff --git a/python/tvm/meta_schedule/search_strategy/search_strategy.py b/python/tvm/meta_schedule/search_strategy/search_strategy.py
index 72713155c41d..d270ea61f6dc 100644
--- a/python/tvm/meta_schedule/search_strategy/search_strategy.py
+++ b/python/tvm/meta_schedule/search_strategy/search_strategy.py
@@ -56,7 +56,7 @@ def __init__(self, sch: Schedule, args_info: List[ArgInfo]) -> None:
             The argument information.
         """
         self.__init_handle_by_constructor__(
-            _ffi_api.MeasureCandidate,  # pylint: disable=no-member
+            _ffi_api.MeasureCandidate,  # type: ignore # pylint: disable=no-member
             sch,
             args_info,
         )
@@ -80,7 +80,7 @@ def initialize_with_tune_context(
         tune_context : TuneContext
             The tuning context for initialization.
         """
-        _ffi_api.SearchStrategyInitializeWithTuneContext(  # pylint: disable=no-member
+        _ffi_api.SearchStrategyInitializeWithTuneContext(  # type: ignore # pylint: disable=no-member
             self, tune_context
         )
 
@@ -92,11 +92,11 @@ def pre_tuning(self, design_spaces: List[Schedule]) -> None:
         design_spaces : List[Schedule]
             The design spaces for pre-tuning.
         """
-        _ffi_api.SearchStrategyPreTuning(self, design_spaces)  # pylint: disable=no-member
+        _ffi_api.SearchStrategyPreTuning(self, design_spaces)  # type: ignore # pylint: disable=no-member
 
     def post_tuning(self) -> None:
         """Post-tuning for the search strategy."""
-        _ffi_api.SearchStrategyPostTuning(self)  # pylint: disable=no-member
+        _ffi_api.SearchStrategyPostTuning(self)  # type: ignore # pylint: disable=no-member
 
     def generate_measure_candidates(self) -> Optional[List[MeasureCandidate]]:
         """Generate measure candidates from design spaces for measurement.
@@ -106,7 +106,7 @@ def generate_measure_candidates(self) -> Optional[List[MeasureCandidate]]:
         measure_candidates : Optional[List[IRModule]]
             The measure candidates generated, None if finished.
         """
-        return _ffi_api.SearchStrategyGenerateMeasureCandidates(self)  # pylint: disable=no-member
+        return _ffi_api.SearchStrategyGenerateMeasureCandidates(self)  # type: ignore # pylint: disable=no-member
 
     def notify_runner_results(self, results: List[RunnerResult]) -> None:
         """Update the search strategy with profiling results.
@@ -116,7 +116,7 @@ def notify_runner_results(self, results: List[RunnerResult]) -> None:
         results : List[RunnerResult]
             The profiling results from the runner.
         """
-        _ffi_api.SearchStrategyNotifyRunnerResults(self, results)  # pylint: disable=no-member
+        _ffi_api.SearchStrategyNotifyRunnerResults(self, results)  # type: ignore # pylint: disable=no-member
 
 
 @register_object("meta_schedule.PySearchStrategy")
@@ -142,7 +142,7 @@ def f_notify_runner_results(results: List["RunnerResult"]) -> None:
             self.notify_runner_results(results)
 
         self.__init_handle_by_constructor__(
-            _ffi_api.SearchStrategyPySearchStrategy,  # pylint: disable=no-member
+            _ffi_api.SearchStrategyPySearchStrategy,  # type: ignore # pylint: disable=no-member
             f_initialize_with_tune_context,
             f_pre_tuning,
             f_post_tuning,
diff --git a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
index b8dcfd9e7a2d..f1e21ad3ddfe 100644
--- a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
+++ b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
@@ -27,7 +27,7 @@ class TaskScheduler(Object):
 
     def tune(self) -> None:
         """Auto-tuning."""
-        _ffi_api.TaskSchedulerTune(self)  # pylint: disable=no-member
+        _ffi_api.TaskSchedulerTune(self)  # type: ignore # pylint: disable=no-member
 
     def _set_task_stopped(self, task_id: int) -> None:
         """Set specific task to be stopped.
@@ -37,7 +37,7 @@ def _set_task_stopped(self, task_id: int) -> None:
         task_id : int
             The task id to be stopped.
         """
-        _ffi_api.TaskSchedulerSetTaskStopped(self, task_id)  # pylint: disable=no-member
+        _ffi_api.TaskSchedulerSetTaskStopped(self, task_id)  # type: ignore # pylint: disable=no-member
 
     def _is_task_running(self, task_id: int) -> bool:
         """Check whether the task is running.
@@ -52,7 +52,7 @@ def _is_task_running(self, task_id: int) -> bool:
         bool
             Whether the task is running.
         """
-        return _ffi_api.TaskSchedulerIsTaskRunning(self, task_id)  # pylint: disable=no-member
+        return _ffi_api.TaskSchedulerIsTaskRunning(self, task_id)  # type: ignore # pylint: disable=no-member
 
     def _join_running_task(self, task_id: int) -> None:
         """Wait until the task is finished.
@@ -62,7 +62,7 @@ def _join_running_task(self, task_id: int) -> None:
         task_id : int
             The task id to be joined.
         """
-        _ffi_api.TaskSchedulerJoinRunningTask(self, task_id)  # pylint: disable=no-member
+        _ffi_api.TaskSchedulerJoinRunningTask(self, task_id)  # type: ignore # pylint: disable=no-member
 
     def _next_task_id(self) -> int:
         """Fetch the next task id.
@@ -72,7 +72,7 @@ def _next_task_id(self) -> int:
         int
             The next task id.
         """
-        return _ffi_api.TaskSchedulerNextTaskId(self)  # pylint: disable=no-member
+        return _ffi_api.TaskSchedulerNextTaskId(self)  # type: ignore # pylint: disable=no-member
 
 
 @register_object("meta_schedule.PyTaskScheduler")
@@ -98,7 +98,7 @@ def f_next_task_id() -> int:
             return self._next_task_id()
 
         self.__init_handle_by_constructor__(
-            _ffi_api.TaskSchedulerPyTaskScheduler,  # pylint: disable=no-member
+            _ffi_api.TaskSchedulerPyTaskScheduler,  # type: ignore # pylint: disable=no-member
             f_tune,
             f_set_task_stopped,
             f_is_task_running,
@@ -110,13 +110,13 @@ def tune(self) -> None:
         raise NotImplementedError()
 
     def _set_task_stopped(self, task_id: int) -> None:
-        _ffi_api.TaskSchedulerSetTaskStopped(self, task_id)  # pylint: disable=no-member
+        _ffi_api.TaskSchedulerSetTaskStopped(self, task_id)  # type: ignore # pylint: disable=no-member
 
     def _is_task_running(self, task_id: int) -> bool:
-        return _ffi_api.TaskSchedulerIsTaskRunning(self, task_id)  # pylint: disable=no-member
+        return _ffi_api.TaskSchedulerIsTaskRunning(self, task_id)  # type: ignore # pylint: disable=no-member
 
     def _join_running_task(self, task_id: int) -> None:
-        _ffi_api.TaskSchedulerJoinRunningTask(self, task_id)  # pylint: disable=no-member
+        _ffi_api.TaskSchedulerJoinRunningTask(self, task_id)  # type: ignore # pylint: disable=no-member
 
     def _next_task_id(self) -> int:
-        return _ffi_api.TaskSchedulerNextTaskId(self)  # pylint: disable=no-member
+        return _ffi_api.TaskSchedulerNextTaskId(self)  # type: ignore # pylint: disable=no-member
diff --git a/python/tvm/meta_schedule/utils.py b/python/tvm/meta_schedule/utils.py
index bf2ef17fb308..c79137d55dda 100644
--- a/python/tvm/meta_schedule/utils.py
+++ b/python/tvm/meta_schedule/utils.py
@@ -20,7 +20,7 @@
 import shutil
 from typing import Any, Callable, List, Optional, Union
 
-import psutil
+import psutil  # type: ignore
 import tvm
 from tvm._ffi import get_global_func, register_func
 from tvm.error import TVMError
diff --git a/python/tvm/relay/backend/contrib/ethosu/legalize.py b/python/tvm/relay/backend/contrib/ethosu/legalize.py
index fd58da803623..b970aec62c6f 100644
--- a/python/tvm/relay/backend/contrib/ethosu/legalize.py
+++ b/python/tvm/relay/backend/contrib/ethosu/legalize.py
@@ -208,6 +208,99 @@ def __call__(self, *args, **kwargs):
         pass
 
 
+class EthosuDepthwiseConv2DRewriter(DFPatternCallback):
+    """Convert ethosu.qnn_depthwise_conv2d composite functions to ethosu_depthwise_conv2d
+    operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (
+            wildcard().has_attr(
+                {"Composite": ethosu_patterns.QnnDepthwiseConv2DParams.composite_name}
+            )
+        )(wildcard())
+
+    def callback(
+        self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map
+    ) -> tvm.relay.Expr:
+        params = ethosu_patterns.QnnDepthwiseConv2DParams(post.op.body)
+        params.ifm.tensor = post.args[0]
+        channels_map = {
+            "NHWC": 3,
+        }
+        if str(params.ofm.layout) not in channels_map.keys():
+            raise UnsupportedLayout(str(params.ofm.layout))
+        kernel_shape_map = {
+            "HWOI": params.weights.shape[0:2],
+        }
+        if str(params.weights.layout) not in kernel_shape_map.keys():
+            raise UnsupportedLayout(str(params.weights.layout))
+
+        weights_values = params.weights.values
+        weights_values_ohwi = np.moveaxis(weights_values, [0, 1, 2, 3], [1, 2, 0, 3])
+
+        activation = "NONE"
+        # Activations requiring LUT is currently not supported, so setting it to an empty list
+        lut = relay.const([], "int8")
+        clip_min = 0
+        clip_max = 0
+        if params.activation:
+            activation = ethosu_patterns.QnnDepthwiseConv2DParams.activation_map[
+                params.activation.op.name
+            ]
+            if activation == "CLIP":
+                clip_min = int(params.activation.attrs.a_min)
+                clip_max = int(params.activation.attrs.a_max)
+        scale_bias = vela_api.pack_biases(
+            biases=params.biases.tensor.data.asnumpy(),
+            ifm_scale=params.ifm.q_params.scale_f32,
+            ifm_dtype=np.dtype(params.ifm.dtype),
+            weight_scales=params.weights.q_params.scale_f32,
+            ofm_scale=params.ofm.q_params.scale_f32,
+            is_activation_tanh_or_sigmoid=activation in ["TANH", "SIGMOID"],
+        )
+
+        ethosu_depthwise_conv2d = ethosu_ops.ethosu_depthwise_conv2d(
+            post.args[0],  # IFM
+            relay.const(weights_values_ohwi, params.weights.values.dtype),
+            relay.const(scale_bias, "uint8"),
+            lut,
+            float(params.ifm.q_params.scale_f32),
+            int(params.ifm.q_params.zero_point),
+            int(params.weights.q_params.zero_point),
+            float(params.ofm.q_params.scale_f32),
+            int(params.ofm.q_params.zero_point),
+            kernel_shape_map[str(params.weights.layout)],
+            params.ofm.shape[channels_map[str(params.ofm.layout)]],
+            strides=params.strides,
+            padding=params.padding,
+            dilation=params.dilation,
+            activation=activation,
+            clip_min=clip_min,
+            clip_max=clip_max,
+            upscale="NONE",
+            ifm_layout=str(params.ifm.layout),
+            ofm_layout=str(params.ofm.layout),
+        )
+        return ethosu_depthwise_conv2d
+
+
+@ir.transform.module_pass(opt_level=1)
+class LegalizeEthosUDepthwiseConv2D:
+    """This is the pass that wraps the EthosUDepthwiseConv2DRewriter"""
+
+    def transform_module(
+        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
+    ) -> tvm.ir.IRModule:
+        for global_var, func in mod.functions.items():
+            func = rewrite(EthosuDepthwiseConv2DRewriter(), func)
+            mod.update_func(global_var, func)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
 @ir.transform.module_pass(opt_level=1)
 class LegalizeEthosU:
     """This is the pass to call graph-rewrites to perform graph transformation
@@ -220,6 +313,7 @@ def transform_module(
     ) -> tvm.ir.IRModule:
         mod = LegalizeSplit()(mod)
         mod = LegalizeEthosUConv2D()(mod)
+        mod = LegalizeEthosUDepthwiseConv2D()(mod)
         return mod
 
     def __call__(self, *args, **kwargs):
diff --git a/python/tvm/relay/backend/contrib/ethosu/op/__init__.py b/python/tvm/relay/backend/contrib/ethosu/op/__init__.py
index 0406298f23f4..1063db6a04c5 100644
--- a/python/tvm/relay/backend/contrib/ethosu/op/__init__.py
+++ b/python/tvm/relay/backend/contrib/ethosu/op/__init__.py
@@ -17,3 +17,4 @@
 "Relay operators for the Arm(R) Ethos(TM)-U NPU"
 
 from .convolution import ethosu_conv2d
+from .depthwise import ethosu_depthwise_conv2d
diff --git a/python/tvm/relay/backend/contrib/ethosu/op/depthwise.py b/python/tvm/relay/backend/contrib/ethosu/op/depthwise.py
new file mode 100644
index 000000000000..abcddf90b97c
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/op/depthwise.py
@@ -0,0 +1,205 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""Relay operator for depthwise convolution"""
+from typing import Tuple
+
+import tvm
+from tvm.relay.op import _make
+from tvm.topi.generic import schedule_injective
+from tvm.relay.op.op import OpStrategy
+from tvm.relay.op import strategy as _strategy
+
+from ..te import depthwise_conv2d_compute
+
+
+def _extract_ethosu_depthwise_conv2d_params(attrs, args):
+    """Get the parameters necessary to construct a ethosu_depthwise_conv2d compute TE
+    from a ethosu_depthwise_conv2d Relay call."""
+    ifm = args[0]
+    weight = args[1]
+    scale_bias = args[2]
+    lut = args[3]
+    ifm_scale = attrs.ifm_scale
+    ifm_zero_point = attrs.ifm_zero_point
+    weight_zero_point = attrs.weight_zero_point
+    ofm_scale = attrs.ofm_scale
+    ofm_zero_point = attrs.ofm_zero_point
+    strides = attrs.strides
+    padding = attrs.padding
+    dilation = attrs.dilation
+    activation = attrs.activation
+    clip_min = attrs.clip_min
+    clip_max = attrs.clip_max
+    upscale = attrs.upscale
+    ifm_layout = attrs.ifm_layout
+    ofm_layout = attrs.ofm_layout
+
+    return (
+        ifm,
+        weight,
+        scale_bias,
+        lut,
+        ifm_scale,
+        ifm_zero_point,
+        weight_zero_point,
+        ofm_scale,
+        ofm_zero_point,
+        strides,
+        padding,
+        dilation,
+        activation,
+        clip_min,
+        clip_max,
+        upscale,
+        ifm_layout,
+        ofm_layout,
+    )
+
+
+@tvm.ir.register_op_attr("contrib.ethosu.depthwise_conv2d", "FTVMCompute")
+def create_ethosu_depthwise_conv2d_compute(attrs, args, out_type):
+    """Create an ethosu_depthwise_conv2d compute op."""
+    params = _extract_ethosu_depthwise_conv2d_params(attrs, args)
+    op = depthwise_conv2d_compute(*params)
+    return [op]
+
+
+@tvm.ir.register_op_attr("contrib.ethosu.depthwise_conv2d", "FTVMStrategy")
+def depthwise_conv2d_strategy_ethosu(attrs, inputs, out_type, target):
+    strategy = OpStrategy()
+    strategy.add_implementation(
+        create_ethosu_depthwise_conv2d_compute,
+        _strategy.wrap_topi_schedule(schedule_injective),
+        name="ethosu_depthwise_conv2d",
+    )
+    return strategy
+
+
+def ethosu_depthwise_conv2d(
+    ifm: tvm.relay.Expr,
+    weight: tvm.relay.Expr,
+    scale_bias: tvm.relay.Expr,
+    lut: tvm.relay.Expr,
+    ifm_scale: float,
+    ifm_zero_point: int,
+    weight_zero_point: int,
+    ofm_scale: float,
+    ofm_zero_point: int,
+    kernel_shape: Tuple[int, int],
+    ofm_channels: int,
+    strides: Tuple[int, int] = (1, 1),
+    padding: Tuple[int, int, int, int] = (0, 0, 0, 0),
+    dilation: Tuple[int, int] = (1, 1),
+    activation: str = "NONE",
+    clip_min: int = 0,
+    clip_max: int = 0,
+    upscale: str = "NONE",
+    ifm_layout: str = "NHWC",
+    ofm_layout: str = "NHWC",
+) -> tvm.relay.Call:
+    """This is a quantized 2D depthwise convolution operation as supported
+    by the NPU. It accepts either NHWC or NHCWB16 format
+    for the input data and OHWI format for the kernel weights.
+
+    Reference: https://developer.arm.com/documentation/102420/0200/
+
+    Note that the per-channel weight scale and bias tensor must be
+    packed together into a combined tensor of uint80s. This is represented
+    in TVM by a (channels, 10) tensor of type uint8. For more detail,
+    refer to the Technical Reference Manual linked above.
+
+    Parameters
+    ----------
+    ifm : tvm.relay.Expr
+        The Input Feature Map tensor (IFM).
+    weight : tvm.relay.Expr
+        The weight tensor.
+    scale_bias : tvm.relay.Expr
+        The packed per-channel weight scale and bias tensor.
+    lut : tvm.relay.Expr
+        The look-up table values to use if activation = "LUT"
+    ifm_scale : float
+        The quantization scale for the Input Feature Map tensor.
+    ifm_zero_point : int
+        The quantization zero point for the Input Feature Map tensor.
+    weight_zero_point : int
+        The quantization zero point for the weight tensor.
+    ofm_scale : float
+        The quantization scale for the Output Feature Map tensor.
+    ofm_zero_point : int
+        The quantization zero point for the Output Feature Map tensor.
+    kernel_shape : tuple of int
+        The 2 dimensional kernel shape as (kernel_height, kernel_width).
+    ofm_channels : int
+        The number of OFM channels.
+    strides : tuple of int, optional
+        The 2 dimensional strides as (stride_height, stride_width).
+    padding : tuple of int, optional
+        The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right).
+    dilation : tuple of int, optional
+        The 2 dimensional dilation as (dilation_height, dilation_width).
+    activation : str, optional
+        The activation function to use.
+            "NONE" - no activation function.
+            "CLIP" - clip the output between clip_min and clip_max.
+            "TANH" - tanh activation function.
+            "SIGMOID" - sigmoid activation function.
+            "LUT" - use a look-up table to perform
+        the activation function.
+    clip_min : int, optional
+        The minimum clipping value if activation = "CLIP"
+    clip_max : int, optional,
+        The maximum clipping value if activation = "CLIP"
+    upscale : str, optional
+        The 2x2 upscaling mode to apply to the Input Feature Map tensor.
+            "NONE" - no upscaling.
+            "NEAREST" - upscale using nearest neighbour.
+            "ZEROS" - upscale using zeros.
+    ifm_layout : str, optional
+        The layout of the Input Feature Map tensor. Can be "NHWC" or "NHCWB16".
+    ofm_layout : str, optional
+        The layout of the Output Feature Map tensor. Can be "NHWC" or "NHCWB16".
+
+    Returns
+    -------
+    out : tvm.relay.Call
+        A call to the ethosu_depthwise_conv2d op.
+
+    """
+    return _make.ethosu_depthwise_conv2d(
+        ifm,
+        weight,
+        scale_bias,
+        lut,
+        ifm_scale,
+        ifm_zero_point,
+        weight_zero_point,
+        ofm_scale,
+        ofm_zero_point,
+        kernel_shape,
+        ofm_channels,
+        strides,
+        padding,
+        dilation,
+        activation,
+        clip_min,
+        clip_max,
+        upscale,
+        ifm_layout,
+        ofm_layout,
+    )
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/__init__.py b/python/tvm/relay/backend/contrib/ethosu/te/__init__.py
index 7ca5de3c160c..5dcdd4dcf602 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/__init__.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/__init__.py
@@ -17,3 +17,4 @@
 """Tensor Expressions for the NPU"""
 
 from .convolution import *
+from .depthwise import *
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
new file mode 100644
index 000000000000..35ae7f9a700a
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
@@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-argument
+"""Tensor Expressions for depthwise convolutions"""
+from typing import Tuple, Union, List
+
+from tvm import te
+from .dma import dma_ofm_compute, dma_ifm_compute
+
+
+def depthwise_conv2d_compute(
+    ifm: te.Tensor,
+    weight: te.Tensor,
+    scale_bias: te.Tensor,
+    lut: te.Tensor,
+    ifm_scale: float,
+    ifm_zero_point: int,
+    weight_zero_point: int,
+    ofm_scale: float,
+    ofm_zero_point: int,
+    strides: Tuple[int, int],
+    padding: Tuple[int, int, int, int],
+    dilation: Union[Tuple[int, int], List[int]],
+    activation: str,
+    clip_min: int,
+    clip_max: int,
+    upscale: str,
+    ifm_layout: str,
+    ofm_layout: str,
+) -> te.Tensor:
+    """A compute operator representing the capabilities of 2D convolution for the NPU.
+
+    Parameters
+    ----------
+    ifm : te.Tensor
+        The Input Feature Map tensor (IFM).
+    weight : te.Tensor
+        The weight tensor.
+    scale_bias : te.Tensor
+        The packed per-channel weight scale and bias tensor.
+    lut : te.Tensor
+        The look-up table values to use if activation = "LUT".
+    ifm_scale : float
+        The quantization scale for the Input Feature Map tensor.
+    ifm_zero_point : int
+        The quantization zero point for the Input Feature Map tensor.
+    weight_zero_point : int
+        The quantization zero point for the weight tensor.
+    ofm_scale : float
+        The quantization scale for the Output Feature Map tensor.
+    ofm_zero_point : int
+        The quantization zero point for the Output Feature Map tensor.
+    strides : tuple
+        The 2 dimensional strides as (stride_height, stride_width).
+    padding : tuple
+        The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right).
+    dilation : Union[int, tuple, list]
+        The 2 dimensional dilation as (dilation_height, dilation_width).
+    activation : str
+        The activation function to use.
+            "NONE" - no activation function.
+            "CLIP" - clip the output between clip_min and clip_max.
+            "TANH" - tanh activation function.
+            "SIGMOID" - sigmoid activation function.
+            "LUT" - use a look-up table to perform the activation function.
+    clip_min : int
+        The minimum clipping value if activation = "CLIP".
+    clip_max : int
+        The maximum clipping value if activation = "CLIP".
+    upscale : str
+        The 2x2 upscaling mode to apply to the Input Feature Map tensor.
+            "NONE" - no upscaling.
+            "NEAREST" - upscale using nearest neighbour.
+            "ZEROS" - upscale using zeros.
+    ifm_layout : str
+        The layout of the Input Feature Map tensor. Can be "NHWC" or "NHCWB16".
+    ofm_layout : str
+        The layout of the Output Feature Map tensor. Can be "NHWC" or "NHCWB16".
+
+    Returns
+    -------
+    te.Tensor
+        The OFM tensor.
+
+    """
+    assert ifm.shape[0] == 1, f"Only batch size 1 is supported"
+    assert ifm_layout in {"NHWC", "NHCWB16"}
+    assert ofm_layout in {"NHWC", "NHCWB16"}
+
+    stride_h, stride_w = strides
+    dilation_h, dilation_w = dilation
+    channels, kernel_h, kernel_w, _ = weight.shape
+
+    # Compute operation for the IFM DMA pipeline
+    dmaed_ifm = dma_ifm_compute(ifm, ifm_layout, ifm_zero_point, ifm_scale, channels, padding)
+
+    # 2D Depthwise Convolution compute operation
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    ofm_height = (dmaed_ifm.shape[1] - dilated_kernel_h) // stride_h + 1
+    ofm_width = (dmaed_ifm.shape[2] - dilated_kernel_w) // stride_w + 1
+    rh = te.reduce_axis((0, kernel_h), name="ry")
+    rw = te.reduce_axis((0, kernel_w), name="rx")
+
+    depthwise_conv2d_attrs = {
+        "op": "ethosu_depthwise_conv2d",
+        "weight_zero_point": weight_zero_point,
+        "activation": activation,
+        "upscale": upscale,
+        "clip_min": clip_min,
+        "clip_max": clip_max,
+        "stride_h": stride_h,
+        "stride_w": stride_w,
+        "dilation_h": dilation_h,
+        "dilation_w": dilation_w,
+    }
+
+    depthwise = te.compute(
+        (1, ofm_height, ofm_width, channels),
+        lambda nn, hh, ww, cc: te.sum(
+            dmaed_ifm(
+                nn, hh * stride_h + rh * dilation_h, ww * stride_w + rw * dilation_w, cc
+            ).astype(ifm.dtype)
+            * weight[cc, rh, rw, 0].astype(ifm.dtype)
+            # This is a trick to load 10 elements of the scale_bias at once, not accurate maths
+            + (scale_bias[cc, 0] * scale_bias[cc, 9]).astype(ifm.dtype),
+            axis=[rh, rw],
+        ),
+        name="ethosu_depthwise_conv2d",
+        attrs=depthwise_conv2d_attrs,
+    )
+
+    # Compute operation for the OFM DMA pipeline
+    return dma_ofm_compute(depthwise, ofm_layout, ofm_zero_point, ofm_scale, channels)
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
index c59a386fefbb..c792ade06643 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -15,11 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name, unused-argument
-"""The integration of Arm(R) Ethos(TM)-U NPU TIR compiler"""
+"""The integration of the Arm(R) Ethos(TM)-U NPU TIR compiler."""
 import tvm
 from tvm import relay
 from tvm.relay.expr_functor import ExprMutator
-from tvm.driver.build_module import get_binds
+from tvm.driver.build_module import schedule_to_module
 
 from .passes import ReplaceOperators, RemoveZeroStores, EncodeConstants
 from .scheduler import schedule
@@ -29,7 +29,7 @@ def lower_ethosu(sch, args, const_dict, name="main"):
     """Lower a schedule to TIR for the Arm(R) Ethos(TM)-U NPU target.
 
     The resulting TIR module will contain a single function
-    that comprises of a sequence of tir.extern_calls to NPU
+    that consists of a sequence of tir.extern_calls to NPU
     operations.
 
     Parameters
@@ -64,22 +64,17 @@ def lower_ethosu(sch, args, const_dict, name="main"):
             "no_unroll_loop_with_extent_one": True,
         },
         "tir.UnrollLoop": {"auto_max_depth": -1},
+        "tir.noalias": True,
+        "tir.debug_keep_trivial_loop": True,
     }
     # Merge two configs
     curr_cfg = {**curr_cfg, **tir_compiler_cfg}
 
     sch = sch.normalize()
-    bounds = tvm.te.schedule.InferBound(sch)
-    stmt = tvm.te.schedule.ScheduleOps(sch, bounds, True)
 
-    compact = tvm.te.schedule.VerifyCompactBuffer(stmt)
-    binds, arg_list = get_binds(args, compact, None)
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc(arg_list, stmt, binds)
-
-    func = func.with_attr("global_symbol", name)
-    func = func.with_attr("tir.noalias", True)
-    mod = tvm.IRModule({name: func})
     with tvm.transform.PassContext(config=curr_cfg):
+        mod = schedule_to_module(sch, args, name)
+
         mod = tvm.tir.transform.Simplify()(mod)
         mod = tvm.tir.transform.StorageFlatten(64)(mod)
         mod = tvm.tir.transform.UnrollLoop()(mod)
@@ -96,20 +91,20 @@ def lower_ethosu(sch, args, const_dict, name="main"):
 
 
 def lower_to_te(prim_func):
-    """Lower a Relay primitive function to a Tensor Expression graph.
+    """Lower a Relay primitive function to a Tensor Expression in an unscheduled CachedFunc.
 
     Parameters
     ----------
     prim_func : tvm.relay.Function
-        The Relay function to lowerethosu_runtime([]).
+        The Relay function to lower.
 
     Returns
     -------
-    out : TEGraph
-        The lowered Tensor Expression graph.
+    out : CachedFunc
+        The lowered Tensor Expression as part of a CachedFunc.
 
     """
-    f = tvm._ffi.get_global_func("relay.backend.contrib.ethosu.LowerToTE")
+    f = tvm._ffi.get_global_func("relay.backend.LowerToTE")
     return f(prim_func)
 
 
@@ -193,7 +188,7 @@ def lower_to_tir(func, cascader=None):
     func, consts = extract_constants(func)
     mod = tvm.IRModule.from_expr(func)
     func = relay.transform.InferType()(mod)["main"]
-    te_graph = lower_to_te(func)
-    s = schedule(te_graph, consts, cascader)
-    mod, consts = lower_ethosu(s, te_graph, consts)
+    cached_func = lower_to_te(func)
+    s = schedule(cached_func, consts, cascader)
+    mod, consts = lower_ethosu(s, cached_func, consts)
     return mod, consts
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py b/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py
index 33fbdcd2b24f..fd7fa293ccfb 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name, unused-argument
-"""Extract information from the convolution operators in TIR."""
+"""Extract parameters from the convolution operators in TIR."""
 import tvm
 from ..vela_api import SCALE_BIAS_LENGTH
 from .utils import get_outer_loops, get_op_attrs, get_base_address, get_loads, get_stores
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/depthwise.py b/python/tvm/relay/backend/contrib/ethosu/tir/depthwise.py
new file mode 100644
index 000000000000..27111a970b27
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/depthwise.py
@@ -0,0 +1,116 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""Extract information from the depthwise convolution operators in TIR."""
+from typing import Dict, Tuple
+import tvm
+from ..vela_api import SCALE_BIAS_LENGTH
+from .utils import get_outer_loops, get_op_attrs, get_base_address, get_loads, get_stores
+from .dma import get_ifm_params, get_ofm_params
+from .spec import (
+    SerialKernel,
+    SerialAddressRange,
+    SerialActivation,
+    Serial2DDepthwise,
+)
+
+
+def get_depthwise_conv2d_params(
+    stmt: tvm.tir.AttrStmt,
+    producers: Dict[tvm.tir.Var, tvm.tir.AttrStmt],
+    consumers: Dict[tvm.tir.Var, tvm.tir.AttrStmt],
+) -> Tuple[Serial2DDepthwise, tvm.tir.Var, tvm.tir.Var]:
+    """Get the parameters necessary to construct a call_extern for a depthwise_conv2d.
+
+    Parameters
+    ----------
+    stmt : tvm.tir.AttrStmt
+        The outermost attribute statement of a depthwise loop nest.
+    producers : Dict[tvm.tir.Var, tvm.tir.AttrStmt]
+        A dictionary to associate pointers with the loop nest
+        that produces their values.
+    consumers : Dict[tvm.tir.Var, tvm.tir.AttrStmt]
+        A dictionary to associate pointers with the loop nest
+        that consumes their values.
+
+    Returns
+    -------
+    Serial2DDepthwise
+        The parameters needed to construct a 2D depthwise.
+    output_pointer : tvm.tir.Var
+        The output pointer of the convolution operation.
+    replace_pointer : tvm.tir.Var
+        The output pointer of the DMA write operation, which is to replace
+        the convolution output pointer.
+
+    """
+    attrs, body = get_op_attrs(stmt)
+    _, _, _, _, _, inner = get_outer_loops(body, "NHWC")
+    rh = inner
+    rw = rh.body
+    # loads = [output, input, weights, scale_bias, scale_bias]
+    loads = get_loads(rw.body)
+    # stores = [output]
+    stores = get_stores(rw.body)
+    input_pointer = loads[1].buffer_var
+    output_pointer = stores[0].buffer_var
+    # Get feature map info
+    serial_ifm, serial_padding = get_ifm_params(input_pointer, producers)
+    serial_ofm, replace_pointer = get_ofm_params(output_pointer, consumers)
+    # Get kernel info
+    serial_kernel = SerialKernel(
+        width=int(rw.extent),
+        height=int(rh.extent),
+        stride_w=int(attrs["stride_w"]),
+        stride_h=int(attrs["stride_h"]),
+        dilation_w=int(attrs["dilation_w"]),
+        dilation_h=int(attrs["dilation_h"]),
+    )
+    # Get scale_bias info
+    scale_bias_load = loads[3]
+    scale_bias_base = get_base_address(scale_bias_load.index)
+    serial_scale_bias = SerialAddressRange(
+        address=tvm.tir.Load("uint8", scale_bias_load.buffer_var, scale_bias_base),
+        length=SCALE_BIAS_LENGTH * serial_ofm[3],
+    )
+    # Get weight info
+    weight_load = loads[2]
+    weight_base = get_base_address(weight_load.index)
+    serial_weight = SerialAddressRange(
+        address=tvm.tir.Load("uint8", weight_load.buffer_var, weight_base),
+        length=serial_ofm[3] * serial_kernel[0] * serial_kernel[1],
+    )
+    # Get activation info
+    serial_activation = SerialActivation(
+        op=attrs["activation"], clip_min=attrs["clip_min"], clip_max=attrs["clip_max"]
+    )
+
+    return (
+        Serial2DDepthwise(
+            ifm=serial_ifm,
+            ofm=serial_ofm,
+            kernel=serial_kernel,
+            weight=serial_weight,
+            weight_zero_point=attrs["weight_zero_point"],
+            scale_bias=serial_scale_bias,
+            padding=serial_padding,
+            activation=serial_activation,
+            upscale="NONE",
+        ),
+        output_pointer,
+        replace_pointer,
+    )
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/dma.py b/python/tvm/relay/backend/contrib/ethosu/tir/dma.py
index ecd402d63309..a116e51c5b7c 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/dma.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/dma.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name, unused-argument
-"""Extract information from the DMA operators in TIR."""
+"""Extract parameters from the DMA operators in TIR."""
 import tvm
 from .utils import get_outer_loops, get_base_address, get_strides, get_op_attrs
 from .spec import SerialFeatureMap, SerialPadding
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
index 1af44962c141..761c8aad7bb1 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
@@ -15,12 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name, unused-argument
-"""The TIR passes to be run on Arm(R) Ethos(TM)-U NPU TIR Compiler"""
+"""The TIR passes to be run on Arm(R) Ethos(TM)-U NPU TIR Compiler."""
 import numpy as np  # type: ignore
 
 import tvm
 from tvm.relay.backend.contrib.ethosu import vela_api
 from .convolution import get_conv2d_params
+from .depthwise import get_depthwise_conv2d_params
 from .transform import get_copy_params
 from .utils import get_weights_pointer, get_scale_bias_pointer
 
@@ -52,6 +53,7 @@ def ReplaceOperators():
     op_map = {
         "ethosu_conv2d": get_conv2d_params,
         "ethosu_copy": get_copy_params,
+        "ethosu_depthwise_conv2d": get_depthwise_conv2d_params,
     }
     pointer_to_producer = {}
     pointer_to_consumer = {}
@@ -299,7 +301,7 @@ def EncodeConstants(const_dict):
     pointer_to_buffer = {}
     rewrite_buffer = {}
     rewrite_pointer = {}
-    accel_type = vela_api.get_target_accel_type()  # type: ignore
+    accel_config = vela_api.get_accelerator_config()
 
     def _align_scale_bias(tir_extern_call, bias):
         """Align the scale_bias to 16 bytes."""
@@ -314,7 +316,7 @@ def _align_scale_bias(tir_extern_call, bias):
 
     def _encode_weights(tir_extern_call, weights):
         """Encode the weights for a TIR extern call."""
-        value_bytes = vela_api.encode_weights(tir_extern_call, weights, accel_type)
+        value_bytes = vela_api.encode_weights(tir_extern_call, weights, accel_config)
         value = np.frombuffer(value_bytes, dtype="uint8")
         return value
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
index 5d9027bf2078..7f892d0c602a 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
@@ -15,17 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name, unused-argument
-"""Different schedulers for Arm(R) Ethos(TM)-U NPU"""
+"""Scheduling for Arm(R) Ethos(TM)-U NPU."""
 import tvm
 
 
-def schedule(te_graph, const_dict, cascader=None):
-    """Schedule a TE graph for NPU compilation.
+def schedule(cached_func, const_dict, cascader=None):
+    """Schedule a CachedFunc for NPU compilation.
 
     Parameters
     ----------
-    te_graph
-        The TE graph to schedule.
+    cached_func : CachedFunc
+        The CachedFunc to schedule.
     const_dict : dict of int to numpy.ndarray
         The constant dictionary.
     cascader : callable, optional
@@ -38,10 +38,10 @@ def schedule(te_graph, const_dict, cascader=None):
         The completed schedule for the graph.
 
     """
-    s = tvm.te.create_schedule([t.op for t in te_graph.outputs])
+    s = tvm.te.create_schedule([t.op for t in cached_func.outputs])
     if cascader:
-        cascader(te_graph, const_dict, s)
-    inline_no_ops(te_graph, s)
+        cascader(cached_func, const_dict, s)
+    inline_no_ops(cached_func, s)
     schedule_pragmas(s)
     schedule_cache_reads(s)
     return s
@@ -96,7 +96,7 @@ def total_cascader(stripe_size):
 
     """
 
-    def _cascader(te_graph, const_dict, sch):
+    def _cascader(cached_func, const_dict, sch):
         scheduled = set()
 
         def _visit(tensor, stage, ax):
@@ -106,8 +106,8 @@ def _visit(tensor, stage, ax):
                 for input_tensor in tensor.op.input_tensors:
                     _visit(input_tensor, stage, ax)
 
-        assert len(te_graph.outputs) == 1
-        out = te_graph.outputs[0]
+        assert len(cached_func.outputs) == 1
+        out = cached_func.outputs[0]
         oi, _ = tile_nd(sch, out, stripe_size)
         for ax in oi:
             sch[out].unroll(ax)
@@ -126,14 +126,14 @@ def copy_constants():
         The planning function.
     """
 
-    def _planner(te_graph, const_dict, sch):
+    def _planner(cached_func, const_dict, sch):
         planned = set()  # type: ignore
 
         def _visit(tensor, reader):
             if tensor is not planned:
                 planned.add(tensor)
                 if isinstance(tensor.op, tvm.te.PlaceholderOp):
-                    index = list(te_graph.inputs).index(tensor)
+                    index = list(cached_func.inputs).index(tensor)
                     if index in const_dict:
                         sch.cache_read(tensor, "global", [reader])
 
@@ -141,7 +141,7 @@ def _visit(tensor, reader):
                     for input_tensor in tensor.op.input_tensors:
                         _visit(input_tensor, tensor)
 
-        for output_tensor in te_graph.outputs:
+        for output_tensor in cached_func.outputs:
             _visit(output_tensor, None)
 
     return _planner
@@ -216,7 +216,7 @@ def _detect_cache_read(stage):
             stage.pragma(fax, "op", "ethosu_copy")
 
 
-def inline_no_ops(te_graph, sch):
+def inline_no_ops(cached_func, sch):
     """Inline 'no-ops' - operations that in principle do nothing.
 
     Modifies the schedule in-place. For now we inline reshape and
@@ -224,8 +224,8 @@ def inline_no_ops(te_graph, sch):
 
     Parameters
     ----------
-    te_graph
-        The TE graph.
+    cached_func : CachedFunc
+        The cached func.
     sch : tvm.te.Schedule
         The schedule.
 
@@ -241,7 +241,7 @@ def _visit(tensor):
             for input_tensor in tensor.op.input_tensors:
                 _visit(input_tensor)
 
-    for out in te_graph.outputs:
+    for out in cached_func.outputs:
         _visit(out)
 
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/spec.py b/python/tvm/relay/backend/contrib/ethosu/tir/spec.py
index 3ecbcd5f3cdc..ff019c7783db 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/spec.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/spec.py
@@ -203,7 +203,7 @@ def __init__(
 
 class Serial2DDepthwise(SerializableFormat):
     """Specialization class to retrieve arguments of
-    a ethosu.depthwise2d tir extern call on a predefined ordering"""
+    a ethosu.depthwise_conv2d TIR extern call on a predefined ordering"""
 
     def __init__(
         self,
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/transform.py b/python/tvm/relay/backend/contrib/ethosu/tir/transform.py
index 0403ce2c7e8f..f50975c83838 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/transform.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/transform.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name, unused-argument
-"""Extract information from the transform operators in TIR."""
+"""Extract parameters from the transform operators in TIR."""
 import tvm
 from .spec import SerialCopy
 from .utils import get_base_address, get_op_attrs
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/utils.py b/python/tvm/relay/backend/contrib/ethosu/tir/utils.py
index 7d6fd3bf82d8..de1c0ab19f6e 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/utils.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/utils.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name
-"""Helper utility functions used by the TIR compiler"""
+"""Helper utility functions used by the NPU TIR compiler"""
 import tvm
 from tvm import arith
 
@@ -23,7 +23,8 @@
 # TODO(@mbaret): Formalise this with a specification
 def get_weights_pointer(tir_extern_call):
     """Get the weights pointer from a NPU extern call if it exists"""
-    if tir_extern_call.args[0] == "ethosu_conv2d":
+    supported_ops = ["ethosu_conv2d", "ethosu_depthwise_conv2d"]
+    if tir_extern_call.args[0] in supported_ops:
         return tir_extern_call.args[41].buffer_var
     return None
 
@@ -31,7 +32,8 @@ def get_weights_pointer(tir_extern_call):
 # TODO(@mbaret): Formalise this with a specification
 def get_scale_bias_pointer(tir_extern_call):
     """Get the scale_bias pointer from a NPU extern call if it exists"""
-    if tir_extern_call.args[0] == "ethosu_conv2d":
+    supported_ops = ["ethosu_conv2d", "ethosu_depthwise_conv2d"]
+    if tir_extern_call.args[0] in supported_ops:
         return tir_extern_call.args[44].buffer_var
     return None
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
index 4b28dc5b191e..bcae01a10214 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
@@ -18,7 +18,7 @@
 the Relay to TIR compilation process, to Vela API calls to
 generate command stream.
 """
-from typing import NamedTuple
+from typing import Dict, NamedTuple, Tuple, Union
 from enum import auto
 from enum import Enum
 import numpy as np  # type: ignore
@@ -32,7 +32,7 @@
 
 
 class BufferType(Enum):
-    """The buffer types the codegen supports"""
+    """The type of information that a buffer contains."""
 
     constant = auto()
     input_or_output = auto()
@@ -50,7 +50,7 @@ class BufferType(Enum):
 
 
 class BufferInfo(NamedTuple):
-    """A data structure to hold metadata of the buffer"""
+    """A data structure to hold metadata of the buffer."""
 
     # If the buffer holds constants, the values will contain that otherwise None
     values: np.ndarray
@@ -90,9 +90,9 @@ def translate(tir_module, params):
     for extern_call in extern_calls:
         _npu_ops.append(translate_ethosu_tir_extern_call(extern_call))
     _npu_ops, constant_tensor, scratch_size = assign_addresses(buffer_info, _npu_ops)
-    target_accel_type = vela_api.get_target_accel_type()
-    cmds = vapi.npu_generate_register_command_stream(_npu_ops, target_accel_type)
-    payload = vapi.npu_create_driver_payload(cmds, target_accel_type)
+    target_accel_config = vela_api.get_accelerator_config()
+    cmds = vapi.npu_generate_register_command_stream(_npu_ops, target_accel_config)
+    payload = vapi.npu_create_driver_payload(cmds, target_accel_config)
     hex_value = "" if constant_tensor is None else constant_tensor.tobytes().hex()
     return payload.hex(), hex_value, scratch_size
 
@@ -125,9 +125,10 @@ def populate_extern_calls(stmt):
     return extern_calls
 
 
-def extract_buffer_info(mod, param_dict):
-    """
-    This function is to read the tvm.IRModule that
+def extract_buffer_info(
+    mod: tvm.IRModule, param_dict: Dict[int, np.ndarray]
+) -> Dict[str, BufferInfo]:
+    """This function is to read the tvm.IRModule that
     contains Relay to TIR compiled IRModule. Thereafter,
     this will extract the buffer information as the shape
     and constant data (if any).
@@ -136,12 +137,14 @@ def extract_buffer_info(mod, param_dict):
     ----------
     mod : tvm.IRModule
         The NPU TIR IRModule.
-    param_dict : dict
+    param_dict : Dict[int, np.ndarray]
         A dictionary containing param idx --> const numpy.NDArray
+
     Returns
     -------
-    dict
-        a dictionary of buffer names --> BufferInfo
+    dict : Dict[str, BufferInfo]
+        A dictionary of buffer names --> BufferInfo
+
     """
     buffer_info = dict()
     # There should only be a single function
@@ -299,6 +302,7 @@ def translate_ethosu_tir_extern_call(tir_extern_call):
     supported_extern_calls = {
         "ethosu_conv2d": translate_ethosu_conv2d,
         "ethosu_copy": translate_ethosu_copy,
+        "ethosu_depthwise_conv2d": translate_ethosu_depthwise_conv2d,
     }
     ext_call_type = tir_extern_call.args[0].value
     assert ext_call_type in supported_extern_calls.keys(), f"{ext_call_type} is not yet supported"
@@ -327,14 +331,15 @@ def translate_ethosu_copy(tir_extern_call):
     return _create_npu_dma_op(serial_object)
 
 
-def _convert_clip_bounds(npu_op):
-    """
-    This function will convert the min and max value
+def _convert_clip_bounds(npu_op: vapi.NpuBlockOperation):
+    """This function will convert the min and max value
     of clip activations to non quantized floats as
     expected by the API.
+
     Parameters
     ----------
-    npu_op : ethosu.vela.api.NpuBlockOperation
+    npu_op : vapi.NpuBlockOperation
+
     """
     clip_min_quant = npu_op.activation.min
     clip_max_quant = npu_op.activation.max
@@ -348,13 +353,14 @@ def _convert_clip_bounds(npu_op):
     npu_op.activation.max = clip_max_actual
 
 
-def translate_ethosu_conv2d(tir_extern_call):
-    """This function will translate a tir extern_call
-    as produced by Relay to TIR compilation.
+def translate_ethosu_conv2d(tir_call_extern: tvm.tir.Call) -> Tuple[vapi.NpuConv2DOperation, int]:
+    """This function will translate a TIR call_extern
+    as produced by NPU Relay to TIR compilation.
+
     Parameters
     ----------
-    tir_extern_call : tvm.tir.Call
-        This should be an tir external call that has a agreed upon ordering
+    tir_call_extern : tvm.tir.Call
+        This should be a TIR call_extern that has a agreed upon ordering
         for TIR Compiler. See Serial2DConvolution in
         tvm/relay/backend/contrib/ethosu/tir/spec.py for the ordering.
 
@@ -364,15 +370,18 @@ def translate_ethosu_conv2d(tir_extern_call):
         The vela object containing the params of ethosu_conv2d
     weights_zero_point : int
         The zero point of the weights
+
     """
-    # We skip the first element as it is the extern_call function name
-    serial_object = spec.create_serial_object(spec.Serial2DConvolution, tir_extern_call.args[1:])
+    # We skip the first element as it is the call_extern function name
+    serial_object = spec.create_serial_object(spec.Serial2DConvolution, tir_call_extern.args[1:])
     return _create_npu_op_conv2d(serial_object)
 
 
-def _create_npu_op_conv2d(serial_2d_convolution):
+def _create_npu_op_conv2d(
+    serial_2d_convolution: spec.Serial2DConvolution,
+) -> Tuple[vapi.NpuConv2DOperation, int]:
     """This is a helper function to capture a list
-    of arguments to create Vela NpuConv2DOperation object
+    of arguments to create Vela NpuConv2DOperation object.
     """
     npu_conv2d_op = vapi.NpuConv2DOperation()
     npu_conv2d_op.ifm = _create_npu_feature_map(serial_2d_convolution.ifm)
@@ -391,8 +400,8 @@ def _create_npu_op_conv2d(serial_2d_convolution):
         _convert_clip_bounds(npu_conv2d_op)
 
     npu_conv2d_op.upscale = _create_npu_resampling_mode(serial_2d_convolution.upscale)
-    target_accel_type = vela_api.get_target_accel_type()  # type: ignore
-    block_config = vela_api.get_optimal_block_config(npu_conv2d_op, target_accel_type)
+    accel_config = vela_api.get_accelerator_config()
+    block_config = vela_api.get_optimal_block_config(npu_conv2d_op, accel_config)
     npu_conv2d_op.block_config = block_config
     weights_shape_ohwi = [
         npu_conv2d_op.ofm.shape.depth,
@@ -408,9 +417,57 @@ def _create_npu_op_conv2d(serial_2d_convolution):
     return npu_conv2d_op, weights_zero_point
 
 
-def _create_npu_feature_map(serial_feature_map):
+def translate_ethosu_depthwise_conv2d(tir_extern_call):
+    """This function will translate a tir extern_call
+    as produced by Relay to TIR compilation.
+
+    Parameters
+    ----------
+    tir_extern_call : tvm.tir.Call
+        This should be a tir external call that has an agreed upon ordering
+        for NPU TIR Compiler. See Serial2DDepthwise in
+        tvm/relay/backend/contrib/ethosu/tir/spec.py for the ordering.
+
+    Returns
+    -------
+    ethosu.vela.api.NpuDepthWiseOperation
+        The vela object containing the params of ethosu_depthwise_conv2d
+    weights_zero_point : int
+        The zero point of the weights
+    """
+    serial_object = spec.create_serial_object(spec.Serial2DDepthwise, tir_extern_call.args[1:])
+    return _create_npu_op_depthwise_conv2d(serial_object)
+
+
+def _create_npu_op_depthwise_conv2d(serial_2d_depthwise):
+    npu_depthwise_conv2d_op = vapi.NpuConvDepthWiseOperation()
+
+    npu_depthwise_conv2d_op.ifm = _create_npu_feature_map(serial_2d_depthwise.ifm)
+    npu_depthwise_conv2d_op.ofm = _create_npu_feature_map(serial_2d_depthwise.ofm)
+    npu_depthwise_conv2d_op.kernel = _create_npu_kernel(serial_2d_depthwise.kernel)
+    npu_depthwise_conv2d_op.weights = [_create_npu_address_range(serial_2d_depthwise.weight)]
+    weights_zero_point = np.int64(serial_2d_depthwise.weight_zero_point.value)
+    npu_depthwise_conv2d_op.biases = [_create_npu_address_range(serial_2d_depthwise.scale_bias)]
+    npu_depthwise_conv2d_op.padding = _create_npu_padding(serial_2d_depthwise.padding)
+
+    npu_depthwise_conv2d_op.activation = _create_npu_activation(serial_2d_depthwise.activation)
+    if (
+        npu_depthwise_conv2d_op.activation
+        and npu_depthwise_conv2d_op.activation.op_type == vapi.NpuActivationOp.NONE_OR_RELU
+    ):
+        _convert_clip_bounds(npu_depthwise_conv2d_op)
+
+    npu_depthwise_conv2d_op.upscale = _create_npu_resampling_mode(serial_2d_depthwise.upscale)
+    target_accel_config = vela_api.get_accelerator_config()
+    block_config = vela_api.get_optimal_block_config(npu_depthwise_conv2d_op, target_accel_config)
+    npu_depthwise_conv2d_op.block_config = block_config
+
+    return npu_depthwise_conv2d_op, weights_zero_point
+
+
+def _create_npu_feature_map(serial_feature_map: spec.SerialFeatureMap) -> vapi.NpuFeatureMap:
     """This is a helper function to capture a list
-    of arguments to create Vela NpuFeatureMap object
+    of arguments to create Vela NpuFeatureMap object.
     """
     layout_map = {"NHWC": vapi.NpuLayout.NHWC, "NHCWB16": vapi.NpuLayout.NHCWB16}
     datatype_map = {
@@ -427,14 +484,14 @@ def _create_npu_feature_map(serial_feature_map):
     nfm = vapi.NpuFeatureMap()
     nfm.data_type = datatype_map[data_type]
     nfm.shape = vapi.NpuShape3D(
-        int(serial_feature_map.height.value),
-        int(serial_feature_map.width.value),
-        int(serial_feature_map.channels.value),
+        int(serial_feature_map.height),
+        int(serial_feature_map.width),
+        int(serial_feature_map.channels),
     )
     nfm.tiles = vapi.NpuTileBox(
-        int(serial_feature_map.tile_height_0.value),
-        int(serial_feature_map.tile_height_1.value),
-        int(serial_feature_map.tile_width_0.value),
+        int(serial_feature_map.tile_height_0),
+        int(serial_feature_map.tile_height_1),
+        int(serial_feature_map.tile_width_0),
         [
             serial_feature_map.tile_address_0,
             serial_feature_map.tile_address_1,
@@ -447,81 +504,75 @@ def _create_npu_feature_map(serial_feature_map):
     )
     nfm.layout = layout_map[layout]
     nfm.strides = vapi.NpuShape3D(
-        int(serial_feature_map.stride_h.value),
-        int(serial_feature_map.stride_w.value),
-        int(serial_feature_map.stride_c.value),
+        int(serial_feature_map.stride_h),
+        int(serial_feature_map.stride_w),
+        int(serial_feature_map.stride_c),
     )
     return nfm
 
 
-def _create_npu_kernel(serial_kernel):
+def _create_npu_kernel(serial_kernel: spec.SerialKernel) -> vapi.NpuKernel:
     """This is a helper function to capture a list
-    of arguments to create Vela NpuKernel object
+    of arguments to create Vela NpuKernel object.
     """
     nknl = vapi.NpuKernel(
-        w=int(serial_kernel.width.value),
-        h=int(serial_kernel.height.value),
-        stride_x=int(serial_kernel.stride_w.value),
-        stride_y=int(serial_kernel.stride_h.value),
-        dilation_x=int(serial_kernel.dilation_w.value),
-        dilation_y=int(serial_kernel.dilation_h.value),
+        w=int(serial_kernel.width),
+        h=int(serial_kernel.height),
+        stride_x=int(serial_kernel.stride_w),
+        stride_y=int(serial_kernel.stride_h),
+        dilation_x=int(serial_kernel.dilation_w),
+        dilation_y=int(serial_kernel.dilation_h),
     )
     return nknl
 
 
-def _create_npu_address_range(serial_address_range):
+def _create_npu_address_range(
+    serial_address_range: spec.SerialAddressRange,
+) -> vapi.NpuAddressRange:
     """This is a helper function to capture a list
-    of arguments to create Vela NpuAddressRange object
+    of arguments to create Vela NpuAddressRange object.
     """
     addr_range = vapi.NpuAddressRange(
         # region will be updated later
         region=0,
         address=serial_address_range.address,
-        length=int(serial_address_range.length.value),
+        length=int(serial_address_range.length),
     )
     return addr_range
 
 
 def _create_npu_quantization(
-    scale,
-    zero_point,
-):
+    scale: Union[tvm.tir.FloatImm, float],
+    zero_point: Union[tvm.tir.IntImm, int],
+) -> vapi.NpuQuantization:
     """This is a helper function to capture a list
-    of arguments to create Vela NpuQuantization object
+    of arguments to create Vela NpuQuantization object.
     """
-    # Scale could be an ndarray if per-channel quantization is available
-    if not isinstance(scale, tvm.tir.expr.Load):
-        if isinstance(scale.value, float):
-            scale = np.single(scale.value)
-        else:
-            assert isinstance(scale.value.value, float)
-            scale = np.single(scale.value.value)
-    q_params = vapi.NpuQuantization(scale_f32=scale, zero_point=zero_point.value)
-    return q_params
+    return vapi.NpuQuantization(scale_f32=float(scale), zero_point=int(zero_point))
 
 
 def _create_npu_weights_zero_point(
-    zero_point,
-):
-    """This is a helper function to capture the weights zero point"""
-    return zero_point.value
+    zero_point: Union[int, tvm.tir.IntImm],
+) -> int:
+    """This is a helper function to capture the weights zero point."""
+    return int(zero_point)
 
 
-def _create_npu_padding(serial_padding):
+def _create_npu_padding(serial_padding: spec.SerialPadding) -> vapi.NpuPadding:
     """This is a helper function to capture a list
-    of arguments to create Vela NpuPadding object"""
+    of arguments to create Vela NpuPadding object."""
     padding = vapi.NpuPadding(
-        top=int(serial_padding.top.value),
-        left=int(serial_padding.left.value),
-        bottom=int(serial_padding.bottom.value),
-        right=int(serial_padding.right.value),
+        top=int(serial_padding.top),
+        left=int(serial_padding.left),
+        bottom=int(serial_padding.bottom),
+        right=int(serial_padding.right),
     )
     return padding
 
 
-def _create_npu_activation(serial_activation):
+def _create_npu_activation(serial_activation: spec.SerialActivation) -> vapi.NpuActivation:
     """This is a helper function to capture a list
-    of arguments to create Vela NpuActivation object"""
+    of arguments to create Vela NpuActivation object."""
     if serial_activation.op == "NONE":
         return None
     if (
@@ -538,16 +589,16 @@ def _create_npu_activation(serial_activation):
     op = str(serial_activation.op.value)
     assert op in op_map.keys()
     act_op = vapi.NpuActivation(op_map[op])
-    act_op.min = int(serial_activation.clip_min.value)
-    act_op.max = int(serial_activation.clip_max.value)
+    act_op.min = int(serial_activation.clip_min)
+    act_op.max = int(serial_activation.clip_max)
     return act_op
 
 
 def _create_npu_resampling_mode(
-    mode,
-):
+    mode: str,
+) -> vapi.NpuResamplingMode:
     """This is a helper function to capture a list
-    of arguments to create Vela NpuResamplingMode object"""
+    of arguments to create Vela NpuResamplingMode object."""
     mode_map = {
         "NONE": vapi.NpuResamplingMode.NONE,
         "NEAREST": vapi.NpuResamplingMode.NEAREST,
diff --git a/python/tvm/relay/backend/contrib/ethosu/vela_api.py b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
index 5009c3157c77..69095e43416e 100644
--- a/python/tvm/relay/backend/contrib/ethosu/vela_api.py
+++ b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
@@ -27,6 +27,7 @@
 import numpy as np  # type: ignore
 from ethosu.vela import api as vapi  # type: ignore
 
+import tvm
 from tvm.relay.backend.contrib.ethosu import util  # type: ignore
 from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator as tirtocs
 
@@ -45,7 +46,7 @@
 
 
 def get_optimal_block_config(
-    npu_op: vapi.NpuOperation, accel_type: vapi.NpuAccelerator
+    npu_op: vapi.NpuOperation, accel_config: vapi.NpuAccelerator
 ) -> vapi.NpuShape3D:
     """
     "The NPU's unit of work is known as a block. It will fetch block(s) from Input
@@ -58,15 +59,15 @@ def get_optimal_block_config(
     ----------
     npu_op : ethosu.vela.api.NpuOperation
         The NPU operation and its params
-    accel_type : ethosu.vela.api.NpuAccelerator
-        The NPU accelerator variant
+    accel_config : ethosu.vela.api.NpuAccelerator
+        The NPU accelerator config
 
     Returns
     -------
     ethosu.vela.api.NpuShape3D :
         The optimal block config for the operator
     """
-    all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_type)
+    all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config)
     return _get_optimal_block_config(all_valid_block_configs)
 
 
@@ -112,7 +113,9 @@ def _get_optimal_block_config(all_valid_block_configs: List[vapi.NpuShape3D]) ->
     return max_area_depth_block_configs[0]
 
 
-def encode_weights(tir_extern_call, values, accel_type):
+def encode_weights(
+    tir_extern_call: tvm.tir.Call, values: np.ndarray, accel_config: vapi.NpuAccelerator
+):
     """This is an API function to compress weights by passing
     a tir_extern_call to NPU Convolution operation and values.
 
@@ -122,26 +125,30 @@ def encode_weights(tir_extern_call, values, accel_type):
         tir_extern_call to NPU Convolution operation
     values : numpy.ndarray
         The constant flattened weight data in OHWI layout
-    accel_type : ethosu.vela.api.NpuAccelerator
-        The NPU accelerator variant
+    accel_config : ethosu.vela.api.NpuAccelerator
+        The NPU accelerator config
 
     Returns
     -------
     bytearray
         Compressed weights
     """
-    supported_ops = ["ethosu_conv2d"]
+    supported_ops = {
+        "ethosu_conv2d": tirtocs.translate_ethosu_conv2d,
+        "ethosu_depthwise_conv2d": tirtocs.translate_ethosu_depthwise_conv2d,
+    }
     op = str(tir_extern_call.args[0].value)
-    assert op in supported_ops
-    npu_op, weights_zero_point = tirtocs.translate_ethosu_conv2d(tir_extern_call)
-    block_config = get_optimal_block_config(npu_op, accel_type)
+    assert op in supported_ops.keys()
+    npu_op, weights_zero_point = supported_ops[op](tir_extern_call)
+    block_config = get_optimal_block_config(npu_op, accel_config)
     # The weight layout is assumed to be flat OHWI, always.
     assert len(values.shape) == 1
+    is_depthwise = op == "ethosu_depthwise_conv2d"
     shape_ohwi = (
         npu_op.ofm.shape.depth,
         npu_op.kernel.height,
         npu_op.kernel.width,
-        npu_op.ifm.shape.depth,
+        1 if is_depthwise else npu_op.ifm.shape.depth,
     )
     assert values.size == np.prod(shape_ohwi)
     values = np.reshape(values, shape_ohwi)
@@ -153,9 +160,8 @@ def encode_weights(tir_extern_call, values, accel_type):
         ifm_bitdepth=npu_op.ifm.data_type.size_in_bits(),
         block_depth=block_config.depth,
         dilation=(npu_op.kernel.dilation_x, npu_op.kernel.dilation_y),
-        accel_type=accel_type,
-        # TODO(@manupa-arm): change this when we support depthwise
-        is_depthwise=False,
+        accel_config=accel_config,
+        is_depthwise=is_depthwise,
     )
 
 
@@ -166,7 +172,7 @@ def compress_weights(
     ifm_bitdepth: int,
     block_depth: int,
     dilation: Tuple[int, int],
-    accel_type: vapi.NpuAccelerator,
+    accel_config: vapi.NpuAccelerator,
     is_depthwise: Optional[bool] = False,
 ) -> bytearray:
     """The NPU requires the weights to be compressed
@@ -188,8 +194,8 @@ def compress_weights(
         The depth of the optimal block config for the operator
     dilation : tuple
         A tuple of 2 elements indicating dilation in h and w
-    accel_type : ethosu.vela.api.NpuAccelerator
-        The NPU accelerator variant
+    accel_config : ethosu.vela.api.NpuAccelerator
+        The NPU accelerator config
     is_depthwise : bool, Optional
         This indicates whether the weights are compressed for depthwise convolution
 
@@ -212,7 +218,7 @@ def compress_weights(
     ]
     block_traversal = calculate_block_traversal_mode(is_depthwise, shape_ohwi, ifm_bitdepth)
     compressed_weights = vapi.npu_encode_weights(
-        accelerator=accel_type,
+        accelerator=accel_config,
         weights_volume=weights_ohwi,
         dilation_xy=dilation,
         ifm_bitdepth=ifm_bitdepth,
@@ -358,15 +364,24 @@ def _calculate_hw_bias_scales(
     return hw_bias_scales
 
 
-def get_target_accel_type():
-    """This is a helper function to convert cli accelerator type str argument
-    to NpuAccelerator"""
+def get_accelerator_config() -> vapi.NpuAccelerator:
+    """Get the configuration of the NPU accelerator.
+
+    The configuration string provided as a compiler option is converted into
+    an NpuAccelerator object. Valid configuration strings:
+     - 'ethos-u55-256'
+     - 'ethos-u55-128'
+     - 'ethos-u55-64'
+     - 'ethos-u55-32'
+
+    """
     npu_accel_str_map = {
         "ethos-u55-256": vapi.NpuAccelerator.Ethos_U55_256,
         "ethos-u55-128": vapi.NpuAccelerator.Ethos_U55_128,
         "ethos-u55-64": vapi.NpuAccelerator.Ethos_U55_64,
         "ethos-u55-32": vapi.NpuAccelerator.Ethos_U55_32,
     }
-    accel_type_str = util.get_accelerator_config()
-    assert accel_type_str in npu_accel_str_map.keys(), f"{accel_type_str} is not supported"
-    return npu_accel_str_map[accel_type_str]
+    compiler_attrs = tvm.get_global_func("relay.ext.ethosu.get_compiler_attrs")()
+    accel_config_str = compiler_attrs.accelerator_config
+    assert accel_config_str in npu_accel_str_map.keys(), f"{accel_config_str} is not supported"
+    return npu_accel_str_map[accel_config_str]
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index c67ac1dc423d..f1686d2a03bb 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -123,7 +123,7 @@ def build(
             to setup the dimensions and parameters correctly.
             target_host is used to specify the host side codegen target.
             By default, llvm is used if it is enabled,
-            otherwise a stackvm intepreter is used.
+            otherwise a stackvm interpreter is used.
 
         params : dict of str to NDArray
             Input parameters to the graph that do not change
@@ -303,7 +303,7 @@ def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"
         setup the dimensions and parameters correctly.
         target_host is used to specify the host side codegen target.
         By default, llvm is used if it is enabled,
-        otherwise a stackvm intepreter is used.
+        otherwise a stackvm interpreter is used.
 
     params : dict of str to NDArray
         Input parameters to the graph that do not change
@@ -452,7 +452,7 @@ def bind_params_by_name(func, params):
 class GraphExecutor(_interpreter.Executor):
     """Wrapper around Executor interface.
 
-    This executor is used for debug and testing purpoes.
+    This executor is used for debug and testing purposes.
 
     Parameters
     ----------
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 3a4897ad3166..825a586918f8 100755
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -835,3 +835,34 @@ def lstm_cell(
         outputs_list.append(hidden_state)  # [seq_num, (batch, hidden_size)]
 
     return outputs_list, hidden_state, cell_state
+
+
+def ensure_scalar_shape(x):
+    """
+    Assume that `x` is a tensor with one element (regardless of tensor rank).
+    Return a version of that tensor with rank 0.
+    """
+    x_shape = infer_shape(x)
+    x_rank = len(x_shape)
+
+    if x_rank == 0:
+        return x
+
+    num_elem = np.prod(x_shape)
+    assert num_elem == 1, "Cannot squeeze tensor shape {} to scalar form.".format(x_shape)
+
+    return _op.squeeze(x)
+
+
+def try_resolve_var_to_const(x, graph_params):
+    """
+    Try to resolve the value of tensor `x` to a specific value.
+    If successful, return a Const op with that value.
+    If unsuccessful, simply return `x`.
+    """
+    if isinstance(x, _expr.Var) and x.name_hint in graph_params:
+        value = graph_params[x.name_hint].numpy()
+        dtype = infer_type(x).checked_type.dtype
+        return _op.const(value, dtype)
+
+    return x
diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index aa185923d02e..bf6293a2a90c 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -896,6 +896,7 @@ def _convert_lstm(inexpr, keras_layer, etab):
     in_data = _op.squeeze(in_data, axis=[0])
     in_data = _op.split(in_data, indices_or_sections=time_steps, axis=0)
     # loop for the number of time_steps
+    out_list = []  # store h outputs in case return_sequences is True
     for data in in_data:
         ixh1 = _op.nn.dense(data, kernel_weight, units=units)
         ixh2 = _op.nn.bias_add(_op.nn.dense(next_h, recurrent_weight, units=units), bias=in_bias)
@@ -906,8 +907,11 @@ def _convert_lstm(inexpr, keras_layer, etab):
         next_c = in_transform * next_c + in_gate * _convert_activation(gates[2], keras_layer, None)
         out_gate = _convert_recurrent_activation(gates[3], keras_layer)
         next_h = out_gate * _convert_activation(next_c, keras_layer, None)
+        if keras_layer.return_sequences:
+            out_list.append(_op.expand_dims(next_h, axis=1))
+    out = _op.concatenate(out_list, axis=1) if keras_layer.return_sequences else next_h
     out_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.output_shape)[0])
-    out = _op.reshape(next_h, newshape=out_shape)
+    out = _op.reshape(out, newshape=out_shape)
     return [out, next_h, next_c]
 
 
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 86cb178d0875..5c112c7dfce0 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -40,6 +40,7 @@
 from .common import (
     AttrCvt,
     Renamer,
+    ensure_scalar_shape,
     fold_constant,
     get_name,
     get_relay_op,
@@ -50,6 +51,7 @@
     infer_value,
     lstm_cell,
     new_var,
+    try_resolve_var_to_const,
     unbind,
 )
 
@@ -2695,6 +2697,40 @@ def _impl_v10(cls, inputs, attr, params):
 
     @classmethod
     def _impl_v11(cls, inputs, attr, params):
+        scale = inputs[2]
+        scale_shape = infer_shape(scale)
+        if len(inputs) == 4:
+            assert (
+                len(scale_shape) == 0 or scale_shape[0] == 0
+            ), "One of scale or size should be passed, not both."
+            size = inputs[3]
+        else:
+            assert len(scale_shape) != 0, "One of scale or size should be passed."
+            size = _op.cast(shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale
+        return cls.v11_13_common(inputs, size, attr, params)
+
+    @classmethod
+    def _impl_v13(cls, inputs, attr, params):
+        scale = inputs[2]
+        size = inputs[3]
+        if size is not None:
+            assert scale is None, "One of scale or size should be passed, not both."
+        else:
+            scale_type = infer_type(scale)
+            scale_shape = scale_type.checked_type.shape
+            scale_dtype = scale_type.checked_type.dtype
+            assert len(scale_shape) != 0, "One of scale or size should be passed."
+            size = _op.cast(shape_of(inputs[0]), scale_dtype) * scale
+
+        return cls.v11_13_common(inputs, size, attr, params)
+
+    @classmethod
+    def v11_13_common(cls, inputs, size, attr, params):
+        """
+        Resize v11 and Resize v13 are identical except in how
+        they handle the passing of scale and size. This utility
+        provides the implementation for both
+        """
         ndims = len(infer_shape(inputs[0]))
         mode = attr.get("mode").decode("ascii")
         if mode == "nearest":
@@ -2713,16 +2749,6 @@ def _impl_v11(cls, inputs, attr, params):
         alpha = attr.get("cubic_coeff_a", -0.75)
         exclude = attr.get("exclude_outside", 0)
 
-        scale = inputs[2]
-        scale_shape = infer_shape(scale)
-        if len(inputs) == 4:
-            assert (
-                len(scale_shape) == 0 or scale_shape[0] == 0
-            ), "One of scale or size should be passed, not both."
-            size = inputs[3]
-        else:
-            assert len(scale_shape) != 0, "One of scale or size should be passed."
-            size = _op.cast(shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale
         out_size = fold_constant(_op.strided_slice(size, [2], [4]))
         out = None
         if ndims == 3:
@@ -3506,6 +3532,156 @@ def _impl_v10(cls, inputs, attr, params):
         return _qnn.op.quantize(out, c_scale, c_zero_point, out_dtype=dtype)
 
 
+class QLinearMatMul(OnnxOpConverter):
+    """
+    Operator converter for QLinearMatMul from Microsoft onnxruntime contrib opset.
+
+    Limitations:
+    - Only supports 2D input tensors.
+    - Not guaranteed to meet the integer-overflow behavior stipulated in the
+      ONNX documentation for this operator.
+    """
+
+    @classmethod
+    def _impl_v10(cls, inputs, attr, params):
+
+        # Some of the ops used below take scalar-like inputs, and may require either
+        # of the following:
+        #
+        # - the input is Const node (not merely an expression that *could* be reduced
+        #   to a single Const at graph-compilation time)
+        #
+        # - the input has a specific dtype
+        #
+        # This function attempts to present 'x' in a form that meets both of those
+        # requirements.
+        def try_resolve_to_const_scalar(x, dtype_override=None):
+            x2 = try_resolve_var_to_const(x, params)
+            x3 = ensure_scalar_shape(x2)
+
+            x_dtype = infer_type(x).checked_type.dtype
+            if (dtype_override is not None) and (dtype_override != x_dtype):
+                x4 = _op.cast(x3, dtype_override)
+            else:
+                x4 = x3
+
+            x5 = fold_constant(x4)
+            return x5
+
+        # Unpack the inputs and obtain some type info...
+        a, a_scale, a_zp, b, b_scale, b_zp, y_scale, y_zp = inputs
+
+        a_type = infer_type(a).checked_type  # 'T1' in ONNX doc for this op
+        a_scale_type = infer_type(a_scale).checked_type
+        a_zp_type = infer_type(a_zp).checked_type
+
+        b_type = infer_type(b).checked_type  # 'T2' in ONNX doc for this op
+        b_scale_type = infer_type(b_scale).checked_type
+        b_zp_type = infer_type(b_zp).checked_type
+
+        y_scale_type = infer_type(y_scale).checked_type
+        y_zp_type = infer_type(y_zp).checked_type  # 'T3' in ONNX doc for this op
+
+        a_shape = infer_shape(a)
+        b_shape = infer_shape(b)
+
+        # Verify type assumptions, based on the ONNX doc for this op...
+        assert a_type.dtype in ["int8", "uint8"]
+        assert a_scale_type.dtype == "float32"
+        assert a_zp_type.dtype == a_type.dtype
+
+        assert b_type.dtype in ["int8", "uint8"]
+        assert b_scale_type.dtype == "float32"
+        assert b_zp_type.dtype == b_type.dtype
+
+        assert y_scale_type.dtype == "float32"
+        assert y_zp_type.dtype in ["int8", "uint8"]
+
+        # TODO: relax this limitation in a future version of this importer.
+        a_rank = len(a_shape)
+        b_rank = len(b_shape)
+        assert (a_rank == 2) and (b_rank == 2), (
+            "QLinearMatMul importer currently requires both 'a' and 'b' tensors to be 2D, but"
+            " rank(a)={}, rank(b)={}".format(a_rank, b_rank)
+        )
+
+        # _qnn.op.dense requires the zero-point values to have dtype int32.
+        a_scale_scalar = try_resolve_to_const_scalar(a_scale)
+        a_zp_scalar = try_resolve_to_const_scalar(a_zp, "int32")
+
+        b_scale_scalar = try_resolve_to_const_scalar(b_scale)
+        b_zp_scalar = try_resolve_to_const_scalar(b_zp, "int32")
+
+        y_scale_scalar = try_resolve_to_const_scalar(y_scale)
+        y_zp_scalar = try_resolve_to_const_scalar(y_zp, "int32")
+
+        # TODO: Confirm that we're using 'num_hidden_units' correctly / as intended with
+        # the '_qnn.op.dense' instance below.
+        num_hidden_units = infer_shape(b)[-1]
+
+        # - Specify the matmul result dtype as int32, so that hopefully the matmul will use
+        #   a 32-bit accumulator as seems to be required by the ONNX op's documentation.
+        #
+        # TL;DR:
+        # The ONNX documentation for this op is clear about acceptable overflow
+        # behavior during the matmul operation:
+        #   - The scalar multiplication ops MAY NOT overflow.
+        #   - The scalar addition ops, which sum the results of the scalar multiplication,
+        #     MAY overflow, but if they do so, it must behave as one would expect during
+        #     32-bit integer-addition overflow.
+        # As of this writing, Relay's qnn.op.dense operator doesn't expose a way for us to
+        # express these constraints.
+        #
+        # TODO: Extend TVM / Relay / TIR / etc. to allow this kind of constraint to be
+        # expressed in a Relay graph. And then update this importer and various TVM
+        # backends accordingly.
+        matmul_result_dtype = "int32"
+
+        matmul_result = _qnn.op.dense(
+            a,
+            _op.transpose(b),
+            a_zp_scalar,
+            b_zp_scalar,
+            a_scale_scalar,
+            b_scale_scalar,
+            num_hidden_units,
+            matmul_result_dtype,
+        )
+
+        # This information might only be found in the C++ code-comments for the
+        # dense.matmul op, but the quantized tensor returned by _qnn.op.dense
+        # has scale==(a_scale_scalar * b_scale_scalar), and zero_point==0.
+        #
+        # 'matmul_result_zp_scalar' has type 'int32' to satisfy input requirements
+        # of the [de/re]quantize ops below.
+        matmul_result_scale_scalar = fold_constant(_op.multiply(a_scale_scalar, b_scale_scalar))
+        matmul_result_zp_scalar = _op.const(0, dtype="int32")
+
+        # requantize requires y_scale to be constant,
+        # if y_scale is not constant, doing dequantize -> quantize
+        if isinstance(y_scale_scalar, _expr.Constant):
+            y = _qnn.op.requantize(
+                matmul_result,
+                matmul_result_scale_scalar,
+                matmul_result_zp_scalar,
+                y_scale_scalar,
+                y_zp_scalar,
+                axis=-1,
+                rounding="TONEAREST",
+                out_dtype=y_zp_type.dtype,
+            )
+        else:
+            matmul_result_deq = _qnn.op.dequantize(
+                matmul_result, matmul_result_scale_scalar, matmul_result_zp_scalar, axis=0
+            )
+
+            y = _qnn.op.quantize(
+                matmul_result_deq, y_scale_scalar, y_zp_scalar, axis=0, out_dtype=y_zp_type.dtype
+            )
+
+        return y
+
+
 class QLinearMul(OnnxOpConverter):
     """Operator converter for QLinearMul from Microsoft onnxruntime contrib opset."""
 
@@ -4234,6 +4410,7 @@ def _get_convert_map(opset):
         "QLinearConv": QLinearConv.get_converter(opset),
         "QLinearConcat": QLinearConcat.get_converter(opset),
         "QLinearAdd": QLinearAdd.get_converter(opset),
+        "QLinearMatMul": QLinearMatMul.get_converter(opset),
         "QLinearMul": QLinearMul.get_converter(opset),
         "QLinearSigmoid": QLinearSigmoid.get_converter(opset),
         "ConvInteger": ConvInteger.get_converter(opset),
diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
index 755c9f70f4ab..c32449546f77 100644
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ b/python/tvm/relay/frontend/paddlepaddle.py
@@ -23,6 +23,7 @@
 import tvm
 from tvm.ir import IRModule
 
+from ... import nd as _nd
 from .. import analysis
 from .. import ty as _ty
 from .. import expr as _expr
@@ -389,8 +390,9 @@ def convert_feed(g, op, block):
 
     if block is not None:
         ipt_name = op.output("Out")[0]
-        dtype = op.attr("dtype")
-        dtype = _convert_dtype_value(dtype)
+        ipt_shape = block.var(ipt_name).shape
+        ipt_dtype = block.var(ipt_name).dtype
+        ipt_dtype = str(ipt_dtype).strip().split(".")[1]
     else:
         ipt_shape = op.shape
         ipt_dtype = str(op.dtype).strip().split(".")[1]
@@ -953,10 +955,12 @@ def extract_parameters(self, program, scope=None):
             if not var.persistable:
                 continue
             if isinstance(scope, dict):
-                self.params[name] = scope[name]
+                self.params[name] = _nd.array(scope[name])
             else:
-                self.params[name] = np.array(scope.var(name).get_tensor())
-            self.nodes[name] = _expr.const(self.params[name])
+                self.params[name] = _nd.array(np.array(scope.var(name).get_tensor()))
+            shape = self.params[name].shape
+            dtype = self.params[name].dtype
+            self.nodes[name] = new_var(name, shape=shape, dtype=dtype)
 
     def check_input_shape(self, op, block):
         """Check the shape information of model's inputs, fixed shape is recommended."""
@@ -1047,6 +1051,12 @@ def from_translated_layer(self, layer, shape_dict):
         free_vars = analysis.free_vars(outputs)
         func = _function.Function(free_vars, outputs)
         mod = IRModule.from_expr(func)
+        # remove unused parameters
+        final_params = dict()
+        for var in free_vars:
+            if var.name_hint in self.params:
+                final_params[var.name_hint] = self.params[var.name_hint]
+        self.params = final_params
         return mod, self.params
 
 
@@ -1055,6 +1065,24 @@ def from_paddle(program_or_layer, shape_dict=None, scope=None):
 
     PaddlePaddle Program/TranslatedLayer represent the computation graph of PaddlePaddle model,
     and PaddlePaddle scope stores all the weights of PaddlePaddle model.
+
+    Parameters
+    ----------
+    program_or_layer : object of `paddle.static.Program` or `paddle.jit.TranslatedLayer`
+        Loaded model by `paddle.static.load_inference_model` or `paddle.jit.load`
+
+    shape_dict : dict of str to tuple/list, optional
+        The input shape of model
+
+    scope : object of `paddle.static.Scope`, optional
+        The scope that saves all the weights of model, use `paddle.static.global_scope` by default
+
+    Returns
+    -------
+    mod : tvm.IRModule
+        The relay module for compilation
+
+    params : dict of str to tvm.nd.NDArray
     """
 
     import paddle
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index a66fc4736a98..3688ff5ff4e5 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -66,6 +66,7 @@ def __init__(self, model, subgraph, exp_tab):
         self.activation_fn_type = build_str_map(ActivationFunctionType())
         self.builtin_options = build_str_map(BuiltinOptions())
         self.prefetched_nodes = {}
+        self.allow_custom_ops = False
 
         # Add more operators
         self.convert_map = {
@@ -287,6 +288,10 @@ def get_op_code_str(self, op):
         if op_code_id == BuiltinOperator.CUSTOM:
             # Custom operator
             custom_op_code_str = self.model.OperatorCodes(op_code_list_idx).CustomCode()
+
+            if self.allow_custom_ops:
+                return "CUSTOM"
+
             if custom_op_code_str == b"TFLite_Detection_PostProcess":
                 return "DETECTION_POSTPROCESS"
 
@@ -3695,7 +3700,7 @@ def _input_type(model):
     return shape_dict, dtype_dict
 
 
-def from_tflite(model, shape_dict=None, dtype_dict=None):
+def from_tflite(model, shape_dict=None, dtype_dict=None, op_converter=OperatorConverter):
     """Convert from tflite model into compatible relay Function.
 
     Parameters
@@ -3755,7 +3760,7 @@ def from_tflite(model, shape_dict=None, dtype_dict=None):
         exp_tab.set_expr(model_input_name, _expr.var(model_input_name, shape=shape, dtype=dtype))
 
     # op code in model
-    op_converter = OperatorConverter(model, subgraph, exp_tab)
+    op_converter = op_converter(model, subgraph, exp_tab)
     op_converter.check_unsupported_ops()
     op_converter.convert_op_to_relay()
 
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 0284d2483ce5..76c806905b18 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -1174,3 +1174,23 @@ def gather_nd_shape_func(attrs, inputs, _):
     assert index_rank > 0, "index_rank needs to be specified for dynamic gather_nd"
 
     return [_gather_nd_shape(inputs[0], inputs[1], convert(batch_dims), convert(index_rank))]
+
+
+@script
+def _gather_shape(data_shape, indices_shape, axis):
+    out_shape = output_tensor((data_shape.shape[0],), "int64")
+    for i in range(data_shape.shape[0]):
+        if i != axis:
+            assert (
+                data_shape[i] == indices_shape[i]
+            ), "data and indices size at non-gather axes must be the same"
+        out_shape[i] = indices_shape[i]
+    return out_shape
+
+
+@_reg.register_shape_func("gather", False)
+def gather_shape_func(attrs, inputs, _):
+    """
+    Shape func for gather operator.
+    """
+    return [_gather_shape(inputs[0], inputs[1], attrs.axis)]
diff --git a/python/tvm/relay/op/contrib/ethosu.py b/python/tvm/relay/op/contrib/ethosu.py
index 85ddfd9a7ec8..ca417942840d 100644
--- a/python/tvm/relay/op/contrib/ethosu.py
+++ b/python/tvm/relay/op/contrib/ethosu.py
@@ -192,11 +192,11 @@ def __init__(self, func_body: tvm.relay.Function):
         bias_add = requantize_op.args[0]
         qnn_conv2d = bias_add.args[0]
         data_layout = qnn_conv2d.attrs.data_layout
-        kernel_layout = qnn_conv2d.attrs.kernel_layout
+        self.kernel_layout = qnn_conv2d.attrs.kernel_layout
         # We consider the weights & biases as params as it should be a Constant
         self.weights = TensorParams(
             qnn_conv2d.args[QConv2DArgs.WEIGHTS.value],
-            kernel_layout,
+            self.kernel_layout,
             qnn_conv2d.args[QConv2DArgs.WEIGHTS_SCALE.value],
             qnn_conv2d.args[QConv2DArgs.WEIGHTS_ZERO_POINT.value],
         )
@@ -219,16 +219,18 @@ def __init__(self, func_body: tvm.relay.Function):
             requantize_op.args[RequantArgs.OFM_SCALE.value],
             requantize_op.args[RequantArgs.OFM_ZERO_POINT.value],
         )
-        self.padding = qnn_conv2d.attrs.padding
-        self.strides = qnn_conv2d.attrs.strides
-        self.dilation = qnn_conv2d.attrs.dilation
+        attrs = qnn_conv2d.attrs
+        self.padding = attrs.padding
+        self.strides = attrs.strides
+        self.dilation = attrs.dilation
         self.activation = activation
+        self.channels = attrs.channels
 
         # If groups are equal to channel, its a depthwise_conv2d
-        self.groups = qnn_conv2d.attrs.groups
+        self.groups = attrs.groups
         self.is_depthwise = False
         channels_axis = {"HWIO": 3, "HWOI": 2}
-        if qnn_conv2d.attrs.groups == self.weights.shape[channels_axis[kernel_layout]]:
+        if self.groups == self.weights.shape[channels_axis[self.kernel_layout]]:
             self.is_depthwise = True
 
     def is_valid(self) -> bool:
@@ -253,10 +255,52 @@ def is_valid(self) -> bool:
         legal_groups = [1, self.ofm.shape[3]]
         if self.groups not in legal_groups:
             return False
-        # This should be a valid QnnDepthwise2DParams, not QnnConv2DParams
+        # This should be a valid QnnDepthwiseConv2DParams, not QnnConv2DParams
         return not self.is_depthwise
 
 
+class QnnDepthwiseConv2DParams(QnnConv2DParams):
+    """
+    This class will parse a call to a ethosu.depthwise_conv2d composite function
+    and extract the parameter information.
+    """
+
+    composite_name = "ethosu.depthwise_conv2d"
+    # The hardware only supports padding upto the numbers as follows
+    padding_bounds = [31, 31, 32, 32]
+
+    def __init__(self, func_body: tvm.relay.expr.Call):
+        QnnConv2DParams.__init__(self, func_body)
+
+    def is_valid(self):
+        """
+        Checks whether QnnDepthwiseConv2D + activation function has compatible attributes with HW
+        """
+        tensor_params = [self.weights, self.ifm, self.ofm]
+        if not check_valid_dtypes(tensor_params):
+            return False
+        if not check_weights(self.weights, self.dilation):
+            return False
+        if not check_bias(self.biases):
+            return False
+        if not check_strides(self.strides):
+            return False
+        if not check_batch_size(self.ifm):
+            return False
+        if not check_dilation(self.dilation):
+            return False
+        if not check_padding(self.padding, self.padding_bounds):
+            return False
+        if self.weights.layout != "HWOI":
+            return False
+        # only depth multiplier of size 1 is supported
+        if self.weights.shape[3] != 1:
+            return False
+        if not self.is_depthwise:
+            return False
+        return True
+
+
 def qnn_conv2d_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
     """
     This function creates the pattern for qnn.conv2D with optional fused RELU activation.
@@ -266,7 +310,22 @@ def qnn_conv2d_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
     ).has_attr({"kernel_layout": "HWIO"})
     bias_add = is_op("nn.bias_add")(qnn_conv2d, is_constant())
     req = is_op("qnn.requantize")(
-        qnn_conv2d | bias_add, is_constant(), is_constant(), is_constant(), is_constant()
+        bias_add, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    clip_or_req = req.optional(is_op("clip"))
+    return clip_or_req
+
+
+def qnn_depthwise_conv2d_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
+    """
+    This function creates the pattern for depthwise qnn.conv2D with optional fused RELU activation.
+    """
+    qnn_conv2d = is_op("qnn.conv2d")(
+        wildcard(), is_constant(), is_constant(), is_constant(), is_constant(), is_constant()
+    ).has_attr({"kernel_layout": "HWOI"})
+    bias_add = is_op("nn.bias_add")(qnn_conv2d, is_constant())
+    req = is_op("qnn.requantize")(
+        bias_add, is_constant(), is_constant(), is_constant(), is_constant()
     )
     clip_or_req = req.optional(is_op("clip"))
     return clip_or_req
@@ -279,7 +338,12 @@ def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Cal
             QnnConv2DParams.composite_name,
             qnn_conv2d_pattern(),
             lambda pat: QnnConv2DParams(pat).is_valid(),
-        )
+        ),
+        (
+            QnnDepthwiseConv2DParams.composite_name,
+            qnn_depthwise_conv2d_pattern(),
+            lambda pat: QnnDepthwiseConv2DParams(pat).is_valid(),
+        ),
     ]
 
 
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 5a17db745b3e..1821ff17258a 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -748,7 +748,14 @@ def log_softmax(data, axis=-1):
 
 
 def max_pool1d(
-    data, pool_size=(1,), strides=(1,), dilation=(1,), padding=(0,), layout="NCW", ceil_mode=False
+    data,
+    pool_size=(1,),
+    strides=(1,),
+    dilation=(1,),
+    padding=(0,),
+    layout="NCW",
+    out_layout="",
+    ceil_mode=False,
 ):
     r"""1D maximum pooling operator.
 
@@ -783,6 +790,9 @@ def max_pool1d(
     layout : str, optional
         Layout of the input.
 
+    out_layout : Optional[str]
+        Layout of the output
+
     ceil_mode : bool, optional
         To enable or disable ceil while pooling.
 
@@ -798,7 +808,9 @@ def max_pool1d(
     if isinstance(dilation, int):
         dilation = (dilation,)
     padding = get_pad_tuple1d(padding)
-    return _make.max_pool1d(data, pool_size, strides, dilation, padding, layout, ceil_mode)
+    return _make.max_pool1d(
+        data, pool_size, strides, dilation, padding, layout, out_layout, ceil_mode
+    )
 
 
 def max_pool2d(
@@ -808,6 +820,7 @@ def max_pool2d(
     dilation=(1, 1),
     padding=(0, 0),
     layout="NCHW",
+    out_layout="",
     ceil_mode=False,
 ):
     r"""2D maximum pooling operator.
@@ -851,6 +864,9 @@ def max_pool2d(
     layout : str, optional
         Layout of the input.
 
+    out_layout : Optional[str]
+        Layout of the output
+
     ceil_mode : bool, optional
         To enable or disable ceil while pooling.
 
@@ -866,7 +882,9 @@ def max_pool2d(
     if isinstance(dilation, int):
         dilation = (dilation, dilation)
     padding = get_pad_tuple2d(padding)
-    return _make.max_pool2d(data, pool_size, strides, dilation, padding, layout, ceil_mode)
+    return _make.max_pool2d(
+        data, pool_size, strides, dilation, padding, layout, out_layout, ceil_mode
+    )
 
 
 def max_pool3d(
@@ -876,6 +894,7 @@ def max_pool3d(
     dilation=(1, 1, 1),
     padding=(0, 0, 0),
     layout="NCDHW",
+    out_layout="",
     ceil_mode=False,
 ):
     r"""3D maximum pooling operator.
@@ -912,6 +931,9 @@ def max_pool3d(
     layout : str, optional
         Layout of the input.
 
+    out_layout : Optional[str]
+        Layout of the output
+
     ceil_mode : bool, optional
         To enable or disable ceil while pooling.
 
@@ -927,7 +949,9 @@ def max_pool3d(
     if isinstance(dilation, int):
         dilation = (dilation, dilation, dilation)
     padding = get_pad_tuple3d(padding)
-    return _make.max_pool3d(data, pool_size, strides, dilation, padding, layout, ceil_mode)
+    return _make.max_pool3d(
+        data, pool_size, strides, dilation, padding, layout, out_layout, ceil_mode
+    )
 
 
 def avg_pool1d(
@@ -937,6 +961,7 @@ def avg_pool1d(
     dilation=(1,),
     padding=(0,),
     layout="NCW",
+    out_layout="",
     ceil_mode=False,
     count_include_pad=False,
 ):
@@ -973,6 +998,9 @@ def avg_pool1d(
     layout : str, optional
         Layout of the input.
 
+    out_layout : Optional[str]
+        Layout of the output
+
     ceil_mode : bool, optional
         To enable or disable ceil while pooling.
 
@@ -992,7 +1020,15 @@ def avg_pool1d(
         dilation = (dilation,)
     padding = get_pad_tuple1d(padding)
     return _make.avg_pool1d(
-        data, pool_size, strides, dilation, padding, layout, ceil_mode, count_include_pad
+        data,
+        pool_size,
+        strides,
+        dilation,
+        padding,
+        layout,
+        out_layout,
+        ceil_mode,
+        count_include_pad,
     )
 
 
@@ -1003,6 +1039,7 @@ def avg_pool2d(
     dilation=(1, 1),
     padding=(0, 0),
     layout="NCHW",
+    out_layout="",
     ceil_mode=False,
     count_include_pad=False,
 ):
@@ -1048,6 +1085,9 @@ def avg_pool2d(
     layout : str, optional
         Layout of the input.
 
+    out_layout : Optional[str]
+        Layout of the output
+
     ceil_mode : bool, optional
         To enable or disable ceil while pooling.
 
@@ -1067,7 +1107,15 @@ def avg_pool2d(
         dilation = (dilation, dilation)
     padding = get_pad_tuple2d(padding)
     return _make.avg_pool2d(
-        data, pool_size, strides, dilation, padding, layout, ceil_mode, count_include_pad
+        data,
+        pool_size,
+        strides,
+        dilation,
+        padding,
+        layout,
+        out_layout,
+        ceil_mode,
+        count_include_pad,
     )
 
 
@@ -1078,6 +1126,7 @@ def avg_pool3d(
     dilation=(1, 1, 1),
     padding=(0, 0, 0),
     layout="NCDHW",
+    out_layout="",
     ceil_mode=False,
     count_include_pad=False,
 ):
@@ -1115,6 +1164,9 @@ def avg_pool3d(
     layout : str, optional
         Layout of the input.
 
+    out_layout : Optional[str]
+        Layout of the output
+
     ceil_mode : bool, optional
         To enable or disable ceil while pooling.
 
@@ -1134,7 +1186,15 @@ def avg_pool3d(
         dilation = (dilation, dilation, dilation)
     padding = get_pad_tuple3d(padding)
     return _make.avg_pool3d(
-        data, pool_size, strides, dilation, padding, layout, ceil_mode, count_include_pad
+        data,
+        pool_size,
+        strides,
+        dilation,
+        padding,
+        layout,
+        out_layout,
+        ceil_mode,
+        count_include_pad,
     )
 
 
@@ -1145,6 +1205,7 @@ def max_pool2d_grad(
     strides=(1, 1),
     padding=(0, 0),
     layout="NCHW",
+    out_layout="",
     ceil_mode=False,
 ):
     r"""Gradient of 2D maximum pooling operator.
@@ -1171,6 +1232,9 @@ def max_pool2d_grad(
     layout : str, optional
         Layout of the input.
 
+    out_layout : Optional[str]
+        Layout of the output
+
     ceil_mode : bool, optional
         To enable or disable ceil while pooling.
 
@@ -1179,7 +1243,9 @@ def max_pool2d_grad(
     result : tvm.relay.Expr
         The computed result.
     """
-    return _make.max_pool2d_grad(out_grad, data, pool_size, strides, padding, layout, ceil_mode)
+    return _make.max_pool2d_grad(
+        out_grad, data, pool_size, strides, padding, layout, out_layout, ceil_mode
+    )
 
 
 def avg_pool2d_grad(
@@ -1189,6 +1255,7 @@ def avg_pool2d_grad(
     strides=(1, 1),
     padding=(0, 0),
     layout="NCHW",
+    out_layout="",
     ceil_mode=False,
     count_include_pad=False,
 ):
@@ -1216,6 +1283,9 @@ def avg_pool2d_grad(
     layout : str, optional
         Layout of the input.
 
+    out_layout : Optional[str]
+        Layout of the output
+
     ceil_mode : bool, optional
         To enable or disable ceil while pooling.
 
@@ -1228,11 +1298,19 @@ def avg_pool2d_grad(
         The computed result.
     """
     return _make.avg_pool2d_grad(
-        out_grad, data, pool_size, strides, padding, layout, ceil_mode, count_include_pad
+        out_grad,
+        data,
+        pool_size,
+        strides,
+        padding,
+        layout,
+        out_layout,
+        ceil_mode,
+        count_include_pad,
     )
 
 
-def global_max_pool2d(data, layout="NCHW"):
+def global_max_pool2d(data, layout="NCHW", out_layout=""):
     r"""2D global maximum pooling operator.
 
     This operator takes data as input and does 2D max value calculation
@@ -1258,15 +1336,18 @@ def global_max_pool2d(data, layout="NCHW"):
     layout : str, optional
         Layout of the input.
 
+    out_layout : Optional[str]
+        Layout of the output
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
-    return _make.global_max_pool2d(data, layout)
+    return _make.global_max_pool2d(data, layout, out_layout)
 
 
-def global_avg_pool2d(data, layout="NCHW"):
+def global_avg_pool2d(data, layout="NCHW", out_layout=""):
     r"""2D global average pooling operator.
 
     This operator takes data as input and does 2D average value calculation
@@ -1292,12 +1373,15 @@ def global_avg_pool2d(data, layout="NCHW"):
     layout : str, optional
         Layout of the input.
 
+    out_layout : Optional[str]
+        Layout of the output
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
-    return _make.global_avg_pool2d(data, layout)
+    return _make.global_avg_pool2d(data, layout, out_layout)
 
 
 def upsampling(
@@ -3114,7 +3198,7 @@ def space_to_depth(data, block_size, layout="NCHW"):
     return _make.space_to_depth(data, block_size, layout)
 
 
-def adaptive_max_pool1d(data, output_size=None, layout="NCW"):
+def adaptive_max_pool1d(data, output_size=None, layout="NCW", out_layout=""):
     r"""1D adaptive max pooling operator. This operator is experimental.
 
     This operator takes data as input and does 1D max value calculation
@@ -3147,6 +3231,9 @@ def adaptive_max_pool1d(data, output_size=None, layout="NCW"):
     layout : str, optional
         Layout of the input.
 
+    out_layout : str, optional
+        Layout of the output.
+
     Returns
     -------
     result : tvm.relay.Expr
@@ -3155,10 +3242,10 @@ def adaptive_max_pool1d(data, output_size=None, layout="NCW"):
     output_size = [] or output_size
     if isinstance(output_size, int):
         output_size = [output_size]
-    return _make.adaptive_max_pool1d(data, output_size, layout)
+    return _make.adaptive_max_pool1d(data, output_size, layout, out_layout)
 
 
-def adaptive_avg_pool1d(data, output_size=None, layout="NCW"):
+def adaptive_avg_pool1d(data, output_size=None, layout="NCW", out_layout=""):
     r"""1D adaptive average pooling operator. This operator is experimental.
 
     This operator takes data as input and does 1D average value calculation
@@ -3191,6 +3278,9 @@ def adaptive_avg_pool1d(data, output_size=None, layout="NCW"):
     layout : str, optional
         Layout of the input.
 
+    out_layout : str, optional
+        Layout of the output.
+
     Returns
     -------
     result : tvm.relay.Expr
@@ -3199,10 +3289,10 @@ def adaptive_avg_pool1d(data, output_size=None, layout="NCW"):
     output_size = [] or output_size
     if isinstance(output_size, int):
         output_size = [output_size]
-    return _make.adaptive_avg_pool1d(data, output_size, layout)
+    return _make.adaptive_avg_pool1d(data, output_size, layout, out_layout)
 
 
-def adaptive_max_pool2d(data, output_size=None, layout="NCHW"):
+def adaptive_max_pool2d(data, output_size=None, layout="NCHW", out_layout=""):
     r"""2D adaptive max pooling operator. This operator is experimental.
 
     This operator takes data as input and does 2D max value calculation
@@ -3238,16 +3328,19 @@ def adaptive_max_pool2d(data, output_size=None, layout="NCHW"):
     layout : str, optional
         Layout of the input.
 
+    out_layout : str, optional
+        Layout of the output.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
     output_size = [] or output_size
-    return _make.adaptive_max_pool2d(data, output_size, layout)
+    return _make.adaptive_max_pool2d(data, output_size, layout, out_layout)
 
 
-def adaptive_avg_pool2d(data, output_size=None, layout="NCHW"):
+def adaptive_avg_pool2d(data, output_size=None, layout="NCHW", out_layout=""):
     r"""2D adaptive average pooling operator. This operator is experimental.
 
     This operator takes data as input and does 2D average value calculation
@@ -3283,16 +3376,19 @@ def adaptive_avg_pool2d(data, output_size=None, layout="NCHW"):
     layout : str, optional
         Layout of the input.
 
+    out_layout : str, optional
+        Layout of the output.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
     output_size = [] or output_size
-    return _make.adaptive_avg_pool2d(data, output_size, layout)
+    return _make.adaptive_avg_pool2d(data, output_size, layout, out_layout)
 
 
-def adaptive_max_pool3d(data, output_size=None, layout="NCDHW"):
+def adaptive_max_pool3d(data, output_size=None, layout="NCDHW", out_layout=""):
     r"""3D adaptive max pooling operator. This operator is experimental.
 
     This operator takes data as input and does 3D max value calculation
@@ -3327,16 +3423,19 @@ def adaptive_max_pool3d(data, output_size=None, layout="NCDHW"):
     layout : str, optional
         Layout of the input.
 
+    out_layout : str, optional
+        Layout of the output.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
     output_size = [] or output_size
-    return _make.adaptive_max_pool3d(data, output_size, layout)
+    return _make.adaptive_max_pool3d(data, output_size, layout, out_layout)
 
 
-def adaptive_avg_pool3d(data, output_size=None, layout="NCDHW"):
+def adaptive_avg_pool3d(data, output_size=None, layout="NCDHW", out_layout=""):
     r"""3D adaptive avg pooling operator. This operator is experimental.
 
     This operator takes data as input and does 3D avg value calculation
@@ -3371,16 +3470,19 @@ def adaptive_avg_pool3d(data, output_size=None, layout="NCDHW"):
     layout : str, optional
         Layout of the input.
 
+    out_layout : str, optional
+        Layout of the output.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
     output_size = [] or output_size
-    return _make.adaptive_avg_pool3d(data, output_size, layout)
+    return _make.adaptive_avg_pool3d(data, output_size, layout, out_layout)
 
 
-def global_max_pool1d(data, layout="NCW"):
+def global_max_pool1d(data, layout="NCW", out_layout=""):
     r"""1D global maximum pooling operator.
 
     This operator takes data as input and does 1D max value calculation
@@ -3403,16 +3505,19 @@ def global_max_pool1d(data, layout="NCW"):
     layout : str, optional
         Layout of the input.
 
+    out_layout : str, optional
+        Layout of the output.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
     output_size = [1]
-    return _make.adaptive_max_pool1d(data, output_size, layout)
+    return _make.adaptive_max_pool1d(data, output_size, layout, out_layout)
 
 
-def global_avg_pool1d(data, layout="NCW"):
+def global_avg_pool1d(data, layout="NCW", out_layout=""):
     r"""1D global average pooling operator.
 
     This operator takes data as input and does 1D average value calculation
@@ -3436,16 +3541,19 @@ def global_avg_pool1d(data, layout="NCW"):
     layout : str, optional
         Layout of the input.
 
+    out_layout : str, optional
+        Layout of the output.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
     output_size = [1]
-    return _make.adaptive_avg_pool1d(data, output_size, layout)
+    return _make.adaptive_avg_pool1d(data, output_size, layout, out_layout)
 
 
-def global_max_pool3d(data, layout="NCDHW"):
+def global_max_pool3d(data, layout="NCDHW", out_layout=""):
     r"""3D global maximum pooling operator.
 
     This operator takes data as input and does 3D max value calculation
@@ -3469,16 +3577,19 @@ def global_max_pool3d(data, layout="NCDHW"):
     layout : str, optional
         Layout of the input.
 
+    out_layout : str, optional
+        Layout of the output.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
     output_size = [1, 1, 1]
-    return _make.adaptive_max_pool3d(data, output_size, layout)
+    return _make.adaptive_max_pool3d(data, output_size, layout, out_layout)
 
 
-def global_avg_pool3d(data, layout="NCDHW"):
+def global_avg_pool3d(data, layout="NCDHW", out_layout=""):
     r"""3D global average pooling operator.
 
     This operator takes data as input and does 3D average value calculation
@@ -3503,13 +3614,16 @@ def global_avg_pool3d(data, layout="NCDHW"):
     layout : str, optional
         Layout of the input.
 
+    out_layout : str, optional
+        Layout of the output.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
     output_size = [1, 1, 1]
-    return _make.adaptive_avg_pool3d(data, output_size, layout)
+    return _make.adaptive_avg_pool3d(data, output_size, layout, out_layout)
 
 
 def correlation(
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index e8731a0d6954..06dfc87038fe 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -130,9 +130,9 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
         elif layout == "NHWC":
             if "SMLAD" in isa and kernel_layout == "HWOI":
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.conv2d_direct_simd),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_direct_simd),
-                    name="conv2d_direct_simd.micro_dev",
+                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_direct_simd),
+                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_direct_simd),
+                    name="conv2d_nhwc_direct_simd.micro_dev",
                 )
             elif kernel_layout == "HWIO":
                 is_aarch64 = topi.arm_cpu.arm_utils.is_aarch64_arm()
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index e47928919ce1..e615bbf21b86 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -1070,7 +1070,7 @@ def fixed_point_multiply(data, multiplier, shift):
         The input tensor.
     multiplier : int
         The integer multiplier of the fixed point constant.
-    a_max : float
+    shift : int
         The integer shift of the fixed point constant.
 
     Returns
diff --git a/python/tvm/rpc/proxy.py b/python/tvm/rpc/proxy.py
index e5ec73db51b9..c3b0056eb591 100644
--- a/python/tvm/rpc/proxy.py
+++ b/python/tvm/rpc/proxy.py
@@ -379,7 +379,7 @@ def _update_tracker(self, period_update=False):
 
             if need_update_info:
                 keylist = "[" + ",".join(self._key_set) + "]"
-                cinfo = {"key": "server:proxy" + keylist}
+                cinfo = {"key": "server:proxy" + keylist, "addr": [None, self._listen_port]}
                 base.sendjson(self._tracker_conn, [TrackerCode.UPDATE_INFO, cinfo])
                 assert base.recvjson(self._tracker_conn) == TrackerCode.SUCCESS
             self._tracker_pending_puts = []
diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
index 74c1f7ac07aa..5a576a705e8a 100644
--- a/python/tvm/rpc/tracker.py
+++ b/python/tvm/rpc/tracker.py
@@ -337,9 +337,10 @@ def request(self, key, user, priority, callback):
     def close(self, conn):
         self._connections.remove(conn)
         if "key" in conn._info:
-            key = conn._info["key"].split(":")[1]  # 'server:rasp3b' -> 'rasp3b'
             for value in conn.put_values:
-                self._scheduler_map[key].remove(value)
+                _, _, _, key = value
+                rpc_key = key.split(":")[0]
+                self._scheduler_map[rpc_key].remove(value)
 
     def stop(self):
         """Safely stop tracker."""
diff --git a/python/tvm/script/context_maintainer.py b/python/tvm/script/context_maintainer.py
index 75566cf6e2c5..56d080857a7d 100644
--- a/python/tvm/script/context_maintainer.py
+++ b/python/tvm/script/context_maintainer.py
@@ -22,8 +22,10 @@
 
 import tvm
 from tvm.ir import Span
+from tvm.ir.expr import Range
 from tvm.tir import Var, Buffer, PrimExpr, Stmt, MatchBufferRegion
 from tvm.runtime import Object
+from tvm.tir.expr import IterVar
 from .tir.node import BufferSlice
 
 
@@ -41,10 +43,10 @@ def example_func(a: T.handle, b: T.handle, c: T.handle) -> None:
             C = T.match_buffer(a, (16, 16), "float32")
 
             for i, j, k in T.grid(16, 16, 16):
-                with T.block([16, 16, T.reduce_axis(16)], "matmul") as [vi, vj, vk]:
-                    T.bind(vi, i)
-                    T.bind(vj, j)
-                    T.bind(vk, k)         # iter_bindings = {vj: i, vj: j, vk: k}
+                with T.block("matmul"):
+                    vi = T.axis.S(16, i)
+                    vj = T.axis.S(16, j)
+                    vk = T.axis.R(16, k)         # iter_bindings = {vj: i, vj: j, vk: k}
 
                     T.where(True)         # predicate of the block_realize
 
@@ -72,8 +74,10 @@ def example_func(a: T.handle, b: T.handle, c: T.handle) -> None:
     """List[Buffer]: list of T.alloc_buffer statements in the block signature"""
     match_buffers: List[MatchBufferRegion] = []
     """List[MatchBufferRegion]: list of T.match_buffer statements in the block signature"""
-    iter_bindings: Mapping[Var, PrimExpr] = {}
-    """Mapping[Var, PrimExpr]: map of block iter var to its values"""
+    iter_values: List[PrimExpr] = []
+    """List[PrimExpr]: list of binding values for iter vars"""
+    iter_vars: List[IterVar] = []
+    """List[PrimExpr]: list of iter vars in the block"""
     reads: Optional[List[BufferSlice]] = None
     """Optional[List[BufferSlice]]:
     list of T.reads statements in the block signature, None for not-visited"""
@@ -91,7 +95,8 @@ def example_func(a: T.handle, b: T.handle, c: T.handle) -> None:
     def __init__(self):
         self.alloc_buffers = []
         self.match_buffers = []
-        self.iter_bindings = {}
+        self.iter_values = []
+        self.iter_vars = []
         self.reads = None
         self.writes = None
         self.annotations = None
@@ -112,8 +117,8 @@ class ContextMaintainer:
     """List[List[synr.ast.Node]]: The ast nodes insides the current scope"""
     block_info_stack: List[BlockInfo] = []
     """List[BlockInfo]: The block info for the current block scope"""
-    loop_stack: List[List[Var]] = []
-    """List[List[Var]]: List of loop vars inside the current block scope"""
+    loop_stack: Dict[Var, Range] = {}
+    """Dict[Var, Range]: The dict from loop var to its domain outside the block"""
     symbols: List[Dict[str, Union[Var, Buffer]]] = []
     """List[Dict[str, Union[Var, Buffer]]]: Symbol map from name to object for the current scope"""
 
@@ -137,7 +142,7 @@ def __init__(self, _report_error: Callable[[str, Union[Span, synr.ast.Span]], No
         # scope context
         self.node_stack = []
         self.block_info_stack = []
-        self.loop_stack = []
+        self.loop_stack = {}
         self.symbols = []
         # function context
         self.func_params = []
@@ -183,8 +188,6 @@ def enter_block_scope(self, nodes: Optional[List[synr.ast.Node]] = None):
             The synr AST nodes in new scope
         """
         self.enter_scope(nodes)
-        # Create a new loop stack for the new block
-        self.loop_stack.append([])
         # Create a new BlockInfo for the new block
         self.block_info_stack.append(BlockInfo())
 
@@ -196,8 +199,6 @@ def exit_scope(self):
     def exit_block_scope(self):
         """Pop the inner most block scope, the function will call `exit_scope` implicitly"""
         self.exit_scope()
-        # Pop loop stack
-        self.loop_stack.pop()
         # Pop block_info
         self.block_info_stack.pop()
 
diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index d5e79e8676c1..8610d91e9f07 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -377,12 +377,13 @@ def A():
         """
         if len(node.assignments) == 1:
             if not (
-                isinstance(node.assignments[0].lhs, ast.Var)
-                and node.assignments[0].lhs.id.name == "__tvm_meta__"
+                len(node.assignments[0].lhs) == 1
+                and isinstance(node.assignments[0].lhs[0], ast.Var)
+                and node.assignments[0].lhs[0].id.name == "__tvm_meta__"
             ):
                 self.report_error(
                     "The only top level assignments allowed are `__tvm_meta__ = ...`",
-                    node.assignments[0].lhs.span,
+                    node.assignments[0].span,
                 )
             self.init_meta(
                 MetaUnparser().do_transform(node.assignments[0].rhs, self._diagnostic_context)
@@ -526,18 +527,19 @@ def transform_Assign(self, node):
                 return self.parse_body(node)
             else:
                 value = self.transform(node.rhs)
-                if not isinstance(node.lhs, ast.Var):
+                if len(node.lhs) == 1 and not isinstance(node.lhs[0], ast.Var):
                     # This is a little confusing because it only is true when
                     # we have taken this branch. We might need to clarify what
                     # exectly is allowed in Assignments in tvmscript.
                     self.report_error(
                         "Left hand side of assignment must be an unqualified variable",
-                        node.lhs.span,
+                        node.span,
                     )
+                ast_var = node.lhs[0]
                 var = tvm.te.var(
-                    node.lhs.id.name,
-                    self.parse_type(node.ty, node.lhs),
-                    span=tvm_span_from_synr(node.lhs.span),
+                    ast_var.id.name,
+                    self.parse_type(node.ty, ast_var),
+                    span=tvm_span_from_synr(ast_var.span),
                 )
                 self.context.update_symbol(var.name, var, node)
                 body = self.parse_body(node)
@@ -596,7 +598,7 @@ def transform_For(self, node):
             For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment)
         By now 1 pattern of For is supported:
             1. for scope handler
-                for name in T.serial()/T.parallel()/T.vectorized()/T.unroll()/tir.range()/
+                for name in T.serial()/T.parallel()/T.vectorized()/T.unroll()/range()/
                             T.grid()/T.thread_binding()
         """
 
@@ -892,9 +894,20 @@ def transform_Attr(self, node):
            namespace.
         """
 
-        if isinstance(node.object, ast.Var):
-            if self.match_tir_namespace(node.object.id.name):
-                func_name = "tir." + node.field.name
+        def get_full_attr_name(node: ast.Attr) -> str:
+            reverse_field_names = [node.field.name]
+            while isinstance(node.object, ast.Attr):
+                node = node.object
+                reverse_field_names.append(node.field.name)
+            if isinstance(node.object, ast.Var):
+                reverse_field_names.append(node.object.id.name)
+            return ".".join(reversed(reverse_field_names))
+
+        if isinstance(node.object, (ast.Var, ast.Attr)):
+            full_attr_name = get_full_attr_name(node)
+            attr_object, fields = full_attr_name.split(".", maxsplit=1)
+            if self.match_tir_namespace(attr_object):
+                func_name = "tir." + fields
                 res = Registry.lookup(func_name)
                 if res is not None:
                     return res
@@ -903,9 +916,7 @@ def transform_Attr(self, node):
                 except TVMError as e:
                     # Check if we got an attribute error
                     if e.args[0].find("AttributeError"):
-                        self.report_error(
-                            f"Unregistered function `tir.{node.field.name}`.", node.field.span
-                        )
+                        self.report_error(f"Unregistered function `tir.{fields}`.", node.span)
                     else:
                         raise e
 
diff --git a/python/tvm/script/tir/scope_handler.py b/python/tvm/script/tir/scope_handler.py
index 487a71d4f077..4750ad7626e2 100644
--- a/python/tvm/script/tir/scope_handler.py
+++ b/python/tvm/script/tir/scope_handler.py
@@ -134,12 +134,14 @@ def enter_scope(
         if isinstance(node, synr.ast.With):
             vars = WithScopeHandler.get_optional_vars(node, context)
             if len(vars) != 1:
-                context.report_error("Unexpected number of vars", node.span)
+                context.report_error(f"Unexpected number of vars: 1 vs. {len(vars)}", node.span)
             name = vars[0].id.name
             var_span = vars[0].id.span
         elif isinstance(node, synr.ast.Assign):
-            name = node.lhs.id.name
-            var_span = node.lhs.id.span
+            if len(node.lhs) != 1:
+                context.report_error(f"Unexpected number of vars: 1 vs. {len(node.lhs)}", node.span)
+            name = node.lhs[0].id.name
+            var_span = node.lhs[0].id.span
         else:
             raise Exception("Internal Bug")
 
@@ -247,42 +249,16 @@ def let(var, value, span):
 
 @register
 class Block(WithScopeHandler):
-    """With scope handler T.block(extents, name) as iter_vars"""
+    """With scope handler T.block(name)"""
 
     def __init__(self):
-        def block(axes=None, name_hint: str = "", span: Optional[Span] = None):
+        def block(name_hint: str = "", span: Optional[Span] = None):
             assert (
                 self.node and self.context and self.body
             ), "call 'exit_scope' before 'enter_scope'"
             block_info = self.context.block_info_stack[-1]
-            if axes is None:
-                axes = []
-            if len(axes) != len(self.block_vars):
-                self.context.report_error(
-                    "Inconsistent number of block vars, "
-                    + f"there are {len(axes)} axes but {len(self.block_vars)} block vars. "
-                    + "The number of block vars should match the number of axes.",
-                    self.node.span,
-                )
-            block_iters: List[IterVar] = []
-            for i, axis in enumerate(axes):
-                axis = tvm.runtime.convert(axis)
-                if isinstance(axis, tvm.tir.PrimExpr):
-                    block_var_dom = Range.from_min_extent(0, axis)
-                    block_iters.append(IterVar(block_var_dom, self.block_vars[i], 0))
-                elif isinstance(axis, Range):
-                    block_iters.append(IterVar(axis, self.block_vars[i], 0))
-                elif isinstance(axis, IterVar):
-                    block_iters.append(IterVar(axis.dom, self.block_vars[i], axis.iter_type))
-                else:
-                    self.context.report_error(
-                        "Invalid argument of T.block(), "
-                        + f"expected PrimExpr, Range or IterVar, but got {type(axis)}",
-                        self.node.span,
-                    )
 
             # create block read/write regions
-
             reads: List[BufferRegion] = (
                 [buffer_slice_to_region(read) for read in block_info.reads]
                 if block_info.reads
@@ -301,7 +277,7 @@ def block(axes=None, name_hint: str = "", span: Optional[Span] = None):
             if region_detect_mask != 0:
                 annotations["tir.script_parsing_detect_access"] = region_detect_mask
             inner = tvm.tir.Block(
-                block_iters,
+                block_info.iter_vars,
                 reads,
                 writes,
                 name_hint,
@@ -312,35 +288,13 @@ def block(axes=None, name_hint: str = "", span: Optional[Span] = None):
                 annotations,
                 span,
             )
-            # create block var iter binding
-            values: List[PrimExpr]
-            if not block_info.iter_bindings:
-                values = self.context.loop_stack[-2].copy()
-                if len(block_iters) == 0:
-                    # It is an opaque block without any bindings
-                    values = []
-                elif len(values) == 0:
-                    values = [tvm.tir.const(float("nan"), dtype="float32")] * len(block_iters)
-                elif len(values) != len(block_iters):
-                    self.context.report_error(
-                        "Number of block iter var and outer loop nesting mismatch, "
-                        + f"{len(block_iters)} block iter vars but {len(values)} loops",
-                        self.node.span,
-                    )
-            else:
-                for block_var in self.block_vars:
-                    if block_var not in block_info.iter_bindings:
-                        self.context.report_error(
-                            "Missing block iter var binding for " + block_var.name,
-                            self.node.span,
-                        )
-                values = [block_info.iter_bindings[block_var] for block_var in self.block_vars]
+            assert len(block_info.iter_vars) == len(block_info.iter_values)
             predicate = (
                 tvm.tir.const(True, "bool")
                 if block_info.predicate is None
                 else block_info.predicate
             )
-            body = tvm.tir.BlockRealize(values, predicate, inner, span)
+            body = tvm.tir.BlockRealize(block_info.iter_values, predicate, inner, span)
             return body
 
         super().__init__(func=block, concise_scope=False, def_symbol=True)
@@ -358,10 +312,13 @@ def enter_scope(
             node, synr.ast.With
         ), f"BlockScopeHandler expected to work on synr.ast.With but got {type(node)}"
 
-        vars = WithScopeHandler.get_optional_vars(node, context)
-        self.block_vars = [tvm.te.var(var.id.name) for var in vars]
-        for block_var in self.block_vars:
-            context.update_symbol(block_var.name, block_var, node)
+        optional_vars = [var.id.name for var in WithScopeHandler.get_optional_vars(node, context)]
+        if optional_vars:
+            context.report_error(
+                f"Block expected no optional_vars (e.g., `x` in `with block() as x`), "
+                f"but got {optional_vars}",
+                node.span,
+            )
 
 
 @register
@@ -378,12 +335,38 @@ def init(span: Span = None):
         super().__init__(func=init, concise_scope=False, def_symbol=True)
 
 
+class LoopInfo:
+    """Helper class for loop information"""
+
+    loop_var: Var
+    begin: PrimExpr
+    extent: PrimExpr
+    kind: ForKind
+    thread_binding: Optional[str]
+    annotations: Optional[Mapping[str, Object]]
+
+    def __init__(
+        self,
+        begin: PrimExpr,
+        extent: PrimExpr,
+        kind: ForKind,
+        thread_binding: Optional[str] = None,
+        annotations: Optional[Mapping[str, Object]] = None,
+    ) -> None:
+        self.begin = begin
+        self.extent = extent
+        self.kind = kind
+        self.thread_binding = thread_binding
+        self.annotations = annotations
+
+
 class ForScopeHandler(ScopeHandler):
     """Base class for all for scope handlers"""
 
     def __init__(self, func):
         super().__init__(func)
-        self.loop_vars: Optional[List[Var]] = None
+        self.loop_vars: List[Var] = []
+        self.loop_info: List[LoopInfo] = []
 
     def enter_scope(
         self,
@@ -415,12 +398,23 @@ def enter_scope(
                 span,
             )
 
+        self.node = node
+        self.context = context
+        # generate loop vars
         self.loop_vars = [
             tvm.te.var(name, dtype="int32", span=span) for name, span in zip(loop_var_names, spans)
         ]
-        for loop_var in self.loop_vars:
+        # collect loop infos by calling self.func
+        call_with_error_reporting(context.report_error, span, self.func, *arg_list)
+        if len(self.loop_vars) != len(self.loop_info):
+            self.context.report_error(
+                f"Inconsistent number of vars and loops, got {len(self.loop_vars)} "
+                + f"vs {len(self.loop_info)}",
+                self.node.span,
+            )
+        for loop_var, loop_info in zip(self.loop_vars, self.loop_info):
             context.update_symbol(loop_var.name, loop_var, node)
-            context.loop_stack[-1].append(loop_var)
+            context.loop_stack[loop_var] = Range.from_min_extent(loop_info.begin, loop_info.extent)
 
     def exit_scope(
         self,
@@ -430,19 +424,34 @@ def exit_scope(
         span: synr.ast.Span,
     ):
         assert self.loop_vars, "call 'exit_scope' before 'enter_scope'"
-        for _ in self.loop_vars:
-            context.loop_stack[-1].pop()
-        return super().exit_scope(node, context, arg_list, span)
+        for loop_var in self.loop_vars:
+            context.loop_stack.pop(loop_var)
+        # Use assert here since we have check it in `enter_scope`
+        assert len(self.loop_vars) == len(self.loop_info)
+
+        body = self.body
+        for var, info in zip(reversed(self.loop_vars), reversed(self.loop_info)):
+            body = tvm.tir.For(
+                var,
+                info.begin,
+                info.extent,
+                info.kind,
+                body,
+                info.thread_binding,
+                info.annotations,
+                span=tvm_span_from_synr(span),
+            )
 
-    def create_loop(
+        return body
+
+    def create_loop_info(
         self,
         begin: PrimExpr,
         end: PrimExpr,
         kind: ForKind,
         thread_binding: Optional[str] = None,
         annotations: Optional[Mapping[str, Object]] = None,
-        span: Optional[Span] = None,
-    ) -> tvm.tir.For:
+    ) -> None:
         """
         Helper function for creating For in TVM Script parser.
 
@@ -471,30 +480,16 @@ def create_loop(
         for : For
             The constructed For.
         """
-        assert (
-            self.loop_vars and self.context and self.node
-        ), "call 'exit_scope' before 'enter_scope'"
-        if len(self.loop_vars) != 1:
-            self.context.report_error(
-                f"Expected exactly one loop var, but got {self.loop_vars}", self.node.span
-            )
+        assert self.context and self.node, "call 'exit_scope' before 'enter_scope'"
         extent = end if begin == 0 else self.context.analyzer.simplify(end - begin)
-        annos: Mapping[str, Object] = {}
+        self.annotations: Mapping[str, Object] = {}
         if annotations is not None:
-            annos = {
+            self.annotations = {
                 key: tvm.tir.StringImm(val) if isinstance(val, str) else val
                 for key, val in annotations.items()
             }
-        return tvm.tir.For(
-            self.loop_vars[0],
-            begin,
-            extent,
-            kind,
-            self.body,
-            thread_binding=thread_binding,
-            annotations=annos,
-            span=span,
-        )
+
+        self.loop_info.append(LoopInfo(begin, extent, kind, thread_binding, annotations))
 
 
 @register
@@ -506,9 +501,8 @@ def serial(
             begin: PrimExpr,
             end: PrimExpr,
             annotations: Optional[Mapping[str, Object]] = None,
-            span: Optional[Span] = None,
         ):
-            return self.create_loop(begin, end, ForKind.SERIAL, annotations=annotations, span=span)
+            self.create_loop_info(begin, end, ForKind.SERIAL, annotations=annotations)
 
         super().__init__(serial)
 
@@ -522,11 +516,8 @@ def parallel(
             begin: PrimExpr,
             end: PrimExpr,
             annotations: Optional[Mapping[str, Object]] = None,
-            span: Optional[Span] = None,
         ):
-            return self.create_loop(
-                begin, end, ForKind.PARALLEL, annotations=annotations, span=span
-            )
+            self.create_loop_info(begin, end, ForKind.PARALLEL, annotations=annotations)
 
         super().__init__(parallel)
 
@@ -540,11 +531,8 @@ def vectorized(
             begin: PrimExpr,
             end: PrimExpr,
             annotations: Optional[Mapping[str, Object]] = None,
-            span: Optional[Span] = None,
         ):
-            return self.create_loop(
-                begin, end, ForKind.VECTORIZED, annotations=annotations, span=span
-            )
+            self.create_loop_info(begin, end, ForKind.VECTORIZED, annotations=annotations)
 
         super().__init__(vectorized)
 
@@ -558,11 +546,8 @@ def unroll(
             begin: PrimExpr,
             end: PrimExpr,
             annotations: Optional[Mapping[str, Object]] = None,
-            span: Optional[Span] = None,
         ):
-            return self.create_loop(
-                begin, end, ForKind.UNROLLED, annotations=annotations, span=span
-            )
+            self.create_loop_info(begin, end, ForKind.UNROLLED, annotations=annotations)
 
         super().__init__(unroll)
 
@@ -577,16 +562,14 @@ def thread_binding(
             end: PrimExpr,
             thread: str,
             annotations: Optional[Mapping[str, Object]] = None,
-            span: Optional[Span] = None,
         ):
-            thread_iter_var = IterVar(None, None, IterVar.ThreadIndex, thread, span=span)
-            return self.create_loop(
+            thread_iter_var = IterVar(None, None, IterVar.ThreadIndex, thread)
+            self.create_loop_info(
                 begin,
                 end,
                 ForKind.THREAD_BINDING,
                 thread_binding=thread_iter_var,
                 annotations=annotations,
-                span=span,
             )
 
         super().__init__(thread_binding)
@@ -603,12 +586,11 @@ def for_range(
             begin: PrimExpr,
             end: PrimExpr = None,
             annotations: Optional[Mapping[str, Object]] = None,
-            span: Optional[Span] = None,
         ):
             if end is None:
                 end = begin
                 begin = 0
-            return self.create_loop(begin, end, ForKind.SERIAL, annotations=annotations, span=span)
+            self.create_loop_info(begin, end, ForKind.SERIAL, annotations=annotations)
 
         super().__init__(for_range)
 
@@ -621,19 +603,8 @@ class Grid(ForScopeHandler):
     """For scope handler T.grid(extents)"""
 
     def __init__(self):
-        def grid(*extents: List[PrimExpr], span: Span):
-            assert (
-                self.node and self.context and self.loop_vars
-            ), "call 'exit_scope' before 'enter_scope'"
-            if len(self.loop_vars) != len(extents):
-                self.context.report_error(
-                    "Inconsistent number of loop vars and extents, "
-                    + f"got {len(self.loop_vars)} vs {len(extents)}",
-                    self.node.span,
-                )
-            body = self.body
-            for loop_var, extent in zip(reversed(self.loop_vars), reversed(extents)):
-                body = tvm.tir.For(loop_var, 0, extent, ForKind.SERIAL, body, span=span)
-            return body
+        def grid(*extents: List[PrimExpr]):
+            for extent in extents:
+                self.create_loop_info(0, extent, ForKind.SERIAL)
 
         super().__init__(grid)
diff --git a/python/tvm/script/tir/special_stmt.py b/python/tvm/script/tir/special_stmt.py
index 69cf15f493de..de212352f3e4 100644
--- a/python/tvm/script/tir/special_stmt.py
+++ b/python/tvm/script/tir/special_stmt.py
@@ -21,17 +21,18 @@
 
 import synr
 from synr import ast
+from tvm.ir.expr import PrimExpr, Range
 
 import tvm.tir
 from tvm.runtime import Object
 from tvm import te
 from tvm.ir import Span
-from tvm.tir import IntImm
+from tvm.tir import IntImm, IterVar
 
 from .node import BufferSlice
 from .utils import buffer_slice_to_region
 
-from ..context_maintainer import ContextMaintainer
+from ..context_maintainer import BlockInfo, ContextMaintainer
 from ..registry import register
 from ..utils import (
     get_param_list,
@@ -132,9 +133,10 @@ def match_buffer(
             buffer_type="default",
             span=None,
         ):
-            if not isinstance(self.node, ast.Assign):
+            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
                 self.context.report_error(
-                    "match_buffer must be assigned to a buffer, e.g. A = match_buffer(...)",
+                    "`match_buffer` must be assigned to a single buffer, "
+                    "e.g. A = match_buffer(...)",
                     self.node.span,
                 )
             if strides is None:
@@ -143,10 +145,11 @@ def match_buffer(
             offset_factor = convert_to_int(
                 offset_factor, "offset_factor", self.context.report_error, self.node.span
             )
+            buffer_name: str = self.node.lhs[0].id.name
             buffer = tvm.tir.decl_buffer(
                 shape,
                 dtype,
-                self.node.lhs.id.name,
+                buffer_name,
                 data,
                 strides,
                 elem_offset,
@@ -173,7 +176,7 @@ def match_buffer(
                     + str(type(param)),
                     self.node.rhs.params[0].span,
                 )
-            self.context.update_symbol(self.node.lhs.id.name, buffer, self.node)
+            self.context.update_symbol(buffer_name, buffer, self.node)
 
         super().__init__(match_buffer, def_symbol=True)
 
@@ -201,9 +204,9 @@ def buffer_decl(
             buffer_type="default",
             span=None,
         ):
-            if not isinstance(self.node, ast.Assign):
+            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
                 self.context.report_error(
-                    "buffer_decl must be assigned to a buffer, e.g. A = buffer_decl(...)",
+                    "`buffer_decl` must be assigned to a single buffer, e.g. A = buffer_decl(...)",
                     self.node.span,
                 )
 
@@ -213,10 +216,11 @@ def buffer_decl(
             offset_factor = convert_to_int(
                 offset_factor, "offset_factor", self.context.report_error, self.node.span
             )
+            buffer_name: str = self.node.lhs[0].id.name
             buffer = tvm.tir.decl_buffer(
                 shape,
                 dtype,
-                self.node.lhs.id.name,
+                buffer_name,
                 data,
                 strides,
                 elem_offset,
@@ -226,7 +230,7 @@ def buffer_decl(
                 buffer_type,
                 span=span,
             )
-            self.context.update_symbol(self.node.lhs.id.name, buffer, self.node)
+            self.context.update_symbol(buffer_name, buffer, self.node)
             return buffer
 
         super().__init__(buffer_decl, def_symbol=True)
@@ -257,9 +261,10 @@ def alloc_buffer(
             buffer_type="default",
             span=None,
         ):
-            if not isinstance(self.node, ast.Assign):
+            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
                 self.context.report_error(
-                    "alloc_buffer must be assigned to a buffer, e.g. A = alloc_buffer(...)",
+                    "`alloc_buffer` must be assigned to a single buffer, "
+                    "e.g. A = alloc_buffer(...)",
                     self.node.span,
                 )
 
@@ -269,10 +274,11 @@ def alloc_buffer(
             offset_factor = convert_to_int(
                 offset_factor, "offset_factor", self.context.report_error, self.node.span
             )
+            buffer_name: str = self.node.lhs[0].id.name
             buffer = tvm.tir.decl_buffer(
                 shape,
                 dtype,
-                self.node.lhs.id.name,
+                buffer_name,
                 data,
                 strides,
                 elem_offset,
@@ -283,32 +289,11 @@ def alloc_buffer(
                 span=span,
             )
             self.context.current_block_scope().alloc_buffers.append(buffer)
-            self.context.update_symbol(self.node.lhs.id.name, buffer, self.node)
+            self.context.update_symbol(buffer_name, buffer, self.node)
 
         super().__init__(alloc_buffer, def_symbol=True)
 
 
-@register
-class BlockVarBind(SpecialStmt):
-    """Special function bind(block_iter, binding_value)
-
-    Example
-    -------
-    .. code-block:: python
-
-        T.bind(vx, i)
-    """
-
-    def __init__(self):
-        def bind(iter_var, values, span=None):
-            block_scope = self.context.current_block_scope()
-            if iter_var in block_scope.iter_bindings:
-                self.context.report_error("Duplicate iter_var bindings of " + str(iter_var), span)
-            block_scope.iter_bindings[iter_var] = values
-
-        super().__init__(bind, def_symbol=False)
-
-
 @register
 class BlockReads(SpecialStmt):
     """Special function reads([read_buffer_regions])
@@ -412,6 +397,315 @@ def block_attr(attrs: Mapping[str, Object], span: Span = None):
         super().__init__(block_attr, def_symbol=False)
 
 
+class BlockAxis(SpecialStmt):
+    """Special stmt for defining a spatial block axis
+    axis.S(dom, iter_value)
+
+    Example
+    -------
+    .. code-block:: python
+
+        vi = T.axis.S(128, i * 4 + j)
+    """
+
+    def axis(
+        self,
+        var_name: str,
+        dom: Union[PrimExpr, Range],
+        value: PrimExpr,
+        iter_type: int,
+        span: Optional[Span] = None,
+    ) -> None:
+        """
+        Helper function for creating block axis
+
+        Parameters
+        ----------
+        var_name : str
+            The name_hint of var
+
+        dom : Union[PrimExpr, Range]
+            The iter domain.
+
+        value : PrimExpr
+            The binding value
+
+        iter_type : int
+            The iteration type.
+
+        span : Optional[Span]
+            The location of this for in the source code.
+        """
+        assert self.context, "call 'exit_scope' before 'enter_scope'"
+        block_scope: BlockInfo = self.context.current_block_scope()
+        if var_name in [iter_var.var.name for iter_var in block_scope.iter_vars]:
+            self.context.report_error("Duplicate block axis " + var_name, self.node.span)
+
+        block_var = tvm.tir.Var(var_name, dtype="int32")
+        dom = tvm.runtime.convert(dom)
+        if isinstance(dom, PrimExpr):
+            dom = tvm.ir.Range.from_min_extent(0, dom)
+        elif not isinstance(dom, tvm.ir.Range):
+            self.context.report_error(
+                f"Block axis domain expected PrimExpr or Range, but got {type(value)}",
+                self.node.span,
+            )
+        value = tvm.runtime.convert(value)
+        if not isinstance(value, PrimExpr):
+            self.context.report_error(
+                f"Block axis value expected PrimExpr, but got {type(value)}",
+                self.node.span,
+            )
+        iter_var = tvm.tir.IterVar(dom, block_var, iter_type)
+        block_scope.iter_vars.append(iter_var)
+        block_scope.iter_values.append(value)
+        self.context.update_symbol(var_name, block_var, self.node)
+
+
+@register
+class BlockAxisSpatial(BlockAxis):
+    """Special stmt for defining a spatial block axis
+    axis.spatial(dom, iter_value)
+
+    Example
+    -------
+    .. code-block:: python
+
+        vi = T.axis.spatial(128, k)
+    """
+
+    def __init__(self):
+        def axis_spatial(
+            dom: Union[PrimExpr, Tuple[PrimExpr, PrimExpr]], value: PrimExpr, span: Span = None
+        ):
+            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
+                self.context.report_error(
+                    "`axis.spatial` must be assigned to a var, e.g. vi = axis.spatial(...)",
+                    self.node.span,
+                )
+            self.axis(self.node.lhs[0].id.name, dom, value, IterVar.DataPar)
+
+        super().__init__(axis_spatial, def_symbol=True)
+
+    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
+        return "tir.axis.spatial", get_param_list(self.func)
+
+
+@register
+class BlockAxisS(BlockAxis):
+    """The sugar special stmt for defining a spatial block axis
+    axis.S(dom, iter_value)
+
+    Example
+    -------
+    .. code-block:: python
+
+        vi = T.axis.S(128, k)
+    """
+
+    def __init__(self):
+        def axis_spatial(
+            dom: Union[PrimExpr, Tuple[PrimExpr, PrimExpr]], value: PrimExpr, span: Span = None
+        ):
+            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
+                self.context.report_error(
+                    "`axis.S` must be assigned to a var, e.g. vi = axis.S(...)",
+                    self.node.span,
+                )
+            self.axis(self.node.lhs[0].id.name, dom, value, IterVar.DataPar)
+
+        super().__init__(axis_spatial, def_symbol=True)
+
+    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
+        return "tir.axis.S", get_param_list(self.func)
+
+
+@register
+class BlockAxisReduce(BlockAxis):
+    """Special stmt for defining a reduce block axis
+    axis.reduce(dom, iter_value)
+
+    Example
+    -------
+    .. code-block:: python
+
+        vi = T.axis.reduce(128, k)
+    """
+
+    def __init__(self):
+        def axis_reduce(
+            dom: Union[PrimExpr, Tuple[PrimExpr, PrimExpr]], value: PrimExpr, span: Span = None
+        ):
+            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
+                self.context.report_error(
+                    "`axis.reduce` must be assigned` to a var, e.g. vi = axis.reduce(...)",
+                    self.node.span,
+                )
+            self.axis(self.node.lhs[0].id.name, dom, value, IterVar.CommReduce)
+
+        super().__init__(axis_reduce, def_symbol=True)
+
+    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
+        return "tir.axis.reduce", get_param_list(self.func)
+
+
+@register
+class BlockAxisR(BlockAxis):
+    """The sugar special stmt for defining a reduce block axis
+    axis.R(dom, iter_value)
+
+    Example
+    -------
+    .. code-block:: python
+
+        vi = T.axis.R(128, k)
+    """
+
+    def __init__(self):
+        def axis_reduce(
+            dom: Union[PrimExpr, Tuple[PrimExpr, PrimExpr]], value: PrimExpr, span: Span = None
+        ):
+            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
+                self.context.report_error(
+                    "`axis.R` must be assigned to a var, e.g. vi = axis.R(...)",
+                    self.node.span,
+                )
+            self.axis(self.node.lhs[0].id.name, dom, value, IterVar.CommReduce)
+
+        super().__init__(axis_reduce, def_symbol=True)
+
+    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
+        return "tir.axis.R", get_param_list(self.func)
+
+
+@register
+class BlockAxisScan(BlockAxis):
+    """Special stmt for defining a ordered block axis
+    axis.scan(dom, iter_value)
+
+    Example
+    -------
+    .. code-block:: python
+
+        vi = T.axis.scan(128, k)
+    """
+
+    def __init__(self):
+        def axis_scan(
+            dom: Union[PrimExpr, Tuple[PrimExpr, PrimExpr]], value: PrimExpr, span: Span = None
+        ):
+            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
+                self.context.report_error(
+                    "`axis.scan` must be assigned to a var, e.g. vi = axis.scan(...)",
+                    self.node.span,
+                )
+            self.axis(self.node.lhs[0].id.name, dom, value, IterVar.Ordered)
+
+        super().__init__(axis_scan, def_symbol=True)
+
+    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
+        return "tir.axis.scan", get_param_list(self.func)
+
+
+@register
+class BlockAxisOpaque(BlockAxis):
+    """Special stmt for defining a opaque block axis
+    axis.opaque(dom, iter_value)
+
+    Example
+    -------
+    .. code-block:: python
+
+        vi = T.axis.opaque(128, k)
+    """
+
+    def __init__(self):
+        def axis_opaque(
+            dom: Union[PrimExpr, Tuple[PrimExpr, PrimExpr]], value: PrimExpr, span: Span = None
+        ):
+            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
+                self.context.report_error(
+                    "`axis.opaque` must be assigned to a var, e.g. vi = axis.opaque(...)",
+                    self.node.span,
+                )
+            self.axis(self.node.lhs[0].id.name, dom, value, IterVar.DimInfo)
+
+        super().__init__(axis_opaque, def_symbol=True)
+
+    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
+        return "tir.axis.opaque", get_param_list(self.func)
+
+
+@register
+class BlockAxisRemap(BlockAxis):
+    """Special stmt for remapping loops vars to block axes.
+    axis.remap(iter_type, iter_value)
+
+    Note
+    ----
+    Iter_type is a string consisting of 'S' and 'R', where 'S' means
+    for spatial and 'R' means for reduce.
+
+    Example
+    -------
+    .. code-block:: python
+
+        vi, vj = T.axis.remap("SS", [i, j])
+    """
+
+    def __init__(self):
+        def axis_remap(iter_types: str, loop_vars: List[tvm.tir.expr.Var], span: Span = None):
+            if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) >= 1:
+                self.context.report_error(
+                    "`axis.remap` must be assigned to one or more vars, "
+                    "e.g. vi, vj = axis.remap(...)",
+                    self.node.span,
+                )
+            var_num: int = len(self.node.lhs)
+            if var_num != len(iter_types):
+                self.context.report_error(
+                    f"`iter_type` expected {var_num} charactor(s), "
+                    f"but got {len(iter_types)}: {iter_types}",
+                    span,
+                )
+            if var_num != len(loop_vars):
+                self.context.report_error(
+                    f"`iter_type` expected {var_num} loop var(s), "
+                    f"but got {len(loop_vars)}: {loop_vars}",
+                    span,
+                )
+            for var, iter_ty, loop_var in zip(self.node.lhs, iter_types, loop_vars):
+                iter_type: int
+                if iter_ty == "S":
+                    iter_type = IterVar.DataPar
+                elif iter_ty == "R":
+                    iter_type = IterVar.CommReduce
+                else:
+                    self.context.report_error(
+                        f'`iter_type` only expected "S" (for spatial) or "R" (for reduce), '
+                        f'but got "{iter_ty}"',
+                        span,
+                    )
+
+                if not isinstance(loop_var, tvm.tir.expr.Var):
+                    self.context.report_error(
+                        f"Values of `axis.remap` expected single loop var, but got {loop_var}",
+                        loop_var.span,
+                    )
+                loops = self.context.loop_stack
+                if loop_var not in loops:
+                    self.context.report_error(
+                        f"Cannot find loop var {loop_var} in loop nesting.",
+                        span,
+                    )
+                self.axis(var.id.name, loops[loop_var], loop_var, iter_type)
+
+        super().__init__(axis_remap, def_symbol=True)
+
+    def signature(self) -> Tuple[str, Tuple[list, list, Any]]:
+        return "tir.axis.remap", get_param_list(self.func)
+
+
 @register
 class BlockPredicate(SpecialStmt):
     """Special function where(predicate)
@@ -449,7 +743,12 @@ def var(dtype, span):
             assert isinstance(
                 self.node, ast.Assign
             ), f"VarDef expected ast.Assign but got {type(self.node)}"
-            v = te.var(self.node.lhs.id.name, dtype, span=span)
+            names = [x.id.name for x in self.node.lhs]
+            if len(names) != 1:
+                self.context.report_error(
+                    f"VarDef expected assign to only one var, but got {names}", span
+                )
+            v = te.var(names[0], dtype, span=span)
             self.context.update_symbol(v.name, v, self.node)
 
         super().__init__(var, def_symbol=True)
@@ -464,8 +763,13 @@ def buffer_var(dtype, storage_scope, span):
             assert isinstance(
                 self.node, ast.Assign
             ), f"BufferVarDef expected ast.Assign but got {type(self.node)}"
+            names = [x.id.name for x in self.node.lhs]
+            if len(names) != 1:
+                self.context.report_error(
+                    f"VarDef expected assign to only one var, but got {names}", span
+                )
             ptr_type = tvm.ir.PointerType(tvm.ir.PrimType(dtype), storage_scope)
-            v = te.var(self.node.lhs.id.name, ptr_type, span=span)
+            v = te.var(names[0], ptr_type, span=span)
             self.context.update_symbol(v.name, v, self.node)
 
         super().__init__(buffer_var, def_symbol=True)
@@ -480,7 +784,12 @@ def env_thread(env_name, span):
             assert isinstance(
                 self.node, ast.Assign
             ), f"EnvThread expected ast.Assign but got {type(self.node)}"
-            v = te.var(self.node.lhs.id.name, span=span)
+            names = [x.id.name for x in self.node.lhs]
+            if len(names) != 1:
+                self.context.report_error(
+                    f"VarDef expected assign to only one var, but got {names}", span
+                )
+            v = te.var(names[0], span=span)
             self.context.func_var_env_dict[v] = env_name
             self.context.update_symbol(v.name, v, self.node)
 
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 4e5826f5b2a2..9af09296e9cc 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -31,6 +31,11 @@
 class TargetKind(Object):
     """Kind of a compilation target"""
 
+    @property
+    def options(self):
+        """Returns the dict of available option names and types"""
+        return dict(_ffi_api.ListTargetKindOptions(self))
+
 
 @tvm._ffi.register_object
 class Target(Object):
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index 681e322b2082..cb0305d49e4a 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -467,10 +467,12 @@ def tir_matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
             B = T.match_buffer(b, (128, 128))
             C = T.match_buffer(c, (128, 128))
 
-            with T.block([128, 128, T.reduce_axis(0, 128)]) as [i, j, k]:
-                with T.init():
-                    C[i, j] = 0.0
-                C[i, j] += A[i, k] * B[j, k]
+            for i, j, k in T.grip(128, 128, 128):
+                with T.block():
+                    vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                    with T.init():
+                        C[vi, vj] = 0.0
+                    C[vi, vj] += A[vi, vk] * B[vj, vk]
 
     Returns
     -------
diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py
index 0413c44208b0..2cb228c357e5 100644
--- a/python/tvm/testing/plugin.py
+++ b/python/tvm/testing/plugin.py
@@ -74,6 +74,7 @@ def pytest_collection_modifyitems(config, items):
     # pylint: disable=unused-argument
     _count_num_fixture_uses(items)
     _remove_global_fixture_definitions(items)
+    _sort_tests(items)
 
 
 @pytest.fixture
@@ -236,6 +237,25 @@ def _remove_global_fixture_definitions(items):
                 delattr(module, name)
 
 
+def _sort_tests(items):
+    """Sort tests by file/function.
+
+    By default, pytest will sort tests to maximize the re-use of
+    fixtures.  However, this assumes that all fixtures have an equal
+    cost to generate, and no caches outside of those managed by
+    pytest.  A tvm.testing.parameter is effectively free, while
+    reference data for testing may be quite large.  Since most of the
+    TVM fixtures are specific to a python function, sort the test
+    ordering by python function, so that
+    tvm.testing.utils._fixture_cache can be cleared sooner rather than
+    later.
+
+    Should be called from pytest_collection_modifyitems.
+
+    """
+    items.sort(key=lambda item: item.location)
+
+
 def _target_to_requirement(target):
     if isinstance(target, str):
         target = tvm.target.Target(target)
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index 6a90924912b1..b002ace0e400 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -108,8 +108,10 @@ def mem_copy(a: T.handle, b: T.handle, m: T.int32, n: T.int32) -> None:
                 A = T.match_buffer(a, (m, n), "float32")
                 B = T.match_buffer(b, (m, n), "float32")
 
-                with T.block([m, n], "") as [vi, vj]:
-                    B[vi, vj] = A[vi, vj]
+                for i, j in T.grid(m, n):
+                    with T.block():
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vi, vj]
 
         Then we can make it specialized with given shapes or buffers.
 
@@ -129,8 +131,10 @@ def mem_copy_16_16(a: T.handle, b: T.handle) -> None:
                 A = T.match_buffer(a, (16, 16), "float32")
                 B = T.match_buffer(b, (16, 16), "float32")
 
-                with T.block([16, 16], "") as [vi, vj]:
-                    B[vi, vj] = A[vi, vj]
+                for i, j in T.grid(16, 16):
+                    with T.block():
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vi, vj]
 
         Returns
         -------
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 09a52d2e7037..786982cf704c 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -397,7 +397,8 @@ def before_fuse(a: T.handle, b: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.match_buffer(b, (128, 128))
                 for i, j in T.grid(128, 128):
-                    with T.block([128, 128], "B") as [vi, vj]:
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
                         B[vi, vj] = A[vi, vj] * 2.0
 
         Create the schedule and do fuse:
@@ -419,9 +420,9 @@ def after_fuse(a: T.handle, b: T.handle) -> None:
                 B = T.match_buffer(b, (128, 128))
                 # the 2 loops are fused into 1
                 for i_j_fused in T.serial(0, 16384):
-                    with T.block([128, 128], "B") as [vi, vj]:
-                        T.bind(vi, tir.floordiv(i_j_fused, 128))
-                        T.bind(vj, T.floormod(i_j_fused, 128))
+                    with T.block("B"):
+                        vi = T.axis.S(128, T.floordiv(i_j_fused, 128))
+                        vj = T.axis.S(128, T.floormod(i_j_fused, 128))
                         B[vi, vj] = A[vi, vj] * 2.0
 
         """
@@ -468,7 +469,8 @@ def before_split(a: T.handle, b: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.match_buffer(b, (128, 128))
                 for i, j in T.grid(128, 128):
-                    with T.block([128, 128], "B") as [vi, vj]:
+                    with T.block("B") as [vi, vj]:
+                        vi, vj = T.axis.remap("SS", [i, j])
                         B[vi, vj] = A[vi, vj] * 2.0
 
         Create the schedule and do split:
@@ -490,9 +492,9 @@ def after_split(a: T.handle, b: T.handle) -> None:
                 B = T.match_buffer(b, (128, 128))
                 # the original loop is split into 2 loops
                 for i0, i1, j in T.grid(2, 64, 128):
-                    with T.block([128, 128], "B") as [vi, vj]:
-                        T.bind(vi, ((i0*64) + i1))
-                        T.bind(vj, j)
+                    with T.block("B"):
+                        vi = T.axis.S(128, i0 * 64 + i1)
+                        vj = T.axis.S(128, j)
                         B[vi, vj] = A[vi, vj] * 2.0
 
         """
@@ -529,7 +531,8 @@ def before_reorder(a: T.handle, b: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.match_buffer(b, (128, 128))
                 for i, j in T.grid(128, 128):
-                    with T.block([128, 128], "B") as [vi, vj]:
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
                         B[vi, vj] = A[vi, vj] * 2.0
 
         Create the schedule and do reorder:
@@ -551,9 +554,8 @@ def after_reorder(a: T.handle, b: T.handle) -> None:
                 B = T.match_buffer(b, (128, 128))
                 # Here j and i are reordered
                 for j, i in T.grid(128, 128):
-                    with T.block([128, 128], "B") as [vi, vj]:
-                        T.bind(vi, i)
-                        T.bind(vj, j)
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
                         B[vi, vj] = A[vi, vj] * 2.0
 
         """
@@ -586,9 +588,8 @@ def before_parallel(a: T.handle, b: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.match_buffer(b, (128, 128))
                 for i, j in T.grid(128, 128):
-                    with T.block([128, 128], "B") as [vi, vj]:
-                        T.bind(vi, i)
-                        T.bind(vj, j)
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
                         B[vi, vj] = A[vi, vj] * 2.0
 
         Create the schedule and do parallel:
@@ -609,9 +610,8 @@ def after_parallel(a: T.handle, b: T.handle) -> None:
                 B = T.match_buffer(b, (128, 128))
                 for i in T.parallel(0, 128):
                     for j in T.serial(0, 128):
-                        with T.block([128, 128], "B") as [vi, vj]:
-                            T.bind(vi, i)
-                            T.bind(vj, j)
+                        with T.block("B"):
+                            vi, vj = T.axis.remap("SS", [i, j])
                             B[vi, vj] = A[vi, vj] * 2.0
 
         """
@@ -642,9 +642,8 @@ def before_vectorize(a: T.handle, b: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.match_buffer(b, (128, 128))
                 for i, j in T.grid(128, 128):
-                    with T.block([128, 128], "B") as [vi, vj]:
-                        T.bind(vi, i)
-                        T.bind(vj, j)
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
                         B[vi, vj] = A[vi, vj] * 2.0
 
         Create the schedule and do vectorize:
@@ -665,9 +664,8 @@ def after_vectorize(a: T.handle, b: T.handle) -> None:
                 B = T.match_buffer(b, (128, 128))
                 for i in T.serial(0, 128):
                     for j in T.vectorized(0, 128):
-                        with T.block([128, 128], "B") as [vi, vj]:
-                            T.bind(vi, i)
-                            T.bind(vj, j)
+                        with T.block("B"):
+                            vi, vj = T.axis.remap("SS", [i, j])
                             B[vi, vj] = A[vi, vj] * 2.0
 
         """
@@ -706,9 +704,8 @@ def before_bind(a: T.handle, b: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.match_buffer(b, (128, 128))
                 for i, j in T.grid(128, 128):
-                    with T.block([128, 128], "B") as [vi, vj]:
-                        T.bind(vi, i)
-                        T.bind(vj, j)
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
                         B[vi, vj] = A[vi, vj] * 2.0
 
         Create the schedule and do bind:
@@ -730,9 +727,8 @@ def after_bind(a: T.handle, b: T.handle) -> None:
                 B = T.match_buffer(b, (128, 128))
                 for i in T.thread_binding(0, 128, thread = "blockIdx.x"):
                     for j in T.thread_binding(0, 128, thread = "threadIdx.x"):
-                        with T.block([128, 128], "B") as [vi, vj]:
-                            T.bind(vi, i)
-                            T.bind(vj, j)
+                        with T.block("B"):
+                            vi, vj = T.axis.remap("SS", [i, j])
                             B[vi, vj] = A[vi, vj] * 2.0
 
         """
@@ -758,9 +754,8 @@ def before_unroll(a: T.handle, b: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.match_buffer(b, (128, 128))
                 for i, j in T.grid(128, 128):
-                    with T.block([128, 128], "B") as [vi, vj]:
-                        T.bind(vi, i)
-                        T.bind(vj, j)
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
                         B[vi, vj] = A[vi, vj] * 2.0
 
         Create the schedule and do unroll:
@@ -781,9 +776,8 @@ def after_unroll(a: T.handle, b: T.handle) -> None:
                 B = T.match_buffer(b, (128, 128))
                 for i in T.unroll(0, 128):
                     for j in T.serial(0, 128):
-                        with T.block([128, 128], "B") as [vi, vj]:
-                            T.bind(vi, i)
-                            T.bind(vj, j)
+                        with T.block("B"):
+                            vi, vj = T.axis.remap("SS", [i, j])
                             B[vi, vj] = A[vi, vj] * 2.0
 
         """
@@ -825,7 +819,8 @@ def before_cache_read(a: T.handle, b: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.match_buffer(b, (128, 128))
                 for i, j in T.grid(128, 128):
-                    with T.block([128, 128], "B") as [vi, vj]:
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
                         B[vi, vj] = A[vi, vj] * 2.0
 
         Create the schedule and cache_read:
@@ -847,10 +842,12 @@ def after_cache_read(a: T.handle, b: T.handle) -> None:
                 B = T.match_buffer(b, (128, 128))
                 A_local = T.alloc_buffer((128, 128), scope="local")
                 for i, j in T.grid(128, 128):
-                    with T.block([128, 128], "A_local") as [vi, vj]:
+                    with T.block("A_local"):
+                        vi, vj = T.axis.remap("SS", [i, j])
                         A_local[vi, vj] = A[vi, vj]
                 for i, j in T.grid(128, 128):
-                    with T.block([128, 128], "B") as [vi, vj]:
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
                         B[vi, vj] = A_local[vi, vj] * 2.0
 
         """
@@ -893,7 +890,8 @@ def before_cache_write(a: T.handle, b: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.match_buffer(b, (128, 128))
                 for i, j in T.grid(128, 128):
-                    with T.block([128, 128], "B") as [vi, vj]:
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
                         B[vi, vj] = A[vi, vj] * 2.0
 
         Create the schedule and cache_write:
@@ -915,10 +913,12 @@ def after_cache_write(a: T.handle, b: T.handle) -> None:
                 B = T.match_buffer(b, (128, 128))
                 B_local = T.alloc_buffer((128, 128), scope="local")
                 for i, j in T.grid(128, 128):
-                    with T.block([128, 128], "A_local") as [vi, vj]:
+                    with T.block("A_local"):
+                        vi, vj = T.axis.remap("SS", [i, j])
                         B_local[vi, vj] = A[vi, vj] * 2.0
                 for i, j in T.grid(128, 128):
-                    with T.block([128, 128], "B") as [vi, vj]:
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
                         B[vi, vj] = B_local[vi, vj]
 
         """
@@ -974,10 +974,14 @@ def before_compute_at(a: T.handle, c: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128), "float32")
                 B = T.alloc_buffer((128, 128), "float32")
                 C = T.match_buffer(c, (128, 128), "float32")
-                with T.block([128, 128], "B") as [vi, vj]:
-                    B[vi, vj] = A[vi, vj] * 2.0
-                with T.block([128, 128], "C") as [vi, vj]:
-                    C[vi, vj] = B[vi, vj] + 1.0
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vi, vj] * 2.0
+                for i, j in T.grid(128, 128):
+                    with T.block("C"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        C[vi, vj] = B[vi, vj] + 1.0
 
         Create the schedule and do compute-at:
 
@@ -1000,14 +1004,12 @@ def after_compute_at(a: T.handle, c: T.handle) -> None:
                 C = T.match_buffer(c, (128, 128), "float32")
                 for i in T.serial(0, 128):
                     for j in T.serial(0, 128):
-                        with T.block([128, 128], "B") as [vi, vj]:
-                            T.bind(vi, i)
-                            T.bind(vj, j)
+                        with T.block("B"):
+                            vi, vj = T.axis.remap("SS", [i, j])
                             B[vi, vj] = A[vi, vj] * 2.0
                     for j in T.serial(0, 128):
-                        with T.block([128, 128], "C") as [vi, vj]:
-                            T.bind(vi, i)
-                            T.bind(vj, j)
+                        with T.block("C"):
+                            vi, vj = T.axis.remap("SS", [i, j])
                             C[vi, vj] = B[vi, vj] + 1.0
 
         """
@@ -1061,10 +1063,14 @@ def before_reverse_compute_at(a: T.handle, c: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128), "float32")
                 B = T.alloc_buffer((128, 128), "float32")
                 C = T.match_buffer(c, (128, 128), "float32")
-                with T.block([128, 128], "B") as [vi, vj]:
-                    B[vi, vj] = A[vi, vj] * 2.0
-                with T.block([128, 128], "C") as [vi, vj]:
-                    C[vi, vj] = B[vi, vj] + 1.0
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vi, vj] * 2.0
+                for i, j in T.grid(128, 128):
+                    with T.block("C"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        C[vi, vj] = B[vi, vj] + 1.0
 
         Create the schedule and do reverse-compute-at:
 
@@ -1087,14 +1093,12 @@ def after_reverse_compute_at(a: T.handle, c: T.handle) -> None:
                 C = T.match_buffer(c, (128, 128), "float32")
                 for i in T.serial(0, 128):
                     for j in T.serial(0, 128):
-                        with T.block([128, 128], "B") as [vi, vj]:
-                            T.bind(vi, i)
-                            T.bind(vj, j)
+                        with T.block("B"):
+                            vi, vj = T.axis.remap("SS", [i, j])
                             B[vi, vj] = A[vi, vj] * 2.0
                     for j in T.serial(0, 128):
-                        with T.block([128, 128], "C") as [vi, vj]:
-                            T.bind(vi, i)
-                            T.bind(vj, j)
+                        with T.block("C"):
+                            vi, vj = T.axis.remap("SS", [i, j])
                             C[vi, vj] = B[vi, vj] + 1.0
 
         """
@@ -1135,10 +1139,14 @@ def before_inline(a: T.handle, c: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.alloc_buffer((128, 128))
                 C = T.match_buffer(c, (128, 128))
-                with T.block([128, 128], "B") as [vi, vj]:
-                    B[vi, vj] = A[vi, vj] * 2.0
-                with T.block([128, 128], "C") as [vi, vj]:
-                    C[vi, vj] = B[vi, vj] + 1.0
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vi, vj] * 2.0
+                for i, j in T.grid(128, 128):
+                    with T.block("C"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        C[vi, vj] = B[vi, vj] + 1.0
 
         Create the schedule and do compute-inline:
 
@@ -1156,8 +1164,10 @@ def before_inline(a: T.handle, c: T.handle) -> None:
             def after_inline(a: T.handle, c: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 C = T.match_buffer(c, (128, 128))
-                with T.block([128, 128], "C") as [vi, vj]:
-                    C[vi, vj] = A[vi, vj] * 2.0 + 1.0
+                for i, j in T.grid(128, 128):
+                    with T.block("C"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        C[vi, vj] = A[vi, vj] * 2.0 + 1.0
 
         """
         _ffi_api.ScheduleComputeInline(self, block)  # type: ignore # pylint: disable=no-member
@@ -1195,10 +1205,14 @@ def before_inline(a: T.handle, c: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.alloc_buffer((128, 128))
                 C = T.match_buffer(c, (128, 128))
-                with T.block([128, 128], "B") as [vi, vj]:
-                    B[vi, vj] = A[vi, vj] * 2.0
-                with T.block([128, 128], "C") as [vi, vj]:
-                    C[vi, vj] = B[vi, vj] + 1.0
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vi, vj] * 2.0
+                for i, j in T.grid(128, 128):
+                    with T.block("C"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        C[vi, vj] = B[vi, vj] + 1.0
 
         Create the schedule and do reverse-compute-inline:
 
@@ -1216,8 +1230,10 @@ def before_inline(a: T.handle, c: T.handle) -> None:
             def after_inline(a: T.handle, c: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 C = T.match_buffer(c, (128, 128))
-                with T.block([128, 128], "C") as [vi, vj]:
-                    C[vi, vj] = A[vi, vj] * 2.0 + 1.0
+                for i, j in T.grid(128, 128):
+                    with T.block("C"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        C[vi, vj] = A[vi, vj] * 2.0 + 1.0
 
         """
         _ffi_api.ScheduleReverseComputeInline(self, block)  # type: ignore # pylint: disable=no-member
@@ -1384,8 +1400,9 @@ def rfactor(self, loop: LoopRV, factor_axis: int) -> LoopRV:
             def before_rfactor(a: T.handle, b: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128, 128))
                 B = T.match_buffer(b, (128,))
-                with T.block([128, T.reduce_axis(0, 128),
-                                T.reduce_axis(0, 128)], "B") as [vii, vi, vj]:
+                for ii, i, j in T.grid(128, 128, 128):
+                with T.block("B"):
+                    vii, vi, vj = T.axis.remap("SRR", [ii, i, j])
                     with T.init():
                         B[vii] = 0.0
                     B[vii] = B[vii] + A[vii, vi, vj]
@@ -1408,14 +1425,18 @@ def after_rfactor(a: T.handle, b: T.handle) -> None:
                 A = T.match_buffer(a, [128, 128, 128])
                 B = T.match_buffer(b, [128])
                 B_rf = T.alloc_buffer([128, 128])
-                with T.block([128, 128, T.reduce_axis(0, 128)], "B_rf") as [vi2, vii, vi]:
-                    with T.init():
-                        B_rf[vi2, vii] = 0.0
-                    B_rf[vi2, vii] = (B_rf[vi2, vii] + A[vii, vi, vi2])
-                with T.block([128, T.reduce_axis(0, 128)], "B") as [vii_1, vi2_1]:
-                    with T.init():
-                        B[vii_1] = 0.0
-                    B[vii_1] = (B[vii_1] + B_rf[vi2_1, vii_1])
+                for i2, ii, i in T.grid(128, 128, 128):
+                    with T.block("B_rf"):
+                        vi2, vii, vi = T.axis.remap("SSR", [i2, ii, i])
+                        with T.init():
+                            B_rf[vi2, vii] = 0.0
+                        B_rf[vi2, vii] = (B_rf[vi2, vii] + A[vii, vi, vi2])
+                for ii, i2 in T.grid(128, 128):
+                    with T.block("B"):
+                        vii, vi2 = T.axis.remap("SR", [ii, i2])
+                        with T.init():
+                            B[vii] = 0.0
+                        B[vii] = B[vii] + B_rf[vi2, vii]
 
 
         Note
@@ -1483,10 +1504,14 @@ def before_storage_align(a: T.handle, c: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.alloc_buffer((128, 128))
                 C = T.match_buffer(c, (128, 128))
-                with T.block([128, 128], "B") as [vi, vj]:
-                    B[vi, vj] = A[vi, vj] * 2.0
-                with T.block([128, 128], "C") as [vi, vj]:
-                    C[vi, vj] = B[vi, vj] + 1.0
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vi, vj] * 2.0
+                for i, j in T.grid(128, 128):
+                    with T.block("C"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        C[vi, vj] = B[vi, vj] + 1.0
 
         Create the schedule and do storage_align:
 
@@ -1505,11 +1530,15 @@ def after_storage_align(a: T.handle, c: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.alloc_buffer((128, 128))
                 C = T.match_buffer(c, (128, 128))
-                with T.block([128, 128], "B") as [vi, vj]:
-                    T.block_attr({"buffer_dim_align": [[[0, 128, 1]]]})
-                    B[vi, vj] = A[vi, vj] * 2.0
-                with T.block([128, 128], "C") as [vi, vj]:
-                    C[vi, vj] = B[vi, vj] + 1.0
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        T.block_attr({"buffer_dim_align": [[[0, 128, 1]]]})
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vi, vj] * 2.0
+                for i, j in T.grid(128, 128):
+                    with T.block("C"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        C[vi, vj] = B[vi, vj] + 1.0
 
         After lowering passes, buffer B will have strides as [129, 1].
 
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index f072f6b38a43..722810e9aa5b 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -628,7 +628,7 @@ def CompactBufferAllocation():
     .. code-block:: python
 
         for i in range(0, 16):
-            with T.block([]):
+            with T.block():
                 B = T.alloc_buffer(16, 16)
                 for j in range(0, 16):
                     B[i, j] = A[i, j] + 1
@@ -643,7 +643,7 @@ def CompactBufferAllocation():
     .. code-block:: python
 
         for i in range(0, 16):
-            with T.block([]):
+            with T.block():
                 B = T.alloc_buffer(1, 16)
                 for j in range(0, 16):
                     B[0, j] = A[i, j] + 1
@@ -715,3 +715,14 @@ def MergeDynamicSharedMemoryAllocations():
         The result pass
     """
     return _ffi_api.MergeDynamicSharedMemoryAllocations()  # type: ignore
+
+
+def ConvertForLoopsToSerial():
+    """Convert Parallel For Loops to Serial For Loops.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.ConvertForLoopsToSerial()  # type: ignore
diff --git a/python/tvm/topi/arm_cpu/conv2d.py b/python/tvm/topi/arm_cpu/conv2d.py
index b3af36740551..0500eb55996c 100644
--- a/python/tvm/topi/arm_cpu/conv2d.py
+++ b/python/tvm/topi/arm_cpu/conv2d.py
@@ -505,15 +505,15 @@ def _callback(op):
     return s
 
 
-@autotvm.register_topi_compute("conv2d_direct_simd.arm_cpu")
-def conv2d_direct_simd(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d with SIMD (v7e-m)."""
-    return direct_simd.conv2d_direct_simd_compute(
+@autotvm.register_topi_compute("conv2d_nhwc_direct_simd.arm_cpu")
+def conv2d_nhwc_direct_simd(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d_nhwc with SIMD (v7e-m)."""
+    return direct_simd.conv2d_nhwc_direct_simd_compute(
         cfg, data, kernel, strides, padding, dilation, out_dtype
     )
 
 
-@autotvm.register_topi_schedule("conv2d_direct_simd.arm_cpu")
-def schedule_conv2d_direct_simd(cfg, outs):
-    """Create schedule for conv2d_direct_simd"""
-    return direct_simd.conv2d_direct_simd_nhwc_schedule(cfg, outs)
+@autotvm.register_topi_schedule("conv2d_nhwc_direct_simd.arm_cpu")
+def schedule_conv2d_nhwc_direct_simd(cfg, outs):
+    """Create schedule for conv2d_nhwc_direct_simd"""
+    return direct_simd.conv2d_nhwc_direct_simd_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
index 307312076a7e..5ef9fd813eb2 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
+++ b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
@@ -30,7 +30,7 @@
 )
 
 
-def conv2d_direct_simd(*args, **kwargs):
+def conv2d_nhwc_direct_simd(*args, **kwargs):
     """Defines the Cortex-M7 SIMD implementation of conv2d."""
     assert not kwargs, "Do not support kwargs in template function call"
     args = deserialize_args(args)
@@ -39,17 +39,17 @@ def conv2d_direct_simd(*args, **kwargs):
     cfg = autotvm.get_config()
     args = [cfg] + args
     assert layout == "NHWC"
-    conv = conv2d_direct_simd_compute(*args)
-    sched = conv2d_direct_simd_nhwc_schedule(cfg, [data, kernel, conv])
+    conv = conv2d_nhwc_direct_simd_compute(*args)
+    sched = conv2d_nhwc_direct_simd_schedule(cfg, [data, kernel, conv])
     return sched, [data, kernel, conv]
 
 
-conv2d_direct_simd.template_key = "direct_simd"
-conv2d_direct_simd.default_data_layout = "NHWC"
-conv2d_direct_simd.default_kernel_layout = "HWOI"
+conv2d_nhwc_direct_simd.template_key = "direct_simd"
+conv2d_nhwc_direct_simd.default_data_layout = "NHWC"
+conv2d_nhwc_direct_simd.default_kernel_layout = "HWOI"
 
 
-def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, out_dtype):
+def conv2d_nhwc_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, out_dtype):
     """Compute function for Cortex-M7 SIMD implementation of conv2d."""
     assert isinstance(strides, int) or len(strides) == 2
     assert isinstance(dilation, int) or len(dilation) == 2
@@ -146,7 +146,7 @@ def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, ou
     return conv
 
 
-def conv2d_direct_simd_nhwc_schedule(cfg, outs):
+def conv2d_nhwc_direct_simd_schedule(cfg, outs):
     """Schedule function for Cortex-M7 SIMD implementation of conv2d."""
     sched = te.create_schedule([x.op for x in outs])
 
diff --git a/src/arith/canonical_simplify.cc b/src/arith/canonical_simplify.cc
index 94db659e25c9..29153037b9fa 100644
--- a/src/arith/canonical_simplify.cc
+++ b/src/arith/canonical_simplify.cc
@@ -535,6 +535,40 @@ void SumExprNode::AddToSelf(const SumExpr& other, int64_t scale) {
   this->AddToSelf(other->base * scale);
 }
 
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<SplitExprNode>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const SplitExprNode*>(node.get());
+      auto factor_str = [](int64_t f) {
+        return f == SplitExprNode::kPosInf ? std::string("+inf") : std::to_string(f);
+      };
+      p->stream << "split(";
+      p->Print(op->index);
+      p->stream << ", lower=" << factor_str(op->lower_factor)
+                << ", upper=" << factor_str(op->upper_factor) << ", scale=" << op->scale
+                << ", div_mode=";
+      switch (op->div_mode) {
+        // No "default", so that the compiler will emit a warning if more div modes are
+        // added that are not covered by the switch.
+        case kTruncDiv:
+          p->stream << "truncdiv";
+          break;
+        case kFloorDiv:
+          p->stream << "floordiv";
+          break;
+      }
+      p->stream << ')';
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<SumExprNode>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const SumExprNode*>(node.get());
+      p->stream << "sum(base=" << op->base;
+      for (const SplitExpr& s : op->args) {
+        p->stream << ", ";
+        p->Print(s);
+      }
+    });
+
 // Sub-class RewriteSimplifier::Impl to take benefit of
 // rewriter for condition simplification etc.
 class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index be78bc4aa9f9..aaf7d48b10c5 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -26,6 +26,7 @@
 #include <tvm/auto_scheduler/feature.h>
 #include <tvm/auto_scheduler/measure.h>
 #include <tvm/auto_scheduler/measure_record.h>
+#include <tvm/driver/driver_api.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/support/parallel_for.h>
 #include <tvm/te/operation.h>
@@ -44,13 +45,6 @@
 #include "search_policy/utils.h"
 #include "utils.h"
 
-namespace tvm {
-// import the function from driver_api.cc
-void GetBinds(const Array<te::Tensor>& args, bool compact,
-              const std::unordered_map<te::Tensor, tir::Buffer>& binds,
-              Map<te::Tensor, tir::Buffer>* out_binds, Array<ObjectRef>* out_arg_list);
-}  // namespace tvm
-
 namespace tvm {
 namespace auto_scheduler {
 
@@ -1268,35 +1262,25 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i
   Array<te::Tensor> tensors;
 
   std::tie(sch, tensors) = task->compute_dag.ApplySteps(state->transform_steps);
+
+  // When inlining, replace const matrices with const values.
+  // Produces wrong IR, but good enough for feature extraction, and
+  // can improve the speed of feature extraction/search.  Must be
+  // called before ScheduleToModule to have an effect.
   sch = sch.normalize_for_feature_extraction();
-  auto bounds = te::InferBound(sch);
 
   try {
-    auto stmt = te::ScheduleOps(sch, bounds, false);
-    Map<te::Tensor, te::Buffer> out_binds;
-    Array<ObjectRef> out_arg_list;
-    bool compact = te::VerifyCompactBuffer(stmt);
     const std::string& name = "main";
-    GlobalVar global_var(name);
-
-    // Copied from driver_api.cc::lower
     auto pass_ctx = tvm::transform::PassContext::Current();
-    GetBinds(tensors, compact, std::unordered_map<te::Tensor, te::Buffer>(), &out_binds,
-             &out_arg_list);
-    tir::PrimFunc f = te::SchedulePostProcToPrimFunc(out_arg_list, std::move(stmt), out_binds);
-    f = WithAttr(std::move(f), "global_symbol", runtime::String(name));
 
-    bool noalias = pass_ctx->GetConfig<Bool>("tir.noalias", Bool(true)).value();
+    auto mod = ScheduleToModule(sch, Array<ObjectRef>{tensors.begin(), tensors.end()}, name,
+                                std::unordered_map<te::Tensor, te::Buffer>());
+
     bool disable_vectorize =
         pass_ctx->GetConfig<Bool>("tir.disable_vectorize", Bool(false)).value();
     bool instrument_bound_checkers =
         pass_ctx->GetConfig<Bool>("tir.instrument_bound_checkers", Bool(false)).value();
 
-    if (noalias) {
-      f = WithAttr(std::move(f), "tir.noalias", Bool(true));
-    }
-    auto mod = IRModule(Map<GlobalVar, BaseFunc>({{global_var, f}}));
-
     if (IsGPUTask(task)) {
       auto pass_list = Array<tvm::transform::Pass>();
       // Phase 0
@@ -1323,9 +1307,7 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i
     const auto& optimize =
         tir::transform::Sequential(Array<tvm::transform::Pass>{tir::transform::Simplify()});
     mod = optimize(std::move(mod));
-    const auto& it = mod->functions.find(global_var);
-    ICHECK(it != mod->functions.end());
-    const auto& prim_func = (*it).second.as<PrimFuncNode>();
+    PrimFunc prim_func = Downcast<PrimFunc>(mod->Lookup(name));
     GetPerStoreFeature(prim_func->body, task->hardware_params->cache_line_bytes, max_n_bufs,
                        feature);
   } catch (Error& e) {
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index bfea3e7b67c0..24cae798988e 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -42,17 +42,27 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.detect_global_barrier", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_bound_checkers", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_assert", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_vectorize", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.is_entry_func", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array<Array<ObjectRef>>);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool);
 
 using runtime::PackedFunc;
 using runtime::TVMArgs;
 using runtime::TVMRetValue;
+using tvm::Array;
+using tvm::transform::Pass;
 
 bool LLVMEnabled() {
   const runtime::PackedFunc* pf = runtime::Registry::Get("target.build.llvm");
   return pf != nullptr;
 }
 
+bool ShouldAnnotateEntryFunc(const Target target, const IRModule mod) {
+  const bool aot_executor = (target->GetAttr<String>("executor").value_or("") == "aot");
+  const bool single_entry_func = (mod->functions.size() == 1);
+  return single_entry_func && !aot_executor;
+}
+
 /*! \return The default host target for a given device target */
 Target DefaultTargetHost(Target target) {
   if (target.defined() && target->kind->device_type == kDLCPU) {
@@ -155,6 +165,13 @@ transform::Pass BindTarget(Target target) {
   return tir::transform::CreatePrimFuncPass(fpass, 0, "BindTarget", {});
 }
 
+static transform::Pass AnnotateEntryFunc(bool b) {
+  auto fpass = [b](tir::PrimFunc f, IRModule m, transform::PassContext ctx) {
+    return WithAttr(std::move(f), tir::attr::kIsEntryFunc, Bool(true));
+  };
+  return tir::transform::CreatePrimFuncPass(fpass, 0, "AnnotateEntryFunc", {});
+}
+
 template <typename FCond>
 transform::Pass Filter(FCond fcond) {
   auto fpass = [fcond](tir::PrimFunc f, IRModule m, transform::PassContext ctx) {
@@ -184,7 +201,7 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   Array<transform::Pass> user_lower_phase2 = Array<transform::Pass>();
   Array<transform::Pass> user_lower_phase3 = Array<transform::Pass>();
 
-  // phase pasees is of the form
+  // phase passes is of the form
   // [[phase_number, pass], [phase_number, pass]... ]
   for (Array<ObjectRef> phase_pass : add_lower_pass) {
     const IntImmNode* phase_num = phase_pass[0].as<IntImmNode>();
@@ -266,24 +283,29 @@ IRModule LowerWithPassList(IRModule mod, Array<tvm::transform::Pass> pass_list)
   return mod;
 }
 
+IRModule ApplyPasses(IRModule mod, transform::Sequential seq) {
+  mod = seq(std::move(mod));
+  return mod;
+}
+
+// Convert te schedule to IRModule
 IRModule ScheduleToModule(te::Schedule sch, const Array<ObjectRef>& args, const std::string& name,
                           const std::unordered_map<te::Tensor, tir::Buffer>& binds) {
-  // Convert te schedule to IRModule
-  Array<ObjectRef> out_arg_list;
-  transform::PassContext pass_ctx = transform::PassContext::Current();
-
   sch = sch.normalize();
 
+  transform::PassContext pass_ctx = transform::PassContext::Current();
+  bool debug_keep_trivial_loop =
+      pass_ctx->GetConfig<Bool>("tir.debug_keep_trivial_loop", Bool(false)).value();
+
   // Before TIR transformation.
-  Map<tir::IterVar, Range> bounds = te::InferBound(sch);
-  tir::Stmt stmt = te::ScheduleOps(sch, std::move(bounds), false);
+  tir::Stmt stmt = te::ScheduleOps(sch, te::InferBound(sch), debug_keep_trivial_loop);
   bool compact = te::VerifyCompactBuffer(stmt);
 
   Map<te::Tensor, tir::Buffer> out_binds;
+  Array<ObjectRef> out_arg_list;
   GetBinds(args, compact, binds, &out_binds, &out_arg_list);
 
-  // Build the function
-  // At this point binds is only te::Tensors
+  // Build the function, converting from te::Tensor to tir::Buffer
   tir::PrimFunc f = te::SchedulePostProcToPrimFunc(out_arg_list, std::move(stmt), out_binds);
   f = WithAttr(std::move(f), "global_symbol", runtime::String(name));
 
@@ -304,7 +326,7 @@ TVM_REGISTER_GLOBAL("driver.schedule_to_module")
                        const Map<te::Tensor, tir::Buffer>& binds) {
       std::unordered_map<te::Tensor, tir::Buffer> c_binds;
       // Check to make sure binds is not null before doing the conversion;
-      if (binds.get() != nullptr) {
+      if (binds.defined()) {
         for (auto kv : binds) {
           c_binds.insert({kv.first, kv.second});
         }
@@ -373,88 +395,97 @@ TVM_REGISTER_GLOBAL("driver.lower_schedule")
       return LowerSchedule(std::move(sch), args, name, c_binds, simple_mode);
     });
 
-std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target& target_arg,
-                                                const Target& target_host_arg,
-                                                const transform::PassContext& pass_ctx) {
+/**
+ * This function takes the input module that contains both the device and host opts.
+ * Then, it applies transformation on the original module before splitting into separate modules for
+ * device and host. Then it also applies transformations on the new splitted modules.
+ */
+std::pair<IRModule, IRModule> SplitMixedModule(IRModule mod_mixed, const Target& target_arg,
+                                               const Target& target_host_arg) {
   Target target = target_arg, target_host = target_host_arg;
   CheckAndUpdateHostConsistency(&target, &target_host);
-  Array<tvm::transform::Pass> mixed_pass_list = {BindTarget(target),
-                                                 tir::transform::VerifyMemory()};
 
-  mixed_pass_list.push_back(tir::transform::MergeDynamicSharedMemoryAllocations());
-  if (pass_ctx->GetConfig<Bool>("tir.detect_global_barrier", Bool(false)).value()) {
-    mixed_pass_list.push_back(tir::transform::ThreadSync("global"));
-  }
-  mixed_pass_list.push_back(tir::transform::ThreadSync("shared"));
-  mixed_pass_list.push_back(tir::transform::ThreadSync("warp"));
-  mixed_pass_list.push_back(tir::transform::InferFragment());
-  mixed_pass_list.push_back(tir::transform::LowerThreadAllreduce());
+  ICHECK(mod_mixed.defined()) << "This module must be defined";
 
-  if (target->GetAttr<Bool>("unpacked-api").value_or(Bool(false))) {
-    mixed_pass_list.push_back(tir::transform::MakeUnpackedAPI());
-  } else {
-    mixed_pass_list.push_back(tir::transform::MakePackedAPI(-1));
-  }
+  mod_mixed = ApplyPasses(mod_mixed, MixedModulePassManager(mod_mixed, target));
 
-  mixed_pass_list.push_back(tir::transform::SplitHostDevice());
+  IRModule host_mod = ApplyPasses(mod_mixed, HostModulePassManager(mod_mixed, target_host));
 
-  auto opt_mixed = transform::Sequential(mixed_pass_list);
-  mod_mixed = opt_mixed(std::move(mod_mixed));
-
-  auto host_pass_list = {
-      Filter([](const tir::PrimFunc& f) {
-        return f->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) !=
-               CallingConv::kDeviceKernelLaunch;
-      }),
-      BindTarget(target_host),
-      tir::transform::LowerTVMBuiltin(),
-      tir::transform::LowerCustomDatatypes(),
-      tir::transform::LowerIntrin(),
-      tir::transform::LowerDeviceStorageAccessInfo(),
-      tir::transform::CombineContextCall(),
-  };
-  auto opt_host = transform::Sequential(host_pass_list);
-  ICHECK(mod_mixed.defined()) << "This module must be defined";
-  auto mhost = opt_host(mod_mixed);
-
-  // device pipeline
-  auto device_pass_list = {
-      Filter([](const tir::PrimFunc& f) {
-        return f->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
-               CallingConv::kDeviceKernelLaunch;
-      }),
-      BindTarget(target),
-      tir::transform::LowerWarpMemory(),
-      tir::transform::Simplify(),
-      tir::transform::LowerCustomDatatypes(),
-      tir::transform::LowerIntrin(),
-      tir::transform::LowerDeviceStorageAccessInfo(),
-  };
-  auto opt_device = transform::Sequential(device_pass_list);
-  auto mdevice = opt_device(mod_mixed);
+  IRModule device_mod = ApplyPasses(mod_mixed, DeviceModulePassManager(mod_mixed, target));
 
-  // some final misc checks.
   auto keys = target->GetKeys();
+
+  CheckAndUpdateHostConsistency(&target, &target_host);
+
   bool target_is_gpu = std::find(keys.begin(), keys.end(), "gpu") != keys.end();
-  if (target_is_gpu && mdevice->functions.size() == 0) {
-    LOG(WARNING) << "Specified target " << target->str()
-                 << " but cannot find device code. Did you forget to bind?";
+  if (target_is_gpu && device_mod->functions.size() == 0) {
+    DLOG(WARNING) << "Specified target " << target->str()
+                  << " but cannot find device code. Did you forget to bind?";
   }
 
-  if (target->kind->device_type == kDLCPU && target_host == target) {
-    // TODO(@jroesch): This check is no longer true we need to figure out if we care about this.
-    // We need to relax this check for just TIR functions.
-    // ICHECK(mdevice->functions.empty()) << "No device code should be generated when target "
-    //                                   << "and host_target are both llvm target."
-    //                                   << "\n";
+  return {host_mod, device_mod};
+}
+
+runtime::Module PreProcessModuleForBuild(const Map<Target, IRModule>& inputs_arg,
+                                         const Target& host_target) {
+  std::vector<runtime::Module> device_modules;
+  Map<Target, IRModule> inputs = inputs_arg;
+  Target target_host = host_target;
+
+  CheckAndUpdateHostConsistency(&inputs, &target_host);
+
+  if (!target_host.defined()) {
+    for (const auto& it : inputs) {
+      if (it.first->kind->device_type == kDLCPU || it.first->kind->device_type == kDLMicroDev) {
+        target_host = it.first;
+        break;
+      }
+    }
+  }
+
+  if (!target_host.defined()) {
+    target_host = DefaultTargetHost(target_host);
   }
 
-  return {mhost, mdevice};
+  // Update target host for all targets
+  CheckAndUpdateHostConsistency(&inputs, &target_host);
+
+  IRModule mhost_all = IRModule(Map<GlobalVar, BaseFunc>());
+
+  ICHECK(mhost_all.defined()) << "The host module must be defined";
+
+  for (const auto& it : inputs) {
+    if (it.second.defined()) {
+      auto pair = SplitMixedModule(it.second, it.first, target_host);
+      auto& host_mod = pair.first;
+      auto& device_mod = pair.second;
+
+      ICHECK(host_mod.defined()) << "The split host module must be defined";
+
+      ICHECK(mhost_all.defined()) << "The host module must be defined";
+
+      mhost_all->Update(host_mod);
+
+      if (device_mod->functions.size() != 0) {
+        device_modules.push_back(codegen::Build(device_mod, it.first));
+      }
+    }
+  }
+
+  runtime::Module complete_mod = codegen::Build(mhost_all, target_host);
+  for (const auto& it : device_modules) {
+    if (it.operator->()) {
+      complete_mod.Import(it);
+    }
+  }
+  return complete_mod;
 }
 
-// Can we make this take one annotated IRModule?
-//
-// Build for heterogeneous execution.
+TVM_REGISTER_GLOBAL("driver.preprocess_module")
+    .set_body_typed([](const Map<Target, IRModule>& inputs_arg, Target host_target) {
+      return PreProcessModuleForBuild(inputs_arg, host_target);
+    });
+
 runtime::Module build(const Map<Target, IRModule>& inputs_arg, const Target& target_host_arg) {
   auto pass_ctx = transform::PassContext::Current();
 
@@ -487,29 +518,41 @@ runtime::Module build(const Map<Target, IRModule>& inputs_arg, const Target& tar
 
   for (const auto& it : inputs) {
     if (it.second.defined()) {
-      auto pair = SplitDevHostFuncs(it.second, it.first, target_host, pass_ctx);
-      auto& mhost = pair.first;
-      auto& mdevice = pair.second;
+      const Target& target = it.first;
+      const IRModule& ir_module = it.second;
+      auto pair = SplitMixedModule(ir_module, target, target_host);
+      auto& host_mod = pair.first;
+      auto& device_mod = pair.second;
 
-      ICHECK(mhost.defined()) << "The split host module must be defined";
+      ICHECK(host_mod.defined()) << "The split host module must be defined";
 
       ICHECK(mhost_all.defined()) << "The host module must be defined";
 
-      mhost_all->Update(mhost);
+      // We don't want library modules going back into host codegen
+      // unless they're supposed to. Here if we overrode the target host
+      // to allow lowering previously we check that it's meant to be placed
+      // back into the host Module.
+      bool overrides_host_target = target->kind->device_type == target_host->kind->device_type;
+      bool non_host_target_kind = target->kind != target_host->kind;
+      if (overrides_host_target && non_host_target_kind) {
+        device_modules.push_back(codegen::Build(host_mod, it.first));
+      } else {
+        mhost_all->Update(host_mod);
+      }
 
-      if (mdevice->functions.size() != 0) {
-        device_modules.push_back(codegen::Build(mdevice, it.first));
+      if (device_mod->functions.size() != 0) {
+        device_modules.push_back(codegen::Build(device_mod, it.first));
       }
     }
   }
 
   runtime::Module mhost = codegen::Build(mhost_all, target_host);
-  // Import all modules
   for (const auto& it : device_modules) {
     if (it.operator->()) {
       mhost.Import(it);
     }
   }
+
   return mhost;
 }
 
@@ -534,8 +577,97 @@ runtime::Module build(const IRModule& funcs, const Target& target_arg,
                       const Target& target_host_arg) {
   auto target = target_arg, target_host = target_host_arg;
   CheckAndUpdateHostConsistency(&target, &target_host);
+  // More maps of target and target host
   Map<Target, IRModule> inputs = {{target, funcs}};
   return build(inputs, target_host);
 }
 
+transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target) {
+  transform::PassContext pass_ctx = transform::PassContext::Current();
+
+  Array<Pass> mixed_pass_list;
+
+  mixed_pass_list.push_back(BindTarget(target));
+
+  mixed_pass_list.push_back(tir::transform::VerifyMemory());
+  mixed_pass_list.push_back(tir::transform::MergeDynamicSharedMemoryAllocations());
+
+  if (ShouldAnnotateEntryFunc(target, mixed_mod)) {
+    mixed_pass_list.push_back(AnnotateEntryFunc(true));
+  }
+
+  bool detect_global_barrier =
+      pass_ctx->GetConfig<Bool>("tir.detect_global_barrier", Bool(false)).value();
+  if (detect_global_barrier) {
+    mixed_pass_list.push_back(tir::transform::ThreadSync("global"));
+  }
+
+  mixed_pass_list.push_back(tir::transform::ThreadSync("shared"));
+  mixed_pass_list.push_back(tir::transform::ThreadSync("warp"));
+  mixed_pass_list.push_back(tir::transform::InferFragment());
+  mixed_pass_list.push_back(tir::transform::LowerThreadAllreduce());
+
+  if (target->GetAttr<Bool>("unpacked-api").value_or(Bool(false))) {
+    mixed_pass_list.push_back(tir::transform::MakeUnpackedAPI());
+  } else {
+    mixed_pass_list.push_back(tir::transform::MakePackedAPI(-1));
+  }
+  mixed_pass_list.push_back(tir::transform::SplitHostDevice());
+
+  return transform::Sequential(mixed_pass_list);
+}
+
+TVM_REGISTER_GLOBAL("driver.mixed_mod_passes")
+    .set_body_typed([](IRModule mixed_mod, Target target) {
+      return MixedModulePassManager(mixed_mod, target);
+    });
+
+transform::Sequential HostModulePassManager(IRModule mixed_mod, Target target_host) {
+  Array<tvm::transform::Pass> host_pass_list;
+  host_pass_list.push_back(Filter([](const tir::PrimFunc& f) {
+    return f->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) !=
+           CallingConv::kDeviceKernelLaunch;
+  }));
+
+  ICHECK(mixed_mod.defined()) << "This module must be defined";
+
+  host_pass_list.push_back(BindTarget(target_host));
+
+  host_pass_list.push_back(tir::transform::LowerTVMBuiltin());
+  host_pass_list.push_back(tir::transform::LowerCustomDatatypes());
+  host_pass_list.push_back(tir::transform::LowerIntrin());
+  host_pass_list.push_back(tir::transform::LowerDeviceStorageAccessInfo());
+  host_pass_list.push_back(tir::transform::CombineContextCall());
+
+  return transform::Sequential(host_pass_list);
+}
+
+TVM_REGISTER_GLOBAL("driver.host_mod_passes")
+    .set_body_typed([](IRModule mixed_mod, Target target_host) {
+      return HostModulePassManager(mixed_mod, target_host);
+    });
+
+transform::Sequential DeviceModulePassManager(IRModule mixed_mod, Target target) {
+  Array<Pass> device_pass_list;
+  device_pass_list.push_back(Filter([](const tir::PrimFunc& f) {
+    return f->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
+           CallingConv::kDeviceKernelLaunch;
+  }));
+
+  device_pass_list.push_back(BindTarget(target));
+
+  device_pass_list.push_back(tir::transform::LowerWarpMemory());
+  device_pass_list.push_back(tir::transform::Simplify());
+  device_pass_list.push_back(tir::transform::LowerCustomDatatypes());
+  device_pass_list.push_back(tir::transform::LowerDeviceStorageAccessInfo());
+  device_pass_list.push_back(tir::transform::LowerIntrin());
+
+  return transform::Sequential(device_pass_list);
+}
+
+TVM_REGISTER_GLOBAL("driver.device_mod_passes")
+    .set_body_typed([](IRModule mixed_mod, Target target_host) {
+      return DeviceModulePassManager(mixed_mod, target_host);
+    });
+
 }  // namespace tvm
diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h
index 3514f3228e27..a2178167b2e3 100644
--- a/src/printer/text_printer.h
+++ b/src/printer/text_printer.h
@@ -276,6 +276,8 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   std::unordered_map<Var, Doc, ObjectPtrHash, ObjectPtrEqual> memo_var_;
   /*! \brief Map from Buffer to Doc */
   std::unordered_map<Buffer, Doc, ObjectPtrHash, ObjectPtrEqual> memo_buf_;
+  /*! \brief Map from Buffer to Doc */
+  std::unordered_map<DataProducer, Doc, ObjectPtrHash, ObjectPtrEqual> memo_producer_;
   /*! \brief name allocation map */
   std::unordered_map<std::string, int> name_alloc_map_;
 
@@ -321,7 +323,9 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc VisitStmt_(const AssertStmtNode* op) override;
   Doc VisitStmt_(const StoreNode* op) override;
   Doc VisitStmt_(const BufferStoreNode* op) override;
+  Doc VisitStmt_(const ProducerStoreNode* op) override;
   Doc VisitStmt_(const BufferRealizeNode* op) override;
+  Doc VisitStmt_(const ProducerRealizeNode* op) override;
   Doc VisitStmt_(const AllocateNode* op) override;
   Doc VisitStmt_(const IfThenElseNode* op) override;
   Doc VisitStmt_(const SeqStmtNode* op) override;
@@ -342,7 +346,9 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc PrintIterVar(const IterVarNode* op);
   Doc PrintRange(const RangeNode* op);
   Doc PrintBuffer(const BufferNode* op);
+  Doc PrintProducer(const DataProducerNode* op);
   Doc BufferNode2Doc(const BufferNode* op, Doc doc);
+  Doc DataProducerNode2Doc(const DataProducerNode* op, Doc doc);
   Doc PrintString(const StringObj* op) { return Doc::StrLiteral(op->data); }
   Doc PrintBufferRegion(const BufferRegionNode* op);
 
@@ -361,6 +367,7 @@ class TIRTextPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc GetUniqueName(std::string prefix);
   Doc AllocVar(const Var& var);
   Doc AllocBuf(const Buffer& buffer);
+  Doc AllocProducer(const DataProducer& buffer);
   /*!
    * \brief special method to render vectors of docs with a separator
    * \param vec vector of docs
diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index fa132f079793..302c4491cebe 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -65,6 +65,8 @@ Doc TIRTextPrinter::Print(const ObjectRef& node) {
     return PrintRange(node.as<RangeNode>());
   } else if (node->IsInstance<BufferNode>()) {
     return PrintBuffer(node.as<BufferNode>());
+  } else if (node->IsInstance<DataProducerNode>()) {
+    return PrintProducer(node.as<DataProducerNode>());
   } else if (node->IsInstance<StringObj>()) {
     return PrintString(node.as<StringObj>());
   } else if (node->IsInstance<BufferRegionNode>()) {
@@ -199,6 +201,19 @@ Doc TIRTextPrinter::PrintBuffer(const BufferNode* op) {
   }
 }
 
+Doc TIRTextPrinter::PrintProducer(const DataProducerNode* op) {
+  const DataProducer& prod = GetRef<DataProducer>(op);
+
+  if (meta_->InMeta(prod)) {
+    return meta_->GetMetaNode(prod);
+  } else if (memo_producer_.count(prod)) {
+    return memo_producer_[prod];
+  } else {
+    memo_producer_[prod] = AllocProducer(prod);
+    return DataProducerNode2Doc(op, memo_producer_[prod]);
+  }
+}
+
 Doc TIRTextPrinter::BufferNode2Doc(const BufferNode* buf, Doc doc) {
   doc << Doc::Text(": Buffer(") << Print(buf->data) << ", " << PrintDType(buf->dtype) << ", "
       << Print(buf->shape) << ", " << Print(buf->strides);
@@ -220,6 +235,11 @@ Doc TIRTextPrinter::BufferNode2Doc(const BufferNode* buf, Doc doc) {
   return doc << ")";
 }
 
+Doc TIRTextPrinter::DataProducerNode2Doc(const DataProducerNode* prod, Doc doc) {
+  return doc << Doc::Text(": DataProducer(") << Print(prod->GetNameHint()) << ", "
+             << PrintDType(prod->GetDataType()) << ", " << Print(prod->GetShape()) << ")";
+}
+
 Doc TIRTextPrinter::PrintBufferRegion(const BufferRegionNode* op) {
   Doc doc;
   doc << Print(op->buffer) << "[";
@@ -439,6 +459,12 @@ Doc TIRTextPrinter::VisitStmt_(const BufferStoreNode* op) {
   return doc;
 }
 
+Doc TIRTextPrinter::VisitStmt_(const ProducerStoreNode* op) {
+  Doc doc;
+  doc << Print(op->producer) << Print(op->indices) << " = " << Print(op->value);
+  return doc;
+}
+
 Doc TIRTextPrinter::VisitStmt_(const BufferRealizeNode* op) {
   Doc doc;
   doc << "realize(" << Print(op->buffer) << ", " << Print(op->bounds) << ", "
@@ -446,6 +472,13 @@ Doc TIRTextPrinter::VisitStmt_(const BufferRealizeNode* op) {
   return doc;
 }
 
+Doc TIRTextPrinter::VisitStmt_(const ProducerRealizeNode* op) {
+  Doc doc;
+  doc << "producer_realize(" << Print(op->producer) << ", " << Print(op->bounds) << ", "
+      << Print(op->condition) << ", " << PrintBody(op->body) << ")";
+  return doc;
+}
+
 Doc TIRTextPrinter::VisitStmt_(const AllocateNode* op) {
   Doc doc;
   auto scope = GetPtrStorageScope(op->buffer_var);
@@ -709,6 +742,20 @@ Doc TIRTextPrinter::AllocBuf(const Buffer& buffer) {
   return val;
 }
 
+Doc TIRTextPrinter::AllocProducer(const DataProducer& producer) {
+  const auto& it = memo_producer_.find(producer);
+  if (it != memo_producer_.end()) {
+    return it->second;
+  }
+  std::string name = producer->GetNameHint();
+  if (name.length() == 0 || !std::isalpha(name[0])) {
+    name = "tensor_" + name;
+  }
+  Doc val = GetUniqueName(name);
+  memo_producer_[producer] = val;
+  return val;
+}
+
 Doc TIRTextPrinter::PrintSep(const std::vector<Doc>& vec, const Doc& sep) {
   Doc seq;
   if (vec.size() != 0) {
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index fa74e56f491c..13e4cfcd30ba 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -22,10 +22,10 @@
  * \brief Printer class to print Tensor IR to python syntax script
  */
 
-#include <tvm/arith/analyzer.h>
 #include <tvm/ir/module.h>
 #include <tvm/node/serialization.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/tir/analysis.h>
 #include <tvm/tir/buffer.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/expr_functor.h>
@@ -128,7 +128,17 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   /*! \brief the number of current node */
   int current_num_;
   /*! \brief loop stack without annotations */
-  std::vector<For> loop_stack_;
+  std::vector<For> simple_loop_stack_;
+  /*! \brief the maps from loop_vars to the loops */
+  std::unordered_map<const VarNode*, For> loop_var_map_;
+  /*!
+   * \brief simple block vars remap from loop vars
+   * simple_remap requires:
+   * 1. block var iter type is kDataPar or kCommReduce
+   * 2. value is a single Var, which is a loop_var outside the block
+   * 3. The iter range is equal to loop range
+   */
+  std::vector<std::pair<IterVar, PrimExpr>> block_var_remaps_;
 
   Doc VisitExpr_(const CastNode* op, ExprPrecedence* out_precedence) override;
   Doc VisitExpr_(const VarNode* op, ExprPrecedence* out_precedence) override;
@@ -193,7 +203,9 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc PrintArray(const ArrayNode* op);
   Doc PrintBuffer(const BufferNode* op);
   Doc AllocBufferDeclaration(const Buffer& buf);
-  Doc PrintBlockVar(const BlockNode* op);
+  Doc PrintBlockVar(const IterVar& iter_var, const PrimExpr& value);
+  Doc PrintBlockVarRemaps();
+  Doc PrintBlockVars(const BlockRealizeNode* op);
   Doc PrintBlockAttr(const BlockRealizeNode* op);
   Doc PrintBlockBody(const BlockNode* op);
   Doc PrintBufferRegion(const BufferRegionNode* op);
@@ -821,21 +833,23 @@ Doc TVMScriptPrinter::VisitStmt_(const EvaluateNode* op) {
 Doc TVMScriptPrinter::VisitStmt_(const ForNode* op) {
   Doc doc;
   var_not_in_headers_.insert(op->loop_var.get());
+  loop_var_map_[op->loop_var.get()] = GetRef<For>(op);
   const auto* body = op->body.as<ForNode>();
   bool simple_loop = op->kind == ForKind::kSerial && op->annotations.empty() && is_zero(op->min);
-  if (simple_loop) loop_stack_.push_back(GetRef<For>(op));
+  if (simple_loop) simple_loop_stack_.push_back(GetRef<For>(op));
   // It is a loop that can be compressed, let the loops below print it out
   if (simple_loop && body != nullptr) {
     Doc result = Print(GetRef<For>(body));
     TryDeallocVar(op->loop_var);
+    loop_var_map_.erase(op->loop_var.get());
     return result;
   }
   // It is a loop that can not be compressed
-  bool print_above = !loop_stack_.empty();
+  bool print_above = !simple_loop_stack_.empty();
   // print loops above if needed
   if (print_above) {
     doc << PrintLoopStack();
-    loop_stack_.clear();
+    simple_loop_stack_.clear();
   }
   if (!simple_loop) {
     // print current loop if needed
@@ -847,6 +861,7 @@ Doc TVMScriptPrinter::VisitStmt_(const ForNode* op) {
     doc << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body));
   }
   TryDeallocVar(op->loop_var);
+  loop_var_map_.erase(op->loop_var.get());
   return doc;
 }
 
@@ -901,52 +916,99 @@ Doc TVMScriptPrinter::VisitStmt_(const BufferStoreNode* op) {
   return doc;
 }
 
-Doc TVMScriptPrinter::PrintBlockVar(const BlockNode* op) {
+Doc TVMScriptPrinter::PrintBlockVar(const IterVar& iter_var, const PrimExpr& value) {
   Doc doc;
-  doc << "with " << tir_prefix_ << ".block([";
-  std::vector<Doc> block_var_docs;
-  for (const auto& iter_var : op->iter_vars) {
-    Doc block_var_doc;
-    if (is_zero(iter_var->dom->min) && iter_var->iter_type == kDataPar) {
-      block_var_doc << Print(iter_var->dom->extent);
+  doc << Print(iter_var->var) << " = " << tir_prefix_ << ".axis.";
+  switch (iter_var->iter_type) {
+    case kDataPar:
+      doc << "spatial";
+      break;
+    case kCommReduce:
+      doc << "reduce";
+      break;
+    case kOrdered:
+      doc << "scan";
+      break;
+    case kOpaque:
+      doc << "opaque";
+      break;
+    default:
+      LOG(FATAL) << "Unknown block var iter type: " << iter_var->iter_type;
+      break;
+  }
+  doc << "(";
+  const Range& dom = iter_var->dom;
+  if (is_zero(dom->min)) {
+    doc << Print(dom->extent);
+  } else {
+    doc << "(" << Print(dom->min) << ", " << Print(dom->min + dom->extent) << ")";
+  }
+  doc << ", " << Print(value) << ")";
+  return doc;
+}
+
+Doc TVMScriptPrinter::PrintBlockVarRemaps() {
+  ICHECK(!block_var_remaps_.empty());
+  if (block_var_remaps_.size() == 1) {
+    const IterVar& iter_var = block_var_remaps_[0].first;
+    const PrimExpr& value = block_var_remaps_[0].second;
+    return PrintBlockVar(iter_var, value);
+  }
+  Doc doc;
+  std::vector<Doc> iter_vars, iter_values;
+  std::string iter_type;
+  for (const auto& pair : block_var_remaps_) {
+    const IterVar& iter_var = pair.first;
+    const PrimExpr& value = pair.second;
+    iter_vars.push_back(Print(iter_var->var));
+    iter_values.push_back(Print(value));
+    if (iter_var->iter_type == kDataPar) {
+      iter_type += "S";
+    } else if (iter_var->iter_type == kCommReduce) {
+      iter_type += "R";
     } else {
-      block_var_doc << tir_prefix_ << ".";
-      switch (iter_var->iter_type) {
-        case kDataPar:
-          block_var_doc << "range";
-          break;
-        case kCommReduce:
-          block_var_doc << "reduce_axis";
-          break;
-        case kOrdered:
-          block_var_doc << "scan_axis";
-          break;
-        case kOpaque:
-          block_var_doc << "opaque_axis";
-          break;
-        default:
-          LOG(FATAL) << "Unknown block var iter type: " << iter_var->iter_type;
-          break;
-      }
-      block_var_doc << "(" << Print(iter_var->dom->min) << ", "
-                    << Print(iter_var->dom->min + iter_var->dom->extent) << ")";
+      ICHECK(false);
     }
-    block_var_docs.push_back(block_var_doc);
-  }
-  doc << PrintSep(block_var_docs, Doc::Text(", ")) << "]";
-  if (!op->name_hint.empty()) {
-    doc << ", " << Doc::StrLiteral(op->name_hint);
   }
-  doc << ")";
-  std::vector<Doc> block_var_names;
-  for (const auto& iter_var : op->iter_vars) {
+  doc << PrintSep(iter_vars, Doc::Text(", ")) << " = " << tir_prefix_ << ".axis.remap("
+      << Doc::StrLiteral(iter_type) << ", [" << PrintSep(iter_values, Doc::Text(", ")) << "])";
+  return doc;
+}
+
+Doc TVMScriptPrinter::PrintBlockVars(const BlockRealizeNode* op) {
+  Doc doc;
+  const auto* block_op = op->block.as<BlockNode>();
+  ICHECK_EQ(block_op->iter_vars.size(), op->iter_values.size());
+  tir::ExprDeepEqual expr_equal;
+
+  auto is_simple_remap = [this, &expr_equal](const IterVar& iter_var,
+                                             const PrimExpr& value) -> bool {
+    if (iter_var->iter_type != kDataPar && iter_var->iter_type != kCommReduce) return false;
+    if (!value->IsInstance<VarNode>()) return false;
+    const Var& var = Downcast<Var>(value);
+    auto it = loop_var_map_.find(var.get());
+    return it != loop_var_map_.end() && expr_equal(it->second->min, iter_var->dom->min) &&
+           expr_equal(it->second->extent, iter_var->dom->extent);
+  };
+
+  for (size_t i = 0; i < block_op->iter_vars.size(); ++i) {
+    const IterVar& iter_var = block_op->iter_vars[i];
+    const PrimExpr& value = op->iter_values[i];
     var_not_in_headers_.insert(iter_var->var.get());
-    block_var_names.push_back(Print(iter_var->var));
+    if (is_simple_remap(iter_var, value)) {
+      block_var_remaps_.push_back(std::make_pair(iter_var, value));
+    } else {
+      if (!block_var_remaps_.empty()) {
+        doc << Doc::NewLine() << PrintBlockVarRemaps();
+        block_var_remaps_.clear();
+      }
+      doc << Doc::NewLine() << PrintBlockVar(iter_var, value);
+    }
   }
-  if (!block_var_names.empty()) {
-    doc << " as [" << PrintSep(block_var_names, Doc::Text(", ")) << "]";
+  if (!block_var_remaps_.empty()) {
+    doc << Doc::NewLine() << PrintBlockVarRemaps();
+    block_var_remaps_.clear();
   }
-  doc << ":";
   return doc;
 }
 
@@ -957,10 +1019,6 @@ Doc TVMScriptPrinter::PrintBlockAttr(const BlockRealizeNode* op) {
   if (!is_one(op->predicate)) {
     block_attr_doc << Doc::NewLine() << tir_prefix_ << ".where(" << Print(op->predicate) << ")";
   }
-  for (size_t i = 0; i < block_op->iter_vars.size(); ++i)
-    block_attr_doc << Doc::NewLine() << tir_prefix_ << ".bind("
-                   << Print(block_op->iter_vars[i]->var) << ", " << Print(op->iter_values[i])
-                   << ")";
   block_attr_doc << Doc::NewLine() << tir_prefix_ << ".reads(" << Print(block_op->reads) << ")";
   block_attr_doc << Doc::NewLine() << tir_prefix_ << ".writes(" << Print(block_op->writes) << ")";
   if (!block_op->annotations.empty()) {
@@ -994,12 +1052,18 @@ Doc TVMScriptPrinter::PrintBlockBody(const BlockNode* op) {
 Doc TVMScriptPrinter::VisitStmt_(const BlockRealizeNode* op) {
   const auto* block_op = op->block.as<BlockNode>();
   // print block name and block vars
-  Doc doc = PrintBlockVar(block_op);
+  Doc doc;
+  doc << "with " << tir_prefix_ << ".block(";
+  if (!block_op->name_hint.empty()) {
+    doc << Doc::StrLiteral(block_op->name_hint);
+  }
+  doc << "):";
+  Doc block_var = PrintBlockVars(op);
   // print predicate, binding, read/write tensor region, annotations
   Doc block_attr_doc = PrintBlockAttr(op);
   // print body
   Doc body = PrintBlockBody(block_op);
-  doc << Doc::Indent(4, block_attr_doc << Doc::NewLine() << body);
+  doc << Doc::Indent(4, block_var << block_attr_doc << Doc::NewLine() << body);
   for (const auto& iter_var : block_op->iter_vars) {
     TryDeallocVar(iter_var->var);
   }
@@ -1265,11 +1329,11 @@ Doc TVMScriptPrinter::PrintLoop(const For& loop) {
 
 Doc TVMScriptPrinter::PrintLoopStack() {
   Doc res;
-  if (loop_stack_.size() == 1) {
-    res << PrintLoop(loop_stack_[0]);
-  } else if (loop_stack_.size() > 1) {
+  if (simple_loop_stack_.size() == 1) {
+    res << PrintLoop(simple_loop_stack_[0]);
+  } else if (simple_loop_stack_.size() > 1) {
     std::vector<Doc> vars, extents;
-    for (const auto& loop : loop_stack_) {
+    for (const auto& loop : simple_loop_stack_) {
       vars.push_back(Print(loop->loop_var));
       extents.push_back(Print(loop->extent));
     }
diff --git a/src/relay/backend/contrib/ethosu/to_te_graph.cc b/src/relay/backend/contrib/ethosu/to_te_graph.cc
deleted file mode 100644
index 9646c39da089..000000000000
--- a/src/relay/backend/contrib/ethosu/to_te_graph.cc
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file relay/backend/contrib/ethosu/to_te_graph.cc
- * \brief Lower a Relay function to a TE graph.
- */
-#include <tvm/driver/driver_api.h>
-#include <tvm/ir/type_functor.h>
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/attrs/device_copy.h>
-#include <tvm/relay/expr.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/op.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/te/operation.h>
-#include <tvm/te/schedule.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/topi/tags.h>
-
-#include <functional>
-#include <limits>
-#include <mutex>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "../../compile_engine.h"
-#include "../../utils.h"
-
-namespace tvm {
-namespace relay {
-namespace contrib {
-namespace ethosu {
-
-/*! \brief Node container to represent a Tensor Expression graph. */
-class TEGraphNode : public Object {
- public:
-  /* \brief The inputs to the graph */
-  tvm::Array<te::Tensor> inputs;
-  /* \brief The outputs to the graph */
-  tvm::Array<te::Tensor> outputs;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("inputs", &inputs);
-    v->Visit("outputs", &outputs);
-  }
-
-  static constexpr const char* _type_key = "relay.TEGraph";
-  TVM_DECLARE_FINAL_OBJECT_INFO(TEGraphNode, Object);
-};
-
-class TEGraph : public ObjectRef {
- public:
-  TVM_DEFINE_OBJECT_REF_METHODS(TEGraph, ObjectRef, TEGraphNode);
-};
-
-TVM_REGISTER_NODE_TYPE(TEGraphNode);
-
-Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
-  // for now, we always use int32 shape when possible
-  // even if the result of shape inference becomes int64.
-  Array<IndexExpr> res;
-  for (IndexExpr val : shape) {
-    const int64_t* pval = tir::as_const_int(val);
-    if (pval != nullptr) {
-#ifndef TVM_INDEX_DEFAULT_I64
-      ICHECK_LE(pval[0], std::numeric_limits<int32_t>::max());
-      ICHECK_GE(pval[0], std::numeric_limits<int32_t>::min());
-      res.push_back(IntImm(DataType::Int(32), *pval));
-#else
-      res.push_back(val);
-#endif  // TVM_INDEX_DEFAULT_I64
-    } else if (val->IsInstance<tir::AnyNode>()) {
-      res.push_back(val.as<tir::AnyNode>()->ToVar());
-    } else {
-      res.push_back(val);
-    }
-  }
-  return res;
-}
-
-class RelayToTE : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
- public:
-  RelayToTE() = default;
-
-  TEGraph Lower(const Function& prim_func) {
-    auto graph_node = make_object<TEGraphNode>();
-    for (Var param : prim_func->params) {
-      Array<tvm::te::Tensor> inputs;
-      if (const auto* ttype = param->checked_type().as<TensorTypeNode>()) {
-        tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype);
-        graph_node->inputs.push_back(tensor);
-        inputs.push_back(tensor);
-      } else {
-        // flatten tuple of tensor type.
-        const auto* tuple_type = param->type_as<TupleTypeNode>();
-        for (Type field : tuple_type->fields) {
-          const auto* ttype = field.as<TensorTypeNode>();
-          ICHECK(ttype != nullptr);
-          tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype);
-          graph_node->inputs.push_back(tensor);
-          inputs.push_back(tensor);
-        }
-      }
-      memo_[param] = inputs;
-    }
-    graph_node->outputs = this->VisitExpr(prim_func->body);
-    return TEGraph(graph_node);
-  }
-
-  Array<te::Tensor> VisitExpr_(const VarNode* op) final {
-    LOG(FATAL) << "Free variable " << op->name_hint();
-    return {};
-  }
-
-  Array<te::Tensor> VisitExpr_(const ConstantNode* op) final {
-    using tir::make_const;
-    ICHECK(op->is_scalar());
-    void* data = op->data->data;
-    DataType dtype = DataType(op->data->dtype);
-    auto value = te::compute(
-        {},
-        [&](const Array<tvm::tir::Var>&) {
-          if (dtype == DataType::Int(32)) {
-            return make_const(dtype, static_cast<const int32_t*>(data)[0]);
-          } else if (dtype == DataType::Int(64)) {
-            return make_const(dtype, static_cast<const int64_t*>(data)[0]);
-          } else if (dtype == DataType::Float(32)) {
-            return make_const(dtype, static_cast<const float*>(data)[0]);
-          } else if (dtype == DataType::Float(64)) {
-            return make_const(dtype, static_cast<const double*>(data)[0]);
-          } else if (dtype == DataType::Bool()) {
-            return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
-          } else {
-            LOG(FATAL) << "not handled";
-            return tvm::PrimExpr();
-          }
-        },
-        "compile_engine_const", topi::kBroadcast);
-    return {value};
-  }
-
-  Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
-    static auto flower_call = tvm::runtime::Registry::Get("relay.backend.lower_call");
-    ICHECK(flower_call) << "relay.backend.lower_call is not registered.";
-
-    Array<te::Tensor> inputs;
-    int count_tuple = 0;
-    for (Expr arg : call_node->args) {
-      if (arg->checked_type().as<TupleTypeNode>()) {
-        ++count_tuple;
-      }
-      for (te::Tensor tensor : VisitExpr(arg)) {
-        inputs.push_back(tensor);
-      }
-    }
-    if (count_tuple) {
-      ICHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input";
-    }
-
-    ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
-    Op op = Downcast<Op>(call_node->op);
-
-    Array<te::Tensor> outputs;
-    LoweredOutput lowered_out =
-        (*flower_call)(GetRef<Call>(call_node), inputs, tvm::Target("llvm"));
-    outputs = lowered_out->outputs;
-
-    if (outputs.size() != 1) {
-      const auto* tuple_type = call_node->checked_type().as<TupleTypeNode>();
-      ICHECK(tuple_type) << "Expect output to be a tuple type";
-      ICHECK_EQ(tuple_type->fields.size(), outputs.size());
-    }
-    return outputs;
-  }
-
-  Array<te::Tensor> VisitExpr_(const FunctionNode* op) final {
-    LOG(FATAL) << "Do not support sub function";
-    return Array<te::Tensor>();
-  }
-
-  Array<te::Tensor> VisitExpr_(const LetNode* op) final {
-    Array<te::Tensor> val = VisitExpr(op->value);
-    ICHECK(!memo_.count(op->var));
-    memo_[op->var] = val;
-    return VisitExpr(op->body);
-  }
-
-  Array<te::Tensor> VisitExpr_(const TupleNode* op) final {
-    Array<te::Tensor> fields;
-    for (Expr field : op->fields) {
-      ICHECK(field->checked_type().as<TensorTypeNode>()) << "Only allow Tuple of Tensor";
-      Array<te::Tensor> res = VisitExpr(field);
-      ICHECK_EQ(res.size(), 1);
-      fields.push_back(res[0]);
-    }
-    return fields;
-  }
-
-  Array<te::Tensor> VisitExpr_(const TupleGetItemNode* op) final {
-    const auto* tuple_type = op->tuple->type_as<TupleTypeNode>();
-    Array<te::Tensor> tuple = VisitExpr(op->tuple);
-    ICHECK_EQ(tuple_type->fields.size(), tuple.size());
-    ICHECK_GE(op->index, 0);
-    ICHECK_LT(static_cast<size_t>(op->index), tuple.size());
-    return {tuple[op->index]};
-  }
-};
-
-TVM_REGISTER_GLOBAL("relay.backend.contrib.ethosu.LowerToTE")
-    .set_body_typed([](Function prim_func) { return RelayToTE().Lower(prim_func); });
-
-}  // namespace ethosu
-}  // namespace contrib
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc b/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
index 6d332803041d..cae20210ec4f 100644
--- a/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
+++ b/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
@@ -33,7 +33,9 @@ namespace example_target_hooks {
 class ConvertAddToSubtract : public MixedModeMutator {
  public:
   explicit ConvertAddToSubtract(IRModule ir_module, Target host_target)
-      : ir_module_(ir_module), host_target_(host_target) {}
+      : ir_module_(ir_module),
+        host_target_(host_target),
+        custom_target_(Target("example_target_hook")) {}
 
   IRModule Mutate() {
     GlobalVar main_global_var = ir_module_->GetGlobalVar("main");
@@ -81,7 +83,15 @@ class ConvertAddToSubtract : public MixedModeMutator {
 
     tir::PrimFunc replacement_func = tir::PrimFunc({x_var, y_var, out_var}, math_loop, VoidType(),
                                                    buffer_map, DictAttrs(dict_attrs));
-    replacement_func = WithAttr(replacement_func, ::tvm::attr::kTarget, host_target_);
+
+    // Switch to TIRToRuntime hook for testing
+    Bool tir_to_runtime = func->GetAttr<Bool>("tir_to_runtime").value_or(Bool(false));
+    if (tir_to_runtime) {
+      replacement_func = WithAttr(replacement_func, ::tvm::attr::kTarget, custom_target_);
+    } else {
+      replacement_func = WithAttr(replacement_func, ::tvm::attr::kTarget, host_target_);
+    }
+
     ir_module_->Add(new_global_var, replacement_func);
   }
 
@@ -109,6 +119,7 @@ class ConvertAddToSubtract : public MixedModeMutator {
  public:
   IRModule ir_module_;
   Target host_target_;
+  Target custom_target_;
 };
 
 transform::Pass RelayToTIR() {
@@ -124,8 +135,4 @@ transform::Pass RelayToTIR() {
 }  // namespace contrib
 }  // namespace relay
 
-TVM_REGISTER_TARGET_KIND("example_target_hook", kDLCPU)
-    .set_attr<tvm::transform::Pass>("RelayToTIR",
-                                    relay::contrib::example_target_hooks::RelayToTIR());
-
 }  // namespace tvm
diff --git a/src/relay/backend/contrib/example_target_hooks/target.cc b/src/relay/backend/contrib/example_target_hooks/target.cc
new file mode 100644
index 000000000000..75b161ad4499
--- /dev/null
+++ b/src/relay/backend/contrib/example_target_hooks/target.cc
@@ -0,0 +1,39 @@
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/relay/transform.h>
+#include <tvm/target/target.h>
+
+namespace tvm {
+
+namespace relay {
+namespace contrib {
+namespace example_target_hooks {
+tvm::transform::Pass RelayToTIR();
+runtime::Module TIRToRuntime(IRModule mod, Target target);
+}  // namespace example_target_hooks
+}  // namespace contrib
+}  // namespace relay
+
+TVM_REGISTER_TARGET_KIND("example_target_hook", kDLCPU)
+    .set_attr<FTVMRelayToTIR>("RelayToTIR", relay::contrib::example_target_hooks::RelayToTIR())
+    .set_attr<FTVMTIRToRuntime>("TIRToRuntime", relay::contrib::example_target_hooks::TIRToRuntime);
+
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/example_target_hooks/tir_to_runtime.cc b/src/relay/backend/contrib/example_target_hooks/tir_to_runtime.cc
new file mode 100644
index 000000000000..36d801d349a7
--- /dev/null
+++ b/src/relay/backend/contrib/example_target_hooks/tir_to_runtime.cc
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <sstream>
+#include <string>
+
+#include "../../../../target/source/codegen_c_host.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace example_target_hooks {
+
+using namespace tir;
+
+class CodeGenExampleTargetHook : public codegen::CodeGenCHost {
+ public:
+  /*!
+   * \brief Emit code that changes adds to multiplies for testing
+   */
+  void VisitExpr_(const SubNode* op, std::ostream& os) final {
+    os << '(';
+    PrintExpr(op->a, os);
+    os << " * ";
+    PrintExpr(op->b, os);
+    os << ')';
+  }
+};
+
+runtime::Module TIRToRuntime(IRModule mod, Target target) {
+  bool output_ssa = false;
+  bool emit_asserts = false;
+  CodeGenExampleTargetHook codegen;
+  Array<String> function_names;
+  codegen.Init(output_ssa, emit_asserts, target->str());
+  for (auto kv : mod->functions) {
+    auto prim_func = Downcast<PrimFunc>(kv.second);
+    auto global_symbol = prim_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+    function_names.push_back(global_symbol.value());
+    codegen.AddFunction(prim_func);
+  }
+  std::string code = codegen.Finish();
+  return codegen::CSourceModuleCreate(code, "c", function_names);
+}
+
+}  // namespace example_target_hooks
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index d0e83765928a..ec87cfc98931 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -111,8 +111,10 @@ Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
 // Construct a schedule for a given Relay primitive function and target.
 class ScheduleBuilder : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
  public:
-  explicit ScheduleBuilder(Target target)
-      : target_(target), device_copy_op_(Op::Get("device_copy")) {
+  explicit ScheduleBuilder(Target target, bool create_schedule = true)
+      : target_(target),
+        device_copy_op_(Op::Get("device_copy")),
+        create_schedule_(create_schedule) {
     // Whether to use auto_scheduler schedule.
     use_auto_scheduler_ = backend::IsAutoSchedulerEnabled();
   }
@@ -149,7 +151,6 @@ class ScheduleBuilder : public backend::MemoizedExprTranslator<Array<te::Tensor>
     auto prim_fn_var = GlobalVar(prim_fn_name);
     prim_fn_var->checked_type_ = prim_func->checked_type();
 
-    ICHECK(anchor_op_.defined());
     // Fusion over tupled results may leave identity relationships
     // between inputs and outputs, and those should not be scheduled.
     // Hence schedule only non PlaceholderOp outputs.
@@ -162,7 +163,7 @@ class ScheduleBuilder : public backend::MemoizedExprTranslator<Array<te::Tensor>
 
     te::Schedule schedule;
     // No need to register schedule for device copy op.
-    if (anchor_attrs_.as<DeviceCopyAttrs>() == nullptr) {
+    if (anchor_attrs_.as<DeviceCopyAttrs>() == nullptr && create_schedule_) {
       if (use_auto_scheduler_) {
         const auto* fauto_schedule =
             runtime::Registry::Get("auto_scheduler.relay_integration.auto_schedule_topi_compute");
@@ -259,17 +260,19 @@ class ScheduleBuilder : public backend::MemoizedExprTranslator<Array<te::Tensor>
       impl = lowered_out->implementation;
     }
 
-    int op_pattern = fpattern[op];
-    if (!use_auto_scheduler_ && op_pattern >= kCommReduce) {
-      ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
-          << "Cannot apply TOPI schedule to a primitive function with two complicated ops"
-          << " anchor=" << anchor_op_ << " current=" << op;
-    }
-    if (op_pattern >= anchor_op_pattern_) {
-      anchor_op_ = op;
-      anchor_attrs_ = call_node->attrs;
-      anchor_op_pattern_ = op_pattern;
-      anchor_implementation_ = impl;
+    if (create_schedule_) {
+      int op_pattern = fpattern[op];
+      if (!use_auto_scheduler_ && op_pattern >= kCommReduce) {
+        ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
+            << "Cannot apply TOPI schedule to a primitive function with two complicated ops"
+            << " anchor=" << anchor_op_ << " current=" << op;
+      }
+      if (op_pattern >= anchor_op_pattern_) {
+        anchor_op_ = op;
+        anchor_attrs_ = call_node->attrs;
+        anchor_op_pattern_ = op_pattern;
+        anchor_implementation_ = impl;
+      }
     }
     if (outputs.size() != 1) {
       const auto* tuple_type = call_node->checked_type().as<TupleTypeNode>();
@@ -334,6 +337,7 @@ class ScheduleBuilder : public backend::MemoizedExprTranslator<Array<te::Tensor>
   // Cache device copy op for equivalence checking to reduce registry lookup
   // overhead for each invocation of call node when retrieving schedules.
   const Op& device_copy_op_;
+  bool create_schedule_;
 };
 
 /*!
@@ -667,6 +671,12 @@ std::string GetUniqueName(std::string name, std::unordered_map<std::string, int>
   return name;
 }
 
+TVM_REGISTER_GLOBAL("relay.backend.LowerToTE").set_body_typed([](Function prim_func) {
+  return ScheduleBuilder(tvm::Target("ext_dev"), false).Create(prim_func, [&](std::string name) {
+    return name;
+  });
+});
+
 }  // namespace tec
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 36cd0c7f406d..70ad2ccc992e 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -824,7 +824,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
 
   /*!
    * \brief Compile a pattern match expression
-   * It first converts the pattern match expression into a desicision tree, the condition
+   * It first converts the pattern match expression into a decision tree, the condition
    * could be object comparison or variable binding. If any of the condition fails in a clause,
    * the decision tree switches to check the conditions of next clause and so on. If no clause
    * matches the value, a fatal node is inserted.
diff --git a/src/relay/op/contrib/ethosu/depthwise.cc b/src/relay/op/contrib/ethosu/depthwise.cc
new file mode 100644
index 000000000000..fa73645d45de
--- /dev/null
+++ b/src/relay/op/contrib/ethosu/depthwise.cc
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/ethosu/depthwise.cc
+ * \brief Depthwise convolution 2D operator definition for the Arm(R) Ethos(TM)-U NPU
+ */
+#include <tvm/relay/base.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/qnn/attrs.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/data_layout.h>
+
+#include "../../../qnn/utils.h"
+#include "../../nn/convolution.h"
+#include "common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace ethosu {
+
+/*! \brief Attributes used by the Ethos(TM)-U NPU depthwise operator */
+struct EthosuDepthwiseConv2DAttrs : public tvm::AttrsNode<EthosuDepthwiseConv2DAttrs> {
+  double ifm_scale;
+  int ifm_zero_point;
+  int weight_zero_point;
+  double ofm_scale;
+  int ofm_zero_point;
+  Array<IndexExpr> kernel_shape;
+  IndexExpr ofm_channels;
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  Array<IndexExpr> dilation;
+  String activation;
+  int clip_min;
+  int clip_max;
+  String upscale;
+  String ifm_layout;
+  String ofm_layout;
+
+  TVM_DECLARE_ATTRS(EthosuDepthwiseConv2DAttrs, "relay.attrs.EthosuDepthwiseConv2DAttrs") {
+    TVM_ATTR_FIELD(ifm_scale).describe("The quantization scale for the Input Feature Map tensor.");
+    TVM_ATTR_FIELD(ifm_zero_point)
+        .describe("The quantization zero point for the Output Feature Map tensor.");
+    TVM_ATTR_FIELD(weight_zero_point)
+        .describe("The quantization zero point for the weight tensor.");
+    TVM_ATTR_FIELD(ofm_scale).describe("The quantization scale for the Output Feature Map tensor.");
+    TVM_ATTR_FIELD(ofm_zero_point)
+        .describe("The quantization zero point for the Output Feature Map tensor.");
+    TVM_ATTR_FIELD(kernel_shape)
+        .describe("The 2 dimensional kernel shape as (kernel_height, kernel_width).")
+        .set_default(NullValue<Array<IndexExpr> >());
+    TVM_ATTR_FIELD(ofm_channels)
+        .describe("The number of OFM channels.")
+        .set_default(NullValue<IndexExpr>());
+    TVM_ATTR_FIELD(strides)
+        .describe("The 2 dimensional strides as (stride_height, stride_width).")
+        .set_default(Array<IndexExpr>({1, 1}));
+    TVM_ATTR_FIELD(padding)
+        .describe("The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right)")
+        .set_default(Array<IndexExpr>({0, 0, 0, 0}));
+    TVM_ATTR_FIELD(dilation)
+        .describe("The 2 dimensional dilation as (dilation_height, dilation_width).")
+        .set_default(Array<IndexExpr>({1, 1}));
+    TVM_ATTR_FIELD(activation)
+        .describe(
+            "Description: The activation function to use."
+            "'NONE' - no activation function."
+            "'CLIP' - clip the output between clip_min and clip_max."
+            "'TANH - tanh activation function."
+            "'SIGMOID' - sigmoid activation function."
+            "'LUT' - use a look-up table to perform the activation function.")
+        .set_default("NONE");
+    TVM_ATTR_FIELD(clip_min)
+        .describe("The minimum clipping value if activation = CLIP.")
+        .set_default(0);
+    TVM_ATTR_FIELD(clip_max)
+        .describe("The maximum clipping value if activation = CLIP.")
+        .set_default(0);
+    TVM_ATTR_FIELD(upscale)
+        .describe(
+            "The 2x2 upscaling mode to apply to the Input Feature Map tensor. "
+            "'NONE' - no upscaling. "
+            "'NEAREST' - upscale using nearest neighbour. "
+            "'ZEROS' - upscale using zeros.")
+        .set_default("NONE");
+    TVM_ATTR_FIELD(ifm_layout)
+        .set_default("NHWC")
+        .describe("The layout of the Input Feature Map tensor. Can be 'NHWC' or 'NHCWB16'.");
+    TVM_ATTR_FIELD(ofm_layout)
+        .set_default("NHWC")
+        .describe("The layout of the Output Feature Map tensor. Can be 'NHWC' or 'NHCWB16'.");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(EthosuDepthwiseConv2DAttrs);
+
+bool EthosuDepthwiseConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                              const TypeReporter& reporter) {
+  ICHECK_EQ(types.size(), 5);
+  const auto* ifm = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  const auto* scale_bias = types[2].as<TensorTypeNode>();
+  if (ifm == nullptr || weight == nullptr) return false;
+
+  const auto* param = attrs.as<EthosuDepthwiseConv2DAttrs>();
+  ICHECK(param != nullptr) << "EthosuDepthwiseConv2DAttrs cannot be nullptr.";
+  ICHECK(ifm->dtype == DataType::UInt(8) || ifm->dtype == DataType::Int(8))
+      << "Expected ethosu_depthwise_conv2d type(uint8) or type(int8) for ifm but was "
+      << ifm->dtype;
+  ICHECK(weight->dtype == DataType::UInt(8) || ifm->dtype == DataType::Int(8))
+      << "Expected ethosu_depthwise_conv2d type(uint8) or type(int8) for weight but was "
+      << weight->dtype;
+  ICHECK(scale_bias->dtype == DataType::UInt(8))
+      << "Expected ethosu_depthwise_conv2d type(uint8) for scale_bias but was "
+      << scale_bias->dtype;
+
+  // Collect the ifm, weight and ofm tensors for using in the inference function
+  Array<Type> tensor_types = {types[0], types[1], types[4]};
+
+  // Assign weight type {ofm_channels, kernel_height, kernel_width, 1}
+  reporter->Assign(types[1], TensorType({param->ofm_channels, param->kernel_shape[0],
+                                         param->kernel_shape[1], weight->shape[3]},
+                                        weight->dtype));
+
+  // Assign ofm type
+  auto ofm_shape =
+      EthosuInferKernelOutput(ifm->shape, param->ifm_layout, param->ofm_layout, param->kernel_shape,
+                              param->ofm_channels, param->dilation, param->strides, param->padding);
+
+  reporter->Assign(types[4], TensorType(ofm_shape, ifm->dtype));
+
+  return true;
+}
+
+Expr MakeEthosuDepthwiseConv2D(Expr ifm, Expr weight, Expr scale_bias, Expr lut, double ifm_scale,
+                               int ifm_zero_point, int weight_zero_point, double ofm_scale,
+                               int ofm_zero_point, Array<IndexExpr> kernel_shape,
+                               IndexExpr ofm_channels, Array<IndexExpr> strides,
+                               Array<IndexExpr> padding, Array<IndexExpr> dilation,
+                               String activation, int clip_min, int clip_max, String upscale,
+                               String ifm_layout, String ofm_layout) {
+  auto attrs = make_object<EthosuDepthwiseConv2DAttrs>();
+  attrs->ifm_scale = ifm_scale;
+  attrs->ifm_zero_point = ifm_zero_point;
+  attrs->weight_zero_point = weight_zero_point;
+  attrs->ofm_scale = ofm_scale;
+  attrs->ofm_zero_point = ofm_zero_point;
+  attrs->kernel_shape = std::move(kernel_shape);
+  attrs->ofm_channels = std::move(ofm_channels);
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->activation = std::move(activation);
+  attrs->clip_min = clip_min;
+  attrs->clip_max = clip_max;
+  attrs->upscale = std::move(upscale);
+  attrs->ifm_layout = std::move(ifm_layout);
+  attrs->ofm_layout = std::move(ofm_layout);
+  static const Op& op = Op::Get("contrib.ethosu.depthwise_conv2d");
+  return Call(op, {ifm, weight, scale_bias, lut}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.ethosu_depthwise_conv2d")
+    .set_body_typed(MakeEthosuDepthwiseConv2D);
+
+RELAY_REGISTER_OP("contrib.ethosu.depthwise_conv2d")
+    .describe(R"code(Arm(R) Ethos(TM)-U NPU 2D quantized depthwise operator.
+
+This Relay operator corresponds to the hardware-implemented quantized
+depthwise operation found on Ethos(TM)-U NPUs. It accepts either NHWC or NHCWB16 format
+for the input data (input feature map, or IFM) and OHWI format for the kernel weights.
+
+- **ifm**: NHWC - (1, ifm_height, ifm_width, ifm_channels)
+           NHCWB16 - (1, ifm_height, ifm_channels // 16, ifm_width, 16)
+- **weight**: (ofm_channels, kernel_shape[0], kernel_shape[1], 1 (depth multiplier))
+- **scale_bias**: (ofm_channels, 10)
+- **ofm**: (1, ofm_height, ofm_width, ofm_channels)
+
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<EthosuDepthwiseConv2DAttrs>()
+    .set_num_inputs(4)
+    .add_argument("ifm", "Tensor", "The Input Feature Map tensor (IFM).")
+    .add_argument("weight", "Tensor", "The weight tensor.")
+    .add_argument("scale_bias", "Tensor", "The packed per-channel weight scale and bias tensor.")
+    .add_argument("lut", "Tensor", "The look-up table values to use if activation = 'LUT'")
+    .set_support_level(11)
+    .add_type_rel("EthosuDepthwiseConv2D", EthosuDepthwiseConv2DRel);
+
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace op
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 0d40caa15052..ee87fbfa4be9 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -49,8 +49,12 @@ InferCorrectLayoutOutput PoolInferCorrectLayout(const Attrs& attrs,
   ICHECK(attrs_ptr);
   ObjectPtr<T> params = make_object<T>(*attrs_ptr);
 
-  if (new_in_layouts.defined()) {
-    // Set the pool with the new layout.
+  if (params->out_layout != "") {
+    // when users specify the out_layout of pooling, transforms.ConvertLayout pass will
+    //   follow user's preference
+    ICHECK_EQ(params->layout, params->out_layout);
+  } else if (new_in_layouts.defined()) {
+    // the pooling is using an inferred layout (i.e., new_in_layouts[0]) given by relay caller
     ICHECK_EQ(new_in_layouts.size(), 1);
     params->layout = new_in_layouts[0].name();
   }
@@ -144,6 +148,7 @@ Array<te::Tensor> Pool2DCompute(const Attrs& attrs, const Array<te::Tensor>& inp
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
+  Layout out_layout(param->out_layout);
 
   ICHECK(tir::BijectiveLayout(layout, kNCHW).defined())
       << "max_pool2d currently only supports layouts that are convertible from NCHW";
@@ -178,9 +183,9 @@ Array<te::Tensor> Pool2DCompute(const Attrs& attrs, const Array<te::Tensor>& inp
 TVM_REGISTER_GLOBAL("relay.op.nn._make.max_pool2d")
     .set_body_typed([](Expr data, Array<IndexExpr> pool_size, Array<IndexExpr> strides,
                        Array<IndexExpr> dilation, Array<IndexExpr> padding, String layout,
-                       bool ceil_mode) {
+                       String out_layout, bool ceil_mode) {
       return MakeMaxPool<MaxPool2DAttrs>(data, pool_size, strides, dilation, padding, layout,
-                                         ceil_mode, "nn.max_pool2d");
+                                         out_layout, ceil_mode, "nn.max_pool2d");
     });
 
 RELAY_REGISTER_OP("nn.max_pool2d")
@@ -216,9 +221,9 @@ RELAY_REGISTER_OP("nn.max_pool2d")
 TVM_REGISTER_GLOBAL("relay.op.nn._make.avg_pool2d")
     .set_body_typed([](Expr data, Array<IndexExpr> pool_size, Array<IndexExpr> strides,
                        Array<IndexExpr> dilation, Array<IndexExpr> padding, String layout,
-                       bool ceil_mode, bool count_include_pad) {
+                       String out_layout, bool ceil_mode, bool count_include_pad) {
       return MakeAvgPool<AvgPool2DAttrs>(data, pool_size, strides, dilation, padding, layout,
-                                         ceil_mode, count_include_pad, "nn.avg_pool2d");
+                                         out_layout, ceil_mode, count_include_pad, "nn.avg_pool2d");
     });
 
 RELAY_REGISTER_OP("nn.avg_pool2d")
@@ -303,9 +308,10 @@ Array<te::Tensor> GlobalPool2DCompute(const Attrs& attrs, const Array<te::Tensor
   return Array<te::Tensor>{topi::nn::global_pool(inputs[0], mode, layout.name())};
 }
 
-Expr MakeGlobalAvgPool2D(Expr data, String layout) {
+Expr MakeGlobalAvgPool2D(Expr data, String layout, String out_layout) {
   auto attrs = make_object<GlobalPool2DAttrs>();
   attrs->layout = std::move(layout);
+  attrs->out_layout = std::move(out_layout);
   static const Op& op = Op::Get("nn.global_avg_pool2d");
   return Call(op, {data}, Attrs(attrs), {});
 }
@@ -331,9 +337,10 @@ RELAY_REGISTER_OP("nn.global_avg_pool2d")
     .set_attr<FTVMCompute>("FTVMCompute", GlobalPool2DCompute<topi::nn::kAvgPool>);
 
 // GlobalMaxPool
-Expr MakeGlobalMaxPool2D(Expr data, String layout) {
+Expr MakeGlobalMaxPool2D(Expr data, String layout, String out_layout) {
   auto attrs = make_object<GlobalPool2DAttrs>();
   attrs->layout = std::move(layout);
+  attrs->out_layout = std::move(out_layout);
   static const Op& op = Op::Get("nn.global_max_pool2d");
   return Call(op, {data}, Attrs(attrs), {});
 }
@@ -423,10 +430,12 @@ Array<te::Tensor> AdaptivePool1DCompute(const Attrs& attrs, const Array<te::Tens
 }
 
 // relay.nn.adaptive_avg_pool1d
-Expr MakeAdaptiveAvgPool1D(Expr data, Array<IndexExpr> output_size, String layout) {
+Expr MakeAdaptiveAvgPool1D(Expr data, Array<IndexExpr> output_size, String layout,
+                           String out_layout) {
   auto attrs = make_object<AdaptivePool1DAttrs>();
   attrs->output_size = std::move(output_size);
   attrs->layout = std::move(layout);
+  attrs->out_layout = std::move(out_layout);
   static const Op& op = Op::Get("nn.adaptive_avg_pool1d");
   return Call(op, {data}, Attrs(attrs), {});
 }
@@ -456,10 +465,12 @@ RELAY_REGISTER_OP("nn.adaptive_avg_pool1d")
     .set_attr<FTVMCompute>("FTVMCompute", AdaptivePool1DCompute<topi::nn::kAvgPool>);
 
 // relay.nn.adaptive_max_pool1d
-Expr MakeAdaptiveMaxPool1D(Expr data, Array<IndexExpr> output_size, String layout) {
+Expr MakeAdaptiveMaxPool1D(Expr data, Array<IndexExpr> output_size, String layout,
+                           String out_layout) {
   auto attrs = make_object<AdaptivePool1DAttrs>();
   attrs->output_size = std::move(output_size);
   attrs->layout = std::move(layout);
+  attrs->out_layout = std::move(out_layout);
   static const Op& op = Op::Get("nn.adaptive_max_pool1d");
   return Call(op, {data}, Attrs(attrs), {});
 }
@@ -571,10 +582,12 @@ Array<te::Tensor> AdaptivePool2DCompute(const Attrs& attrs, const Array<te::Tens
 }
 
 // relay.nn.adaptive_avg_pool2d
-Expr MakeAdaptiveAvgPool2D(Expr data, Array<IndexExpr> output_size, String layout) {
+Expr MakeAdaptiveAvgPool2D(Expr data, Array<IndexExpr> output_size, String layout,
+                           String out_layout) {
   auto attrs = make_object<AdaptivePool2DAttrs>();
   attrs->output_size = std::move(output_size);
   attrs->layout = std::move(layout);
+  attrs->out_layout = std::move(out_layout);
   static const Op& op = Op::Get("nn.adaptive_avg_pool2d");
   return Call(op, {data}, Attrs(attrs), {});
 }
@@ -606,10 +619,12 @@ RELAY_REGISTER_OP("nn.adaptive_avg_pool2d")
     .set_attr<FTVMCompute>("FTVMCompute", AdaptivePool2DCompute<topi::nn::kAvgPool>);
 
 // relay.nn.adaptive_max_pool2d
-Expr MakeAdaptiveMaxPool2D(Expr data, Array<IndexExpr> output_size, String layout) {
+Expr MakeAdaptiveMaxPool2D(Expr data, Array<IndexExpr> output_size, String layout,
+                           String out_layout) {
   auto attrs = make_object<AdaptivePool2DAttrs>();
   attrs->output_size = std::move(output_size);
   attrs->layout = std::move(layout);
+  attrs->out_layout = std::move(out_layout);
   static const Op& op = Op::Get("nn.adaptive_max_pool2d");
   return Call(op, {data}, Attrs(attrs), {});
 }
@@ -700,6 +715,7 @@ Array<te::Tensor> AdaptivePool3DCompute(const Attrs& attrs, const Array<te::Tens
   const auto* param = attrs.as<AdaptivePool3DAttrs>();
   ICHECK(param != nullptr);
   Layout layout(param->layout);
+  Layout out_layout(param->out_layout);
   ICHECK(tir::BijectiveLayout(layout, kNCDHW).defined())
       << "Adaptive pool3d currently only supports layouts that are convertible from NCDHW";
   ICHECK_EQ(layout.IndexOf(LayoutAxis::Get('d')), -1)
@@ -737,10 +753,12 @@ Array<te::Tensor> AdaptivePool3DCompute(const Attrs& attrs, const Array<te::Tens
 }
 
 // relay.nn.adaptive_max_pool3d
-Expr MakeAdaptiveMaxPool3D(Expr data, Array<IndexExpr> output_size, String layout) {
+Expr MakeAdaptiveMaxPool3D(Expr data, Array<IndexExpr> output_size, String layout,
+                           String out_layout) {
   auto attrs = make_object<AdaptivePool3DAttrs>();
   attrs->output_size = std::move(output_size);
   attrs->layout = std::move(layout);
+  attrs->out_layout = std::move(out_layout);
   static const Op& op = Op::Get("nn.adaptive_max_pool3d");
   return Call(op, {data}, Attrs(attrs), {});
 }
@@ -772,10 +790,12 @@ RELAY_REGISTER_OP("nn.adaptive_max_pool3d")
     .set_attr<FTVMCompute>("FTVMCompute", AdaptivePool3DCompute<topi::nn::kMaxPool>);
 
 // relay.nn.adaptive_max_pool3d
-Expr MakeAdaptiveAvgPool3D(Expr data, Array<IndexExpr> output_size, String layout) {
+Expr MakeAdaptiveAvgPool3D(Expr data, Array<IndexExpr> output_size, String layout,
+                           String out_layout) {
   auto attrs = make_object<AdaptivePool3DAttrs>();
   attrs->output_size = std::move(output_size);
   attrs->layout = std::move(layout);
+  attrs->out_layout = std::move(out_layout);
   static const Op& op = Op::Get("nn.adaptive_avg_pool3d");
   return Call(op, {data}, Attrs(attrs), {});
 }
@@ -866,12 +886,13 @@ Array<te::Tensor> Pool2DGradCompute(const Attrs& attrs, const Array<te::Tensor>&
 // MaxPool2DGrad
 Expr MakeMaxPool2DGrad(Expr out_grad, Expr data, Array<IndexExpr> pool_size,
                        Array<IndexExpr> strides, Array<IndexExpr> padding, String layout,
-                       bool ceil_mode) {
+                       String out_layout, bool ceil_mode) {
   auto attrs = make_object<MaxPool2DAttrs>();
   attrs->pool_size = std::move(pool_size);
   attrs->strides = std::move(strides);
   attrs->padding = std::move(padding);
   attrs->layout = std::move(layout);
+  attrs->out_layout = std::move(out_layout);
   attrs->ceil_mode = ceil_mode;
   static const Op& op = Op::Get("nn.max_pool2d_grad");
   return Call(op, {out_grad, data}, Attrs(attrs), {});
@@ -913,12 +934,13 @@ RELAY_REGISTER_OP("nn.max_pool2d_grad")
 // AvgPool2DGrad
 Expr MakeAvgPool2DGrad(Expr out_grad, Expr data, Array<IndexExpr> pool_size,
                        Array<IndexExpr> strides, Array<IndexExpr> padding, String layout,
-                       bool ceil_mode, bool count_include_pad) {
+                       String out_layout, bool ceil_mode, bool count_include_pad) {
   auto attrs = make_object<AvgPool2DAttrs>();
   attrs->pool_size = std::move(pool_size);
   attrs->strides = std::move(strides);
   attrs->padding = std::move(padding);
   attrs->layout = std::move(layout);
+  attrs->out_layout = std::move(out_layout);
   attrs->ceil_mode = ceil_mode;
   attrs->count_include_pad = count_include_pad;
   static const Op& op = Op::Get("nn.avg_pool2d_grad");
@@ -976,6 +998,7 @@ bool Pool1DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   ICHECK(param != nullptr);
 
   Layout layout(param->layout);
+  Layout out_layout(param->out_layout);
   ICHECK(layout.Contains(LayoutAxis::Get('W')) && !layout.Contains(LayoutAxis::Get('w')))
       << "Invalid layout " << layout << ". Pool1D layout must have W, which cannot be split";
 
@@ -1018,6 +1041,7 @@ Array<te::Tensor> Pool1DCompute(const Attrs& attrs, const Array<te::Tensor>& inp
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
+  Layout out_layout(param->out_layout);
 
   ICHECK(tir::BijectiveLayout(layout, kNCW).defined())
       << "max_pool1d currently only supports layouts that are convertible from NCW";
@@ -1046,9 +1070,9 @@ Array<te::Tensor> Pool1DCompute(const Attrs& attrs, const Array<te::Tensor>& inp
 TVM_REGISTER_GLOBAL("relay.op.nn._make.max_pool1d")
     .set_body_typed([](Expr data, Array<IndexExpr> pool_size, Array<IndexExpr> strides,
                        Array<IndexExpr> dilation, Array<IndexExpr> padding, String layout,
-                       bool ceil_mode) {
+                       String out_layout, bool ceil_mode) {
       return MakeMaxPool<MaxPool1DAttrs>(data, pool_size, strides, dilation, padding, layout,
-                                         ceil_mode, "nn.max_pool1d");
+                                         out_layout, ceil_mode, "nn.max_pool1d");
     });
 
 RELAY_REGISTER_OP("nn.max_pool1d")
@@ -1082,9 +1106,9 @@ RELAY_REGISTER_OP("nn.max_pool1d")
 TVM_REGISTER_GLOBAL("relay.op.nn._make.avg_pool1d")
     .set_body_typed([](Expr data, Array<IndexExpr> pool_size, Array<IndexExpr> strides,
                        Array<IndexExpr> dilation, Array<IndexExpr> padding, String layout,
-                       bool ceil_mode, bool count_include_pad) {
+                       String out_layout, bool ceil_mode, bool count_include_pad) {
       return MakeAvgPool<AvgPool1DAttrs>(data, pool_size, strides, dilation, padding, layout,
-                                         ceil_mode, count_include_pad, "nn.avg_pool1d");
+                                         out_layout, ceil_mode, count_include_pad, "nn.avg_pool1d");
     });
 
 RELAY_REGISTER_OP("nn.avg_pool1d")
@@ -1134,6 +1158,7 @@ bool Pool3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   ICHECK(param != nullptr);
 
   Layout layout(param->layout);
+  Layout out_layout(param->out_layout);
   ICHECK(layout.Contains(LayoutAxis::Get('D')) && layout.Contains(LayoutAxis::Get('H')) &&
          layout.Contains(LayoutAxis::Get('W')) && !layout.Contains(LayoutAxis::Get('d')) &&
          !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
@@ -1194,6 +1219,7 @@ Array<te::Tensor> Pool3DCompute(const Attrs& attrs, const Array<te::Tensor>& inp
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
+  Layout out_layout(param->out_layout);
 
   ICHECK(tir::BijectiveLayout(layout, kNCDHW).defined())
       << "max_pool3d currently only supports layouts that are convertible from NCDHW";
@@ -1231,9 +1257,9 @@ Array<te::Tensor> Pool3DCompute(const Attrs& attrs, const Array<te::Tensor>& inp
 TVM_REGISTER_GLOBAL("relay.op.nn._make.max_pool3d")
     .set_body_typed([](Expr data, Array<IndexExpr> pool_size, Array<IndexExpr> strides,
                        Array<IndexExpr> dilation, Array<IndexExpr> padding, String layout,
-                       bool ceil_mode) {
+                       String out_layout, bool ceil_mode) {
       return MakeMaxPool<MaxPool3DAttrs>(data, pool_size, strides, dilation, padding, layout,
-                                         ceil_mode, "nn.max_pool3d");
+                                         out_layout, ceil_mode, "nn.max_pool3d");
     });
 
 RELAY_REGISTER_OP("nn.max_pool3d")
@@ -1270,9 +1296,9 @@ RELAY_REGISTER_OP("nn.max_pool3d")
 TVM_REGISTER_GLOBAL("relay.op.nn._make.avg_pool3d")
     .set_body_typed([](Expr data, Array<IndexExpr> pool_size, Array<IndexExpr> strides,
                        Array<IndexExpr> dilation, Array<IndexExpr> padding, String layout,
-                       bool ceil_mode, bool count_include_pad) {
+                       String out_layout, bool ceil_mode, bool count_include_pad) {
       return MakeAvgPool<AvgPool3DAttrs>(data, pool_size, strides, dilation, padding, layout,
-                                         ceil_mode, count_include_pad, "nn.avg_pool3d");
+                                         out_layout, ceil_mode, count_include_pad, "nn.avg_pool3d");
     });
 
 RELAY_REGISTER_OP("nn.avg_pool3d")
diff --git a/src/relay/op/nn/pooling.h b/src/relay/op/nn/pooling.h
index 9b7eab25fe9a..32ae464101ab 100644
--- a/src/relay/op/nn/pooling.h
+++ b/src/relay/op/nn/pooling.h
@@ -35,13 +35,14 @@ namespace relay {
 template <typename T>
 inline Expr MakeMaxPool(Expr data, Array<IndexExpr> pool_size, Array<IndexExpr> strides,
                         Array<IndexExpr> dilation, Array<IndexExpr> padding, String layout,
-                        bool ceil_mode, String op_name) {
+                        String out_layout, bool ceil_mode, String op_name) {
   auto attrs = make_object<T>();
   attrs->pool_size = std::move(pool_size);
   attrs->strides = std::move(strides);
   attrs->dilation = std::move(dilation);
   attrs->padding = std::move(padding);
   attrs->layout = std::move(layout);
+  attrs->out_layout = std::move(out_layout);
   attrs->ceil_mode = ceil_mode;
   static const Op& op = Op::Get(op_name);
   return Call(op, {data}, Attrs(attrs), {});
@@ -50,13 +51,14 @@ inline Expr MakeMaxPool(Expr data, Array<IndexExpr> pool_size, Array<IndexExpr>
 template <typename T>
 inline Expr MakeAvgPool(Expr data, Array<IndexExpr> pool_size, Array<IndexExpr> strides,
                         Array<IndexExpr> dilation, Array<IndexExpr> padding, String layout,
-                        bool ceil_mode, bool count_include_pad, String op_name) {
+                        String out_layout, bool ceil_mode, bool count_include_pad, String op_name) {
   auto attrs = make_object<T>();
   attrs->pool_size = std::move(pool_size);
   attrs->strides = std::move(strides);
   attrs->dilation = std::move(dilation);
   attrs->padding = std::move(padding);
   attrs->layout = std::move(layout);
+  attrs->out_layout = std::move(out_layout);
   attrs->ceil_mode = ceil_mode;
   attrs->count_include_pad = count_include_pad;
   static const Op& op = Op::Get(op_name);
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index c9f14c91c7b1..5001925b7570 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -149,23 +149,41 @@ InferCorrectLayoutOutput ReduceInferCorrectLayout(const Attrs& attrs,
     tvm::Array<tvm::Integer> new_r_axes;
     std::string inferred_in_string = "";
     std::string inferred_out_string = "";
-    int axis_index = 0;
-    for (auto iter_var : layout->axes) {
-      const auto& layout_axis = LayoutAxis::Get(iter_var);
+    auto push_new_axis = [&](const std::string& layout_dim, int axis) {
+      if ((old_r_dims.count(layout_dim) && !params->exclude) ||
+          (!old_r_dims.count(layout_dim) && params->exclude)) {
+        new_r_axes.push_back(tvm::Integer(axis));
+        return true;
+      }
+      return false;
+    };
+    for (size_t axis_index = 0; axis_index < layout->axes.size(); ++axis_index) {
+      const auto& layout_axis = LayoutAxis::Get(layout->axes[axis_index]);
       const std::string& layout_dim = layout_axis.name();
-      // Collect only the primal axis.
       if (layout_axis.IsPrimal()) {
-        if (old_r_dims.count(layout_dim) && !params->exclude) {
-          new_r_axes.push_back(tvm::Integer(axis_index));
-        }
-        if (!old_r_dims.count(layout_dim) && params->exclude) {
-          new_r_axes.push_back(tvm::Integer(axis_index));
-        }
+        push_new_axis(layout_dim, axis_index);
+        inferred_in_string += layout_dim;
         if (!old_r_dims.count(layout_dim) || params->keepdims) {
           inferred_out_string += layout_dim;
         }
-        inferred_in_string += layout_dim;
-        axis_index++;
+      } else {
+        // For example, if the original layout is NCHW, the new layout is NCHW8c, and the original
+        // reduce axes is [1], the new reduce axes become [1, 4].
+        auto primal_dim = layout_axis.ToPrimal().name();
+        auto packed_dim = std::to_string(layout.FactorOf(layout_axis)) + layout_dim;
+        inferred_in_string += packed_dim;
+        if (push_new_axis(primal_dim, axis_index)) {
+          if (params->exclude) {
+            // The primal axis is not reduced, so keep the input packed dim.
+            inferred_out_string += packed_dim;
+          } else {
+            // If the primal axis is part of reduce axes in the original layout, the inner dim
+            // becomes 1 after reduction.
+            inferred_out_string += "1" + layout_dim;
+          }
+        } else {
+          inferred_out_string += packed_dim;
+        }
       }
     }
 
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 3781107eeee1..90a0e3150573 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -2599,24 +2599,19 @@ InferCorrectLayoutOutput StridedSliceInferCorrectLayout(
         params->strides = new_strides;
         layout = new_layout;
       }
-    } else {
+    } else if (old_layout_name.size() <
+               new_layout_name.size()) {  // prohibit transforms such as NCHW4c -> NCHW
       if (params->axes) {
         auto axes = params->axes.value();
         Array<Integer> new_axes;
-
         for (size_t i = 0; i < axes.size(); ++i) {
           auto old_idx = axes[i];
           auto new_idx = new_layout.IndexOf(layout[old_idx]);
           new_axes.push_back(new_idx);
 
           const LayoutAxis& axis = layout[old_idx];
-          if (!axis.IsPrimal()) {
-            // original layout that contains splitted axes is not supported
-            return out_default;
-          }
-
+          ICHECK(axis.IsPrimal());
           auto factor = new_layout.FactorOf(axis);
-
           if (factor == -1) {
             new_begin.push_back(begin[i]);
             new_end.push_back(end[i]);
@@ -2636,10 +2631,7 @@ InferCorrectLayoutOutput StridedSliceInferCorrectLayout(
       } else {
         for (size_t i = 0; i < begin.size(); i++) {
           const LayoutAxis& axis = layout[i];
-          if (!axis.IsPrimal()) {
-            // original layout that contains splitted axes is not supported
-            return out_default;
-          }
+          ICHECK(axis.IsPrimal());
           auto factor = new_layout.FactorOf(axis);
           if (factor == -1) {
             new_begin.push_back(IntImm(begin[i]->dtype, begin[i]));
@@ -3260,8 +3252,10 @@ bool GatherRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   oshape.reserve(ndim_data);
   for (size_t i = 0; i < ndim_data; ++i) {
     if (i == static_cast<size_t>(axis)) {
-      const int64_t* indice_shape_i = tir::as_const_int(indices->shape[i]);
-      ICHECK_GE(*indice_shape_i, 1);
+      if (indices->shape[i].as<IntImmNode>()) {
+        const int64_t* indice_shape_i = tir::as_const_int(indices->shape[i]);
+        ICHECK_GE(*indice_shape_i, 1);
+      }
     } else {
       ICHECK(reporter->AssertEQ(indices->shape[i], data->shape[i]));
     }
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index 5782f1f6b4d1..ecdd36ddb791 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -275,6 +275,7 @@ Expr DepthwiseConv2DSecondTerm(const Expr& padded_data, const Expr& kernel_zero_
     Array<IndexExpr> padding({0, 0});
     reduced_t2 = AvgPool2D(scaled_hw_t2, param->kernel_size, param->strides, param->dilation,
                            padding, param->data_layout,
+                           "",      // out_layout
                            false,   // ceil_mode
                            false);  // count_include_pad
   } else {
@@ -284,6 +285,7 @@ Expr DepthwiseConv2DSecondTerm(const Expr& padded_data, const Expr& kernel_zero_
       Array<IndexExpr> padding({0, 0});
       reduced_t2 = AvgPool2D(reduced_t2, param->kernel_size, param->strides, param->dilation,
                              padding, param->data_layout,
+                             "",      // out_layout
                              false,   // ceil_mode
                              false);  // count_include_pad
     }
@@ -463,6 +465,7 @@ Expr Conv2DSecondTerm(const Expr& padded_data, const Expr& kernel_zero_point,
         Multiply(reduced_c_t2, MakeConstantScalar(DataType::Int(32), kernel_h * kernel_w));
     reduced_t2 = AvgPool2D(reduced_c_t2, param->kernel_size, param->strides, param->dilation,
                            padding, param->data_layout,
+                           "",      // out_layout
                            false,   // ceil_mode
                            false);  // count_include_pad
   } else {
@@ -471,6 +474,7 @@ Expr Conv2DSecondTerm(const Expr& padded_data, const Expr& kernel_zero_point,
     if (stride1 * stride2 != 1) {
       reduced_t2 = AvgPool2D(reduced_c_t2, param->kernel_size, param->strides, param->dilation,
                              padding, param->data_layout,
+                             "",      // out_layout
                              false,   // ceil_mode
                              false);  // count_include_pad
     }
diff --git a/src/relay/transforms/pattern_utils.h b/src/relay/transforms/pattern_utils.h
index 692ef3c9f557..03b8ee6937a7 100644
--- a/src/relay/transforms/pattern_utils.h
+++ b/src/relay/transforms/pattern_utils.h
@@ -676,9 +676,10 @@ static inline Expr Reshape(Expr data, Array<Integer> newshape) {
 
 static inline Expr AvgPool2D(Expr data, Array<IndexExpr> pool_size, Array<IndexExpr> strides,
                              Array<IndexExpr> dilation, Array<IndexExpr> padding,
-                             std::string layout, bool ceil_mode, bool count_include_pad) {
-  return MakeAvgPool<AvgPool2DAttrs>(data, pool_size, strides, dilation, padding, layout, ceil_mode,
-                                     count_include_pad, "nn.avg_pool2d");
+                             std::string layout, std::string out_layout, bool ceil_mode,
+                             bool count_include_pad) {
+  return MakeAvgPool<AvgPool2DAttrs>(data, pool_size, strides, dilation, padding, layout,
+                                     out_layout, ceil_mode, count_include_pad, "nn.avg_pool2d");
 }
 
 static inline Expr Pad(Expr data, Array<Array<IndexExpr>> pad_width, Expr pad_value,
diff --git a/src/runtime/hexagon/launcher/CMakeLists.txt b/src/runtime/hexagon/launcher/CMakeLists.txt
deleted file mode 100644
index d3a2f4f8161d..000000000000
--- a/src/runtime/hexagon/launcher/CMakeLists.txt
+++ /dev/null
@@ -1,156 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-cmake_minimum_required(VERSION 3.2)
-project(HexagonLauncher C CXX)
-
-if(NOT "${FASTRPC_LIBS}" STREQUAL "SKEL" AND
-   NOT "${FASTRPC_LIBS}" STREQUAL "STUB")
-  message(SEND_ERROR "Please set FASTRPC_LIBS to either SKEL or STUB")
-endif()
-
-if(NOT DEFINED USE_HEXAGON_SDK)
-  message(SEND_ERROR "Please set USE_HEXAGON_SDK to the location of Hexagon SDK")
-endif()
-if (NOT DEFINED USE_HEXAGON_ARCH)
-  message(SEND_ERROR "Please set USE_HEXAGON_ARCH to the Hexagon architecture version")
-endif()
-
-include(../../../../cmake/modules/HexagonSDK.cmake)
-
-find_hexagon_sdk_root("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}")
-
-include_directories(SYSTEM ${HEXAGON_SDK_INCLUDES} ${HEXAGON_REMOTE_ROOT})
-
-set(QAIC_EXE "${HEXAGON_QAIC_EXE}")
-foreach(INCDIR IN LISTS HEXAGON_SDK_INCLUDES HEXAGON_REMOTE_ROOT)
-  list(APPEND QAIC_FLAGS "-I${INCDIR}")
-endforeach()
-
-set(LAUNCHER_SRC "${CMAKE_CURRENT_SOURCE_DIR}")
-set(CMAKE_SKIP_RPATH TRUE)
-
-# Qaic for the domain header.
-#
-# Don't add paths to these filenames, or otherwise cmake may spontaneously
-# add -o option to the qaic invocation (with an undesirable path).
-set(LAUNCHER_RPC_IDL "launcher_rpc.idl")
-set(LAUNCHER_RPC_H "launcher_rpc.h")
-set(LAUNCHER_RPC_SKEL_C "launcher_rpc_skel.c")
-set(LAUNCHER_RPC_STUB_C "launcher_rpc_stub.c")
-
-add_custom_command(
-  OUTPUT ${LAUNCHER_RPC_SKEL_C} ${LAUNCHER_RPC_STUB_C}
-    "${LAUNCHER_SRC}/${LAUNCHER_RPC_H}"
-  COMMAND ${QAIC_EXE} ${QAIC_FLAGS}
-    "${LAUNCHER_SRC}/${LAUNCHER_RPC_IDL}"
-  COMMAND ${CMAKE_COMMAND} -E rename "${LAUNCHER_RPC_H}"
-    "${LAUNCHER_SRC}/${LAUNCHER_RPC_H}"
-  MAIN_DEPENDENCY "${LAUNCHER_SRC}/${LAUNCHER_RPC_IDL}"
-)
-
-
-if("${FASTRPC_LIBS}" STREQUAL "SKEL")
-  # Skel libraries.
-  #
-  if (NOT DEFINED TVM_RUNTIME_HEXAGON)
-    message(SEND_ERROR "Please set TVM_RUNTIME_HEXAGON=/path/to/libtvm_runtime.a")
-  endif()
-
-  include_directories(SYSTEM ${HEXAGON_QURT_INCLUDES})
-  include_directories(
-    "${LAUNCHER_SRC}"
-    "${LAUNCHER_SRC}/../../../../include"
-    "${LAUNCHER_SRC}/../../../../3rdparty/dlpack/include"
-    "${LAUNCHER_SRC}/../../../../3rdparty/dmlc-core/include"
-  )
-  link_directories(${HEXAGON_QURT_LIBS})
-
-  add_definitions(-D_MACH_I32=int)
-  add_definitions(-DDMLC_CXX11_THREAD_LOCAL=0)
-  add_definitions(-DDMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
-
-  # Extra compile flags (both C and C++).
-  set(EXTRA_COMP_FLAGS
-    "-O3"
-    "-m${USE_HEXAGON_ARCH}"
-  )
-  string(REGEX REPLACE ";" " " EXTRA_COMP_FLAGS_STR "${EXTRA_COMP_FLAGS}")
-  set(CMAKE_C_FLAGS "${EXTRA_COMP_FLAGS_STR} ${CMAKE_C_FLAGS}")
-  set(CMAKE_CXX_FLAGS "${EXTRA_COMP_FLAGS_STR} ${CMAKE_CXX_FLAGS}")
-
-  set(EXTRA_LINK_FLAGS
-    "-lposix"
-    "-lqurt"
-    "-Wl,--export-dynamic"
-    "-Wl,--whole-archive ${TVM_RUNTIME_HEXAGON} -Wl,--no-whole-archive"
-    "-Wl,--defsym=HEAP_SIZE=0x40000000"
-  )
-  string(REGEX REPLACE ";" " " EXTRA_LINK_FLAGS_STR "${EXTRA_LINK_FLAGS}")
-
-  set(SKEL_SRCS
-    "launcher_core.cc"
-    "launcher_hexagon.cc"
-  )
-  add_library(launcher_rpc_skel SHARED
-    "${LAUNCHER_SRC}/${LAUNCHER_RPC_H}"
-    "${LAUNCHER_RPC_SKEL_C}"
-    "${SKEL_SRCS}"
-  )
-
-  # Extra linker flags for linking shared libraries.
-  set_target_properties(launcher_rpc_skel PROPERTIES
-    LINK_FLAGS ${EXTRA_LINK_FLAGS_STR}
-  )
-else()
-  # Stub libraries.
-  #
-  if (NOT DEFINED TVM_RUNTIME_ANDROID)
-    message(SEND_ERROR "Please set TVM_RUNTIME_ANDROID=/path/to/libtvm_runtime.so")
-  endif()
-
-  include_directories(SYSTEM
-    "${HEXAGON_SDK_INCLUDES}"
-    "${HEXAGON_RPCMEM_ROOT}/inc"
-  )
-  include_directories(
-    "${LAUNCHER_SRC}"
-    "${LAUNCHER_SRC}/../../../../include"
-    "${LAUNCHER_SRC}/../../../../3rdparty/dlpack/include"
-    "${LAUNCHER_SRC}/../../../../3rdparty/dmlc-core/include"
-  )
-  link_directories(${HEXAGON_REMOTE_ROOT})
-
-  add_definitions(-DDMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
-
-  set(STUB_SRCS
-    "launcher_android.cc"
-    "launcher_core.cc"
-    "launcher_main.cc"
-    "launcher_util.cc"
-  )
-
-  add_executable(launcher_android
-    "${STUB_SRCS}"
-    "${LAUNCHER_RPC_STUB_C}"
-  )
-  target_link_libraries(launcher_android cdsprpc log)
-
-  set_target_properties(launcher_android PROPERTIES
-    LINK_FLAGS "${TVM_RUNTIME_ANDROID}"
-  )
-endif()
diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
index 41f867057282..3820ce942af0 100644
--- a/src/runtime/pipeline/pipeline_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -21,31 +21,129 @@
  * \file pipeline_executor.cc
  */
 #include "pipeline_executor.h"
-
 namespace tvm {
 namespace runtime {
+/*!
+ * \brief Give frontends an access to packed functions.
+ * \param name The name of the function.
+ * \param sptr_to_self The pointer to the module node.
+ * \return The corresponding packed function.
+ */
+PackedFunc PipelineExecutor::GetFunction(const std::string& name,
+                                         const ObjectPtr<Object>& sptr_to_self) {
+  if (name == "get_num_outputs") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumOutputs(); });
+  } else {
+    LOG(FATAL) << "Unknown packed function: " << name;
+    return PackedFunc();
+  }
+  return nullptr;
+}
 
-void PipelineRuntime::Init(const Array<tvm::runtime::Module>& modules,
-                           const std::string& pipeline_json) {
-  return;
+/*!
+ * \brief Use the mod_config information to create a graph runtime list.
+ * \param mod_config The config information that generates by the export library function call.
+ */
+std::vector<Module> PipelineExecutor::CreateGraphModules(const ModuleConfig& mod_config) {
+  const PackedFunc* graph_executor_create = Registry::Get("tvm.graph_executor.create");
+  std::vector<Module> ret;
+  ret.resize(mod_config.size());
+  for (auto config : mod_config) {
+    // Load library.
+    auto lib = Module::LoadFromFile(config.second.lib_name.c_str());
+
+    // Read json.
+    std::ifstream ifJson(config.second.json_name.c_str());
+    if (ifJson.fail()) {
+      LOG(FATAL) << "json file not found: " << config.second.json_name;
+    }
+    const std::string json((std::istreambuf_iterator<char>(ifJson)),
+                           std::istreambuf_iterator<char>());
+
+    // Create a graph executor.
+    std::istringstream istr(config.second.dev);
+    std::string str;
+    int device_type = 1, device_id = 0;
+    while (getline(istr, str, ';')) {
+      std::istringstream istr_dev(str);
+      std::string str_temp;
+      if (getline(istr_dev, str_temp)) {
+        device_type = stoi(str_temp);
+      }
+      if (getline(istr_dev, str_temp)) {
+        device_id = stoi(str_temp);
+      }
+    }
+    Module graph_module = (*graph_executor_create)(json, lib, device_type, device_id);
+
+    // Load parameters.
+    TVMByteArray params_arr;
+    const char* params_file_name = config.second.params_name.c_str();
+    std::ifstream if_param(params_file_name);
+    if (if_param.fail()) {
+      LOG(FATAL) << "params file not found: " << params_file_name;
+    }
+    const std::string params((std::istreambuf_iterator<char>(if_param)),
+                             std::istreambuf_iterator<char>());
+    params_arr.data = params.c_str();
+    params_arr.size = params.length();
+    auto load_params = graph_module.GetFunction("load_params");
+    load_params(params_arr);
+
+    // Put a graph executor module into the vector.
+    ret[config.first] = graph_module;
+  }
+  return ret;
 }
 
-/* GetFunction can not be pure abstract function, implement an empty function for now.
+/*!
+ * \brief Initialize the pipeline executor with a list of modules to be pipelined
+ *  and config in JSON format.
+ * \param modules The module list used for building the pipeline.
+ * \param pipeline_json The configuration of modules dependencies.
  */
-PackedFunc PipelineRuntime::GetFunction(const std::string& name,
-                                        const ObjectPtr<Object>& sptr_to_self) {
-  return nullptr;
+void PipelineExecutor::Init(const std::vector<Module>& modules, const std::string& pipeline_json) {
+  ICHECK(!modules.empty()) << "The graph executor module list is empty.";
+  // Use JSONReader to load pipeline configuration.
+  std::istringstream is(pipeline_json);
+  dmlc::JSONReader reader(&is);
+  PipelineConfig& pipeline_config = this->LoadPipelineConfig(&reader);
+  ICHECK(!pipeline_config.Empty()) << "The pipeline config information is empty.";
+  // Initialize the pipeline function class used for pipeline thread pool management
+  // and schedule etc. This function returns the number of output.
+  num_outputs_ = pipeline_scheduler_.PipelineInit(modules, pipeline_config);
+  return;
 }
 
-Module PipelineRuntimeCreate(const Array<tvm::runtime::Module>& m,
-                             const std::string& pipeline_json) {
-  auto exec = make_object<PipelineRuntime>();
-  exec->Init(m, pipeline_json);
+Module PipelineExecutorCreate(const Array<Module>& m, const std::string& pipeline_json) {
+  ICHECK(!m.empty()) << "The module list is empty.";
+  auto exec = make_object<PipelineExecutor>();
+  std::vector<Module> graph_modules;
+  for (auto mod : m) {
+    graph_modules.push_back(mod);
+  }
+  exec->Init(graph_modules, pipeline_json);
+  return Module(exec);
+}
+
+Module PipelineExecutorLoad(const std::string& load_json, const std::string& pipeline_json) {
+  auto exec = make_object<PipelineExecutor>();
+  std::istringstream is(load_json);
+  dmlc::JSONReader reader(&is);
+  ModuleConfig& mod_config = exec->LoadModuleConfig(&reader);
+  ICHECK(!mod_config.empty()) << "The module config is empty.";
+  std::vector<Module> modules = exec->CreateGraphModules(mod_config);
+  exec->Init(modules, pipeline_json);
   return Module(exec);
 }
 
 TVM_REGISTER_GLOBAL("tvm.pipeline_executor.create").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = PipelineRuntimeCreate(args[0], args[1]);
+  *rv = PipelineExecutorCreate(args[0], args[1]);
+});
+
+TVM_REGISTER_GLOBAL("tvm.pipeline_executor.load").set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = PipelineExecutorLoad(args[0], args[1]);
 });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
index c7625c62b724..a883ba25ec08 100644
--- a/src/runtime/pipeline/pipeline_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -23,9 +23,16 @@
  */
 #ifndef TVM_RUNTIME_PIPELINE_PIPELINE_EXECUTOR_H_
 #define TVM_RUNTIME_PIPELINE_PIPELINE_EXECUTOR_H_
+
 #include <tvm/runtime/registry.h>
 
+#include <array>
+#include <iostream>
+#include <sstream>
 #include <string>
+#include <vector>
+
+#include "pipeline_scheduler.h"
 namespace tvm {
 namespace runtime {
 /*!
@@ -36,18 +43,23 @@ namespace runtime {
  *
  *  This executor can be accessed by various language via TVM runtime PackedFunc API.
  */
-class TVM_DLL PipelineRuntime : public ModuleNode {
+class TVM_DLL PipelineExecutor : public ModuleNode {
  public:
   /*!
    * \Return the type key of the executor.
    */
-  const char* type_key() const final { return "PipelineRuntime"; }
+  const char* type_key() const final { return "PipelineExecutor"; }
   /*!
-   * \brief Initialize the pipeline executor with module array and json text.
+   * \brief Initialize the pipeline executor with module array and JSON text.
    * \param modules The module list used for building pipeline.
    * \param pipeline_json The configuration of modules dependencies.
    */
-  void Init(const Array<tvm::runtime::Module>& modules, const std::string& pipeline_json);
+  void Init(const std::vector<Module>& modules, const std::string& pipeline_json);
+  /*!
+   * \brief Use the information of mod_config to create a list of graph executor.
+   * \param mod_config The configuration information generated by the library export function call.
+   */
+  std::vector<Module> CreateGraphModules(const ModuleConfig& mod_config);
   /*!
    * \brief Give frontends an access to packed functions.
    * \param name The name of the function.
@@ -55,6 +67,86 @@ class TVM_DLL PipelineRuntime : public ModuleNode {
    * \return The corresponding packed function.
    */
   virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self);
+
+  /*!
+   * \brief Get the number of outputs.
+   *
+   * \return The number of outputs.
+   */
+  int NumOutputs() const { return num_outputs_; }
+
+  /*!\brief Load the module files information.*/
+  ModuleConfig& LoadModuleConfig(dmlc::JSONReader* reader) {
+    reader->BeginArray();
+    while (reader->NextArrayItem()) {
+      std::string key;
+      reader->BeginObject();
+      int mod_idx = -1;
+      std::string lib_name;
+      std::string json_name;
+      std::string params_name;
+      std::string dev;
+      while (reader->NextObjectItem(&key)) {
+        if (key == "mod_idx") {
+          reader->Read(&mod_idx);
+        } else if (key == "lib_name") {
+          reader->Read(&lib_name);
+        } else if (key == "json_name") {
+          reader->Read(&json_name);
+        } else if (key == "params_name") {
+          reader->Read(&params_name);
+        } else if (key == "dev") {
+          reader->Read(&dev);
+        } else {
+          LOG(FATAL) << "do not support key " << key;
+        }
+      }
+      ICHECK(mod_idx >= 0) << "Invalid mod_idx value " << mod_idx;
+      // Load the lib, json, and params information.
+      ICHECK(!lib_name.empty()) << "lib_name is empty.";
+      ICHECK(!json_name.empty()) << "json_name is empty.";
+      ICHECK(!params_name.empty()) << "params_name is empty.";
+      mod_config_[mod_idx] = GraphModuleLoadInfo(lib_name, json_name, params_name, dev);
+    }
+    return mod_config_;
+  }
+
+ private:
+  /*!\brief The class used to execute and schedule the pipeline logic.*/
+  PipelineScheduler pipeline_scheduler_;
+  /*!\brief The dependency information of each graph runtime module of the pipeline.*/
+  PipelineConfig pipeline_config_;
+  /*!\brief The module information used to create the graph runtimes.*/
+  ModuleConfig mod_config_;
+  /*!\brief How many outputs are in this pipeline executor.*/
+  size_t num_outputs_ = 0;
+  /*!\brief Json loader.*/
+  PipelineConfig& LoadPipelineConfig(dmlc::JSONReader* reader) {
+    reader->BeginArray();
+    while (reader->NextArrayItem()) {
+      std::string key;
+      reader->BeginObject();
+      int mod_idx = -1;
+      OutputMap output;
+      std::string dev;
+      while (reader->NextObjectItem(&key)) {
+        if (key == "mod_idx") {
+          reader->Read(&mod_idx);
+        } else if (key == "dev") {
+          reader->Read(&dev);
+        } else if (key == "output") {
+          reader->Read(&output);
+        } else {
+          LOG(FATAL) << "do not support key " << key;
+        }
+      }
+      ICHECK(mod_idx >= 0) << "Invalid mod_idx value " << mod_idx;
+      // Check if the output is successfully read.
+      ICHECK(!output.Empty()) << "Invalid output binding result.";
+      pipeline_config_.Insert(mod_idx, output);
+    }
+    return pipeline_config_;
+  }
 };
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/pipeline/pipeline_scheduler.cc b/src/runtime/pipeline/pipeline_scheduler.cc
new file mode 100644
index 000000000000..82caf855a479
--- /dev/null
+++ b/src/runtime/pipeline/pipeline_scheduler.cc
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "pipeline_scheduler.h"
+
+#include <utility>
+#include <vector>
+namespace tvm {
+namespace runtime {
+/*!
+ * \brief Initialize the pipeline.
+ * \param modules The list of graph executor modules.
+ * \param pipeline_conf The dependency information of each graph executor module.
+ */
+size_t PipelineScheduler::PipelineInit(const std::vector<Module>& modules,
+                                       const PipelineConfig& pipeline_config) {
+  graph_modules_ = modules;
+  int num_output = pipeline_config.GetGlobalOutputNum();
+  return num_output;
+}
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/pipeline/pipeline_scheduler.h b/src/runtime/pipeline/pipeline_scheduler.h
new file mode 100644
index 000000000000..5ee127edffa3
--- /dev/null
+++ b/src/runtime/pipeline/pipeline_scheduler.h
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RUNTIME_PIPELINE_PIPELINE_SCHEDULER_H_
+#define TVM_RUNTIME_PIPELINE_PIPELINE_SCHEDULER_H_
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "pipeline_struct.h"
+namespace tvm {
+namespace runtime {
+/*!
+ * \brief The class that executes the pipeline logic,it is used to initialize the thread pool,
+    execute and schedule pipeline tasks, allocate and manage memory, etc.
+ */
+class PipelineScheduler {
+ public:
+  /*!
+   * \brief Initialize the pipeline.
+   * \param modules The list of graph executor module.
+   * \param pipeline_config The dependency information of each graph executor module.
+   */
+  size_t PipelineInit(const std::vector<Module>& modules, const PipelineConfig& pipeline_config);
+
+ private:
+  /*!\brief The list of graph executors.*/
+  std::vector<Module> graph_modules_;
+};
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_PIPELINE_PIPELINE_SCHEDULER_H_
diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
new file mode 100644
index 000000000000..3cc9621702c1
--- /dev/null
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RUNTIME_PIPELINE_PIPELINE_STRUCT_H_
+#define TVM_RUNTIME_PIPELINE_PIPELINE_STRUCT_H_
+#include <assert.h>
+#include <dlpack/dlpack.h>
+#include <dmlc/json.h>
+
+#include <limits>
+#include <string>
+#include <unordered_map>
+#include <vector>
+/*!
+ * \brief All binding information of a output interface.
+ */
+struct OutputBindings {
+  /*!\brief Output interface binding information, 'int' is the index of the module that
+   *  uses this output data as the input interface data, 'string' is the input interface name
+   *  of the module.
+   */
+  std::unordered_map<int, std::string> bindings;
+  /*! The index value of the global interface to which the current output are bound.*/
+  int global_output_index = std::numeric_limits<int>::min();
+  /*!\brief Whether this binding is bound to the PipelineExecutor output interface.*/
+  bool IsGlobalOutput() const { return global_output_index >= 0; }
+  /*!
+   * \brief Create a module interface map from JSONReader.
+   * \param reader JSON reader.
+   */
+  void Load(dmlc::JSONReader* reader) {
+    reader->BeginArray();
+    while (reader->NextArrayItem()) {
+      std::string key;
+      reader->BeginObject();
+      std::string input_name;
+      int mod_idx = std::numeric_limits<int>::min();
+      // Whether the output binding is global.
+      bool global_binding = false;
+      while (reader->NextObjectItem(&key)) {
+        if (key == "mod_idx") {
+          reader->Read(&mod_idx);
+        } else if (key == "input_name") {
+          reader->Read(&input_name);
+        } else if (key == "global_output_index") {
+          // There should be only one global binding.
+          ICHECK(global_output_index < 0);
+          reader->Read(&global_output_index);
+          // When the key value is 'global_output_index', it means that this output is bound to
+          // a global interface.
+          global_binding = true;
+        } else {
+          LOG(FATAL) << "do not support key " << key;
+        }
+      }
+      // When this output is bound to a global interface, check if the global interface index
+      // start from 0.
+      if (global_binding) {
+        ICHECK(global_output_index >= 0);
+      } else {
+        // When this output is bound to a graph executor module interface, check if the module
+        // index start from 0.
+        ICHECK(mod_idx >= 0);
+        bindings[mod_idx] = input_name;
+      }
+    }
+  }
+};
+
+/*!
+ * \brief The binding information of all outputs of a module.
+ */
+struct OutputMap {
+  /*! \brief Output binding map, 'int' is output interface index.*/
+  std::unordered_map<int, OutputBindings> output_binding_map;
+  OutputMap& operator=(const OutputMap& output) {
+    output_binding_map = output.output_binding_map;
+    return *this;
+  }
+
+  /*!\brief This function is used to verify whether OutputMap is successfully loaded.
+   * \return Return true to indicate that this class has not been successfully loaded.
+   */
+  bool Empty() { return output_binding_map.empty(); }
+  /*! \brief The pipeline outputs is the final outputs of pipeline, this function is used to
+   *   get how many pipeline outputs are in this Outputmap
+   *  \return Number of pipeline outputs.
+   */
+  size_t GetGlobalOutputNum(void) const {
+    size_t num_output = 0;
+    for (auto bindings : output_binding_map) {
+      num_output += bindings.second.IsGlobalOutput() ? 1 : 0;
+    }
+    return num_output;
+  }
+
+  /*!
+   * \brief Create a output binding map from JSONReader.
+   * \param reader Json reader.
+   */
+  void Load(dmlc::JSONReader* reader) {
+    reader->BeginArray();
+    while (reader->NextArrayItem()) {
+      std::string key;
+      reader->BeginObject();
+      int output_idx = -1;
+      OutputBindings binding;
+      while (reader->NextObjectItem(&key)) {
+        if (key == "output_idx") {
+          reader->Read(&output_idx);
+        } else if (key == "dependencies") {
+          reader->Read(&binding);
+        } else {
+          LOG(FATAL) << "do not support key " << key;
+        }
+      }
+      ICHECK(output_idx >= 0);
+      output_binding_map[output_idx] = binding;
+    }
+  }
+};
+/*!
+ * \brief The binding or dependency information of each module output interface.
+ */
+struct PipelineConfig {
+  /*!\brief The key is the module index, this variable records all module pipeline configuration
+   * information.
+   */
+  std::unordered_map<int, OutputMap> config;
+  OutputMap& operator[](int key) {
+    ICHECK(config.find(key) != config.end());
+    return config[key];
+  }
+
+  void Insert(int key, const OutputMap& map) { config[key] = map; }
+
+  /*!\brief This function is used to verify whether config is loaded successfully.
+   * \return Return true to indicate that this class has not been successfully loaded.
+   */
+  bool Empty() { return config.empty(); }
+
+  /*!
+   * \brief Get the number of global outputs.
+   * \return The number of outputs the entire pipeline has.
+   */
+  size_t GetGlobalOutputNum() const {
+    size_t num_output = 0;
+    for (auto mod_output : config) {
+      num_output += mod_output.second.GetGlobalOutputNum();
+    }
+    return num_output;
+  }
+};
+/*!
+ * \brief The information used to initialize the graph executor module, the information
+ *  come from the export library function call.
+ */
+struct GraphModuleLoadInfo {
+  GraphModuleLoadInfo(const std::string& lib, const std::string& json, const std::string& params,
+                      const std::string& device)
+      : lib_name(lib), json_name(json), params_name(params), dev(device) {}
+  GraphModuleLoadInfo() { ; }
+  std::string lib_name;
+  std::string json_name;
+  std::string params_name;
+  std::string dev;
+};
+/*! The Module information of each module.The 'int' is module index. */
+using ModuleConfig = std::unordered_map<int, GraphModuleLoadInfo>;
+#endif  //  TVM_RUNTIME_PIPELINE_PIPELINE_STRUCT_H_
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index bd59be87f7d9..a1d06fc8cab8 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -25,6 +25,7 @@
 #include <dmlc/json.h>
 #include <tvm/ir/expr.h>
 #include <tvm/runtime/c_backend_api.h>
+#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/profiling.h>
 #include <tvm/runtime/threading_backend.h>
@@ -160,6 +161,51 @@ void Profiler::Stop() {
   }
 }
 
+std::vector<int64_t> ToShape(NDArray shape_tensor) {
+  std::vector<int64_t> shape;
+  auto rank = shape_tensor.Shape().size();
+  auto dtype = shape_tensor.DataType();
+
+  // For 0-rank shapes we need to allocate a single scalar.
+  if (rank == 0) {
+    return shape;
+  }
+
+  // Otherwise we should be rank-1, and we will extract the number of dimensions
+  // for the output vector.
+  ICHECK_EQ(rank, 1U) << "shape tensor should be a k-length vector, found " << rank;
+  int64_t ndim = shape_tensor.Shape().at(0);
+  shape.resize(ndim);
+
+  const DLTensor* dl_tensor = shape_tensor.operator->();
+  if (dtype.is_int() && dtype.bits() == 32 && dtype.lanes() == 1) {
+    int32_t* dims = reinterpret_cast<int32_t*>(dl_tensor->data);
+    shape.assign(dims, dims + ndim);
+  } else if (dtype.is_int() && dtype.bits() == 64 && dtype.lanes() == 1) {
+    int64_t* dims = reinterpret_cast<int64_t*>(dl_tensor->data);
+    shape.assign(dims, dims + ndim);
+  } else {
+    LOG(FATAL) << "invalid shape tensor datatype: " << dtype;
+  }
+
+  return shape;
+}
+
+String ShapeString(NDArray shape, DLDataType dtype) { return ShapeString(ToShape(shape), dtype); }
+
+String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype) {
+  std::stringstream sizes;
+  sizes << dtype << "[";
+  for (size_t i = 0; i < shape.size(); i++) {
+    if (i != 0) {
+      sizes << ", ";
+    }
+    sizes << shape[i];
+  }
+  sizes << "]";
+  return String(sizes.str());
+}
+
 String ShapeString(const std::vector<NDArray>& shapes) {
   std::stringstream sizes;
   for (const NDArray& ary : shapes) {
@@ -181,7 +227,7 @@ String ShapeString(const std::vector<NDArray>& shapes) {
 
 String ReportNode::AsCSV() const {
   // get unique headers
-  std::unordered_set<std::string> unique_headers;
+  std::set<std::string> unique_headers;
 
   for (auto row : calls) {
     for (auto p : row) {
@@ -311,6 +357,9 @@ String ReportNode::AsTable(bool sort, bool aggregate) const {
       if (frame.find("Argument Shapes") != frame.end()) {
         name += Downcast<String>(frame["Argument Shapes"]);
       }
+      if (frame.find("Device") != frame.end()) {
+        name += Downcast<String>(frame["Device"]);
+      }
 
       if (aggregates.find(name) == aggregates.end()) {
         aggregates[name] = {i};
@@ -404,7 +453,7 @@ String ReportNode::AsTable(bool sort, bool aggregate) const {
   }
 
   // Table formatting
-  std::unordered_set<std::string> unique_headers;
+  std::set<std::string> unique_headers;
 
   for (auto row : aggregated_calls) {
     for (auto p : row) {
@@ -412,10 +461,11 @@ String ReportNode::AsTable(bool sort, bool aggregate) const {
     }
   }
 
-  std::vector<std::string> headers = {"Name", "Duration (us)",
-                                      "Percent"};  // always include these headers
+  // always include these headers in this order
+  std::vector<std::string> headers = {"Name",   "Duration (us)", "Percent",
+                                      "Device", "Count",         "Argument Shapes"};
   for (auto header : unique_headers) {
-    if (header != "Name" && header != "Duration (us)" && header != "Percent") {
+    if (std::find(headers.begin(), headers.end(), header) == headers.end()) {
       headers.push_back(header);
     }
   }
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index d6575c35d10d..cd2d1332580b 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -25,6 +25,7 @@
 #include "vm.h"
 
 #include <tvm/runtime/container/adt.h>
+#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/registry.h>
 
 #include <algorithm>
@@ -32,6 +33,7 @@
 #include <iomanip>
 #include <memory>
 #include <numeric>
+#include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
@@ -96,6 +98,58 @@ void VirtualMachineDebug::LoadExecutable(const Executable* exec) {
   }
 }
 
+void VirtualMachineDebug::OpStartHook(Instruction instr) {
+  if (prof_ && prof_.operator*().IsRunning()) {
+    if (instr.op == Opcode::LoadConst) {
+      Device dev = GetDevice(exec_->const_device_type[instr.const_index]);
+      prof_.operator*().StartCall("VM::LoadConst", dev, {});
+    } else if (instr.op == Opcode::DeviceCopy) {
+      Device dst_dev;
+      dst_dev.device_type = static_cast<DLDeviceType>(instr.dst_device_type);
+      dst_dev.device_id = 0;
+      prof_.operator*().StartCall("VM::DeviceCopy", dst_dev, {});
+    } else if (instr.op == Opcode::ReshapeTensor) {
+      prof_.operator*().StartCall("VM::ReshapeTensor", devices_[1], {});
+    } else if (instr.op == Opcode::AllocTensor) {
+      auto shape = std::vector<int64_t>(instr.alloc_tensor.ndim);
+
+      for (uint32_t i = 0; i < instr.alloc_tensor.ndim; ++i) {
+        shape[i] = instr.alloc_tensor.shape[i];
+      }
+      auto storage_obj = ReadRegister(instr.alloc_tensor.storage);
+      auto storage = Downcast<Storage>(storage_obj);
+      prof_.operator*().StartCall(
+          "VM::AllocTensor", storage->buffer.device,
+          {{"Argument Shapes", profiling::ShapeString(shape, instr.alloc_tensor.dtype)}});
+    } else if (instr.op == Opcode::AllocTensorReg) {
+      auto storage_obj = ReadRegister(instr.alloc_tensor_reg.storage);
+      auto storage = Downcast<Storage>(storage_obj);
+      Device cpu_dev = GetDevice(static_cast<Index>(kDLCPU));
+      auto shape_obj = ReadRegister(instr.alloc_tensor_reg.shape_register);
+      NDArray shape_tensor = Downcast<NDArray>(shape_obj).CopyTo(cpu_dev);
+      prof_.operator*().StartCall(
+          "VM::AllocTensorReg", storage->buffer.device,
+          {{"Argument Shapes",
+            profiling::ShapeString(shape_tensor, instr.alloc_tensor_reg.dtype)}});
+    } else if (instr.op == Opcode::AllocStorage) {
+      auto size = LoadScalarInt(instr.alloc_storage.allocation_size);
+      std::ostringstream shape;
+      shape << DLDataType2String(instr.alloc_storage.dtype_hint) << "[" << size << "]";
+      prof_.operator*().StartCall("VM::AllocStorage",
+                                  {static_cast<DLDeviceType>(instr.alloc_storage.device_type), 0},
+                                  {{"VM::Argument Shapes", String(shape.str())}});
+    } else {
+      prof_.operator*().StartCall("VM::UnknownOp", devices_[1], {});
+    }
+  }
+}
+
+void VirtualMachineDebug::OpStopHook() {
+  if (prof_ && prof_.operator*().IsRunning()) {
+    prof_.operator*().StopCall();
+  }
+}
+
 void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
                                        Index output_size, const std::vector<ObjectRef>& args) {
   ICHECK(exec_);
diff --git a/src/runtime/vm/profiler/vm.h b/src/runtime/vm/profiler/vm.h
index 1efefda52b97..4325fa8a7999 100644
--- a/src/runtime/vm/profiler/vm.h
+++ b/src/runtime/vm/profiler/vm.h
@@ -51,6 +51,8 @@ class VirtualMachineDebug : public VirtualMachine {
  private:
   void InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count, Index output_size,
                     const std::vector<ObjectRef>& args) final;
+  void OpStartHook(Instruction instr) final;
+  void OpStopHook() final;
 
   std::unordered_map<Index, std::string> packed_index_map_;
   dmlc::optional<profiling::Profiler> prof_;
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index c7a1baa1430d..addd5ca5d861 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -113,6 +113,9 @@ std::vector<int64_t> ToShape(NDArray shape_tensor) {
   return shape;
 }
 
+void VirtualMachine::OpStartHook(Instruction instr) {}
+void VirtualMachine::OpStopHook() {}
+
 PackedFunc VirtualMachine::GetFunction(const std::string& name,
                                        const ObjectPtr<Object>& sptr_to_self) {
   if (name == "invoke") {
@@ -400,11 +403,9 @@ inline void VirtualMachine::WriteRegister(Index r, const ObjectRef& val) {
   frames_.back().register_file[r] = val;
 }
 
-inline ObjectRef VirtualMachine::ReadRegister(Index r) const {
-  return frames_.back().register_file[r];
-}
+ObjectRef VirtualMachine::ReadRegister(Index r) const { return frames_.back().register_file[r]; }
 
-inline int64_t VirtualMachine::LoadScalarInt(Index r) const {
+int64_t VirtualMachine::LoadScalarInt(Index r) const {
   int64_t result = 0;
   const auto& obj = ReadRegister(r);
   NDArray array = Downcast<NDArray>(CopyTo(obj, {kDLCPU, 0}));
@@ -458,6 +459,11 @@ void VirtualMachine::RunLoop() {
         throw std::runtime_error("VM encountered fatal error");
       }
       case Opcode::LoadConst: {
+        bool is_not_cached = const_pool_.size() <= static_cast<size_t>(instr.const_index) ||
+                             !const_pool_[instr.const_index].defined();
+        if (is_not_cached) {
+          OpStartHook(instr);
+        }
         auto constant_obj = exec_->constants[instr.const_index];
         // We cache the allocated object in the constant pool. To measure, the
         // first iteration will set the pool up. The other iterations will
@@ -471,6 +477,9 @@ void VirtualMachine::RunLoop() {
           const_pool_[instr.const_index] = CopyTo(constant_obj, dev);
         }
         WriteRegister(instr.dst, const_pool_[instr.const_index]);
+        if (is_not_cached) {
+          OpStopHook();
+        }
         pc_++;
         goto main_loop;
       }
@@ -560,6 +569,7 @@ void VirtualMachine::RunLoop() {
         goto main_loop;
       }
       case Opcode::AllocTensor: {
+        OpStartHook(instr);
         auto shape = std::vector<int64_t>(instr.alloc_tensor.ndim);
 
         for (uint32_t i = 0; i < instr.alloc_tensor.ndim; ++i) {
@@ -572,10 +582,12 @@ void VirtualMachine::RunLoop() {
         auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor.dtype);
 
         WriteRegister(instr.dst, obj);
+        OpStopHook();
         pc_++;
         goto main_loop;
       }
       case Opcode::AllocTensorReg: {
+        OpStartHook(instr);
         Device cpu_dev = GetDevice(static_cast<Index>(kDLCPU));
         auto shape_obj = ReadRegister(instr.alloc_tensor_reg.shape_register);
         NDArray shape_tensor = Downcast<NDArray>(CopyTo(shape_obj, cpu_dev));
@@ -586,6 +598,7 @@ void VirtualMachine::RunLoop() {
         auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor_reg.dtype);
 
         WriteRegister(instr.dst, obj);
+        OpStopHook();
         pc_++;
         goto main_loop;
       }
@@ -609,6 +622,7 @@ void VirtualMachine::RunLoop() {
         goto main_loop;
       }
       case Opcode::AllocStorage: {
+        OpStartHook(instr);
         auto size = LoadScalarInt(instr.alloc_storage.allocation_size);
         auto alignment = instr.alloc_storage.alignment;
 
@@ -625,6 +639,7 @@ void VirtualMachine::RunLoop() {
         storage_obj->buffer = alloc->Alloc(size, alignment, instr.alloc_storage.dtype_hint);
         Storage storage(storage_obj);
         WriteRegister(instr.dst, storage);
+        OpStopHook();
         pc_++;
         goto main_loop;
       }
@@ -656,6 +671,7 @@ void VirtualMachine::RunLoop() {
         }
       }
       case Opcode::ReshapeTensor: {
+        OpStartHook(instr);
         Device cpu_dev = GetDevice(static_cast<Index>(kDLCPU));
         auto tensor_obj = ReadRegister(instr.reshape_tensor.tensor);
         NDArray tensor_arr = Downcast<NDArray>(tensor_obj);
@@ -671,10 +687,12 @@ void VirtualMachine::RunLoop() {
         // Reshape the input tensor
         auto out_tensor = tensor_arr.CreateView(shape, tensor_arr->dtype);
         WriteRegister(instr.dst, out_tensor);
+        OpStopHook();
         pc_++;
         goto main_loop;
       }
       case Opcode::DeviceCopy: {
+        OpStartHook(instr);
         auto tensor_src = ReadRegister(instr.src);
         NDArray src_data = Downcast<NDArray>(tensor_src);
         Device src_dev = src_data->device;
@@ -686,6 +704,7 @@ void VirtualMachine::RunLoop() {
 
         NDArray dst_data = src_data.CopyTo(dst_dev);
         WriteRegister(instr.dst, dst_data);
+        OpStopHook();
         pc_++;
         goto main_loop;
       }
diff --git a/src/target/codegen.cc b/src/target/codegen.cc
index 5a4aa39f01b4..41221ad8a33e 100644
--- a/src/target/codegen.cc
+++ b/src/target/codegen.cc
@@ -47,6 +47,11 @@ runtime::Module Build(IRModule mod, Target target) {
     mod = tir::transform::SkipAssert()(mod);
   }
 
+  auto target_attr_map = tvm::TargetKind::GetAttrMap<FTVMTIRToRuntime>("TIRToRuntime");
+  if (target_attr_map.count(target->kind)) {
+    return target_attr_map[target->kind](mod, target);
+  }
+
   // the build function.
   std::string build_f_name = "target.build." + target->kind->name;
   const PackedFunc* bf = runtime::Registry::Get(build_f_name);
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index bd22532d998c..5414366c1cd7 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -644,7 +644,7 @@ CodeGenLLVM::TypedPointer CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::V
     } else {
       ICHECK(t.is_handle());
       buf = builder_->CreatePointerCast(buf, t_tvm_value_->getPointerTo());
-      buf = builder_->CreateInBoundsGEP(t_void_p_, buf, index);
+      buf = builder_->CreateInBoundsGEP(t_tvm_value_, buf, index);
       return TypedPointer(t_void_p_, builder_->CreatePointerCast(buf, t_void_p_->getPointerTo()));
     }
   }
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 12fbf2c3e42c..6c64f6798e47 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -77,6 +77,8 @@ void CodeGenLLVM::Init(const std::string& module_name, llvm::TargetMachine* tm,
   this->InitTarget(tm);
 }
 
+void CodeGenLLVM::SetFastMathFlag(llvm::FastMathFlags fmf) { builder_->setFastMathFlags(fmf); }
+
 void CodeGenLLVM::InitTarget(llvm::TargetMachine* tm) {
   module_->setTargetTriple(tm->getTargetTriple().str());
   module_->setDataLayout(tm->createDataLayout());
@@ -343,7 +345,26 @@ void CodeGenLLVM::Optimize() {
 
   // place optimization pass
   llvm::PassManagerBuilder builder;
-  builder.OptLevel = 3;
+
+  // Use the same opt-level as specified in TargetMachine for running passes
+  llvm::CodeGenOpt::Level opt_level = target_machine_->getOptLevel();
+
+  switch (opt_level) {
+    case llvm::CodeGenOpt::Level::None:
+      builder.OptLevel = 0;
+      break;
+    case llvm::CodeGenOpt::Level::Less:
+      builder.OptLevel = 1;
+      break;
+
+    case llvm::CodeGenOpt::Level::Default:
+      builder.OptLevel = 2;
+      break;
+
+    default:
+      // CodeGenOpt::Level::Aggressive
+      builder.OptLevel = 3;
+  }
 
 #if TVM_LLVM_VERSION >= 50
   builder.Inliner = llvm::createFunctionInliningPass(builder.OptLevel, 0, false);
@@ -410,7 +431,7 @@ llvm::Type* CodeGenLLVM::DTypeToLLVMType(const DataType& dtype) const {
   } else {
     return etype;
   }
-}
+}  // namespace codegen
 
 llvm::Type* CodeGenLLVM::GetLLVMType(const Type& type) const {
   if (auto* ptr = type.as<PrimTypeNode>()) {
@@ -626,6 +647,20 @@ llvm::Value* CodeGenLLVM::CreateVecPad(llvm::Value* vec, int target_lanes) {
 }
 
 llvm::Value* CodeGenLLVM::CreateVecConcat(std::vector<llvm::Value*> vecs) {
+  // To allow creating vectors from scalars, convert any scalars in "vecs" to single-lane
+  // LLVM vector types.
+  for (size_t i = 0, e = vecs.size(); i != e; ++i) {
+    llvm::Value* v = vecs[i];
+    if (!v->getType()->isVectorTy()) {
+#if TVM_LLVM_VERSION >= 110
+      llvm::Type* vec_ty = llvm::FixedVectorType::get(v->getType(), 1);
+#else
+      llvm::Type* vec_ty = llvm::VectorType::get(v->getType(), 1);
+#endif
+      vecs[i] = builder_->CreateInsertElement(llvm::UndefValue::get(vec_ty), v, ConstInt32(0));
+    }
+  }
+
   // concat vector, tree shape reduction
   int total_lanes = 0;
 
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 177b53056354..4a9df65951c0 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -78,6 +78,13 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    */
   virtual void Init(const std::string& module_name, llvm::TargetMachine* tm, llvm::LLVMContext* ctx,
                     bool system_lib, bool dynamic_lookup, bool target_c_runtime);
+
+  /*!
+   * \brief Turn on fast math flags for floating point operations.
+   * \param fmf FastMathFlags to use for code generation.
+   */
+  void SetFastMathFlag(llvm::FastMathFlags fmf);
+
   /*!
    * \brief Compile and add function f to the current module.
    * \param f The function to be added.
diff --git a/src/target/llvm/llvm_common.cc b/src/target/llvm/llvm_common.cc
index be80a8bc767e..06b2be2d9fb6 100644
--- a/src/target/llvm/llvm_common.cc
+++ b/src/target/llvm/llvm_common.cc
@@ -106,6 +106,8 @@ void ParseLLVMTargetOptions(const Target& target, std::string* triple, std::stri
 #if TVM_LLVM_VERSION < 50
   opt.LessPreciseFPMADOption = true;
 #endif
+  // In clang, these are fed from LangOpts which describe language specific features
+  // TODO(AndrewZhaoLuo): figure out how these relate to fast math flags
   opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
   opt.UnsafeFPMath = false;
   opt.NoInfsFPMath = false;
@@ -139,8 +141,22 @@ std::unique_ptr<llvm::TargetMachine> GetLLVMTargetMachine(const Target& target,
     ICHECK(allow_null) << err << " target_triple=" << target_triple;
     return nullptr;
   }
-  llvm::TargetMachine* tm =
-      llvm_target->createTargetMachine(target_triple, mcpu, mattr, opt, llvm::Reloc::PIC_);
+
+  Integer llvm_opt_level = target->GetAttr<Integer>("opt-level").value_or(Integer(3));
+  llvm::CodeGenOpt::Level llvm_opt;
+  if (llvm_opt_level <= 0) {
+    llvm_opt = llvm::CodeGenOpt::None;
+  } else if (llvm_opt_level == 1) {
+    llvm_opt = llvm::CodeGenOpt::Less;
+  } else if (llvm_opt_level == 2) {
+    llvm_opt = llvm::CodeGenOpt::Default;
+  } else {
+    // llvm_opt_level >= 3
+    llvm_opt = llvm::CodeGenOpt::Aggressive;
+  }
+
+  llvm::TargetMachine* tm = llvm_target->createTargetMachine(
+      target_triple, mcpu, mattr, opt, llvm::Reloc::PIC_, llvm::CodeModel::Small, llvm_opt);
   return std::unique_ptr<llvm::TargetMachine>(tm);
 }
 
diff --git a/src/target/llvm/llvm_common.h b/src/target/llvm/llvm_common.h
index b967c7ad44e0..fcc44fb8f95c 100644
--- a/src/target/llvm/llvm_common.h
+++ b/src/target/llvm/llvm_common.h
@@ -72,7 +72,11 @@
 #include <llvm/Support/FileSystem.h>
 #include <llvm/Support/Host.h>
 #include <llvm/Support/MemoryBuffer.h>
+#if TVM_LLVM_VERSION >= 140
+#include <llvm/MC/TargetRegistry.h>
+#else
 #include <llvm/Support/TargetRegistry.h>
+#endif
 #include <llvm/Support/TargetSelect.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Target/TargetMachine.h>
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index 0e4bca4396f5..657778df0e93 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -258,8 +258,53 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     // makes sense when we start to use multiple modules.
     cg->Init("TVMMod", tm_.get(), ctx_.get(), system_lib, system_lib, target_c_runtime);
 
-    cg->AddFunctionsOrdered(funcs.begin(), funcs.end());
+    // See https://llvm.org/docs/LangRef.html#fast-math-flags for details
+    Bool fast_math_all = target->GetAttr<Bool>("fast-math").value_or(Bool(false));
+    Bool fast_math_nnan = target->GetAttr<Bool>("fast-math-nnan").value_or(Bool(false));
+    Bool fast_math_ninf = target->GetAttr<Bool>("fast-math-ninf").value_or(Bool(false));
+    Bool fast_math_nsz = target->GetAttr<Bool>("fast-math-nsz").value_or(Bool(false));
+    Bool fast_math_arcp = target->GetAttr<Bool>("fast-math-arcp").value_or(Bool(false));
+
+    llvm::FastMathFlags fmf;
+    if (fast_math_all) {
+#if TVM_LLVM_VERSION >= 60
+      fmf.setFast();
+#else
+      fmf.setUnsafeAlgebra();
+#endif
+    }
+
+    if (fast_math_nnan) {
+      fmf.setNoNaNs();
+    }
+    if (fast_math_ninf) {
+      fmf.setNoInfs();
+    }
+    if (fast_math_nsz) {
+      fmf.setNoSignedZeros();
+    }
+    if (fast_math_arcp) {
+      fmf.setAllowReciprocal();
+    }
+
+#if TVM_LLVM_VERSION >= 60
+    Bool fast_math_contract = target->GetAttr<Bool>("fast-math-contract").value_or(Bool(false));
+    Bool fast_math_afn = target->GetAttr<Bool>("fast-math-afn").value_or(Bool(false));
+    Bool fast_math_reassoc = target->GetAttr<Bool>("fast-math-reassoc").value_or(Bool(false));
+    if (fast_math_contract) {
+      fmf.setAllowContract();
+    }
+    if (fast_math_afn) {
+      fmf.setApproxFunc();
+    }
+    if (fast_math_reassoc) {
+      fmf.setAllowReassoc();
+    }
+#endif
 
+    cg->SetFastMathFlag(fmf);
+
+    cg->AddFunctionsOrdered(funcs.begin(), funcs.end());
     if (entry_func.length() != 0) {
       cg->AddMainFunction(entry_func);
     }
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index 10a437a547c1..4ff1c6ef61ed 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -35,7 +35,7 @@
 namespace tvm {
 namespace codegen {
 
-class CodeGenCHost final : public CodeGenC {
+class CodeGenCHost : public CodeGenC {
  public:
   CodeGenCHost();
   void Init(bool output_ssa, bool emit_asserts, std::string target_str);
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index d719386d204b..4403af26d1a8 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -49,6 +49,14 @@ Array<String> TargetKindRegEntry::ListTargetKinds() {
   return TargetKindRegistry::Global()->ListAllNames();
 }
 
+Map<String, String> TargetKindRegEntry::ListTargetKindOptions(const TargetKind& target_kind) {
+  Map<String, String> options;
+  for (const auto& kv : target_kind->key2vtype_) {
+    options.Set(kv.first, kv.second.type_key);
+  }
+  return options;
+}
+
 TargetKindRegEntry& TargetKindRegEntry::RegisterOrGet(const String& target_kind_name) {
   return TargetKindRegistry::Global()->RegisterOrGet(target_kind_name);
 }
@@ -222,6 +230,15 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<Bool>("link-params", Bool(false))
     .add_attr_option<Bool>("unpacked-api")
     .add_attr_option<String>("interface-api")
+    // Fast math flags, see https://llvm.org/docs/LangRef.html#fast-math-flags
+    .add_attr_option<Bool>("fast-math")  // implies all the below
+    .add_attr_option<Bool>("fast-math-nnan")
+    .add_attr_option<Bool>("fast-math-ninf")
+    .add_attr_option<Bool>("fast-math-nsz")
+    .add_attr_option<Bool>("fast-math-arcp")
+    .add_attr_option<Bool>("fast-math-contract")
+    .add_attr_option<Bool>("fast-math-reassoc")
+    .add_attr_option<Integer>("opt-level")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("c", kDLCPU)
@@ -359,5 +376,7 @@ TVM_REGISTER_TARGET_KIND("composite", kDLCPU).add_attr_option<Array<Target>>("de
 /**********  Registry  **********/
 
 TVM_REGISTER_GLOBAL("target.ListTargetKinds").set_body_typed(TargetKindRegEntry::ListTargetKinds);
+TVM_REGISTER_GLOBAL("target.ListTargetKindOptions")
+    .set_body_typed(TargetKindRegEntry::ListTargetKindOptions);
 
 }  // namespace tvm
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index a47556bac101..aa164b03a2a7 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -102,12 +102,6 @@ BlockRealize GenerateBlockFromTensor(const te::ComputeOp& compute_op, const te::
   f_push_block_vars(compute_op->axis);
   f_push_block_vars(compute_op->reduce_axis);
 
-  // If we have a rank 0 tensor then we manifest it as a rank 1 buffer with a single element.
-  if (compute_op->axis.size() == 0) {
-    iter_vars.push_back(IterVar(Range::FromMinExtent(0, 1), Var(), IterVarType::kDataPar));
-    bindings.push_back(Var());
-  }
-
   // Step 2. Declare buffer and update op2buffers
   Buffer buffer = decl_buffer(tensor->shape, tensor->dtype, tensor->GetNameHint(), "global");
   info->tensor2buffers[tensor] = buffer;
diff --git a/src/te/operation/tensorize.cc b/src/te/operation/tensorize.cc
index 447fc501d03b..9e2d3d0e725f 100644
--- a/src/te/operation/tensorize.cc
+++ b/src/te/operation/tensorize.cc
@@ -337,7 +337,8 @@ void VerifyTensorizeBody(const ComputeOpNode* self, const Stage& stage,
     }
     ICHECK(expr_equal(lhs, rhs)) << "Failed to match the compute with TensorIntrin " << intrin->name
                                  << "'s declaration "
-                                 << " provided= " << lhs << ", intrin=  " << rhs;
+                                 << " provided= " << lhs << ", intrin=  " << rhs
+                                 << ", running this stage: " << stage;
   }
 }
 
diff --git a/src/tir/ir/script/script_complete.cc b/src/tir/ir/script/script_complete.cc
index f265a8ae2b1b..7e3d3d107507 100644
--- a/src/tir/ir/script/script_complete.cc
+++ b/src/tir/ir/script/script_complete.cc
@@ -44,21 +44,11 @@ class ScriptCompleter : public StmtMutator {
   Map<Var, Buffer>* buffer_var_map_;
   Stmt VisitStmt_(const BlockRealizeNode* op) override {
     contains_block = true;
-    Stmt body = StmtMutator::VisitStmt_(op);
-    if (!op->iter_values.empty() && !op->iter_values[0].dtype().is_int()) {
-      auto block_with_binding = CopyOnWrite(Downcast<BlockRealize>(body).get());
-      std::vector<PrimExpr> bindings;
-      for (size_t i = 0; i < op->iter_values.size(); ++i) {
-        bindings.push_back(Var("i" + std::to_string(i)));
-      }
-      block_with_binding->iter_values = bindings;
-      body = BlockRealize(block_with_binding);
-      for (int i = op->iter_values.size() - 1; i >= 0; --i) {
-        body = For(Downcast<Var>(bindings[i]), op->block->iter_vars[i]->dom->min,
-                   op->block->iter_vars[i]->dom->extent, {}, body);
-      }
+    for (const PrimExpr& value : op->iter_values) {
+      CHECK(value.dtype().is_int())
+          << "BlockRealize iter_value expected a IntImm, but got " << value.dtype();
     }
-    return body;
+    return StmtMutator::VisitStmt_(op);
   }
 
   Stmt VisitStmt_(const BlockNode* op) override {
@@ -122,7 +112,7 @@ PrimFunc ScriptComplete(PrimFunc func, const Array<Buffer>& root_allocates) {
   // generate surrounding loops automatically
   Stmt res = script_completer(func->body);
   // generate root block automatically
-  if (script_completer.contains_block && !contain_root) {
+  if ((script_completer.contains_block || root_allocates.size()) && !contain_root) {
     res = Block({}, {}, {}, "root", res, NullOpt, root_allocates);
     res = BlockRealize({}, Bool(true), Downcast<Block>(res));
   }
diff --git a/src/tir/transforms/convert_for_loops_serial.cc b/src/tir/transforms/convert_for_loops_serial.cc
new file mode 100644
index 000000000000..d01ae8a45113
--- /dev/null
+++ b/src/tir/transforms/convert_for_loops_serial.cc
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tir/transforms/convert_for_loops_serial.cc
+ * \brief Convert all for loops to serial for lesser memory consumption
+ */
+#include <tvm/arith/analyzer.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm {
+namespace tir {
+
+class ForLoopSerialConverter : public StmtExprMutator {
+ public:
+  ForLoopSerialConverter() = default;
+  Stmt operator()(const PrimFunc& func);
+
+ private:
+  Stmt VisitStmt_(const ForNode* op) override;
+};
+
+Stmt ForLoopSerialConverter::VisitStmt_(const ForNode* op) {
+  if (op->kind == ForKind::kParallel) {
+    return For(op->loop_var, op->min, op->extent, ForKind::kSerial, op->body, op->thread_binding,
+               op->annotations, op->span);
+  }
+  return StmtExprMutator::VisitStmt_(op);
+}
+
+Stmt ForLoopSerialConverter::operator()(const PrimFunc& func) {
+  return this->VisitStmt(func->body);
+}
+
+PrimFunc ConvertForLoopsToSerial(PrimFunc func) {
+  PrimFuncNode* fptr = func.CopyOnWrite();
+  fptr->body = ForLoopSerialConverter()(func);
+  return func;
+}
+
+namespace transform {
+
+Pass ConvertForLoopsToSerial() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    return ConvertForLoopsToSerial(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.ConvertForLoopsToSerial", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.ConvertForLoopsToSerial")
+    .set_body_typed(ConvertForLoopsToSerial);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index 6a3ce596c2fe..ccc660509ca1 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -60,6 +60,19 @@ using runtime::ThreadScope;
  */
 class BufferShapeLegalize : public StmtExprMutator {
  public:
+  static transform::Pass Pass() {
+    auto pass_func = [](PrimFunc func, IRModule m, transform::PassContext ctx) {
+      IRVisitorWithAnalyzer bound_analyzer;
+
+      bound_analyzer(func->body);
+
+      auto fptr = func.CopyOnWrite();
+      fptr->body = BufferShapeLegalize(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body));
+      return func;
+    };
+    return transform::CreatePrimFuncPass(pass_func, 0, "tir.BufferShapeLegalize", {});
+  }
+
   explicit BufferShapeLegalize(const Map<Var, Buffer>& extern_buffer_map,
                                IRVisitorWithAnalyzer* bound_analyzer)
       : bound_analyzer_(bound_analyzer) {
@@ -383,6 +396,19 @@ class BufferShapeLegalize : public StmtExprMutator {
  */
 class BufferStrideLegalize : public StmtExprMutator {
  public:
+  static transform::Pass Pass() {
+    auto pass_func = [](PrimFunc func, IRModule m, transform::PassContext ctx) {
+      IRVisitorWithAnalyzer bound_analyzer;
+
+      bound_analyzer(func->body);
+
+      auto fptr = func.CopyOnWrite();
+      fptr->body = BufferStrideLegalize(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body));
+      return func;
+    };
+    return transform::CreatePrimFuncPass(pass_func, 0, "tir.BufferStrideLegalize", {});
+  }
+
   explicit BufferStrideLegalize(const Map<Var, Buffer>& extern_buffer_map,
                                 IRVisitorWithAnalyzer* bound_analyzer)
       : bound_analyzer_(bound_analyzer) {
@@ -565,6 +591,15 @@ class BufferStrideLegalize : public StmtExprMutator {
  */
 class ThreadScopePropagate : public StmtExprMutator {
  public:
+  static transform::Pass Pass() {
+    auto pass_func = [](PrimFunc func, IRModule m, transform::PassContext ctx) {
+      auto fptr = func.CopyOnWrite();
+      fptr->body = ThreadScopePropagate(fptr->buffer_map)(std::move(fptr->body));
+      return func;
+    };
+    return transform::CreatePrimFuncPass(pass_func, 0, "tir.ThreadScopePropagate", {});
+  }
+
   explicit ThreadScopePropagate(const Map<Var, Buffer>& extern_buffer_map) {
     // External buffers shouldn't be overwritten, even if they have a
     // BufferRealizeNode.
@@ -718,6 +753,19 @@ class ThreadScopePropagate : public StmtExprMutator {
  */
 class BufferBindUnwrapper : public StmtExprMutator {
  public:
+  static transform::Pass Pass() {
+    auto pass_func = [](PrimFunc func, IRModule m, transform::PassContext ctx) {
+      IRVisitorWithAnalyzer bound_analyzer;
+
+      bound_analyzer(func->body);
+
+      auto fptr = func.CopyOnWrite();
+      fptr->body = BufferBindUnwrapper(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body));
+      return func;
+    };
+    return transform::CreatePrimFuncPass(pass_func, 0, "tir.BufferBindUnwrapper", {});
+  }
+
   explicit BufferBindUnwrapper(const Map<Var, Buffer>& extern_buffer_map,
                                IRVisitorWithAnalyzer* bound_analyzer)
       : bound_analyzer_(bound_analyzer) {
@@ -1030,6 +1078,20 @@ class BufferBindUnwrapper : public StmtExprMutator {
 
 class StorageFlattener : public StmtExprMutator {
  public:
+  static transform::Pass Pass(int cache_line_size, bool create_bound_attributes) {
+    auto pass_func = [=](PrimFunc func, IRModule m, transform::PassContext ctx) {
+      IRVisitorWithAnalyzer bound_analyzer;
+
+      bound_analyzer(func->body);
+
+      auto fptr = func.CopyOnWrite();
+      fptr->body = StorageFlattener(fptr->buffer_map, cache_line_size, create_bound_attributes,
+                                    &bound_analyzer)(std::move(fptr->body));
+      return func;
+    };
+    return transform::CreatePrimFuncPass(pass_func, 0, "tir.StorageFlattener", {});
+  }
+
   explicit StorageFlattener(const Map<Var, Buffer>& extern_buffer_map, int cache_line_size,
                             bool create_bound_attributes, IRVisitorWithAnalyzer* bound_analyzer)
       : bound_analyzer_(bound_analyzer), create_bound_attributes_(create_bound_attributes) {
@@ -1355,6 +1417,19 @@ class StorageFlattener : public StmtExprMutator {
  */
 class AssertSimplifier : public StmtMutator {
  public:
+  static transform::Pass Pass() {
+    auto pass_func = [=](PrimFunc func, IRModule m, transform::PassContext ctx) {
+      IRVisitorWithAnalyzer bound_analyzer;
+
+      bound_analyzer(func->body);
+
+      auto fptr = func.CopyOnWrite();
+      fptr->body = AssertSimplifier(&bound_analyzer)(std::move(fptr->body));
+      return func;
+    };
+    return transform::CreatePrimFuncPass(pass_func, 0, "tir.AssertSimplifier", {});
+  }
+
   explicit AssertSimplifier(IRVisitorWithAnalyzer* bound_analyzer)
       : bound_analyzer_(bound_analyzer) {}
 
@@ -1409,30 +1484,25 @@ class AssertSimplifier : public StmtMutator {
 // We do support a few relaxed case, such as binding a
 // region with shape [1, 1, n, m] to buffer with shape [n, m]
 PrimFunc StorageFlatten(PrimFunc func, int cache_line_size, bool create_bound_attributes) {
-  // Only apply this pass to TIR from TE schedules
+  // Only apply this pass to TIR from TE schedules.  Because this is a
+  // per-function attribute, we can't just check it once for the
+  // entire module and apply the Sequential transform.
   Optional<Bool> from_legacy_te_schedule = func->GetAttr("from_legacy_te_schedule", Bool(false));
   if (from_legacy_te_schedule.value()) {
-    auto fptr = func.CopyOnWrite();
-
-    IRVisitorWithAnalyzer bound_analyzer;
-    bound_analyzer(fptr->body);
-
-    fptr->body = BufferShapeLegalize(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body));
-
-    auto stride_legalize = BufferStrideLegalize(fptr->buffer_map, &bound_analyzer);
-    fptr->body = stride_legalize(std::move(fptr->body));
-    fptr->buffer_map = stride_legalize.UpdatedExternBufferMap();
-
-    fptr->body = ThreadScopePropagate(fptr->buffer_map)(std::move(fptr->body));
-
-    fptr->body = BufferBindUnwrapper(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body));
-
-    fptr->body = StorageFlattener(fptr->buffer_map, cache_line_size, create_bound_attributes,
-                                  &bound_analyzer)(std::move(fptr->body));
-
-    fptr->body = AssertSimplifier(&bound_analyzer)(std::move(fptr->body));
-
-    return func;
+    auto seq = transform::Sequential(
+        {
+            BufferShapeLegalize::Pass(),
+            BufferStrideLegalize::Pass(),
+            ThreadScopePropagate::Pass(),
+            BufferBindUnwrapper::Pass(),
+            StorageFlattener::Pass(cache_line_size, create_bound_attributes),
+            AssertSimplifier::Pass(),
+        },
+        "tir.StorageFlatten_impl");
+    GlobalVar dummy_func_name("dummy_func");
+    IRModule mod(Map<GlobalVar, BaseFunc>({{dummy_func_name, func}}));
+    mod = seq(mod);
+    return Downcast<PrimFunc>(mod->Lookup(dummy_func_name));
   } else {
     return func;
   }
diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
index 2e8ba11c0262..6106eb2225e1 100644
--- a/tests/cpp/target_test.cc
+++ b/tests/cpp/target_test.cc
@@ -152,8 +152,18 @@ TEST(TargetCreation, DeduplicateKeys) {
   ICHECK_EQ(target->GetAttr<Bool>("link-params"), false);
 }
 
-TEST(TargetKindRegistryListTargetKinds, Basic) {
+TEST(TargetKindRegistry, ListTargetKinds) {
   Array<String> names = TargetKindRegEntry::ListTargetKinds();
   ICHECK_EQ(names.empty(), false);
   ICHECK_EQ(std::count(std::begin(names), std::end(names), "llvm"), 1);
 }
+
+TEST(TargetKindRegistry, ListTargetOptions) {
+  TargetKind llvm = TargetKind::Get("llvm").value();
+  Map<String, String> attrs = TargetKindRegEntry::ListTargetKindOptions(llvm);
+  ICHECK_EQ(attrs.empty(), false);
+
+  ICHECK_EQ(attrs["mattr"], "Array");
+  ICHECK_EQ(attrs["mcpu"], "runtime.String");
+  ICHECK_EQ(attrs["system-lib"], "IntImm");
+}
diff --git a/tests/python/contrib/test_arm_compute_lib/test_pooling.py b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
index 9deaa758639e..b174f9a78866 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_pooling.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_pooling.py
@@ -123,6 +123,7 @@ def _get_expected_pooling_codegen(
             "num_inputs": "1",
             "num_outputs": "1",
             "layout": [["NHWC"]],
+            "out_layout": [[""]],
             "shape": [[list(output_shape)]],
             "dtype": [[dtype]],
             "padding": [[str(p) for p in padding]],
@@ -149,6 +150,7 @@ def _get_expected_global_pooling_codegen(shape, dtype, typef):
             "num_inputs": "1",
             "num_outputs": "1",
             "layout": [["NHWC"]],
+            "out_layout": [[""]],
             "shape": [[[1, 1, 1, shape[3]]]],
             "dtype": [[dtype]],
         },
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 8b0d3063a696..01a7ceb9ed56 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -29,7 +29,9 @@
 import os
 import struct
 import numpy
+import math
 from enum import IntEnum
+import tensorflow as tf
 
 from ethosu.vela.register_command_stream_generator import CmdMode
 from ethosu.vela.register_command_stream_generator import cmd0
@@ -66,26 +68,6 @@ def __init__(self):
         self.npu_ops = set()
 
 
-def parse_relay_tflite_model(tflite_model, input_tensor, input_shape, input_dtype):
-    mod_, params_ = relay.frontend.from_tflite(
-        tflite_model,
-        shape_dict={input_tensor: input_shape},
-        dtype_dict={input_tensor: input_dtype},
-    )
-    return mod_, params_
-
-
-def parse_tflite_model(model_file):
-    try:
-        import tflite
-
-        return tflite.Model.GetRootAsModel(model_file, 0)
-    except AttributeError:
-        import tflite.Model
-
-        return tflite.Model.Model.GetRootAsModel(model_file, 0)
-
-
 def print_payload(payload):
     cmds = deserialize_command_stream(payload)
     for cmd_val in cmds:
@@ -270,6 +252,58 @@ def flatten_numpy_data(data):
     return reshaped_data
 
 
+class InputGenerator:
+    def __init__(self, random_state):
+        self._random_state = random_state
+
+    def generate(self, size, dtype):
+        if dtype == numpy.float32:
+            print("random float32")
+            return self._random_state.uniform(-1, 1, size).astype(dtype)
+        else:
+            print("random (u)int min=%d max=%d", numpy.iinfo(dtype).min, numpy.iinfo(dtype).max)
+            low = numpy.iinfo(dtype).min
+            high = numpy.iinfo(dtype).max + 1
+            return self._random_state.randint(low, high, size, dtype)
+
+
+def generate_ref_data_tflite(model):
+    """
+    This method generates reference data by running the specified model on tflite with random input data.
+    The random input data and generated output data are returned.
+    """
+    expected_output_data = {}
+    interpreter = tf.lite.Interpreter(model_content=model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    # Initialize random generators with a fixed seed to get deterministic results
+    seed = 0
+    random_state = numpy.random.RandomState(seed)
+
+    inputgen = InputGenerator(random_state)
+
+    # Generate input data
+    input_data = {
+        input_detail["name"]: inputgen.generate(
+            input_detail["shape"],
+            input_detail["dtype"],
+        )
+        for input_detail in input_details
+    }
+    for index, value in enumerate(input_data.values()):
+        interpreter.set_tensor(index, value)
+    interpreter.invoke()
+
+    expected_output_data = [
+        interpreter.get_tensor(output_detail["index"]) for output_detail in output_details
+    ]
+
+    return input_data, expected_output_data
+
+
 def generate_weights_data(shape, dtype):
     size = 1
     for dim in shape:
@@ -278,7 +312,7 @@ def generate_weights_data(shape, dtype):
 
 
 def get_convolutional_args(call, include_buffers=False, remove_constants=False):
-    """A method to extract the arguments from conv2d or depthwise2d extern call."""
+    """A method to extract the arguments from conv2d or depthwise_conv2d extern call."""
     args = call.args
     conv_args = []
     remove_indices = [0]
@@ -299,6 +333,44 @@ def get_convolutional_args(call, include_buffers=False, remove_constants=False):
     return conv_args
 
 
+def compute_ofm_shape(ifm_shape, padding, kernel_shape, strides, dilation=[1, 1]):
+    assert len(strides) == 2
+    assert len(dilation) == 2
+    assert len(kernel_shape) == 2
+    if padding.lower() == "valid":
+        h = math.ceil((ifm_shape[1] - (kernel_shape[0] - 1) * dilation[0]) / strides[0])
+        w = math.ceil((ifm_shape[2] - (kernel_shape[1] - 1) * dilation[1]) / strides[1])
+    if padding.lower() == "same":
+        h = math.ceil(ifm_shape[1] / strides[0])
+        w = math.ceil(ifm_shape[2] / strides[1])
+    ofm_shape = [ifm_shape[0], h, w, ifm_shape[3]]
+    return ofm_shape
+
+
+def compute_padding_shape(ifm_shape, ofm_shape, padding, kernel_shape, strides, dilation=[1, 1]):
+    assert len(strides) == 2
+    assert len(dilation) == 2
+    assert len(kernel_shape) == 2
+    if padding.lower() == "valid":
+        return [0, 0, 0, 0]
+    if padding.lower() == "same":
+        effective_kernel_shape = [
+            dilation[0] * (kernel_shape[0] - 1) + 1,
+            dilation[1] * (kernel_shape[1] - 1) + 1,
+        ]
+        pad_along_height = max(
+            (ofm_shape[1] - 1) * strides[0] + effective_kernel_shape[0] - ifm_shape[1], 0
+        )
+        pad_along_width = max(
+            (ofm_shape[2] - 1) * strides[1] + effective_kernel_shape[1] - ifm_shape[2], 0
+        )
+        pad_top = pad_along_height // 2
+        pad_bottom = pad_along_height - pad_top
+        pad_left = pad_along_width // 2
+        pad_right = pad_along_width - pad_left
+        return [pad_top, pad_left, pad_bottom, pad_right]
+
+
 def make_ethosu_conv2d(
     ifm,
     ifm_channels,
@@ -343,3 +415,48 @@ def make_ethosu_conv2d(
         ofm_layout=ofm_layout,
     )
     return conv
+
+
+def make_ethosu_depthwise_conv2d(
+    ifm,
+    channels,
+    kernel_shape,
+    padding,
+    strides,
+    dilation,
+    activation="NONE",
+    ifm_layout="NHWC",
+    ofm_layout="NHWC",
+    weight_dtype="int8",
+):
+    # params
+    weight_shape = (channels, kernel_shape[0], kernel_shape[1], 1)
+    padding = get_pad_tuple(padding, kernel_shape)
+
+    scale_bias_data = generate_weights_data((weight_shape[0], 10), "uint8")
+    scale_bias = relay.const(scale_bias_data, dtype="uint8")
+    weight_data = generate_weights_data(weight_shape, weight_dtype)
+    weight = relay.const(weight_data, dtype=weight_dtype)
+    depthwise = ethosu_ops.ethosu_depthwise_conv2d(
+        ifm,
+        weight,
+        scale_bias,
+        lut=relay.const([], dtype="int8"),
+        ifm_scale=0.6,
+        ifm_zero_point=11,
+        weight_zero_point=13,
+        ofm_scale=0.26,
+        ofm_zero_point=15,
+        kernel_shape=kernel_shape,
+        ofm_channels=channels,
+        strides=strides,
+        padding=padding,
+        dilation=dilation,
+        activation=activation,
+        clip_min=15 if activation == "CLIP" else 0,
+        clip_max=105 if activation == "CLIP" else 0,
+        upscale="NONE",
+        ifm_layout=ifm_layout,
+        ofm_layout=ofm_layout,
+    )
+    return depthwise
diff --git a/tests/python/contrib/test_ethosu/test_attr_passing.py b/tests/python/contrib/test_ethosu/test_attr_passing.py
index a2fbe1888d2a..6b99a5c1e540 100644
--- a/tests/python/contrib/test_ethosu/test_attr_passing.py
+++ b/tests/python/contrib/test_ethosu/test_attr_passing.py
@@ -28,7 +28,9 @@ def test_compiler_attr():
     }
     with tvm.transform.PassContext(opt_level=3, config={"relay.ext.ethosu.options": config}):
         with tvm.target.Target("c -device=micro_dev"):
-            assert util.get_accelerator_config() == config["accelerator_config"]
+            compiler_attrs = tvm.get_global_func("relay.ext.ethosu.get_compiler_attrs")()
+            accel_config_str = compiler_attrs.accelerator_config
+            assert accel_config_str == config["accelerator_config"]
 
 
 def test_compiler_attr_default():
@@ -37,7 +39,9 @@ def test_compiler_attr_default():
     }
     with tvm.transform.PassContext(opt_level=3):
         with tvm.target.Target("c -device=micro_dev"):
-            assert util.get_accelerator_config() == default_config["accelerator_config"]
+            compiler_attrs = tvm.get_global_func("relay.ext.ethosu.get_compiler_attrs")()
+            accel_config_str = compiler_attrs.accelerator_config
+            assert accel_config_str == default_config["accelerator_config"]
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 1944de5f94c0..4949d6814ab2 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -18,14 +18,12 @@
 import pytest
 
 pytest.importorskip("ethosu.vela")
-import os
 import numpy as np
-import pathlib
+import tflite.Model
 
 import tvm
-import tvm.micro as micro
+import tensorflow as tf
 from tvm import relay
-from tvm.relay.backend.contrib import ethosu
 from tvm.relay.backend.contrib.ethosu import util
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tests.python.relay.aot.aot_test_utils import generate_ref_data
@@ -168,5 +166,93 @@ def create_graph_activation(input_tensor_name, input_tensor_shape, input_tensor_
         infra.verify_source(compiled_models, accel_type)
 
 
+@pytest.mark.parametrize("accel_type", ACCEL_TYPES)
+@pytest.mark.parametrize("ifm_shape", [(1, 55, 55, 3), (1, 23, 32, 7)])
+@pytest.mark.parametrize(
+    "kernel_shape, activation",
+    [((3, 3), "relu"), ((1, 2), None)],
+)
+@pytest.mark.parametrize("padding", ["SAME", "VALID"])
+@pytest.mark.parametrize("strides, dilation", [((1, 1), (2, 2)), ((3, 2), (1, 1))])
+def test_tflite_depthwise_conv2d(
+    accel_type,
+    ifm_shape,
+    kernel_shape,
+    padding,
+    strides,
+    dilation,
+    activation,
+):
+    dtype = "int8"
+
+    def create_tflite_graph():
+        class Model(tf.Module):
+            @tf.function
+            def depthwise_conv2d(self, x):
+                weight_shape = [kernel_shape[0], kernel_shape[1], ifm_shape[3], 1]
+                weight = tf.constant(np.random.uniform(size=weight_shape), dtype=tf.float32)
+                # The input strides to the TensorFlow API needs to be of shape 1x4
+                tf_strides = [1, strides[0], strides[1], 1]
+                op = tf.nn.depthwise_conv2d(
+                    x, weight, strides=tf_strides, padding=padding, dilations=dilation
+                )
+                if activation:
+                    op = tf.nn.relu(op)
+                return op
+
+        model = Model()
+        concrete_func = model.depthwise_conv2d.get_concrete_function(
+            tf.TensorSpec(ifm_shape, dtype=tf.float32)
+        )
+
+        # Convert the model
+        def representative_dataset():
+            for _ in range(100):
+                data = np.random.rand(*tuple(ifm_shape))
+                yield [data.astype(np.float32)]
+
+        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+        converter.optimizations = [tf.lite.Optimize.DEFAULT]
+        converter.representative_dataset = representative_dataset
+        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+        converter.inference_input_type = tf.int8
+        converter.inference_output_type = tf.int8
+        tflite_model = converter.convert()
+        return tflite_model
+
+    tflite_graph = create_tflite_graph()
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+
+    relay_module, params = relay.frontend.from_tflite(
+        tflite_model,
+        shape_dict={"input": ifm_shape},
+        dtype_dict={"input": dtype},
+    )
+    mod = partition_for_ethosu(relay_module, params)
+
+    # Generate reference data
+    input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
+
+    compiled_models = infra.build_source(
+        mod,
+        input_data,
+        output_data,
+        accel_type,
+    )
+
+    # Assumes only two runtime.Modules are created -- i.e. single offload module
+    imported_modules = compiled_models[0].executor_factory.lib.imported_modules
+    assert len(imported_modules) == 2
+    ethosu_module = imported_modules[0]
+
+    # Verify generated C source
+    get_cs = tvm._ffi.get_global_func("runtime.module.ethosu.getcs")
+    cmms = get_cs(ethosu_module)
+    cmms = bytes.fromhex(cmms)
+
+    infra.print_payload(cmms)
+    infra.verify_source(compiled_models, accel_type)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index 60ed352edcfd..5b60102162be 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -64,10 +64,10 @@ def main(placeholder: T.handle, ethosu_write: T.handle, placeholder_1: T.handle,
 
 
 def test_weight_stream_only():
-    def _planner(te_graph, const_dict, sch):
-        weights = te_graph.inputs[1]
-        bias = te_graph.inputs[2]
-        out = te_graph.outputs[0]
+    def _planner(cached_func, const_dict, sch):
+        weights = cached_func.inputs[1]
+        bias = cached_func.inputs[2]
+        out = cached_func.outputs[0]
         conv_compute = Convolution2DCompute.from_output(out)
         co = conv_compute.split(sch, 3, 2)
         cache_weights = sch.cache_read(weights, "global", [conv_compute.conv2d])
@@ -208,10 +208,10 @@ def main(placeholder: T.handle, placeholder_1: T.handle, placeholder_2: T.handle
 
 
 def test_mixed_read():
-    def _planner(te_graph, const_dict, sch):
-        weight = te_graph.inputs[4]
-        scale_bias = te_graph.inputs[5]
-        out = te_graph.outputs[0]
+    def _planner(cached_func, const_dict, sch):
+        weight = cached_func.inputs[4]
+        scale_bias = cached_func.inputs[5]
+        out = cached_func.outputs[0]
         conv_compute = Convolution2DCompute.from_output(out)
         co = conv_compute.split(sch, 3, 2)
         cache_weight = sch.cache_read(weight, "global", [conv_compute.conv2d])
diff --git a/tests/python/contrib/test_ethosu/test_legalize.py b/tests/python/contrib/test_ethosu/test_legalize.py
index 911a0e6eefc6..b9a588d4aec0 100644
--- a/tests/python/contrib/test_ethosu/test_legalize.py
+++ b/tests/python/contrib/test_ethosu/test_legalize.py
@@ -20,15 +20,33 @@
 
 pytest.importorskip("ethosu.vela")
 import numpy as np
+import tensorflow as tf
+import tflite.Model
 
 import tvm
 from tvm import relay
-from tvm.relay.backend.contrib import ethosu
 from tvm.relay.backend.contrib.ethosu import legalize, preprocess
-from tvm.relay.dataflow_pattern import *
-from tvm.relay.op.contrib.ethosu import *
+from tvm.relay import dataflow_pattern
+from tvm.relay.op.contrib import ethosu
+from tvm.relay.build_module import bind_params_by_name
 
 from . import relay_ir_builder
+from . import infra
+
+
+def partition_ethosu_by_table(mod, pattern_table):
+    """In case only the legalization part is supported for an operator, we don't
+    want to add the operator's pattern to the pattern table so that the compiler
+    wouldn't attempt to offload an operator without full stack support."""
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.MergeComposite(pattern_table)(mod)
+    mod = relay.transform.AnnotateTarget("ethosu")(mod)
+    mod = relay.transform.MergeCompilerRegions()(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.PartitionGraph()(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = preprocess.preprocess_ext_io()(mod)
+    return mod
 
 
 def test_split_indices_legalize():
@@ -294,7 +312,7 @@ def verify_linear(ext_func, conv2d_params):
     ]
     for test_case in test_cases:
         mod, conv_params = test_case[0](*test_case[1])
-        mod = partition_for_ethosu(mod)
+        mod = ethosu.partition_for_ethosu(mod)
         mod = legalize.LegalizeEthosUConv2D()(mod)
         verify_linear(mod["tvmgen_default_ethosu_main_0"], conv_params)
 
@@ -327,12 +345,123 @@ def create_graph_single_unsupported_ifm_layout(
 
     for test_case in test_cases:
         mod, conv_params = test_case[0](*test_case[1])
-        mod = partition_for_ethosu(mod)
+        mod = ethosu.partition_for_ethosu(mod)
         with pytest.raises(
             tvm._ffi.base.TVMError, match="EthosUCodegenError: Unsupported Layout NCHW"
         ):
             mod = legalize.LegalizeEthosUConv2D()(mod)
 
 
+@pytest.mark.parametrize("ifm_shape", [(1, 299, 299, 3), (1, 123, 17, 7)])
+@pytest.mark.parametrize("kernel_shape", [(7, 3), (22, 5)])
+@pytest.mark.parametrize("padding", ["SAME", "VALID"])
+@pytest.mark.parametrize("strides, dilation", [((1, 1), (2, 1)), ((3, 2), (1, 1))])
+@pytest.mark.parametrize("activation", ["RELU", None])
+def test_tflite_depthwise_conv_2d_legalize(
+    ifm_shape, kernel_shape, padding, strides, dilation, activation
+):
+    dtype = "int8"
+
+    def create_tflite_graph():
+        class Model(tf.Module):
+            @tf.function
+            def depthwise_conv2d(self, x):
+                weight_shape = [kernel_shape[0], kernel_shape[1], ifm_shape[3], 1]
+                weight = tf.constant(np.random.uniform(size=weight_shape), dtype=tf.float32)
+                # The input strides to the TensorFlow API needs to be of shape 1x4
+                tf_strides = [1, strides[0], strides[1], 1]
+                op = tf.nn.depthwise_conv2d(
+                    x, weight, strides=tf_strides, padding=padding, dilations=dilation
+                )
+                if activation:
+                    op = tf.nn.relu(op)
+                return op
+
+        model = Model()
+        concrete_func = model.depthwise_conv2d.get_concrete_function(
+            tf.TensorSpec(ifm_shape, dtype=tf.float32)
+        )
+
+        # Convert the model
+        def representative_dataset():
+            for _ in range(100):
+                data = np.random.rand(*tuple(ifm_shape))
+                yield [data.astype(np.float32)]
+
+        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+        converter.optimizations = [tf.lite.Optimize.DEFAULT]
+        converter.representative_dataset = representative_dataset
+        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+        converter.inference_input_type = tf.int8
+        converter.inference_output_type = tf.int8
+        tflite_model = converter.convert()
+        return tflite_model
+
+    def verify(ext_func):
+        op = ext_func.body
+        ofm_channels = op.attrs.ofm_channels
+
+        # check IFM
+        ifm = op.args[0].checked_type
+        assert list(ifm.shape) == list(ifm_shape)
+        assert str(ifm.dtype) == dtype
+        assert ifm.shape[3] == ofm_channels
+
+        # check OFM
+        ofm = op.checked_type
+        expected_ofm_shape = infra.compute_ofm_shape(
+            ifm_shape, padding, kernel_shape, strides, dilation
+        )
+        assert list(ofm.shape) == list(expected_ofm_shape)
+        assert str(ofm.dtype) == dtype
+        assert ofm.shape[3] == ofm_channels
+
+        # check weights
+        weights_ohwi = op.args[1].data.asnumpy()
+        assert str(weights_ohwi.dtype) == dtype
+        assert weights_ohwi.shape[0] == ofm_channels
+        assert weights_ohwi.shape[1] == kernel_shape[0]
+        assert weights_ohwi.shape[2] == kernel_shape[1]
+        assert weights_ohwi.shape[3] == 1  # only depth multiplier 1 is supported
+
+        # Check that scale_bias matches weight tensor
+        assert list(op.args[2].checked_type.shape)[0] == ofm_channels
+
+        expected_padding = infra.compute_padding_shape(
+            ifm_shape, expected_ofm_shape, padding, kernel_shape, strides, dilation
+        )
+        assert list(op.attrs.padding) == list(expected_padding)
+        assert op.attrs.ofm_channels == ofm_channels
+        assert list(op.attrs.strides) == list(strides)
+        assert list(op.attrs.dilation) == list(dilation)
+        if activation == "RELU":
+            assert str(op.attrs.activation) == "CLIP"
+
+    depthwise_pattern_table = [
+        (
+            ethosu.QnnDepthwiseConv2DParams.composite_name,
+            ethosu.qnn_depthwise_conv2d_pattern(),
+            lambda pat: ethosu.QnnDepthwiseConv2DParams(pat).is_valid(),
+        )
+    ]
+
+    tflite_graph = create_tflite_graph()
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+
+    mod, params = relay.frontend.from_tflite(
+        tflite_model,
+        shape_dict={"input": ifm_shape},
+        dtype_dict={"input": dtype},
+    )
+
+    mod["main"] = bind_params_by_name(mod["main"], params)
+    mod = partition_ethosu_by_table(mod, depthwise_pattern_table)
+
+    mod["tvmgen_default_ethosu_main_0"] = dataflow_pattern.rewrite(
+        legalize.EthosuDepthwiseConv2DRewriter(), mod["tvmgen_default_ethosu_main_0"]
+    )
+    verify(mod["tvmgen_default_ethosu_main_0"])
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py
new file mode 100644
index 000000000000..b3ce74c4e84a
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py
@@ -0,0 +1,178 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import tvm
+from tvm import relay
+from tvm.relay.testing import run_opt_pass
+from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from .infra import make_ethosu_depthwise_conv2d, get_convolutional_args
+
+
+@pytest.mark.parametrize(
+    "trial",
+    [
+        [(1, 8, 8, 3), 3, (3, 2), (0, 0), (1, 1), (1, 1), "CLIP", "NHWC", "NHWC"],
+        [(1, 8, 8, 3), 3, (1, 1), (2, 1), (1, 1), (1, 1), "TANH", "NHWC", "NHWC"],
+        [(1, 8, 8, 3), 3, (1, 1), (0, 0), (1, 1), (1, 1), "NONE", "NHWC", "NHWC"],
+        [(1, 1, 1, 1), 1, (1, 1), (0, 0), (1, 1), (1, 1), "CLIP", "NHWC", "NHWC"],
+        [(1, 7, 9, 4), 4, (3, 2), (1, 2), (2, 1), (1, 2), "SIGMOID", "NHWC", "NHWC"],
+        [(1, 8, 2, 8, 16), 18, (1, 1), (2, 1), (1, 1), (1, 1), "CLIP", "NHCWB16", "NHWC"],
+        [(1, 7, 9, 40), 40, (3, 2), (1, 2), (2, 1), (1, 2), "CLIP", "NHWC", "NHCWB16"],
+        [(1, 4, 12, 9, 16), 182, (2, 3), (6, 3), (2, 2), (1, 1), "CLIP", "NHCWB16", "NHCWB16"],
+        [(1, 7, 9, 4), 4, (3, 2), (1, 2), (2, 1), (2, 2), "CLIP", "NHWC", "NHWC"],
+        [(1, 7, 9, 41), 41, (3, 2), (1, 2), (2, 1), (2, 2), "CLIP", "NHWC", "NHCWB16"],
+        [
+            (1, 13, 12, 19, 16),
+            182,
+            (1, 3),
+            (5, 3),
+            (2, 1),
+            (2, 1),
+            "CLIP",
+            "NHCWB16",
+            "NHCWB16",
+        ],
+    ],
+)
+def test_depthwise_conv2d_single(trial):
+    def _get_func(
+        ifm_shape,
+        channels,
+        kernel_shape,
+        padding,
+        strides,
+        dilation,
+        activation,
+        ifm_layout,
+        ofm_layout,
+    ):
+        ifm = relay.var("ifm", shape=ifm_shape, dtype="int8")
+        depthwise = make_ethosu_depthwise_conv2d(
+            ifm,
+            channels,
+            kernel_shape,
+            padding,
+            strides,
+            dilation,
+            activation,
+            ifm_layout,
+            ofm_layout,
+        )
+        func = relay.Function(relay.analysis.free_vars(depthwise), depthwise)
+        func = run_opt_pass(func, relay.transform.InferType())
+        return func
+
+    func = _get_func(*trial)
+    mod, _ = lower_to_tir(func)
+    data = []
+
+    def _visit(stmt):
+        if isinstance(stmt, tvm.tir.Call):
+            data.append(get_convolutional_args(stmt, remove_constants=True))
+
+    tvm.tir.stmt_functor.post_order_visit(mod["main"].body, _visit)
+    (
+        ifm_shape,
+        channels,
+        kernel_shape,
+        padding,
+        strides,
+        dilation,
+        activation,
+        ifm_layout,
+        ofm_layout,
+    ) = trial
+    dilated_kernel_h = (kernel_shape[0] - 1) * dilation[0] + 1
+    dilated_kernel_w = (kernel_shape[1] - 1) * dilation[1] + 1
+    if ifm_layout == "NHWC":
+        ifm_stride_c = 1
+        ifm_stride_w = ifm_shape[3]
+        ifm_stride_h = ifm_shape[2] * ifm_shape[3]
+        ofm_height = (ifm_shape[1] - dilated_kernel_h + padding[0] + padding[0]) // strides[0] + 1
+        ofm_width = (ifm_shape[2] - dilated_kernel_w + padding[1] + padding[1]) // strides[1] + 1
+    else:
+        ifm_stride_w = 16
+        ifm_stride_c = 16 * ifm_shape[3]
+        ifm_stride_h = 16 * ifm_shape[2] * ifm_shape[3]
+        ofm_height = (ifm_shape[1] - dilated_kernel_h + padding[0] + padding[0]) // strides[0] + 1
+        ofm_width = (ifm_shape[3] - dilated_kernel_w + padding[1] + padding[1]) // strides[1] + 1
+
+    if ofm_layout == "NHWC":
+        ofm_stride_c = 1
+        ofm_stride_w = channels if ofm_width > 1 else 1
+        ofm_stride_h = channels * ofm_width if ofm_height > 1 else 1
+    else:
+        ofm_stride_w = 16
+        ofm_stride_c = 16 * ofm_width
+        ofm_stride_h = 16 * ofm_width * ((channels - 1) // 16 + 1)
+
+    answer = [
+        "int8",
+        ifm_shape[1],
+        ifm_shape[2] if ifm_layout == "NHWC" else ifm_shape[3],
+        channels,
+        ifm_shape[1],
+        0,
+        ifm_shape[2] if ifm_layout == "NHWC" else ifm_shape[3],
+        0,
+        0,
+        0,
+        0,
+        0.6,
+        11,
+        ifm_layout,
+        ifm_stride_h,
+        ifm_stride_w,
+        ifm_stride_c,
+        "int8",
+        ofm_height,
+        ofm_width,
+        channels,
+        ofm_height,
+        0,
+        ofm_width,
+        0,
+        0,
+        0,
+        0,
+        0.26,
+        15,
+        ofm_layout,
+        ofm_stride_h,
+        ofm_stride_w,
+        ofm_stride_c,
+        kernel_shape[1],
+        kernel_shape[0],
+        strides[1],
+        strides[0],
+        dilation[1],
+        dilation[0],
+        13,
+        padding[0],
+        padding[1],
+        padding[0],
+        padding[1],
+        activation,
+        15 if activation == "CLIP" else 0,
+        105 if activation == "CLIP" else 0,
+        "NONE",
+    ]
+    assert data[0] == answer, data[0]
diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
index 8077271ed496..b04059011e8e 100644
--- a/tests/python/contrib/test_ethosu/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -81,10 +81,10 @@ def test_inline_no_ops():
     func = relay.Function(relay.analysis.free_vars(relu2), relu2)
     func = run_opt_pass(func, relay.transform.InferType())
 
-    te_graph = lower_to_te(func)
-    sch = te.create_schedule([te_graph.outputs[0].op])
-    inline_no_ops(te_graph, sch)
-    reshape_tensor = te_graph.outputs[0].op.input_tensors[0]
+    cached_func = lower_to_te(func)
+    sch = te.create_schedule([cached_func.outputs[0].op])
+    inline_no_ops(cached_func, sch)
+    reshape_tensor = cached_func.outputs[0].op.input_tensors[0]
     slice_tensor = reshape_tensor.op.input_tensors[0].op.input_tensors[0]
     assert sch[reshape_tensor].attach_type == AttachType.kInline
     assert sch[slice_tensor].attach_type == AttachType.kInline
@@ -114,11 +114,11 @@ def test_copy_constants():
     func = run_opt_pass(func, relay.transform.InferType())
 
     func, const_dict = extract_constants(func)
-    te_graph = lower_to_te(func)
+    cached_func = lower_to_te(func)
 
-    sch = te.create_schedule([te_graph.outputs[0].op])
+    sch = te.create_schedule([cached_func.outputs[0].op])
     planner = copy_constants()
-    planner(te_graph, const_dict, sch)
+    planner(cached_func, const_dict, sch)
     assert len(sch.stages) == 21
     assert ".global" in sch.stages[5].op.name
     assert ".global" in sch.stages[7].op.name
diff --git a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
index b07f3a5016fa..8240b392a1cf 100644
--- a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
+++ b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
@@ -497,6 +497,81 @@ def populate_ethosu_conv2d_calls(stmt):
             assert w_zero_point == ref["w_zero_point"]
 
 
+# fmt: off
+"""A ethosu_depthwise_conv2d tir testcase for the translator"""
+@tvm.script.ir_module
+class SingleEthosuDepthwiseConv2D:
+    @T.prim_func
+    def main(placeholder: T.handle, placeholder_1: T.handle, placeholder_2: T.handle, ethosu_depthwise_conv2d: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        placeholder_4 = T.match_buffer(placeholder_1, [3, 3, 2, 1], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_2, [3, 10], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_3 = T.match_buffer(placeholder, [1, 8, 8, 3], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        ethosu_depthwise_conv2d_1 = T.match_buffer(ethosu_depthwise_conv2d, [1, 6, 7, 3], dtype="int8", elem_offset=0, align=128, offset_factor=1)
+        # body
+        T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 8, 3, 8, 0, 8, T.load("int8", placeholder_3.data, 0), 0, 0, 0, T.float32(0.6), 11, "NHWC", 24, 3, 1, "int8", 6, 7, 3, 6, 0, 7, T.load("int8", ethosu_depthwise_conv2d_1.data, 0), 0, 0, 0, T.float32(0.26), 15, "NHWC", 21, 3, 1, 2, 3, 1, 1, 1, 1, T.load("int8", placeholder_4.data, 0), 18, 13, T.load("uint8", placeholder_5.data, 0), 30, 0, 0, 0, 0, "CLIP", 15, 105, "NONE", dtype="int8"))
+    __tvm_meta__ = None
+# fmt: on
+
+
+def test_translate_ethosu_depthwise_conv2d():
+    def extract_ethosu_depthwise_conv2d_extern_call(mod):
+        # There should only be a single function
+        assert len(mod.functions.items()) == 1
+        primfunc = mod.functions.items()[0][1]
+
+        ethosu_depthwise_conv2d_calls = list()
+
+        def populate_ethosu_depthwise_conv2d_calls(stmt):
+            if (
+                isinstance(stmt, tvm.tir.Call)
+                and stmt.op.name == "tir.call_extern"
+                and stmt.args[0] == "ethosu_depthwise_conv2d"
+            ):
+                ethosu_depthwise_conv2d_calls.append(stmt)
+
+        stmt_functor.post_order_visit(primfunc.body, populate_ethosu_depthwise_conv2d_calls)
+        return ethosu_depthwise_conv2d_calls[0]
+
+    depthwise_conv2d_call = extract_ethosu_depthwise_conv2d_extern_call(SingleEthosuDepthwiseConv2D)
+    npu_op, w_zero_point = tir_to_cs_translator.translate_ethosu_depthwise_conv2d(
+        depthwise_conv2d_call
+    )
+
+    assert npu_op.ifm.data_type == vapi.NpuDataType.INT8
+    assert npu_op.ifm.shape == vapi.NpuShape3D(8, 8, 3)
+    assert npu_op.ifm.tiles.height_0 == vapi.NpuTileBox(8, 0, 8, [0, 0, 0, 0]).height_0
+    assert npu_op.ifm.tiles.height_1 == vapi.NpuTileBox(8, 0, 8, [0, 0, 0, 0]).height_1
+    assert npu_op.ifm.tiles.width_0 == vapi.NpuTileBox(8, 0, 8, [0, 0, 0, 0]).width_0
+    assert npu_op.ifm.quantization == pytest.approx(vapi.NpuQuantization(0.6, 11))
+    assert npu_op.ifm.layout == vapi.NpuLayout.NHWC
+    assert npu_op.ifm.strides == vapi.NpuShape3D(24, 3, 1)
+    # Compare OFM
+    assert npu_op.ofm.data_type == vapi.NpuDataType.INT8
+    assert npu_op.ofm.shape == vapi.NpuShape3D(6, 7, 3)
+    assert npu_op.ofm.tiles.height_0 == vapi.NpuTileBox(6, 0, 8, [0, 0, 0, 0]).height_0
+    assert npu_op.ofm.tiles.height_1 == vapi.NpuTileBox(6, 0, 7, [0, 0, 0, 0]).height_1
+    assert npu_op.ofm.tiles.width_0 == vapi.NpuTileBox(6, 0, 7, [0, 0, 0, 0]).width_0
+    assert npu_op.ofm.quantization == pytest.approx(vapi.NpuQuantization(0.26, 15))
+    assert npu_op.ofm.layout == vapi.NpuLayout.NHWC
+    assert npu_op.ofm.strides == vapi.NpuShape3D(21, 3, 1)
+    # Compare kernel and padding
+    assert (
+        npu_op.kernel.__dict__
+        == vapi.NpuKernel(w=2, h=3, stride_x=1, stride_y=1, dilation_x=1, dilation_y=1).__dict__
+    )
+    assert npu_op.padding == vapi.NpuPadding(top=0, left=0, bottom=0, right=0)
+    # Compare activation
+    assert npu_op.activation.op_type == vapi.NpuActivationOp.NONE_OR_RELU
+    assert npu_op.activation.min == 0
+    assert npu_op.activation.max == pytest.approx(23.4)
+    # Compare ifm upscaling
+    assert npu_op.ifm_upscale == vapi.NpuResamplingMode.NONE
+    # Compare weight quantization parameters
+    assert w_zero_point == 13
+
+
 def test_translate_ethosu_copy():
     def extract_ethosu_copy_extern_calls(mod):
         """This function will obtain all ethosu_conv2d
diff --git a/tests/python/contrib/test_ethosu/test_type_inference.py b/tests/python/contrib/test_ethosu/test_type_inference.py
new file mode 100644
index 000000000000..47fddad773b2
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_type_inference.py
@@ -0,0 +1,96 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+from tvm import relay
+from tvm.relay.testing import run_opt_pass
+from .infra import make_ethosu_conv2d
+from .infra import make_ethosu_depthwise_conv2d
+
+
+@pytest.mark.parametrize(
+    ["ifm_shape", "ifm_layout"], [((1, 56, 72, 55), "NHWC"), ((1, 56, 4, 72, 16), "NHCWB16")]
+)
+@pytest.mark.parametrize(
+    "ofm_shape,ofm_layout", [((1, 54, 38, 122), "NHWC"), ((1, 54, 8, 38, 16), "NHCWB16")]
+)
+def test_ethosu_conv2d_type_inference(
+    ifm_shape,
+    ifm_layout,
+    ofm_shape,
+    ofm_layout,
+):
+    ifm_channels = 55
+    ofm_channels = 122
+    kernel_shape = (3, 2)
+    padding = (0, 1, 2, 3)
+    strides = (1, 2)
+    dilation = (2, 1)
+    ifm = relay.var("ifm", shape=ifm_shape, dtype="int8")
+    conv2d = make_ethosu_conv2d(
+        ifm,
+        ifm_channels,
+        ofm_channels,
+        kernel_shape,
+        padding,
+        strides,
+        dilation,
+        ifm_layout=ifm_layout,
+        ofm_layout=ofm_layout,
+    )
+    f = relay.Function([ifm], conv2d)
+    f = run_opt_pass(f, relay.transform.InferType())
+    assert tuple(f.body.checked_type.shape) == ofm_shape
+
+
+@pytest.mark.parametrize(
+    "ifm_shape, ifm_layout", [((1, 46, 71, 55), "NHWC"), ((1, 46, 4, 71, 16), "NHCWB16")]
+)
+@pytest.mark.parametrize(
+    "ofm_shape, ofm_layout", [((1, 44, 37, 55), "NHWC"), ((1, 44, 4, 37, 16), "NHCWB16")]
+)
+def test_ethosu_depthwise_conv2d_type_inference(
+    ifm_shape,
+    ifm_layout,
+    ofm_shape,
+    ofm_layout,
+):
+    channels = 55
+    kernel_shape = (3, 2)
+    padding = (0, 1, 2, 3)
+    strides = (1, 2)
+    dilation = (2, 1)
+    ifm = relay.var("ifm", shape=ifm_shape, dtype="int8")
+    depthwise_conv2d = make_ethosu_depthwise_conv2d(
+        ifm,
+        channels,
+        kernel_shape,
+        padding,
+        strides,
+        dilation,
+        ifm_layout=ifm_layout,
+        ofm_layout=ofm_layout,
+    )
+    f = relay.Function([ifm], depthwise_conv2d)
+    f = run_opt_pass(f, relay.transform.InferType())
+    assert tuple(f.body.checked_type.shape) == ofm_shape
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_vela_api.py b/tests/python/contrib/test_ethosu/test_vela_api.py
index 02c305387d45..cf845db2b43b 100644
--- a/tests/python/contrib/test_ethosu/test_vela_api.py
+++ b/tests/python/contrib/test_ethosu/test_vela_api.py
@@ -354,18 +354,17 @@ def create_mock(test_vec):
             max = np.iinfo(ifm_dtype).max
             min = np.iinfo(ifm_dtype).min
             values = np.random.randint(min, max, test_vec["shape"], ifm_dtype)
-            compressed_weights = vela_api.compress_weights(
+            vela_api.compress_weights(
                 weights=values,
                 weights_zp=test_vec["zero_point"],
                 weights_layout=test_vec["layout"],
                 ifm_bitdepth=ifm_bitdepth,
                 block_depth=test_vec["block_depth"],
                 dilation=test_vec["dilation"],
-                accel_type=test_vec["accel"],
+                accel_config=test_vec["accel"],
                 is_depthwise=test_vec["is_depthwise"],
             )
             return mock_npu_encode_weights
-        return None
 
     for tv in test_vecs:
         mock_obj = create_mock(tv)
diff --git a/tests/python/contrib/test_hexagon/README.md b/tests/python/contrib/test_hexagon/README.md
new file mode 100644
index 000000000000..1d6a298d48d6
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/README.md
@@ -0,0 +1,431 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+Documents manual TE schedule to illustrate Hexagon operator slicing.
+
+# High Level Notes
+
+* Using float32 (for now) so that tests will pass on CPU
+* Using global storage scope (for now) which means "cache" reads and writes from global, to global
+* TIR is pending changes from the work-in-progress layout RFC
+  (https://github.com/apache/tvm-rfcs/pull/39)
+* TIR has been hand-edited for context and clarity
+  * Added C-style comments
+  * Changed variable names
+  * Added spacing and line breaks
+* Naming conventions
+  * Using input (instead of activation)
+  * Using kernel (instead of weight, filter)
+  * Using `k` to denote channel-out and `c` or `rc` (reduction channel) to denote channel-in
+  * Using `rh` and `rw` (reduction height / width) to denote kernel height and width
+
+# Calling Convention
+
+TODO: Map this packed string to parameters
+conv2d_packed_filter-1-1-0-float32-1-1-64-64-64-llvm
+
+# Baseline conv2d
+
+This is a baseline 1x1 conv2d schedule for Hexagon.
+
+## Command
+
+pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_packed_filter-1-1-0-float32-1-1-64-64-64-llvm]"
+
+## Parameters
+
+| Parameter | Value       |
+| --------- | ----------- |
+| Batch     | 1           |
+| Kernel    | 1x1         |
+| Spatial   | 64x64       |
+| Input Ch  | 64          |
+| Output Ch | 64          |
+| Stride    | 1           |
+| Padding   | 0           |
+| Layout    | NHWC8h8w32c |
+
+## Assumptions
+
+* Microkernels will compute "full depth" in channel-out (k) dimension.  
+  * The compute schedule (see TIR below) 
+    * Places the outer channel-out loop over `ko` inside the outer width loop over `wo` 
+    * Encodes the assumption that Hexagon microkernels will compute "full depth" in the channel-out (k) dimension
+
+## To Do
+
+* Adjust compute schedule and add kernel cache read once Hexagon microkernel semantics are understood
+
+## Annotated TIR
+
+```
+primfn(input_handle: handle, kernel_handle: handle, output_handle: handle) -> ()
+  attr = {"from_legacy_te_schedule": True, "global_symbol": "default_function", "tir.noalias": True, "target": meta[Target][0]}
+  buffers = {output_buffer: Buffer(output_pointer: Pointer(float32), float32, [1, 8, 8, 2, 8, 8, 32], []), // NHWC8h8w32c
+             kernel_buffer: Buffer(kernel_pointer: Pointer(float32), float32, [2, 2, 1, 1, 8, 32, 4], []), // OIHW8i32o4i
+             input_buffer: Buffer(input_pointer: Pointer(float32), float32, [1, 64, 64, 64], [])} // NHWC (pending layout RFC)
+  buffer_map = {input_handle: input_buffer, kernel_handle: kernel_buffer, output_handle: output_buffer} {
+
+  allocate(input.cache: Pointer(global float32), float32, [32768]), storage_scope = global;
+  allocate(output.cache: Pointer(global float32), float32, [32768]), storage_scope = global;
+
+  for (ho.outer: int32, 0, 8) {
+    // cache read
+    // NHWC -> NHWC8h8w32c (pending layout RFC)
+    for (wo: int32, 0, 8) {
+      for (co: int32, 0, 2) {
+        for (hi: int32, 0, 8) {
+          for (wi: int32, 0, 8) {
+            for (ci: int32, 0, 32) {
+              input.cache[(((((wo*4096) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = 
+                (float32*)input_pointer[((((((ho.outer*32768) + (hi*4096)) + (wo*512)) + (wi*64)) + (co*32)) + ci)]
+            }
+          }
+        }
+      }
+    }
+
+    // compute
+    for (wo.c: int32, 0, 8) {
+      for (ko.c: int32, 0, 2) {
+        
+        // init output cache
+        for (hi.c.init: int32, 0, 8) {
+          for (wi.c.init: int32, 0, 8) {
+            for (ki.c.init: int32, 0, 32) {
+              output.cache[(((((wo.c*4096) + (ko.c*2048)) + (hi.c.init*256)) + (wi.c.init*32)) + ki.c.init)] = 0f32
+            }
+          }
+        }
+
+        // convolution
+        for (rc.outer: int32, 0, 2) {
+          for (hi.c: int32, 0, 8) {
+            for (wi.c: int32, 0, 8) {
+              for (ki.c: int32, 0, 32) {
+                for (rc.inner: int32, 0, 32) {
+                  output.cache[(((((wo.c*4096) + (ko.c*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] = 
+                  (
+                    (float32*)output.cache[(((((wo.c*4096) + (ko.c*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] + 
+                    (
+                      (float32*)input.cache[(((((wo.c*4096) + (rc.outer*2048)) + (hi.c*256)) + (wi.c*32)) + rc.inner)] *
+                      (float32*)kernel_pointer[(((((ko.c*2048) + (rc.outer*1024)) + (floordiv(rc.inner, 4)*128)) + (ki.c*4)) + floormod(rc.inner, 4))]
+                    )
+                  )
+                }
+              }
+            }
+          }
+        } // end rc.outer
+      } // end ko.c
+    } // end wo.c
+
+    // cache write
+    for (wo: int32, 0, 8) {
+      for (ko: int32, 0, 2) {
+        for (hi: int32, 0, 8) {
+          for (wi: int32, 0, 8) {
+            for (ki: int32, 0, 32) {
+              output_pointer[((((((ho.outer*32768) + (wo*4096)) + (ko*2048)) + (hi*256)) + (wi*32)) + ki)] = 
+                (float32*)output.cache[(((((wo*4096) + (ko*2048)) + (hi*256)) + (wi*32)) + ki)]
+            }
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+# Split on Height - "Full Output Slice"
+
+Adds a new parameter `h_split` which creates a loop split on the height `h` dimension.  The cache reads and writes are moved to the outer of the two loops created by that split - the loop over `ho.outer`.  This increases cache usage by a factor equivalent to `h_split`.  The compute is still "full width" and "full depth" in the channel-out dimension and now over multiple slices in the height `h` dimension.  
+
+The key changes in TIR versus the baseline are ...
+
+1) Increased cache allocations:
+
+```
+  allocate(input.cache: Pointer(global float32), float32, [65536]), storage_scope = global;
+  allocate(output.cache: Pointer(global float32), float32, [65536]), storage_scope = global;
+```
+
+2) The loop split on the `h` dimension:
+
+```
+  for (ho.outer: int32, 0, 4) {
+    for (ho.inner: int32, 0, 2) {
+```
+
+## Command
+
+pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_packed_filter-1-1-0-float32-2-1-64-64-64-llvm]"
+
+## Parameters
+
+| Parameter | Value       |
+| --------- | ----------- |
+| Batch     | 1           |
+| Kernel    | 1x1         |
+| Spatial   | 64x64       |
+| Input Ch  | 64          |
+| Output Ch | 64          |
+| Stride    | 1           |
+| Padding   | 0           |
+| Layout    | NHWC8h8w32c |
+| h_split   | 2           |
+
+## Assumptions
+
+Same as baseline
+
+## To Do
+
+Same as baseline
+
+## Annotated TIR
+
+```
+primfn(input_handle: handle, kernel_handle: handle, output_handle: handle) -> ()
+  attr = {"from_legacy_te_schedule": True, "global_symbol": "default_function", "tir.noalias": True, "target": meta[Target][0]}
+  buffers = {output_buffer: Buffer(output_pointer: Pointer(float32), float32, [1, 8, 8, 2, 8, 8, 32], []),
+             kernel_buffer: Buffer(kernel_pointer: Pointer(float32), float32, [2, 2, 1, 1, 8, 32, 4], []),
+             input_buffer: Buffer(input_pointer: Pointer(float32), float32, [1, 64, 64, 64], [])}
+  buffer_map = {input_handle: input_buffer, kernel_handle: kernel_buffer, output_handle: output_buffer} {
+  
+  // increased cache usage due to h_split parameter
+  allocate(input.cache: Pointer(global float32), float32, [65536]), storage_scope = global;
+  allocate(output.cache: Pointer(global float32), float32, [65536]), storage_scope = global;
+
+  // loop split ho.outer vs. ho.inner based on h_split parameter
+  for (ho.outer: int32, 0, 4) {
+    for (ho.inner: int32, 0, 2) {
+      for (wo: int32, 0, 8) {
+        for (co: int32, 0, 2) {
+          for (hi: int32, 0, 8) {
+            for (wi: int32, 0, 8) {
+              for (ci: int32, 0, 32) {
+                input.cache[((((((ho.inner*32768) + (wo*4096)) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = 
+                  (float32*)input_pointer[(((((((ho.outer*65536) + (ho.inner*32768)) + (hi*4096)) + (wo*512)) + (wi*64)) + (co*32)) + ci)]
+              }
+            }
+          }
+        }
+      }
+    }
+    for (ho.c.inner: int32, 0, 2) {
+      for (wo.c: int32, 0, 8) {
+        for (ko.c: int32, 0, 2) {
+          for (hi.c.init: int32, 0, 8) {
+            for (wi.c.init: int32, 0, 8) {
+              for (ki.c.init: int32, 0, 32) {
+                output.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c*2048)) + (hi.c.init*256)) + (wi.c.init*32)) + ki.c.init)] = 0f32
+              }
+            }
+          }
+          for (rc.outer: int32, 0, 2) {
+            for (hi.c: int32, 0, 8) {
+              for (wi.c: int32, 0, 8) {
+                for (ki.c: int32, 0, 32) {
+                  for (rc.inner: int32, 0, 32) {
+                    output.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] = 
+                    (
+                      (float32*)output.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] + 
+                      (
+                        (float32*)input.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (rc.outer*2048)) + (hi.c*256)) + (wi.c*32)) + rc.inner)] *
+                        (float32*)kernel_pointer[(((((ko.c*2048) + (rc.outer*1024)) + (floordiv(rc.inner, 4)*128)) + (ki.c*4)) + floormod(rc.inner, 4))]
+                      )
+                    )
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    for (ho.inner: int32, 0, 2) {
+      for (wo: int32, 0, 8) {
+        for (ko: int32, 0, 2) {
+          for (hi: int32, 0, 8) {
+            for (wi: int32, 0, 8) {
+              for (ki: int32, 0, 32) {
+                output_pointer[(((((((ho.outer*65536) + (ho.inner*32768)) + (wo*4096)) + (ko*2048)) + (hi*256)) + (wi*32)) + ki)] = 
+                  (float32*)output.cache[((((((ho.inner*32768) + (wo*4096)) + (ko*2048)) + (hi*256)) + (wi*32)) + ki)]
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+# 3x3 conv2d (no padding)
+
+Change from a 1x1 kernel to a 3x3 kernel.  The implication of this change is that `h_split + 1` rather than just `h_split` "full width" slices of the input are required to compute the output.  This is due to the fact that the 3x3 kernel will "fall off the bottom" of the input and thus the vertically adjacent "full width" slice must be prefetched into the input cache.
+
+The key changes in TIR versus the above are...
+
+1) Increased input cache size to hold the vertically adjacent slice
+
+```
+  allocate(input.cache: Pointer(global float32), float32, [98304]), storage_scope = global;
+```
+
+2) Loop over `ho.inner` upper bound increased from `h_split` = 2 to `h_split + 1` = 3
+
+```
+  for (ho.outer: int32, 0, 4) {
+    for (ho.inner: int32, 0, 3) {
+      if (((ho.outer*2) + ho.inner) < 8) {
+```
+
+The `if` statement above indicates NOT to prefetch the vertically adjacent slice at the "bottom" of the input since it does not exist.
+
+## Command
+
+pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_packed_filter-3-1-0-float32-2-1-64-64-64-llvm]"
+
+## Parameters
+
+| Parameter | Value       |
+| --------- | ----------- |
+| Batch     | 1           |
+| Kernel    | 3x3         |
+| Spatial   | 64x64       |
+| Input Ch  | 64          |
+| Output Ch | 64          |
+| Stride    | 1           |
+| Padding   | 0           |
+| Layout    | NHWC8h8w32c |
+| h_split   | 2           |
+
+## Assumptions
+
+Same as above
+
+## To Do
+
+Same as above, and ...
+
+There may be some opportunity to optimize cache reuse in this case.  Consider the loops over `ho.outer` and `ho.inner` and the index calculation `ho.outer * 64k + ho.inner * 32k` into the input pointer:
+
+| ho.outer | ho.inner | ho.outer * 64k + ho.inner * 32k       |
+| -------- | -------- | ------------------------------------- |
+| 0        | 0        | 0                                     |
+| 0        | 1        | 32k                                   |
+| 0        | 2        | 64k (vertical adjacent slice loop 0)  |
+| 1        | 0        | 64k                                   |
+| 1        | 1        | 96k                                   |
+| 1        | 2        | 128k (vertical adjacent slice loop 1) |
+| 2        | 0        | 128k                                  |
+| 2        | 1        | 160k                                  |
+| 2        | 2        | 192k (vertical adjacent slice loop 2) |
+| 3        | 0        | 192k                                  |
+| 3        | 1        | 224k                                  |
+| 3        | 2        | (No vertical adjacent slice loop 3)   |
+
+Noe that the vertically adjacent slice in loop N (i.e. the loop where `ho.outer` = N) is reused in loop N + 1.
+
+## Annotated TIR
+
+```
+primfn(input_handle: handle, kernel_handle: handle, output_handle: handle) -> ()
+  attr = {"from_legacy_te_schedule": True, "global_symbol": "default_function", "tir.noalias": True, "target": meta[Target][0]}
+  buffers = {output_buffer: Buffer(output_pointer: Pointer(float32), float32, [1, 8, 8, 2, 8, 8, 32], []),
+             kernel_buffer: Buffer(kernel_pointer: Pointer(float32), float32, [2, 2, 3, 3, 8, 32, 4], []),
+             input_buffer: Buffer(input_pointer: Pointer(float32), float32, [1, 64, 64, 64], [])}
+  buffer_map = {input_handle: input_buffer, kernel_handle: kernel_buffer, output_handle: output_buffer} {
+
+  // increased input cache size to hold vertically adjacent slice
+  allocate(input.cache: Pointer(global float32), float32, [98304]), storage_scope = global;
+  allocate(output.cache: Pointer(global float32), float32, [65536]), storage_scope = global;
+  for (ho.outer: int32, 0, 4) {
+
+    // iterate over h_split + 1 = 3 input slices
+    for (ho.inner: int32, 0, 3) {
+
+      // don't prefetch the vertically adjacent slice at the "bottom" of the input
+      if (((ho.outer*2) + ho.inner) < 8) {
+        for (wo: int32, 0, 8) {
+          for (co: int32, 0, 2) {
+            for (hi: int32, 0, 8) {
+              for (wi: int32, 0, 8) {
+                for (ci: int32, 0, 32) {
+                  input.cache[((((((ho.inner*32768) + (wo*4096)) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = 
+                    (float32*)input_pointer[(((((((ho.outer*65536) + (ho.inner*32768)) + (hi*4096)) + (wo*512)) + (wi*64)) + (co*32)) + ci)]
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    for (ho.c.inner: int32, 0, 2) {
+      for (wo.c: int32, 0, 8) {
+        for (ko.c: int32, 0, 2) {
+          for (hi.c.init: int32, 0, 8) {
+            for (wi.c.init: int32, 0, 8) {
+              for (ki.c.init: int32, 0, 32) {
+                output.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c*2048)) + (hi.c.init*256)) + (wi.c.init*32)) + ki.c.init)] = 0f32
+              }
+            }
+          }
+          for (rc.outer: int32, 0, 2) {
+            for (hi.c: int32, 0, 8) {
+              for (wi.c: int32, 0, 8) {
+                for (rh: int32, 0, 3) {
+                  for (rw: int32, 0, 3) {
+                    for (ki.c: int32, 0, 32) {
+                      for (rc.inner: int32, 0, 32) {
+                        output.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] = 
+                        (
+                          (float32*)output.cache[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] + 
+                          (
+                            (float32*)input.cache[((((((((floordiv((hi.c + rh), 8)*32768) + (ho.c.inner*32768)) + (floordiv((wi.c + rw), 8)*4096)) + (wo.c*4096)) + (rc.outer*2048)) + (floormod((hi.c + rh), 8)*256)) + (floormod((wi.c + rw), 8)*32)) + rc.inner)] *
+                            (float32*)kernel_pointer[(((((((ko.c*18432) + (rc.outer*9216)) + (rh*3072)) + (rw*1024)) + (floordiv(rc.inner, 4)*128)) + (ki.c*4)) + floormod(rc.inner, 4))]
+                          )
+                        )
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    for (ho.inner: int32, 0, 2) {
+      for (wo: int32, 0, 8) {
+        for (ko: int32, 0, 2) {
+          for (hi: int32, 0, 8) {
+            for (wi: int32, 0, 8) {
+              for (ki: int32, 0, 32) {
+                output_pointer[(((((((ho.outer*65536) + (ho.inner*32768)) + (wo*4096)) + (ko*2048)) + (hi*256)) + (wi*32)) + ki)] = 
+                  (float32*)output.cache[((((((ho.inner*32768) + (wo*4096)) + (ko*2048)) + (hi*256)) + (wi*32)) + ki)]
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+```
\ No newline at end of file
diff --git a/tests/python/contrib/test_hexagon/test_conv2d_blocked.py b/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
index e0b7fb20ab8e..37a623b613f8 100644
--- a/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
+++ b/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
@@ -162,6 +162,7 @@ def conv2d_packed_filter(
     stride,
     padding,
     dtype,
+    h_split_factor,
     storage_scope="global",
 ):
     """
@@ -260,15 +261,45 @@ def compute(n, ho, wo, ko, hi, wi, ki):
     s[X_pad].compute_inline()
     s[X_packed].compute_inline()
 
-    # Perform scheduling
-    n, hid, wid, cid, hoff, woff, coff = s[Y].op.axis
-    slice = s[Y].fuse(wid, cid)
+    # cache read for the input / activation (X)
     Xl = s.cache_read(X_packed, storage_scope, [Y])
+
+    # cache write for the output (Y)
     Yl = s.cache_write(Y, storage_scope)
 
-    s[Yl].compute_at(s[Y], hid)
-    n, hid, slice, hoff, woff, coff = s[Yl].op.axis
-    s[Xl].compute_at(s[Yl], slice)
+    ########################
+    # cache write schedule #
+    ########################
+
+    # loop schedule corresponding with nhwc8h8w32c layout
+    # using k to represent output channel
+    n, ho, wo, ko, hi, wi, ki = s[Y].op.axis
+
+    # loop split h and compute cache write at outer loop split
+    # to increase cache usage by factor of h_split_factor
+    hoo, hoi = s[Y].split(ho, factor=h_split_factor)
+    s[Yl].compute_at(s[Y], hoo)
+
+    ####################
+    # compute schedule #
+    ####################
+
+    # loop schedule corresponding with nhwc8h8w32c layout
+    # using k to represent output channel
+    n, ho, wo, ko, hi, wi, ki = s[Yl].op.axis
+
+    # reduction axes
+    # using rc to represent (reduction) input channel
+    rh, rw, rc = s[Yl].op.reduce_axis
+
+    # split input channel by the block size
+    rco, rci = s[Yl].split(rc, factor=block_C)
+
+    # loop split h and compute cache write at outer loop split
+    # to increase cache usage by factor of h_split_factor
+    hoo, hoi = s[Yl].split(ho, factor=h_split_factor)
+    s[Yl].reorder(n, hoo, hoi, wo, ko, rco, hi, wi, ki, rci)
+    s[Xl].compute_at(s[Yl], hoo)
 
     binds = {}
     if storage_scope and storage_scope != "global":
@@ -287,6 +318,7 @@ def conv2d_packed_filter_nhwhwc(
     stride,
     padding,
     dtype,
+    h_split_factor,
     storage_scope="global",
 ):
     """
@@ -299,7 +331,7 @@ def conv2d_packed_filter_nhwhwc(
     assert kernel_size == tuple(shape_oihw8i32o4i[2:4])
 
     block_shape = get_block_shape()
-    block_H, block_W, _ = block_shape
+    block_H, block_W, block_C = block_shape
     shape = get_packed_activation_layout(shape_nhwc, block_shape, packed_C=False)
     logical_output_shape = get_conv2d_nhwc_shape(
         shape_nhwc,
@@ -372,18 +404,62 @@ def compute(n, ho, wo, hi, wi, k):
     s[X_pad].compute_inline()
     s[X_packed].compute_inline()
 
+    # cache read for the input / activation (X)
+    Xl = s.cache_read(X_packed, storage_scope, [Y])
+
+    # cache write for the output (Y)
+    Yl = s.cache_write(Y, storage_scope)
+
+    ########################
+    # cache write schedule #
+    ########################
+
+    # loop schedule corresponding with nhw8h8wc layout
+    # using k to represent output channel
     n, ho, wo, hi, wi, k = s[Y].op.axis
-    rh, rw, rc = s[Y].op.reduce_axis
 
-    rco, rci = s[Y].split(rc, factor=32)
-    s[Y].reorder(n, rco, wo, ho, k, hi, wi)
-    Xl = s.cache_read(X_packed, storage_scope, [Y])
-    s[Xl].compute_at(s[Y], rco)
+    # split output channel by the block size
+    ko, ki = s[Y].split(k, factor=block_C)
+
+    # loop split h and compute cache write at outer loop split
+    # to increase cache usage by factor of h_split_factor
+    hoo, hoi = s[Y].split(ho, factor=h_split_factor)
+    s[Y].reorder(n, hoo, hoi, wo, ko, hi, wi, ki)
+    s[Yl].compute_at(s[Y], hoo)
+
+    ####################
+    # compute schedule #
+    ####################
+
+    # loop schedule corresponding with nhw8h8wc layout
+    # using k to represent output channel
+    n, ho, wo, hi, wi, k = s[Yl].op.axis
+
+    # reduction axes
+    # using rc to represent (reduction) input channel
+    rh, rw, rc = s[Yl].op.reduce_axis
+
+    # split output & input channel by the block size
+    ko, ki = s[Yl].split(k, factor=block_C)
+    rco, rci = s[Yl].split(rc, factor=block_C)
+
+    # loop split h and compute cache write at outer loop split
+    # to increase cache usage by factor of h_split_factor
+    hoo, hoi = s[Yl].split(ho, factor=h_split_factor)
+    s[Yl].reorder(n, hoo, hoi, wo, ko, rco, hi, wi, ki, rci)
+    s[Xl].compute_at(s[Yl], hoo)
+
+    #######################
+    # cache read schedule #
+    #######################
+
+    # loop schedule corresponding with nhw8h8wc layout
+    # using k to represent output channel
+    n, ho, wo, hi, wi, c = s[Xl].op.axis
 
-    ko, ki = s[Y].split(k, factor=32)
-    s[Y].reorder(n, rco, wo, ho, ko, hi, wi, ki)
-    Fl = s.cache_read(filt_packed, storage_scope, [Y])
-    s[Fl].compute_at(s[Y], ko)
+    # split intput channel by the block size
+    co, ci = s[Xl].split(c, factor=block_C)
+    s[Xl].reorder(n, ho, wo, co, hi, wi, ci)
 
     binds = {}
     if storage_scope and storage_scope != "global":
@@ -397,13 +473,14 @@ def compute(n, ho, wo, hi, wi, k):
 
 class BaseConv2d:
     batch = tvm.testing.parameter(1)
-    in_size = tvm.testing.parameter(8, 56)
+    in_size = tvm.testing.parameter(8, 56, 64)
     in_channel = tvm.testing.parameter(64)
     out_channel = tvm.testing.parameter(64)
-    kernel = tvm.testing.parameter(3)
+    kernel = tvm.testing.parameter(1, 3)
     stride = tvm.testing.parameter(1)
-    pad = tvm.testing.parameter(1)
+    pad = tvm.testing.parameter(0, 1)
     dtype = tvm.testing.parameter("float32")
+    h_split_factor = tvm.testing.parameter(1, 2)
 
 
 class TestConv2dLogical(BaseConv2d):
@@ -445,6 +522,7 @@ def test_conv2d(
         pad,
         dtype,
         target,
+        h_split_factor,
     ):
         inputs = [
             np.random.uniform(0, 255, size=shape_nhwc).astype(dtype),
@@ -465,6 +543,7 @@ def test_conv2d(
             stride=(stride, stride),
             padding=(pad, pad, pad, pad),
             dtype=dtype,
+            h_split_factor=h_split_factor,
         )
         return output, ref_output
 
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 2e4687fb7985..9d44d8f22f41 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -397,7 +397,7 @@ def test_compile_tflite_module_with_external_codegen_cmsisnn(
 
     tvmc_package = tvmc.compiler.compile_model(
         tvmc_model,
-        target=f"cmsis-nn, c -runtime=c --system-lib --link-params -mcpu=cortex-m55 --executor=aot",
+        target=f"cmsis-nn, c -runtime=c --system-lib --link-params -mcpu=cortex-m55 -executor=aot",
         output_format="mlf",
         package_path=output_file_name,
         pass_context_configs=["tir.disable_vectorize=true"],
@@ -455,7 +455,7 @@ def test_compile_tflite_module_with_external_codegen_ethosu(
 
         tvmc_package = tvmc.compiler.compile_model(
             tvmc_model,
-            target=f"ethos-u -accelerator_config={accel_type}, c -runtime=c --system-lib --link-params -mcpu=cortex-m55 --executor=aot",
+            target=f"ethos-u -accelerator_config={accel_type}, c -runtime=c --system-lib --link-params -mcpu=cortex-m55 -executor=aot",
             output_format="mlf",
             package_path=output_file_name,
             pass_context_configs=["tir.disable_vectorize=true"],
@@ -471,7 +471,11 @@ def test_compile_tflite_module_with_external_codegen_ethosu(
                 for name in mlf_package.getnames()
                 if re.match(r"\./codegen/host/src/\D+\d+\.c", name)
             ]
-            assert len(c_source_files) == 17
+            # The number of c_source_files depends on the number of fused subgraphs that
+            # get offloaded to the NPU, e.g. conv2d->depthwise_conv2d->conv2d gets offloaded
+            # as a single subgraph if both of these operators are supported by the NPU.
+            # Currently there are two source files for CPU execution and two offload graphs
+            assert len(c_source_files) == 4
 
 
 @mock.patch("tvm.relay.build")
diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py
index 569c42020817..4d2fb56c5d4e 100644
--- a/tests/python/driver/tvmc/test_frontends.py
+++ b/tests/python/driver/tvmc/test_frontends.py
@@ -14,11 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import os
-import tarfile
 
 import pytest
 
+import tvm
 from tvm.ir.module import IRModule
 
 from tvm.driver import tvmc
@@ -229,3 +228,128 @@ def test_load_model___wrong_language__to_pytorch(tflite_mobilenet_v1_1_quant):
             model_format="pytorch",
             shape_dict={"input": [1, 3, 224, 224]},
         )
+
+
+def test_compile_tflite_module_nhwc_to_nchw(tflite_mobilenet_v1_1_quant):
+    # some CI environments wont offer TFLite, so skip in case it is not present
+    pytest.importorskip("tflite")
+
+    tvmc_model = tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant)
+    before = tvmc_model.mod
+
+    expected_layout = "NCHW"
+    after = tvmc.common.convert_graph_layout(before, expected_layout)
+
+    layout_transform_calls = []
+
+    def _is_layout_transform(node):
+        if isinstance(node, tvm.relay.expr.Call):
+            layout_transform_calls.append(
+                node.op.name == "layout_transform"
+                and node.attrs.src_layout == "NHWC"
+                and node.attrs.dst_layout == "NCHW"
+            )
+
+    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
+
+    assert any(layout_transform_calls), "Expected 'layout_transform NHWC->NCHW' not found"
+
+
+def test_compile_onnx_module_nchw_to_nhwc(onnx_resnet50):
+    # some CI environments wont offer ONNX, so skip in case it is not present
+    pytest.importorskip("onnx")
+
+    tvmc_model = tvmc.frontends.load_model(onnx_resnet50)
+    before = tvmc_model.mod
+
+    expected_layout = "NHWC"
+    after = tvmc.common.convert_graph_layout(before, expected_layout)
+
+    layout_transform_calls = []
+
+    def _is_layout_transform(node):
+        if isinstance(node, tvm.relay.expr.Call):
+            layout_transform_calls.append(
+                node.op.name == "layout_transform"
+                and node.attrs.src_layout == "NCHW"
+                and node.attrs.dst_layout == "NHWC"
+            )
+
+    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
+
+    assert any(layout_transform_calls), "Expected 'layout_transform NCWH->NHWC' not found"
+
+
+def test_compile_paddle_module_nchw_to_nhwc(paddle_resnet50):
+    # some CI environments wont offer Paddle, so skip in case it is not present
+    pytest.importorskip("paddle")
+
+    tvmc_model = tvmc.frontends.load_model(paddle_resnet50, "paddle")
+    before = tvmc_model.mod
+
+    expected_layout = "NHWC"
+    after = tvmc.common.convert_graph_layout(before, expected_layout)
+
+    layout_transform_calls = []
+
+    def _is_layout_transform(node):
+        if isinstance(node, tvm.relay.expr.Call):
+            layout_transform_calls.append(
+                node.op.name == "layout_transform"
+                and node.attrs.src_layout == "NCHW"
+                and node.attrs.dst_layout == "NHWC"
+            )
+
+    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
+
+    assert any(layout_transform_calls), "Expected 'layout_transform NCWH->NHWC' not found"
+
+
+def test_compile_tflite_module__same_layout__nhwc_to_nhwc(tflite_mobilenet_v1_1_quant):
+    # some CI environments wont offer TFLite, so skip in case it is not present
+    pytest.importorskip("tflite")
+
+    tvmc_model = tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant)
+    before = tvmc_model.mod
+
+    expected_layout = "NHWC"
+    after = tvmc.common.convert_graph_layout(before, expected_layout)
+
+    layout_transform_calls = []
+
+    def _is_layout_transform(node):
+        if isinstance(node, tvm.relay.expr.Call):
+            layout_transform_calls.append(
+                node.op.name == "layout_transform"
+                and node.attrs.src_layout == "NHWC"
+                and node.attrs.dst_layout == "NHWC"
+            )
+
+    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
+
+    assert not any(layout_transform_calls), "Unexpected 'layout_transform' call"
+
+
+def test_compile_onnx_module__same_layout__nchw_to_nchw(onnx_resnet50):
+    # some CI environments wont offer ONNX, so skip in case it is not present
+    pytest.importorskip("onnx")
+
+    tvmc_model = tvmc.frontends.load_model(onnx_resnet50)
+    before = tvmc_model.mod
+
+    expected_layout = "NCHW"
+    after = tvmc.common.convert_graph_layout(before, expected_layout)
+
+    layout_transform_calls = []
+
+    def _is_layout_transform(node):
+        if isinstance(node, tvm.relay.expr.Call):
+            layout_transform_calls.append(
+                node.op.name == "layout_transform"
+                and node.attrs.src_layout == "NCHW"
+                and node.attrs.dst_layout == "NCHW"
+            )
+
+    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
+
+    assert not any(layout_transform_calls), "Unexpected 'layout_transform' call"
diff --git a/tests/python/driver/tvmc/test_mlf.py b/tests/python/driver/tvmc/test_mlf.py
index 0426f5678153..11306bd58848 100644
--- a/tests/python/driver/tvmc/test_mlf.py
+++ b/tests/python/driver/tvmc/test_mlf.py
@@ -27,7 +27,7 @@
 
 
 @pytest.mark.parametrize(
-    "target,pass_configs", [["llvm", []], ["c --executor=aot", ["tir.disable_vectorize=1"]]]
+    "target,pass_configs", [["llvm", []], ["c -executor=aot", ["tir.disable_vectorize=1"]]]
 )
 def test_tvmc_cl_compile_run_mlf(tflite_mobilenet_v1_1_quant, tmpdir_factory, target, pass_configs):
     pytest.importorskip("tflite")
@@ -114,7 +114,7 @@ def test_tvmc_import_package_mlf_aot(tflite_mobilenet_v1_1_quant, tflite_compile
 
     tflite_compiled_model_mlf = tflite_compile_model(
         tflite_mobilenet_v1_1_quant,
-        target="c --executor=aot",
+        target="c -executor=aot",
         output_format="mlf",
         pass_context_configs=["tir.disable_vectorize=1"],
     )
diff --git a/tests/python/driver/tvmc/test_pass_config.py b/tests/python/driver/tvmc/test_pass_config.py
new file mode 100644
index 000000000000..d8ffd7d4d521
--- /dev/null
+++ b/tests/python/driver/tvmc/test_pass_config.py
@@ -0,0 +1,73 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+from tvm.contrib.target.vitis_ai import vitis_ai_available
+from tvm.driver import tvmc
+
+from tvm.driver.tvmc.common import TVMCException
+
+
+def test_config_invalid_format():
+    with pytest.raises(TVMCException):
+        _ = tvmc.common.parse_configs(["relay.backend.use_auto_scheduler.missing.value"])
+
+
+def test_config_missing_from_tvm():
+    with pytest.raises(TVMCException):
+        _ = tvmc.common.parse_configs(["relay.backend.use_auto_scheduler.missing.value=1234"])
+
+
+def test_config_unsupported_tvmc_config():
+    with pytest.raises(TVMCException):
+        _ = tvmc.common.parse_configs(["tir.LoopPartition=value"])
+
+
+def test_config_empty():
+    with pytest.raises(TVMCException):
+        _ = tvmc.common.parse_configs([""])
+
+
+def test_config_valid_config_bool():
+    configs = tvmc.common.parse_configs(["relay.backend.use_auto_scheduler=true"])
+
+    assert len(configs) == 1
+    assert "relay.backend.use_auto_scheduler" in configs.keys()
+    assert configs["relay.backend.use_auto_scheduler"] == True
+
+
+@pytest.mark.skipif(
+    not vitis_ai_available(),
+    reason="--target vitis-ai is not available. TVM built with 'USE_VITIS_AI OFF'",
+)
+def test_config_valid_multiple_configs():
+    configs = tvmc.common.parse_configs(
+        [
+            "relay.backend.use_auto_scheduler=false",
+            "tir.detect_global_barrier=10",
+            "relay.ext.vitis_ai.options.build_dir=mystring",
+        ]
+    )
+
+    assert len(configs) == 3
+    assert "relay.backend.use_auto_scheduler" in configs.keys()
+    assert configs["relay.backend.use_auto_scheduler"] == False
+    assert "tir.detect_global_barrier" in configs.keys()
+    assert configs["tir.detect_global_barrier"] == 10
+    assert "relay.ext.vitis_ai.options.build_dir" in configs.keys()
+    assert configs["relay.ext.vitis_ai.options.build_dir"] == "mystring"
diff --git a/tests/python/driver/tvmc/test_common.py b/tests/python/driver/tvmc/test_pass_list.py
similarity index 97%
rename from tests/python/driver/tvmc/test_common.py
rename to tests/python/driver/tvmc/test_pass_list.py
index 5cac6a1378a5..de50b04f415a 100644
--- a/tests/python/driver/tvmc/test_common.py
+++ b/tests/python/driver/tvmc/test_pass_list.py
@@ -14,12 +14,13 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
 import argparse
 import pytest
 from tvm.driver import tvmc
 
 
-def test_common_parse_pass_list_str():
+def test_parse_pass_list_str():
     assert [""] == tvmc.common.parse_pass_list_str("")
     assert ["FoldScaleAxis", "FuseOps"] == tvmc.common.parse_pass_list_str("FoldScaleAxis,FuseOps")
 
diff --git a/tests/python/driver/tvmc/test_shape_parser.py b/tests/python/driver/tvmc/test_shape_parser.py
new file mode 100644
index 000000000000..f49d89ac7c0f
--- /dev/null
+++ b/tests/python/driver/tvmc/test_shape_parser.py
@@ -0,0 +1,103 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+
+import pytest
+
+from tvm.driver import tvmc
+
+
+def test_shape_parser():
+    # Check that a valid input is parsed correctly
+    shape_string = "input:[10,10,10]"
+    shape_dict = tvmc.common.parse_shape_string(shape_string)
+    assert shape_dict == {"input": [10, 10, 10]}
+
+
+def test_alternate_syntax():
+    shape_string = "input:0:[10,10,10] input2:[20,20,20,20]"
+    shape_dict = tvmc.common.parse_shape_string(shape_string)
+    assert shape_dict == {"input:0": [10, 10, 10], "input2": [20, 20, 20, 20]}
+
+
+@pytest.mark.parametrize(
+    "shape_string",
+    [
+        "input:[10,10,10] input2:[20,20,20,20]",
+        "input: [10, 10, 10] input2: [20, 20, 20, 20]",
+        "input:[10,10,10],input2:[20,20,20,20]",
+    ],
+)
+def test_alternate_syntaxes(shape_string):
+    shape_dict = tvmc.common.parse_shape_string(shape_string)
+    assert shape_dict == {"input": [10, 10, 10], "input2": [20, 20, 20, 20]}
+
+
+def test_negative_dimensions():
+    # Check that negative dimensions parse to Any correctly.
+    shape_string = "input:[-1,3,224,224]"
+    shape_dict = tvmc.common.parse_shape_string(shape_string)
+    # Convert to strings to allow comparison with Any.
+    assert str(shape_dict) == "{'input': [?, 3, 224, 224]}"
+
+
+def test_multiple_valid_gpu_inputs():
+    # Check that multiple valid gpu inputs are parsed correctly.
+    shape_string = "gpu_0/data_0:[1, -1,224,224] gpu_1/data_1:[7, 7]"
+    shape_dict = tvmc.common.parse_shape_string(shape_string)
+    expected = "{'gpu_0/data_0': [1, ?, 224, 224], 'gpu_1/data_1': [7, 7]}"
+    assert str(shape_dict) == expected
+
+
+def test_invalid_pattern():
+    shape_string = "input:[a,10]"
+    with pytest.raises(argparse.ArgumentTypeError):
+        tvmc.common.parse_shape_string(shape_string)
+
+
+def test_invalid_separators():
+    shape_string = "input:5,10 input2:10,10"
+    with pytest.raises(argparse.ArgumentTypeError):
+        tvmc.common.parse_shape_string(shape_string)
+
+
+def test_invalid_colon():
+    shape_string = "gpu_0/data_0:5,10 :test:10,10"
+    with pytest.raises(argparse.ArgumentTypeError):
+        tvmc.common.parse_shape_string(shape_string)
+
+
+@pytest.mark.parametrize(
+    "shape_string",
+    [
+        "gpu_0/data_0:5,10 /:10,10",
+        "gpu_0/data_0:5,10 data/:10,10",
+        "gpu_0/data_0:5,10 /data:10,10",
+        "gpu_0/invalid/data_0:5,10 data_1:10,10",
+    ],
+)
+def test_invalid_slashes(shape_string):
+    with pytest.raises(argparse.ArgumentTypeError):
+        tvmc.common.parse_shape_string(shape_string)
+
+
+def test_dot():
+    # Check dot in input name
+    shape_string = "input.1:[10,10,10]"
+    shape_dict = tvmc.common.parse_shape_string(shape_string)
+    assert shape_dict == {"input.1": [10, 10, 10]}
diff --git a/tests/python/driver/tvmc/test_target.py b/tests/python/driver/tvmc/test_target.py
new file mode 100644
index 000000000000..afb099f3add6
--- /dev/null
+++ b/tests/python/driver/tvmc/test_target.py
@@ -0,0 +1,143 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+from tvm.driver import tvmc
+
+from tvm.driver.tvmc.common import TVMCException
+
+
+def test_target_from_cli__error_duplicate():
+    with pytest.raises(TVMCException):
+        _ = tvmc.common.target_from_cli("llvm, llvm")
+
+
+def test_target_invalid_more_than_two_tvm_targets():
+    with pytest.raises(TVMCException):
+        _ = tvmc.common.target_from_cli("cuda, opencl, llvm")
+
+
+def test_target_from_cli__error_target_not_found():
+    with pytest.raises(TVMCException):
+        _ = tvmc.common.target_from_cli("invalidtarget")
+
+
+def test_target_from_cli__error_no_tvm_target():
+    with pytest.raises(TVMCException):
+        _ = tvmc.common.target_from_cli("ethos-n77")
+
+
+def test_target_two_tvm_targets():
+    tvm_target, extra_targets = tvmc.common.target_from_cli(
+        "opencl -device=mali, llvm -mtriple=aarch64-linux-gnu"
+    )
+
+    assert "opencl" in str(tvm_target)
+    assert "llvm" in str(tvm_target.host)
+
+    # No extra targets
+    assert 0 == len(extra_targets)
+
+
+def test_tokenize_target_with_opts():
+    tokens = tvmc.common.tokenize_target("foo -opt1=value1 --flag, bar -opt2=value2")
+    expected_tokens = ["foo", "-opt1=value1", "--flag", ",", "bar", "-opt2=value2"]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_tokenize_target_with_plus_sign():
+    tokens = tvmc.common.tokenize_target("foo -opt1=+value1 --flag, bar -opt2=test,+v")
+    expected_tokens = ["foo", "-opt1=+value1", "--flag", ",", "bar", "-opt2=test,+v"]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_tokenize_target_with_commas():
+    tokens = tvmc.common.tokenize_target("foo -opt1=v,a,l,u,e,1 --flag")
+    expected_tokens = ["foo", "-opt1=v,a,l,u,e,1", "--flag"]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_tokenize_target_with_commas_and_single_quotes():
+    tokens = tvmc.common.tokenize_target("foo -opt1='v, a, l, u, e', bar")
+    expected_tokens = ["foo", "-opt1='v, a, l, u, e'", ",", "bar"]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_tokenize_target_with_commas_and_double_quotes():
+    tokens = tvmc.common.tokenize_target('foo -opt1="v, a, l, u, e", bar')
+    expected_tokens = ["foo", '-opt1="v, a, l, u, e"', ",", "bar"]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_tokenize_target_with_dashes():
+    tokens = tvmc.common.tokenize_target("foo-bar1 -opt-1=t-e-s-t, baz")
+    expected_tokens = ["foo-bar1", "-opt-1=t-e-s-t", ",", "baz"]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_parse_single_target_with_opts():
+    targets = tvmc.common.parse_target("llvm -device=arm_cpu --system-lib")
+
+    assert len(targets) == 1
+    assert "device" in targets[0]["opts"]
+    assert "system-lib" in targets[0]["opts"]
+
+
+def test_parse_multiple_target():
+    targets = tvmc.common.parse_target("compute-library, llvm -device=arm_cpu --system-lib")
+
+    assert len(targets) == 2
+    assert "compute-library" == targets[0]["name"]
+    assert "llvm" == targets[1]["name"]
+
+
+def test_parse_multiple_target_with_opts():
+    targets = tvmc.common.parse_target("ethos-n77 -myopt=value, llvm -device=arm_cpu --system-lib")
+
+    assert len(targets) == 2
+    assert "ethos-n77" == targets[0]["name"]
+    assert "myopt" in targets[0]["opts"]
+    assert "value" == targets[0]["opts"]["myopt"]
+    assert "llvm" == targets[1]["name"]
+
+
+def test_parse_quotes_and_separators_on_options():
+    targets_no_quote = tvmc.common.parse_target("foo -option1=+v1.0x,+value,+bar")
+    targets_single_quote = tvmc.common.parse_target("foo -option1='+v1.0x,+value'")
+    targets_double_quote = tvmc.common.parse_target('foo -option1="+v1.0x,+value"')
+
+    assert len(targets_no_quote) == 1
+    assert "+v1.0x,+value,+bar" == targets_no_quote[0]["opts"]["option1"]
+
+    assert len(targets_single_quote) == 1
+    assert "+v1.0x,+value" == targets_single_quote[0]["opts"]["option1"]
+
+    assert len(targets_double_quote) == 1
+    assert "+v1.0x,+value" == targets_double_quote[0]["opts"]["option1"]
diff --git a/tests/python/driver/tvmc/test_target_options.py b/tests/python/driver/tvmc/test_target_options.py
new file mode 100644
index 000000000000..f6942299b751
--- /dev/null
+++ b/tests/python/driver/tvmc/test_target_options.py
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+
+import pytest
+
+from tvm.driver import tvmc
+from tvm.driver.tvmc.common import TVMCException
+from tvm.driver.tvmc.target import generate_target_args, reconstruct_target_args
+
+
+def test_target_to_argparse():
+    parser = argparse.ArgumentParser()
+    generate_target_args(parser)
+    parsed, _ = parser.parse_known_args(
+        ["--target=llvm", "--target-llvm-mattr=+fp,+mve", "--target-llvm-mcpu=cortex-m3"]
+    )
+    assert parsed.target == "llvm"
+    assert parsed.target_llvm_mcpu == "cortex-m3"
+    assert parsed.target_llvm_mattr == "+fp,+mve"
+
+
+def test_mapping_target_args():
+    parser = argparse.ArgumentParser()
+    generate_target_args(parser)
+    parsed, _ = parser.parse_known_args(["--target=llvm", "--target-llvm-mcpu=cortex-m3"])
+    assert reconstruct_target_args(parsed) == {"llvm": {"mcpu": "cortex-m3"}}
+
+
+def test_target_recombobulation_single():
+    tvm_target, _ = tvmc.common.target_from_cli("llvm", {"llvm": {"mcpu": "cortex-m3"}})
+
+    assert str(tvm_target) == "llvm -keys=cpu -link-params=0 -mcpu=cortex-m3"
+
+
+def test_target_recombobulation_many():
+    tvm_target, _ = tvmc.common.target_from_cli(
+        "opencl -device=mali, llvm -mtriple=aarch64-linux-gnu",
+        {"llvm": {"mcpu": "cortex-m3"}, "opencl": {"max_num_threads": 404}},
+    )
+
+    assert "-max_num_threads=404" in str(tvm_target)
+    assert "-device=mali" in str(tvm_target)
+    assert "-mtriple=aarch64-linux-gnu" in str(tvm_target.host)
+    assert "-mcpu=cortex-m3" in str(tvm_target.host)
+
+
+def test_error_if_target_missing():
+    with pytest.raises(
+        TVMCException,
+        match="Passed --target-opencl-max_num_threads but did not specify opencl target",
+    ):
+        tvmc.common.target_from_cli(
+            "llvm",
+            {"opencl": {"max_num_threads": 404}},
+        )
diff --git a/tests/python/driver/tvmc/test_tracker.py b/tests/python/driver/tvmc/test_tracker.py
new file mode 100644
index 000000000000..2ca0fae8f45e
--- /dev/null
+++ b/tests/python/driver/tvmc/test_tracker.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from tvm.driver import tvmc
+
+
+def test_tracker_host_port_from_cli__hostname_port():
+    input_str = "1.2.3.4:9090"
+    expected_host = "1.2.3.4"
+    expected_port = 9090
+
+    actual_host, actual_port = tvmc.common.tracker_host_port_from_cli(input_str)
+
+    assert expected_host == actual_host
+    assert expected_port == actual_port
+
+
+def test_tracker_host_port_from_cli__hostname_port__empty():
+    input_str = ""
+
+    actual_host, actual_port = tvmc.common.tracker_host_port_from_cli(input_str)
+
+    assert actual_host is None
+    assert actual_port is None
+
+
+def test_tracker_host_port_from_cli__only_hostname__default_port_is_9090():
+    input_str = "1.2.3.4"
+    expected_host = "1.2.3.4"
+    expected_port = 9090
+
+    actual_host, actual_port = tvmc.common.tracker_host_port_from_cli(input_str)
+
+    assert expected_host == actual_host
+    assert expected_port == actual_port
diff --git a/tests/python/driver/tvmc/test_tvmc_common.py b/tests/python/driver/tvmc/test_tvmc_common.py
deleted file mode 100644
index bdfdb48ce6a0..000000000000
--- a/tests/python/driver/tvmc/test_tvmc_common.py
+++ /dev/null
@@ -1,413 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import argparse
-
-import pytest
-
-import tvm
-from tvm.contrib.target.vitis_ai import vitis_ai_available
-from tvm.driver import tvmc
-
-from tvm.driver.tvmc.common import TVMCException
-
-
-def test_compile_tflite_module_nhwc_to_nchw(tflite_mobilenet_v1_1_quant):
-    # some CI environments wont offer TFLite, so skip in case it is not present
-    pytest.importorskip("tflite")
-
-    tvmc_model = tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant)
-    before = tvmc_model.mod
-
-    expected_layout = "NCHW"
-    after = tvmc.common.convert_graph_layout(before, expected_layout)
-
-    layout_transform_calls = []
-
-    def _is_layout_transform(node):
-        if isinstance(node, tvm.relay.expr.Call):
-            layout_transform_calls.append(
-                node.op.name == "layout_transform"
-                and node.attrs.src_layout == "NHWC"
-                and node.attrs.dst_layout == "NCHW"
-            )
-
-    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
-
-    assert any(layout_transform_calls), "Expected 'layout_transform NHWC->NCHW' not found"
-
-
-def test_compile_onnx_module_nchw_to_nhwc(onnx_resnet50):
-    # some CI environments wont offer ONNX, so skip in case it is not present
-    pytest.importorskip("onnx")
-
-    tvmc_model = tvmc.frontends.load_model(onnx_resnet50)
-    before = tvmc_model.mod
-
-    expected_layout = "NHWC"
-    after = tvmc.common.convert_graph_layout(before, expected_layout)
-
-    layout_transform_calls = []
-
-    def _is_layout_transform(node):
-        if isinstance(node, tvm.relay.expr.Call):
-            layout_transform_calls.append(
-                node.op.name == "layout_transform"
-                and node.attrs.src_layout == "NCHW"
-                and node.attrs.dst_layout == "NHWC"
-            )
-
-    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
-
-    assert any(layout_transform_calls), "Expected 'layout_transform NCWH->NHWC' not found"
-
-
-def test_compile_paddle_module_nchw_to_nhwc(paddle_resnet50):
-    # some CI environments wont offer Paddle, so skip in case it is not present
-    pytest.importorskip("paddle")
-
-    tvmc_model = tvmc.frontends.load_model(paddle_resnet50, "paddle")
-    before = tvmc_model.mod
-
-    expected_layout = "NHWC"
-    after = tvmc.common.convert_graph_layout(before, expected_layout)
-
-    layout_transform_calls = []
-
-    def _is_layout_transform(node):
-        if isinstance(node, tvm.relay.expr.Call):
-            layout_transform_calls.append(
-                node.op.name == "layout_transform"
-                and node.attrs.src_layout == "NCHW"
-                and node.attrs.dst_layout == "NHWC"
-            )
-
-    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
-
-    assert any(layout_transform_calls), "Expected 'layout_transform NCWH->NHWC' not found"
-
-
-def test_compile_tflite_module__same_layout__nhwc_to_nhwc(tflite_mobilenet_v1_1_quant):
-    # some CI environments wont offer TFLite, so skip in case it is not present
-    pytest.importorskip("tflite")
-
-    tvmc_model = tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant)
-    before = tvmc_model.mod
-
-    expected_layout = "NHWC"
-    after = tvmc.common.convert_graph_layout(before, expected_layout)
-
-    layout_transform_calls = []
-
-    def _is_layout_transform(node):
-        if isinstance(node, tvm.relay.expr.Call):
-            layout_transform_calls.append(
-                node.op.name == "layout_transform"
-                and node.attrs.src_layout == "NHWC"
-                and node.attrs.dst_layout == "NHWC"
-            )
-
-    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
-
-    assert not any(layout_transform_calls), "Unexpected 'layout_transform' call"
-
-
-def test_compile_onnx_module__same_layout__nchw_to_nchw(onnx_resnet50):
-    # some CI environments wont offer ONNX, so skip in case it is not present
-    pytest.importorskip("onnx")
-
-    tvmc_model = tvmc.frontends.load_model(onnx_resnet50)
-    before = tvmc_model.mod
-
-    expected_layout = "NCHW"
-    after = tvmc.common.convert_graph_layout(before, expected_layout)
-
-    layout_transform_calls = []
-
-    def _is_layout_transform(node):
-        if isinstance(node, tvm.relay.expr.Call):
-            layout_transform_calls.append(
-                node.op.name == "layout_transform"
-                and node.attrs.src_layout == "NCHW"
-                and node.attrs.dst_layout == "NCHW"
-            )
-
-    tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform)
-
-    assert not any(layout_transform_calls), "Unexpected 'layout_transform' call"
-
-
-def test_tracker_host_port_from_cli__hostname_port():
-    input_str = "1.2.3.4:9090"
-    expected_host = "1.2.3.4"
-    expected_port = 9090
-
-    actual_host, actual_port = tvmc.common.tracker_host_port_from_cli(input_str)
-
-    assert expected_host == actual_host
-    assert expected_port == actual_port
-
-
-def test_tracker_host_port_from_cli__hostname_port__empty():
-    input_str = ""
-
-    actual_host, actual_port = tvmc.common.tracker_host_port_from_cli(input_str)
-
-    assert actual_host is None
-    assert actual_port is None
-
-
-def test_tracker_host_port_from_cli__only_hostname__default_port_is_9090():
-    input_str = "1.2.3.4"
-    expected_host = "1.2.3.4"
-    expected_port = 9090
-
-    actual_host, actual_port = tvmc.common.tracker_host_port_from_cli(input_str)
-
-    assert expected_host == actual_host
-    assert expected_port == actual_port
-
-
-def test_shape_parser():
-    # Check that a valid input is parsed correctly
-    shape_string = "input:[10,10,10]"
-    shape_dict = tvmc.common.parse_shape_string(shape_string)
-    assert shape_dict == {"input": [10, 10, 10]}
-    # Check that multiple valid input shapes are parse correctly
-    shape_string = "input:[10,10,10] input2:[20,20,20,20]"
-    shape_dict = tvmc.common.parse_shape_string(shape_string)
-    assert shape_dict == {"input": [10, 10, 10], "input2": [20, 20, 20, 20]}
-    # Check that multiple valid input shapes with colons are parse correctly
-    shape_string = "input:0:[10,10,10] input2:[20,20,20,20]"
-    shape_dict = tvmc.common.parse_shape_string(shape_string)
-    assert shape_dict == {"input:0": [10, 10, 10], "input2": [20, 20, 20, 20]}
-    # Check that alternate syntax parses correctly
-    shape_string = "input: [10, 10, 10] input2: [20, 20, 20, 20]"
-    shape_dict = tvmc.common.parse_shape_string(shape_string)
-    assert shape_dict == {"input": [10, 10, 10], "input2": [20, 20, 20, 20]}
-    shape_string = "input:[10,10,10],input2:[20,20,20,20]"
-    shape_dict = tvmc.common.parse_shape_string(shape_string)
-    assert shape_dict == {"input": [10, 10, 10], "input2": [20, 20, 20, 20]}
-    # Check that negative dimensions parse to Any correctly.
-    shape_string = "input:[-1,3,224,224]"
-    shape_dict = tvmc.common.parse_shape_string(shape_string)
-    # Convert to strings to allow comparison with Any.
-    assert str(shape_dict) == "{'input': [?, 3, 224, 224]}"
-    # Check that multiple valid gpu inputs are parsed correctly.
-    shape_string = "gpu_0/data_0:[1, -1,224,224] gpu_1/data_1:[7, 7]"
-    shape_dict = tvmc.common.parse_shape_string(shape_string)
-    expected = "{'gpu_0/data_0': [1, ?, 224, 224], 'gpu_1/data_1': [7, 7]}"
-    assert str(shape_dict) == expected
-
-    # Check that invalid pattern raises expected error.
-    shape_string = "input:[a,10]"
-    with pytest.raises(argparse.ArgumentTypeError):
-        tvmc.common.parse_shape_string(shape_string)
-    # Check that input with invalid separators raises error.
-    shape_string = "input:5,10 input2:10,10"
-    with pytest.raises(argparse.ArgumentTypeError):
-        tvmc.common.parse_shape_string(shape_string)
-    # Check that input with a invalid slash raises error.
-    shape_string = "gpu_0/data_0:5,10 /:10,10"
-    with pytest.raises(argparse.ArgumentTypeError):
-        tvmc.common.parse_shape_string(shape_string)
-    # Check that input with a invalid colon raises error.
-    shape_string = "gpu_0/data_0:5,10 :test:10,10"
-    with pytest.raises(argparse.ArgumentTypeError):
-        tvmc.common.parse_shape_string(shape_string)
-    # Check that input with a invalid slash raises error.
-    shape_string = "gpu_0/data_0:5,10 data/:10,10"
-    with pytest.raises(argparse.ArgumentTypeError):
-        tvmc.common.parse_shape_string(shape_string)
-    # Check that input with a invalid slash raises error.
-    shape_string = "gpu_0/data_0:5,10 /data:10,10"
-    with pytest.raises(argparse.ArgumentTypeError):
-        tvmc.common.parse_shape_string(shape_string)
-    # Check that input with invalid slashes raises error.
-    shape_string = "gpu_0/invalid/data_0:5,10 data_1:10,10"
-    with pytest.raises(argparse.ArgumentTypeError):
-        tvmc.common.parse_shape_string(shape_string)
-
-
-def test_target_from_cli__error_duplicate():
-    with pytest.raises(TVMCException):
-        _ = tvmc.common.target_from_cli("llvm, llvm")
-
-
-def test_target_invalid_more_than_two_tvm_targets():
-    with pytest.raises(TVMCException):
-        _ = tvmc.common.target_from_cli("cuda, opencl, llvm")
-
-
-def test_target_from_cli__error_target_not_found():
-    with pytest.raises(TVMCException):
-        _ = tvmc.common.target_from_cli("invalidtarget")
-
-
-def test_target_from_cli__error_no_tvm_target():
-    with pytest.raises(TVMCException):
-        _ = tvmc.common.target_from_cli("ethos-n77")
-
-
-def test_target_two_tvm_targets():
-    tvm_target, extra_targets = tvmc.common.target_from_cli(
-        "opencl -device=mali, llvm -mtriple=aarch64-linux-gnu"
-    )
-
-    assert "opencl" in str(tvm_target)
-    assert "llvm" in str(tvm_target.host)
-
-    # No extra targets
-    assert 0 == len(extra_targets)
-
-
-def test_tokenize_target_with_opts():
-    tokens = tvmc.common.tokenize_target("foo -opt1=value1 --flag, bar -opt2=value2")
-    expected_tokens = ["foo", "-opt1=value1", "--flag", ",", "bar", "-opt2=value2"]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_tokenize_target_with_plus_sign():
-    tokens = tvmc.common.tokenize_target("foo -opt1=+value1 --flag, bar -opt2=test,+v")
-    expected_tokens = ["foo", "-opt1=+value1", "--flag", ",", "bar", "-opt2=test,+v"]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_tokenize_target_with_commas():
-    tokens = tvmc.common.tokenize_target("foo -opt1=v,a,l,u,e,1 --flag")
-    expected_tokens = ["foo", "-opt1=v,a,l,u,e,1", "--flag"]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_tokenize_target_with_commas_and_single_quotes():
-    tokens = tvmc.common.tokenize_target("foo -opt1='v, a, l, u, e', bar")
-    expected_tokens = ["foo", "-opt1='v, a, l, u, e'", ",", "bar"]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_tokenize_target_with_commas_and_double_quotes():
-    tokens = tvmc.common.tokenize_target('foo -opt1="v, a, l, u, e", bar')
-    expected_tokens = ["foo", '-opt1="v, a, l, u, e"', ",", "bar"]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_tokenize_target_with_dashes():
-    tokens = tvmc.common.tokenize_target("foo-bar1 -opt-1=t-e-s-t, baz")
-    expected_tokens = ["foo-bar1", "-opt-1=t-e-s-t", ",", "baz"]
-
-    assert len(tokens) == len(expected_tokens)
-    assert tokens == expected_tokens
-
-
-def test_parse_single_target_with_opts():
-    targets = tvmc.common.parse_target("llvm -device=arm_cpu --system-lib")
-
-    assert len(targets) == 1
-    assert "device" in targets[0]["opts"]
-    assert "system-lib" in targets[0]["opts"]
-
-
-def test_parse_multiple_target():
-    targets = tvmc.common.parse_target("compute-library, llvm -device=arm_cpu --system-lib")
-
-    assert len(targets) == 2
-    assert "compute-library" == targets[0]["name"]
-    assert "llvm" == targets[1]["name"]
-
-
-def test_parse_multiple_target_with_opts():
-    targets = tvmc.common.parse_target("ethos-n77 -myopt=value, llvm -device=arm_cpu --system-lib")
-
-    assert len(targets) == 2
-    assert "ethos-n77" == targets[0]["name"]
-    assert "myopt" in targets[0]["opts"]
-    assert "value" == targets[0]["opts"]["myopt"]
-    assert "llvm" == targets[1]["name"]
-
-
-def test_parse_quotes_and_separators_on_options():
-    targets_no_quote = tvmc.common.parse_target("foo -option1=+v1.0x,+value,+bar")
-    targets_single_quote = tvmc.common.parse_target("foo -option1='+v1.0x,+value'")
-    targets_double_quote = tvmc.common.parse_target('foo -option1="+v1.0x,+value"')
-
-    assert len(targets_no_quote) == 1
-    assert "+v1.0x,+value,+bar" == targets_no_quote[0]["opts"]["option1"]
-
-    assert len(targets_single_quote) == 1
-    assert "+v1.0x,+value" == targets_single_quote[0]["opts"]["option1"]
-
-    assert len(targets_double_quote) == 1
-    assert "+v1.0x,+value" == targets_double_quote[0]["opts"]["option1"]
-
-
-def test_config_invalid_format():
-    with pytest.raises(TVMCException):
-        _ = tvmc.common.parse_configs(["relay.backend.use_auto_scheduler.missing.value"])
-
-
-def test_config_missing_from_tvm():
-    with pytest.raises(TVMCException):
-        _ = tvmc.common.parse_configs(["relay.backend.use_auto_scheduler.missing.value=1234"])
-
-
-def test_config_unsupported_tvmc_config():
-    with pytest.raises(TVMCException):
-        _ = tvmc.common.parse_configs(["tir.LoopPartition=value"])
-
-
-def test_config_empty():
-    with pytest.raises(TVMCException):
-        _ = tvmc.common.parse_configs([""])
-
-
-def test_config_valid_config_bool():
-    configs = tvmc.common.parse_configs(["relay.backend.use_auto_scheduler=true"])
-
-    assert len(configs) == 1
-    assert "relay.backend.use_auto_scheduler" in configs.keys()
-    assert configs["relay.backend.use_auto_scheduler"] == True
-
-
-@pytest.mark.skipif(
-    not vitis_ai_available(),
-    reason="--target vitis-ai is not available. TVM built with 'USE_VITIS_AI OFF'",
-)
-def test_config_valid_multiple_configs():
-    configs = tvmc.common.parse_configs(
-        [
-            "relay.backend.use_auto_scheduler=false",
-            "tir.detect_global_barrier=10",
-            "relay.ext.vitis_ai.options.build_dir=mystring",
-        ]
-    )
-
-    assert len(configs) == 3
-    assert "relay.backend.use_auto_scheduler" in configs.keys()
-    assert configs["relay.backend.use_auto_scheduler"] == False
-    assert "tir.detect_global_barrier" in configs.keys()
-    assert configs["tir.detect_global_barrier"] == 10
-    assert "relay.ext.vitis_ai.options.build_dir" in configs.keys()
-    assert configs["relay.ext.vitis_ai.options.build_dir"] == "mystring"
diff --git a/tests/python/frontend/keras/test_forward.py b/tests/python/frontend/keras/test_forward.py
index 26bf58cbf384..4dfe89fe40e5 100644
--- a/tests/python/frontend/keras/test_forward.py
+++ b/tests/python/frontend/keras/test_forward.py
@@ -417,6 +417,17 @@ def test_forward_reuse_layers(self, keras):
         keras_model = keras.models.Model(data, z)
         verify_keras_frontend(keras_model)
 
+    def test_forward_lstm(self, keras):
+        data = keras.layers.Input(shape=(10, 32))
+        rnn_funcs = [
+            keras.layers.LSTM(16),
+            keras.layers.LSTM(16, return_sequences=True),
+        ]
+        for rnn_func in rnn_funcs:
+            x = rnn_func(data)
+            keras_model = keras.models.Model(data, x)
+            verify_keras_frontend(keras_model, need_transpose=False)
+
     def test_forward_rnn(self, keras):
         data = keras.layers.Input(shape=(1, 32))
         rnn_funcs = [
@@ -613,6 +624,7 @@ def test_forward_nested_layers(self, keras):
         sut.test_forward_multi_inputs(keras=k)
         sut.test_forward_multi_outputs(keras=k)
         sut.test_forward_reuse_layers(keras=k)
+        sut.test_forward_lstm(keras=k)
         sut.test_forward_rnn(keras=k)
         sut.test_forward_vgg16(keras=k)
         sut.test_forward_vgg16(keras=k, layout="NHWC")
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 69bb44e360ff..dd1c77330986 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -3970,6 +3970,7 @@ def verify(ishape, oshape, scales, mode, coord_trans="asymmetric", alpha=0.5, ex
             make_constant_node("scales", onnx.TensorProto.FLOAT, (len(scales),), scales),
         ]
         input_names = ["X", "roi", "scales"]
+
         if oshape != []:
             nodes.append(
                 make_constant_node("sizes", onnx.TensorProto.INT64, (len(oshape),), oshape)
@@ -4941,7 +4942,6 @@ def verify_eyelike(indata):
     "test_mvn",
     # This test fails llvm with a lowering error:
     "test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded",
-    "test_qlinearmatmul_2D",
     "test_qlinearmatmul_3D",
     "test_range_float_type_positive_delta_expanded",
     "test_range_int32_type_negative_delta_expanded",
@@ -4955,15 +4955,7 @@ def verify_eyelike(indata):
     "test_reduce_sum_keepdims_random",
     "test_reduce_sum_negative_axes_keepdims_example",
     "test_reduce_sum_negative_axes_keepdims_random",
-    "test_resize_downsample_sizes_cubic",
-    "test_resize_downsample_sizes_linear_pytorch_half_pixel",
-    "test_resize_downsample_sizes_nearest",
     "test_resize_tf_crop_and_resize",
-    "test_resize_upsample_sizes_cubic",
-    "test_resize_upsample_sizes_nearest",
-    "test_resize_upsample_sizes_nearest_ceil_half_pixel",
-    "test_resize_upsample_sizes_nearest_floor_align_corners",
-    "test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric",
     "test_rnn_seq_length",
     "test_round",
     "test_scan9_sum",
diff --git a/tests/python/frontend/paddlepaddle/test_forward.py b/tests/python/frontend/paddlepaddle/test_forward.py
index 1d64f947e68a..e3d1fc9daf2b 100644
--- a/tests/python/frontend/paddlepaddle/test_forward.py
+++ b/tests/python/frontend/paddlepaddle/test_forward.py
@@ -80,9 +80,8 @@ def verify_model(func, input_data, rtol=1e-5, atol=1e-5):
         baseline_outputs = (baseline_outputs.numpy(),)
 
     mod, params = relay.frontend.from_paddle(baseline_model, input_shape_dict)
-    parms_num = min(len(input_names), len(mod["main"].params))
     compiled_names = []
-    for arg in mod["main"].params[:parms_num]:
+    for arg in mod["main"].params:
         assert arg.name_hint in input_names or arg.name_hint in params
         if arg.name_hint in input_names:
             compiled_names.append(arg.name_hint)
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 4a6f88417b9c..754976ca8c13 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -161,6 +161,7 @@ def run_tvm_graph(
     target="llvm",
     out_names=None,
     mode="graph_executor",
+    op_converter=relay.frontend.tflite.OperatorConverter,
 ):
     """Generic function to compile on relay and execute on tvm"""
     # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1
@@ -185,7 +186,7 @@ def run_tvm_graph(
         dtype_dict[e] = input_data[i].dtype.name
 
     mod, params = relay.frontend.from_tflite(
-        tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict
+        tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict, op_converter=op_converter
     )
 
     if mode in ["debug", "vm"]:
@@ -3996,6 +3997,72 @@ def test_detection_postprocess():
         )
 
 
+#######################################################################
+# Custom Converter
+# ----------------
+
+
+def test_custom_op_converter():
+    """Test case for user-defined operator converter in TFLite frontend"""
+
+    class DummyOperatorConverter(relay.frontend.tflite.OperatorConverter):
+        """Operator Converter for converting TFLite ops to relay ops"""
+
+        def __init__(self, model, subgraph, exp_tab):
+            super(DummyOperatorConverter, self).__init__(model, subgraph, exp_tab)
+            self.allow_custom_ops = True
+
+            convert_map_overwrite = {"SUB": self.convert_sub_dummy}
+
+            self.convert_map.update(convert_map_overwrite)
+
+        def convert_sub_dummy(self, op):
+            """Convert TFLite SUB"""
+            input_tensors = self.get_input_tensors(op)
+            assert len(input_tensors) == 2, "input tensors length should be 2"
+
+            lhs_tensor = input_tensors[0]
+            rhs_tensor = input_tensors[1]
+
+            lhs_expr = self.get_expr(lhs_tensor.tensor_idx)
+            rhs_expr = self.get_expr(rhs_tensor.tensor_idx)
+
+            temp_expr = relay.op.negative(rhs_expr)
+            out = relay.op.add(lhs_expr, temp_expr)
+
+            return out
+
+    with tf.Graph().as_default():
+        # Generate TFLite model for single addition
+        data = [
+            np.arange(6.0, dtype=np.float32).reshape((2, 1, 1, 3)),
+            np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 1, 3)),
+        ]
+        in_data = [
+            array_ops.placeholder(shape=data[0].shape, dtype="float32", name="in_0"),
+            array_ops.placeholder(shape=data[1].shape, dtype="float32", name="in_1"),
+        ]
+        out = math_ops.subtract(in_data[0], in_data[1])
+        in_name = [x[1] for x in zip(in_data, ("in_0:0", "in_1:0"))]
+        input_tensors = [x for x in in_data]
+        output_tensors = [out]
+        in_node = [0] * len(in_name)
+        for i in range(len(in_name)):
+            in_node[i] = in_name[i].split(":")[0] if ":" in in_name[i] else in_name[i]
+
+        with tf.Session() as sess:
+            converter = tf.lite.TFLiteConverter.from_session(sess, input_tensors, output_tensors)
+            tflite_model_buf = converter.convert()
+    in_data = [x[1] for x in zip(in_data, data)]
+    tvm_output_orig = run_tvm_graph(tflite_model_buf, in_data, in_node)
+    tvm_output_dummy = run_tvm_graph(
+        tflite_model_buf, in_data, in_node, op_converter=DummyOperatorConverter
+    )
+    tvm.testing.assert_allclose(
+        np.squeeze(tvm_output_orig[0]), np.squeeze(tvm_output_dummy[0]), rtol=1e-5, atol=1e-5
+    )
+
+
 #######################################################################
 # Mobilenet
 # ---------
@@ -4621,6 +4688,9 @@ def test_prevent_tensorflow_dynamic_range():
     # Detection_PostProcess
     test_detection_postprocess()
 
+    # Overwrite Converter
+    test_custom_op_converter()
+
     # End to End
     test_forward_mobilenet_v1()
     test_forward_mobilenet_v2()
diff --git a/tests/python/integration/test_lower.py b/tests/python/integration/test_lower.py
index 690258c2fa3b..63733b05ab3f 100644
--- a/tests/python/integration/test_lower.py
+++ b/tests/python/integration/test_lower.py
@@ -33,9 +33,8 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
     # body
     for blockIdx_x in T.thread_binding(0, 16, "blockIdx.x"):
         for blockIdx_y in T.thread_binding(0, 8, "blockIdx.y"):
-            with T.block([16, 8]) as [bx, by]:
-                T.bind(bx, blockIdx_x)
-                T.bind(by, blockIdx_y)
+            with T.block():
+                bx, by = T.axis.remap("SS", [blockIdx_x, blockIdx_y])
                 shared_A = T.alloc_buffer([1024, 1024], "float16", scope="shared")
                 shared_B = T.alloc_buffer([1024, 1024], "float16", scope="shared")
                 wmma_A = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_a")
@@ -44,9 +43,9 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
                 for ty in T.thread_binding(0, 2, "threadIdx.y"):
                     for tz in T.thread_binding(0, 2, "threadIdx.z"):
                         for i, j in T.grid(2, 4):
-                            with T.block([64, 64]) as [vi, vj]:
-                                T.bind(vi, bx * 4 + ty * 2 + i)
-                                T.bind(vj, by * 8 + tz * 4 + j)
+                            with T.block():
+                                vi = T.axis.S(64, bx * 4 + ty * 2 + i)
+                                vj = T.axis.S(64, by * 8 + tz * 4 + j)
                                 T.reads([])
                                 T.writes(wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
                                 C0 = T.match_buffer(
@@ -74,23 +73,23 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
                             for tx in T.thread_binding(0, 32, "threadIdx.x"):
                                 for i0, j0 in T.grid(1, 4):
                                     for j1 in T.vectorized(0, 4):
-                                        with T.block([1024, 1024]) as [vi, vj]:
-                                            T.bind(vi, bx * 64 + ty * 32 + tx + i0)
-                                            T.bind(vj, ko * 32 + tz * 16 + j0 * 4 + j1)
+                                        with T.block():
+                                            vi = T.axis.S(1024, bx * 64 + ty * 32 + tx + i0)
+                                            vj = T.axis.S(1024, ko * 32 + tz * 16 + j0 * 4 + j1)
                                             shared_A[vi, vj + 8] = A[vi, vj]
 
                                 for i0, j0 in T.grid(2, 4):
                                     for j1 in T.vectorized(0, 4):
-                                        with T.block([1024, 1024]) as [vi, vj]:
-                                            T.bind(vi, by * 128 + ty * 64 + tx * 2 + i0)
-                                            T.bind(vj, ko * 32 + tz * 16 + j0 * 4 + j1)
+                                        with T.block():
+                                            vi = T.axis.S(1024, by * 128 + ty * 64 + tx * 2 + i0)
+                                            vj = T.axis.S(1024, ko * 32 + tz * 16 + j0 * 4 + j1)
                                             shared_B[vi, vj + 8] = B[vi, vj]
 
                             for ki in range(0, 2):
                                 for i in range(0, 2):
-                                    with T.block([64, 64]) as [vi, vk]:
-                                        T.bind(vi, bx * 4 + ty * 2 + i)
-                                        T.bind(vk, ko * 2 + ki)
+                                    with T.block():
+                                        vi = T.axis.S(64, bx * 4 + ty * 2 + i)
+                                        vk = T.axis.S(64, ko * 2 + ki)
                                         T.reads(
                                             shared_A[
                                                 vi * 16 : vi * 16 + 16,
@@ -142,9 +141,9 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
                                             )
                                         )
                                 for j in range(0, 4):
-                                    with T.block([64, 64]) as [vj, vk]:
-                                        T.bind(vj, by * 8 + tz * 4 + j)
-                                        T.bind(vk, ko * 2 + ki)
+                                    with T.block():
+                                        vj = T.axis.S(64, by * 8 + tz * 4 + j)
+                                        vk = T.axis.S(64, ko * 2 + ki)
                                         T.reads(
                                             shared_B[
                                                 vj * 16 : vj * 16 + 16,
@@ -196,14 +195,10 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
                                             )
                                         )
                                 for i, j in T.grid(2, 4):
-                                    with T.block([64, 64, T.reduce_axis(0, 64)]) as [
-                                        vi,
-                                        vj,
-                                        vk,
-                                    ]:
-                                        T.bind(vi, bx * 4 + ty * 2 + i)
-                                        T.bind(vj, by * 8 + tz * 4 + j)
-                                        T.bind(vk, ko * 2 + ki)
+                                    with T.block():
+                                        vi = T.axis.S(64, bx * 4 + ty * 2 + i)
+                                        vj = T.axis.S(64, by * 8 + tz * 4 + j)
+                                        vk = T.axis.R(64, ko * 2 + ki)
                                         T.reads(
                                             [
                                                 wmma_A[
@@ -258,9 +253,9 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
                                             )
                                         )
                         for i, j in T.grid(2, 4):
-                            with T.block([64, 64]) as [vi, vj]:
-                                T.bind(vi, bx * 4 + ty * 2 + i)
-                                T.bind(vj, by * 8 + tz * 4 + j)
+                            with T.block():
+                                vi = T.axis.S(64, bx * 4 + ty * 2 + i)
+                                vj = T.axis.S(64, by * 8 + tz * 4 + j)
                                 T.reads(wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
                                 T.writes(C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
                                 s0 = T.var("int32")
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index ca097734a9eb..a40164ded941 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -15,10 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 import pytest
+import numpy as np
 
 import tvm
 from tvm import te, topi
-import numpy as np
+from tvm.driver.build_module import schedule_to_module
 import tvm.testing
 import tvm.topi.testing
 
@@ -532,10 +533,7 @@ def test_reduce_storage_reuse():
     target = tvm.target.Target("cuda")
 
     def run_passes(sch, args):
-        bounds = tvm.te.schedule.InferBound(sch)
-        stmt = tvm.te.schedule.ScheduleOps(sch, bounds)
-        func = tvm.te.schedule.SchedulePostProcToPrimFunc(args, stmt, None)
-        mod = tvm.IRModule.from_expr(func)
+        mod = schedule_to_module(sch, args)
         mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", target))(mod)
         return tvm.transform.Sequential(
             [
diff --git a/tests/python/nightly/quantization/test_quantization_accuracy_for_vit.py b/tests/python/nightly/quantization/test_quantization_accuracy_for_vit.py
index 8cecbf97c001..484ec23b369a 100644
--- a/tests/python/nightly/quantization/test_quantization_accuracy_for_vit.py
+++ b/tests/python/nightly/quantization/test_quantization_accuracy_for_vit.py
@@ -14,15 +14,20 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import tvm
 import os
 import sys
-from tvm import relay
-from tvm.relay import quantize as qtz
 import logging
+
+import pytest
+
+pytest.importorskip("onnx")
+
 import onnx
+
+import tvm
+from tvm import relay
+from tvm.relay import quantize as qtz
 import tvm.testing
-import mxnet as mx
 from test_quantization_accuracy import Config, get_val_data, eval_acc
 
 logging.basicConfig(level=logging.INFO)
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index decddc1ef0a4..8788faf45866 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -2064,5 +2064,27 @@ def verify_scatter_nd(data_np, indices_np, updates_np, ref_res):
     verify_scatter_nd(data, indices, updates, out)
 
 
+@tvm.testing.uses_gpu
+def test_gather():
+    def verify_gather(data_shape, indices_shape, data_shape_np, indices_shape_np, axis):
+        x = relay.var("x", relay.TensorType(data_shape, "float32"))
+        y = relay.var("y", relay.TensorType(indices_shape, "int32"))
+        z = relay.gather(x, axis, y)
+
+        mod = tvm.IRModule()
+        mod["main"] = relay.Function([x, y], z)
+
+        data_np = np.random.uniform(size=data_shape_np).astype("float32")
+        indices_np = np.random.randint(low=0, high=2, size=indices_shape_np, dtype="int32")
+
+        ref_res = tvm.topi.testing.gather_python(data_np, axis, indices_np)
+        check_result([data_np, indices_np], mod, [ref_res])
+
+    verify_gather((relay.Any(),), (relay.Any(),), (10,), (10,), 0)
+    verify_gather((2, 2), (2, relay.Any()), (2, 2), (2, 3), 1)
+    verify_gather((relay.Any(), 2), (2, relay.Any()), (2, 2), (2, 3), 1)
+    verify_gather((relay.Any(), relay.Any()), (relay.Any(), relay.Any()), (2, 3), (1, 3), 0)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index 3310b6b2ed69..ab36f79c6ea7 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -507,12 +507,12 @@ def expected():
         bias = relay.layout_transform(bias, src_layout="NHWC", dst_layout="NCHW")
         bias = relay.layout_transform(bias, src_layout="NCHW", dst_layout="NCHW16c")
         add = relay.add(y, bias)
-        y = relay.layout_transform(add, src_layout="NCHW16c", dst_layout="NCHW")
-        mean = relay.mean(y, axis=1, exclude=True)
-        var = relay.variance(y, axis=1, exclude=True)
+        mean = relay.mean(add, axis=[1, 4], exclude=True)
+        var = relay.variance(add, axis=[1, 4], exclude=True)
         denom = relay.const(1.0) / relay.sqrt(var + relay.const(1e-05))
         gamma = relay.var("gamma", shape=(16,))
-        denom = denom * gamma
+        denom_c16c = denom * relay.layout_transform(gamma, src_layout="C", dst_layout="C16c")
+        denom = relay.layout_transform(denom_c16c, src_layout="C16c", dst_layout="C")
         denom_expand1 = relay.expand_dims(denom, axis=1, num_newaxis=2)
         denom_expand2 = relay.expand_dims(denom_expand1, axis=0)
         denom_nchwc16 = relay.layout_transform(
@@ -520,7 +520,10 @@ def expected():
         )
         out = add * denom_nchwc16
         beta = relay.var("beta", shape=(16,))
-        numerator = (-mean) * denom + beta
+        numerator_c16c = (-mean) * denom_c16c + relay.layout_transform(
+            beta, src_layout="C", dst_layout="C16c"
+        )
+        numerator = relay.layout_transform(numerator_c16c, src_layout="C16c", dst_layout="C")
         numerator_expand1 = relay.expand_dims(numerator, axis=1, num_newaxis=2)
         numerator_expand2 = relay.expand_dims(numerator_expand1, axis=0)
         numerator_nchwc16 = relay.layout_transform(
@@ -1096,8 +1099,8 @@ def expected_nchw():
         y = relay.nn.conv2d(
             y, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
         )
-        ret = relay.layout_transform(y, "NCHW16c", "NCHW")
-        ret = relay.sum(ret, axis=[1], keepdims=True)
+        ret = relay.sum(y, axis=[1, 4], keepdims=True)
+        ret = relay.layout_transform(ret, "NCHW1c", "NCHW")
         y = relay.Function(analysis.free_vars(ret), ret)
         return y
 
@@ -1126,9 +1129,8 @@ def expected_nhwc():
         y = relay.nn.conv2d(
             y, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1), data_layout="NCHW16c"
         )
-        ret = relay.layout_transform(y, "NCHW16c", "NCHW")
-        ret = relay.sum(ret, axis=[1], keepdims=True)
-        ret = relay.layout_transform(ret, "NCHW", "NHWC")
+        ret = relay.sum(y, axis=[1, 4], keepdims=True)
+        ret = relay.layout_transform(ret, "NCHW1c", "NHWC")
         y = relay.Function(analysis.free_vars(ret), ret)
         return y
 
@@ -1397,28 +1399,54 @@ def expected():
             assert tvm.ir.structural_equal(a, b)
 
 
+def test_conv2d_strided_slice_packed_to_unpacked():
+    """We do not support propagating through packed to unpacked layout"""
+    x_shape = (1, 1, 1, 1, 4)
+    w_shape = (9, 1, 3, 3, 4, 4)
+
+    def before():
+        x = relay.var("x", shape=x_shape)
+        weight = relay.var("weight", shape=w_shape)
+        y = relay.nn.conv2d(
+            x,
+            weight,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NCHW4c",
+            kernel_layout="OIHW4i4o",
+        )
+        y = relay.strided_slice(y, begin=[0, 0], end=[1, -1], strides=[1, 8])
+        return relay.Function([x, weight], y)
+
+    def expected():
+        x = relay.var("x", shape=x_shape)
+        weight = relay.var("weight", shape=w_shape)
+        x_nchw = relay.layout_transform(x, src_layout="NCHW4c", dst_layout="NCHW")
+        weight_oihw = relay.layout_transform(weight, src_layout="OIHW4i4o", dst_layout="OIHW")
+        y = relay.nn.conv2d(
+            x_nchw,
+            weight_oihw,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NCHW",
+            kernel_layout="OIHW",
+        )
+        y = relay.layout_transform(y, src_layout="NCHW", dst_layout="NCHW4c")
+        y = relay.strided_slice(y, begin=[0, 0], end=[1, -1], strides=[1, 8])
+        return relay.Function([x, weight], y)
+
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs["data_layout"] = "NCHW"
+        new_attrs["kernel_layout"] = "OIHW"
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        a = run_opt_pass(before(), transform.AlterOpLayout())
+        b = run_opt_pass(expected(), transform.InferType())
+        assert tvm.ir.structural_equal(a, b)
+
+
 if __name__ == "__main__":
-    test_alter_op()
-    test_alter_return_none()
-    test_alter_layout()
-    test_alter_layout_dual_path()
-    test_alter_layout_lrn()
-    test_alter_layout_resnet()
-    test_alter_layout_broadcast_op()
-    test_alter_layout_broadcast_scalar_op()
-    test_alter_layout_scalar()
-    test_alter_layout_concatenate()
-    test_alter_layout_nchw_upsamping_op()
-    test_alter_layout_strided_slice()
-    test_alter_layout_depthwise_conv2d()
-    test_alter_layout_prelu()
-    test_alter_layout_pad()
-    test_alter_layout_pool()
-    test_alter_layout_sum()
-    test_alter_layout_nhwc_arm()
-    test_alter_layout_nhwc_int8_aarch64()
-    test_alter_op_with_global_var()
-    test_alter_op_dense()
-    test_alter_layout_strided_slice_axes_nhwc()
-    test_not_inplace_modify()
-    test_alter_op_dense_packed_data()
+    pytest.main([__file__])
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index 9b4d154360b2..528470ff2ec7 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -2039,5 +2039,337 @@ def expected():
         assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+def test_conv_max_pool_uses_specified_convert_layout():
+    relay.op.get("nn.max_pool2d").reset_attr("FTVMConvertOpLayout")
+
+    @tvm.ir.register_op_attr("nn.max_pool2d", "FTVMConvertOpLayout")
+    def convert_maxpool2d(attrs, inputs, tinfos, desired_layouts):
+        # stick by convertng layout and out_layout to use NHWC and NHWC,
+        #   respectively, as specified in the transforms.ConvertLayout() function's arguments later
+        new_attrs = dict(attrs)
+        new_attrs["layout"] = str(desired_layouts[0])
+        new_attrs["out_layout"] = str(desired_layouts[0])
+        return relay.nn.max_pool2d(*inputs, **new_attrs)
+
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight = relay.var("weight", shape=(64, 64, 3, 3))
+        y = relay.nn.conv2d(
+            x,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NCHW",
+            kernel_layout="OIHW",
+        )
+        y = relay.nn.relu(y)
+        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout="NCHW")
+        y = relay.nn.batch_flatten(y)
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight = relay.var("weight", shape=(64, 64, 3, 3))
+        x = relay.layout_transform(x, "NCHW", "NHWC")
+        weight = relay.layout_transform(weight, "OIHW", "OHWI")
+        y = relay.nn.conv2d(
+            x,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="OHWI",
+        )
+        y = relay.nn.relu(y)
+        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout="NHWC", out_layout="NHWC")
+        y = relay.layout_transform(y, "NHWC", "NCHW")
+        y = relay.nn.batch_flatten(y)
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    a = before()
+    a = run_opt_pass(
+        a, transform.ConvertLayout({"nn.conv2d": ["NHWC", "OHWI"], "nn.max_pool2d": ["NHWC"]})
+    )
+    b = run_opt_pass(expected(), transform.InferType())
+
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "\n\n Expected = \n" + str(b)
+
+
+def test_conv_bias_pool_uses_specified_convert_layout():
+    relay.op.get("nn.max_pool2d").reset_attr("FTVMConvertOpLayout")
+
+    @tvm.ir.register_op_attr("nn.max_pool2d", "FTVMConvertOpLayout")
+    def convert_maxpool2d(attrs, inputs, tinfos, desired_layouts):
+        # stick by convertng layout and out_layout to use NHWC and NHWC,
+        #   respectively, as specified in the transforms.ConvertLayout() function's arguments later
+        new_attrs = dict(attrs)
+        new_attrs["layout"] = str(desired_layouts[0])
+        new_attrs["out_layout"] = str(desired_layouts[0])
+        return relay.nn.max_pool2d(*inputs, **new_attrs)
+
+    def before():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        bias = relay.var("bias", shape=(64,))
+        weight = relay.var("weight", shape=(3, 3, 64, 64))
+        y = relay.nn.conv2d(
+            x,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        y = relay.nn.bias_add(y, bias, axis=3)
+        # a useless tuple, which will be eliminated
+        y = relay.Tuple([y])[0]
+        y = relay.nn.relu(y)
+        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout="NHWC")
+        y = relay.cast(y, "int32")
+        y = relay.nn.batch_flatten(y)
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    def expected():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        bias = relay.var("bias", shape=(64,))
+        weight = relay.var("weight", shape=(3, 3, 64, 64))
+        x = relay.layout_transform(x, "NHWC", "NCHW")
+        weight = relay.layout_transform(weight, "HWIO", "OIHW")
+        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
+
+        bias = relay.expand_dims(bias, axis=0, num_newaxis=3)
+        bias = relay.layout_transform(bias, "NHWC", "NCHW")
+        y = relay.add(y, bias)
+        # a useless tuple, which will be eliminated
+        y = relay.Tuple([y])[0]
+        y = relay.nn.relu(y)
+        y = relay.layout_transform(y, "NCHW", "NHWC")
+        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout="NHWC", out_layout="NHWC")
+        y = relay.cast(y, "int32")
+        y = relay.nn.batch_flatten(y)
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    a = before()
+    a = run_opt_pass(
+        a,
+        transform.ConvertLayout({"nn.conv2d": ["NCHW", "OIHW"], "nn.max_pool2d": ["NHWC"]}),
+    )
+    b = run_opt_pass(expected(), transform.InferType())
+
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "\n\n Expected = \n" + str(b)
+
+
+def test_deformable_conv_bias_pool_uses_specified_convert_layout():
+    relay.op.get("nn.max_pool2d").reset_attr("FTVMConvertOpLayout")
+
+    @tvm.ir.register_op_attr("nn.max_pool2d", "FTVMConvertOpLayout")
+    def convert_maxpool2d(attrs, inputs, tinfos, desired_layouts):
+        # stick by convertng layout and out_layout to use NHWC and NHWC,
+        #   respectively, as specified in the transforms.ConvertLayout() function's arguments later
+        new_attrs = dict(attrs)
+        new_attrs["layout"] = str(desired_layouts[0])
+        new_attrs["out_layout"] = str(desired_layouts[0])
+        return relay.nn.max_pool2d(*inputs, **new_attrs)
+
+    def before(N, CI, H, W, CO, KH, KW, layout):
+        if layout == "NCHW":
+            data_shape = (N, CI, H, W)
+            weight_shape = (CO, CI, KH, KW)
+            kernel_layout = "OIHW"
+        else:
+            data_shape = (N, H, W, CI)
+            weight_shape = (KH, KW, CI, CO)
+            kernel_layout = "HWIO"
+        bias_shape = (CO,)
+
+        data = relay.var("data", shape=data_shape, dtype="float32")
+        offset = relay.var("offset")
+        weight = relay.var("weight", shape=weight_shape, dtype="float32")
+        bias = relay.var("bias", shape=bias_shape, dtype="float32")
+
+        y = relay.nn.deformable_conv2d(
+            data,
+            offset,
+            weight,
+            kernel_size=(KH, KW),
+            channels=CO,
+            data_layout=layout,
+            kernel_layout=kernel_layout,
+        )
+        y = relay.nn.bias_add(y, bias, axis=-1 if layout == "NHWC" else 1)
+        y = relay.nn.relu(y)
+        y = relay.nn.max_pool2d(y, pool_size=(2, 2), layout=layout)
+        y = relay.cast(y, "int32")
+        y = relay.nn.batch_flatten(y)
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    def expected(N, CI, H, W, CO, KH, KW, OH, OW, src_layout, dst_layout, max_pool_layout=None):
+        layout_map = {"src": {}, "dst": {}}
+        if src_layout == "NCHW":
+            nchw = layout_map["src"]
+            nhwc = layout_map["dst"]
+        else:
+            nchw = layout_map["dst"]
+            nhwc = layout_map["src"]
+
+        nchw["data_layout"] = "NCHW"
+        nchw["data_shape"] = (N, CI, H, W)
+        nchw["offset_shape"] = (N, KH * KW * 2, OH, OW)
+        nchw["weight_shape"] = (CO, CI, KH, KW)
+        nchw["kernel_layout"] = "OIHW"
+
+        nhwc["data_layout"] = "NHWC"
+        nhwc["data_shape"] = (N, H, W, CI)
+        nhwc["offset_shape"] = (N, OH, OW, KH * KW * 2)
+        nhwc["weight_shape"] = (KH, KW, CI, CO)
+        nhwc["kernel_layout"] = "HWIO"
+
+        bias_shape = (CO,)
+
+        data = relay.var("data", shape=layout_map["src"]["data_shape"], dtype="float32")
+        offset = relay.var("offset", shape=layout_map["src"]["offset_shape"], dtype="float32")
+        weight = relay.var("weight", shape=layout_map["src"]["weight_shape"], dtype="float32")
+        bias = relay.var("bias", shape=bias_shape, dtype="float32")
+
+        data = relay.layout_transform(
+            data, layout_map["src"]["data_layout"], layout_map["dst"]["data_layout"]
+        )
+        offset = relay.layout_transform(
+            offset, layout_map["src"]["data_layout"], layout_map["dst"]["data_layout"]
+        )
+        weight = relay.layout_transform(
+            weight, layout_map["src"]["kernel_layout"], layout_map["dst"]["kernel_layout"]
+        )
+        y = relay.nn.deformable_conv2d(
+            data,
+            offset,
+            weight,
+            kernel_size=(KH, KW),
+            channels=CO,
+            data_layout=layout_map["dst"]["data_layout"],
+            kernel_layout=layout_map["dst"]["kernel_layout"],
+        )
+        if layout_map["src"]["data_layout"] == "NHWC":
+            bias = relay.expand_dims(bias, axis=0, num_newaxis=3)
+        else:
+            bias = relay.expand_dims(bias, axis=1, num_newaxis=2)
+            bias = relay.expand_dims(bias, axis=0)
+        bias = relay.layout_transform(
+            bias, layout_map["src"]["data_layout"], layout_map["dst"]["data_layout"]
+        )
+        y = relay.add(y, bias)
+        y = relay.nn.relu(y)
+        if max_pool_layout != layout_map["dst"]["data_layout"]:
+            y = relay.layout_transform(y, layout_map["dst"]["data_layout"], max_pool_layout)
+        y = relay.nn.max_pool2d(
+            y, pool_size=(2, 2), layout=max_pool_layout, out_layout=max_pool_layout
+        )
+        y = relay.cast(y, "int32")
+        y = relay.nn.batch_flatten(y)
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    # NHWC -> NCHW
+    a = before(1, 3, 224, 224, 32, 3, 3, "NHWC")
+    a = run_opt_pass(
+        a,
+        transform.ConvertLayout(
+            {"nn.deformable_conv2d": ["NCHW", "default"], "nn.max_pool2d": ["NHWC"]}
+        ),
+    )
+    # - in the before() func, its last argument "NHWC" is also the layout of max_pool
+    b = run_opt_pass(
+        # max_pool has its own layout argument
+        expected(1, 3, 224, 224, 32, 3, 3, 222, 222, "NHWC", "NCHW", max_pool_layout="NHWC"),
+        transform.InferType(),
+    )
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "\n\n Expected = \n" + str(b)
+
+    # NCHW -> NHWC
+    a = before(1, 3, 224, 224, 32, 3, 3, "NCHW")
+    a = run_opt_pass(
+        a,
+        transform.ConvertLayout(
+            {"nn.deformable_conv2d": ["NHWC", "default"], "nn.max_pool2d": ["NCHW"]}
+        ),
+    )
+    # - in the before() func, its last argument "NCHW" is also the layout of max_pool
+    b = run_opt_pass(
+        # max_pool has its own layout argument
+        expected(1, 3, 224, 224, 32, 3, 3, 222, 222, "NCHW", "NHWC", max_pool_layout="NCHW"),
+        transform.InferType(),
+    )
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "\n\n Expected = \n" + str(b)
+
+
+def test_resnet_pool_uses_specified_convert_layout():
+    relay.op.get("nn.global_max_pool2d").reset_attr("FTVMConvertOpLayout")
+
+    @tvm.ir.register_op_attr("nn.global_max_pool2d", "FTVMConvertOpLayout")
+    def convert_global_maxpool2d(attrs, inputs, tinfos, desired_layouts):
+        # stick by convertng layout and out_layout to use NHWC and NHWC,
+        #   respectively, as specified in the transforms.ConvertLayout() function's arguments later
+        new_attrs = dict(attrs)
+        new_attrs["layout"] = str(desired_layouts[0])
+        new_attrs["out_layout"] = str(desired_layouts[0])
+        return relay.nn.global_max_pool2d(*inputs, **new_attrs)
+
+    def before():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
+        weight2 = relay.var("weight2", shape=(1, 1, 64, 32))
+        y = relay.nn.conv2d(
+            x,
+            weight1,
+            channels=32,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        y = relay.nn.relu(y)
+        y2 = relay.nn.conv2d(
+            x, weight2, channels=32, kernel_size=(1, 1), data_layout="NHWC", kernel_layout="HWIO"
+        )
+        y2 = relay.nn.relu(y2)
+        y = y + y2
+        y = relay.nn.global_max_pool2d(y, layout="NHWC")
+        return relay.Function(analysis.free_vars(y), y)
+
+    def expected():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        weight1 = relay.var("weight1", shape=(3, 3, 64, 32))
+        weight2 = relay.var("weight2", shape=(1, 1, 64, 32))
+        weight1 = relay.layout_transform(weight1, "HWIO", "OIHW")
+        weight2 = relay.layout_transform(weight2, "HWIO", "OIHW")
+        x = relay.layout_transform(x, "NHWC", "NCHW")
+        y = relay.nn.conv2d(x, weight1, channels=32, kernel_size=(3, 3), padding=(1, 1))
+        y = relay.nn.relu(y)
+        y2 = relay.nn.conv2d(x, weight2, channels=32, kernel_size=(1, 1))
+        y2 = relay.nn.relu(y2)
+        y = y + y2
+        y = relay.layout_transform(y, "NCHW", "NHWC")
+        y = relay.nn.global_max_pool2d(y, layout="NHWC", out_layout="NHWC")
+        return relay.Function(analysis.free_vars(y), y)
+
+    a = before()
+    a = run_opt_pass(
+        a,
+        transform.ConvertLayout(
+            {"nn.conv2d": ["NCHW", "default"], "nn.global_max_pool2d": ["NHWC"]}
+        ),
+    )
+    b = run_opt_pass(expected(), transform.InferType())
+
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "\n\n Expected = \n" + str(b)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index d9411c92c375..4a9b7eacdf65 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import pytest
+import os
 import numpy as np
 import tvm
 import tvm.testing
@@ -76,11 +77,11 @@ def get_manual_conf(mods, target):
     # The third output is the final output, the second output is for mod3, the first output
     # is for mod2 input.
     pipe_config1 = {
-        "mod_idx": 1,
+        "mod_idx": 0,
         "output": [
-            {"output_idx": 0, "dependent": [{"mod_idx": 2, "input_name": "data_0"}]},
-            {"output_idx": 1, "dependent": [{"mod_idx": 3, "input_name": "data_0"}]},
-            {"output_idx": 2, "dependent": [{"mod_idx": 0, "input_name": "0"}]},
+            {"output_idx": 0, "dependencies": [{"mod_idx": 1, "input_name": "data_0"}]},
+            {"output_idx": 1, "dependencies": [{"mod_idx": 2, "input_name": "data_0"}]},
+            {"output_idx": 2, "dependencies": [{"global_output_index": 0}]},
         ],
     }
     mod_config[mods[0]] = {
@@ -94,9 +95,9 @@ def get_manual_conf(mods, target):
     }
 
     pipe_config2 = {
-        "mod_idx": 2,
+        "mod_idx": 1,
         "output": [
-            {"output_idx": 0, "dependent": [{"mod_idx": 3, "input_name": "data_1"}]},
+            {"output_idx": 0, "dependencies": [{"mod_idx": 2, "input_name": "data_1"}]},
         ],
     }
     mod_config[mods[1]] = {
@@ -110,8 +111,8 @@ def get_manual_conf(mods, target):
     }
 
     pipe_config3 = {
-        "mod_idx": 3,
-        "output": [{"output_idx": 0, "dependent": [{"mod_idx": 0, "input_name": "1"}]}],
+        "mod_idx": 2,
+        "output": [{"output_idx": 0, "dependencies": [{"global_output_index": 1}]}],
     }
     mod_config[mods[2]] = {
         "pipeline": pipe_config3,
@@ -128,7 +129,7 @@ def get_manual_conf(mods, target):
 def test_pipe_config_check():
     # This function is used to trigger runtime error by applying wrong logic connection.
 
-    # Get the three pipeline modules here.
+    # Get three pipeline modules here.
     (mod1, mod2, mod3), dshape = get_mannual_mod()
 
     # The input or output name is illegal and expects a runtime error.
@@ -179,10 +180,12 @@ def test_pipeline():
 
             pipe_config = pipeline_executor.PipelineConfig()
 
-            # The global input named "data_0" will be connected to a input named "data_0" of mod1.
+            # The pipeline input named "data_0" will be connected to a input named "data_0"
+            # of mod1.
             pipe_config["input"]["data_0"].connect(pipe_config[mod1]["input"]["data_0"])
 
-            # The global Input named "data_1" will be connected to a input named "data_1" of mod2.
+            # The pipeline Input named "data_1" will be connected to a input named "data_1"
+            # of mod2.
             pipe_config["input"]["data_1"].connect(pipe_config[mod2]["input"]["data_1"])
 
             # The mod1 output[0] will be connected to a input named "data_0" of mod2.
@@ -194,10 +197,10 @@ def test_pipeline():
             # The mod2 output[2] will be connected to a input named "data_1" of mod3.
             pipe_config[mod2]["output"][0].connect(pipe_config[mod3]["input"]["data_1"])
 
-            # The mod1 output[2] will be connected to global output[1].
+            # The mod1 output[2] will be connected to pipeline output[0].
             pipe_config[mod1]["output"][2].connect(pipe_config["output"]["0"])
 
-            # The mod3 output[0] will be connected to global output[2].
+            # The mod3 output[0] will be connected to pipeline output[1].
             pipe_config[mod3]["output"][0].connect(pipe_config["output"]["1"])
             # Print configueration (print(pipe_config)), the result looks like following.
             #
@@ -231,9 +234,21 @@ def test_pipeline():
             with tvm.transform.PassContext(opt_level=3):
                 pipeline_mod_factory = pipeline_executor.build(pipe_config)
 
+            # Export the parameter configuration to a file.
+            directory_path = tvm.contrib.utils.tempdir().temp_dir
+            # If the directory does not exist, create it.
+            if not os.path.exists(directory_path):
+                os.makedirs(directory_path)
+            config_file_name = pipeline_mod_factory.export_library(directory_path)
+
+            # Use the output of build to create and initialize PipelineModule.
             pipeline_module = pipeline_executor.PipelineModule(pipeline_mod_factory)
             assert pipeline_module
 
+            # Use the import function to create and initialize PipelineModule.
+            pipeline_module_test = pipeline_executor.PipelineModule.load_library(config_file_name)
+            assert pipeline_module_test.num_outputs == 2
+
 
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_target_hooks.py b/tests/python/relay/test_target_hooks.py
index 4d7a7fcdc15b..5856dc1e1c69 100644
--- a/tests/python/relay/test_target_hooks.py
+++ b/tests/python/relay/test_target_hooks.py
@@ -49,5 +49,28 @@ def test_tir_external_generation(check_result):
     check_result(func, inputs, (8,), x_data - y_data)
 
 
+@pytest.mark.parametrize("check_result", [check_aot_executor_result, check_graph_executor_result])
+def test_runtime_module_generation(check_result):
+    shape = (8,)
+    x_data = np.random.randint(255, size=shape).astype("float32")
+    y_data = np.random.randint(255, size=shape).astype("float32")
+    inputs = {"x": x_data, "y": y_data}
+
+    x0 = relay.var("x0", shape=shape, dtype="float32")
+    y0 = relay.var("y0", shape=shape, dtype="float32")
+    z = x0 + y0
+    func = relay.Function([x0, y0], z)
+    func = set_external_func_attr(func, "example_target_hook", "replace_add_with_subtract")
+    # Test hook to trigger TIRToRuntime code generation
+    func = func.with_attr("tir_to_runtime", True)
+
+    x = relay.var("x", shape=(8,), dtype="float32")
+    y = relay.var("y", shape=(8,), dtype="float32")
+    call = relay.Call(func, [x, y])
+    func = IRModule.from_expr(call)
+
+    check_result(func, inputs, (8,), x_data * y_data)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_autotvm_graph_tuner_utils.py b/tests/python/unittest/test_autotvm_graph_tuner_utils.py
index 3f6d3980ee28..583bd366847c 100644
--- a/tests/python/unittest/test_autotvm_graph_tuner_utils.py
+++ b/tests/python/unittest/test_autotvm_graph_tuner_utils.py
@@ -20,6 +20,8 @@
 # helps avoid topi arithmetic operator overloading issue:
 # https://github.com/apache/tvm/issues/3240
 # TODO: restore the file name after this issue is resolved.
+import pytest
+
 import tvm
 from tvm import te
 
@@ -34,6 +36,7 @@
     bind_inputs,
 )
 from tvm.autotvm.graph_tuner._base import OPT_OUT_OP
+from tvm.autotvm.graph_tuner.utils.traverse_graph import _replace_device_with_tracing
 from tvm.relay.expr import Call, TupleGetItem, Tuple, Var
 
 
@@ -57,7 +60,7 @@ def test_has_multiple_inputs():
     target_ops = [relay.op.get("nn.conv2d")]
     node_list = []
     node_dict = {}
-    expr2graph(net, target_ops, node_dict, node_list)
+    expr2graph(net, target_ops, node_dict, node_list, tvm.target.Target("llvm"))
     input_names = ["data"]
     verify_has_multiple_inputs(node_list, 2, input_names, False)
     verify_has_multiple_inputs(node_list, 4, input_names, False)
@@ -79,7 +82,7 @@ def _count_node(node):
 
     relay.analysis.post_order_visit(mod["main"], _count_node)
 
-    expr2graph(mod["main"], target_ops, node_dict, node_list)
+    expr2graph(mod["main"], target_ops, node_dict, node_list, tvm.target.Target("llvm"))
     assert len(node_list) == len(op_name_list)
     for i, item in enumerate(zip(op_name_list, node_list)):
         op_name, node = item
@@ -103,7 +106,7 @@ def test_get_direct_ancestor():
     target_ops = [relay.op.get("nn.conv2d")]
     node_list = []
     node_dict = {}
-    expr2graph(net, target_ops, node_dict, node_list)
+    expr2graph(net, target_ops, node_dict, node_list, tvm.target.Target("llvm"))
     visited_dict = {}
     input_names = ["data"]
     out = get_direct_ancestor(node_list, visited_dict, target_ops, 5, input_names)
@@ -115,7 +118,7 @@ def test_get_direct_ancestor():
     net = bind_inputs(net, {"data": (1, 16, 224, 224)})
     node_list = []
     node_dict = {}
-    expr2graph(net, target_ops, node_dict, node_list)
+    expr2graph(net, target_ops, node_dict, node_list, tvm.target.Target("llvm"))
     out = get_direct_ancestor(node_list, visited_dict, target_ops, 3, input_names)
     assert out == [0], "Output mismatch: expecting [0] but got %s." % str(out)
 
@@ -134,7 +137,7 @@ def test_get_in_nodes():
     input_names = ["data"]
     node_list = []
     node_dict = {}
-    expr2graph(net, target_ops, node_dict, node_list)
+    expr2graph(net, target_ops, node_dict, node_list, tvm.target.Target("llvm"))
     out = get_in_nodes(node_list, target_ops, input_names)
     expected_out = {3: [0], 4: [3, 0], 7: [4]}
     diff_set = set(out) ^ set(expected_out)
@@ -155,6 +158,20 @@ def test_get_out_nodes():
         )
 
 
+def test_target_device_replacement():
+    assert _replace_device_with_tracing("cuda") == "cuda -device=tracing"
+    assert (
+        _replace_device_with_tracing("cuda -device=some_device -libs=cudnn")
+        == "cuda -device=tracing -libs=cudnn"
+    )
+    assert (
+        _replace_device_with_tracing("llvm -device=arm_cpu -arg=xxx")
+        == "llvm -device=tracing -arg=xxx"
+    )
+    assert _replace_device_with_tracing("llvm -device=arm_cpu") == "llvm -device=tracing"
+    assert _replace_device_with_tracing("llvm -device=abc, def") == "llvm -device=tracing"
+
+
 if __name__ == "__main__":
     test_has_multiple_inputs()
     test_expr2graph()
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 62e68ab01ce5..9450a937a155 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -35,6 +35,7 @@
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.testing import conv2d_nchw_python
 
+pytest.importorskip("tvm.micro.testing")
 from tvm.micro.testing import check_tune_log
 
 BUILD = True
diff --git a/tests/python/unittest/test_lower_build.py b/tests/python/unittest/test_lower_build.py
index 6502f0c67de6..fabf41705698 100644
--- a/tests/python/unittest/test_lower_build.py
+++ b/tests/python/unittest/test_lower_build.py
@@ -41,10 +41,12 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "init") as [vi, vj]:
+        with T.block("init"):
+            vi, vj = T.axis.remap("SS", [i, j])
             C[vi, vj] = T.float32(0)
-        for k in range(0, 128):
-            with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
+        for k in range(128):
+            with T.block("update"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
diff --git a/tests/python/unittest/test_meta_schedule_arg_info.py b/tests/python/unittest/test_meta_schedule_arg_info.py
index 7bedea9082d1..62dcb52f7415 100644
--- a/tests/python/unittest/test_meta_schedule_arg_info.py
+++ b/tests/python/unittest/test_meta_schedule_arg_info.py
@@ -28,10 +28,12 @@ def Matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 256), "float32")
     B = T.match_buffer(b, (256, 512), "float32")
     C = T.match_buffer(c, (128, 512), "float32")
-    with T.block([128, 256, T.reduce_axis(0, 512)], "matmul") as [vi, vj, vk]:
-        with T.init():
-            C[vi, vj] = 0.0
-        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+    for i, j, k in T.grid(128, 256, 512):
+        with T.block("matmul"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vi, vj] = 0.0
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
 # fmt: on
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
diff --git a/tests/python/unittest/test_meta_schedule_builder.py b/tests/python/unittest/test_meta_schedule_builder.py
index fa09a092c8c4..fb3fa135a9b8 100644
--- a/tests/python/unittest/test_meta_schedule_builder.py
+++ b/tests/python/unittest/test_meta_schedule_builder.py
@@ -47,10 +47,12 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no
         A = T.match_buffer(a, (1024, 1024), "float32")
         B = T.match_buffer(b, (1024, 1024), "float32")
         C = T.match_buffer(c, (1024, 1024), "float32")
-        with T.block([1024, 1024, T.reduce_axis(0, 1024)], "matmul") as [vi, vj, vk]:
-            with T.init():
-                C[vi, vj] = 0.0
-            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+        for i, j, k in T.grid(1024, 1024, 1024):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
 
 @script.ir_module
@@ -64,12 +66,16 @@ def matmul_relu(  # pylint: disable=no-self-argument
         B = T.match_buffer(b, (1024, 1024), "float32")
         D = T.match_buffer(d, (1024, 1024), "float32")
         C = T.alloc_buffer((1024, 1024), "float32")
-        with T.block([1024, 1024, T.reduce_axis(0, 1024)], "matmul") as [vi, vj, vk]:
-            with T.init():
-                C[vi, vj] = 0.0
-            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
-        with T.block([1024, 1024], "relu") as [vi, vj]:
-            D[vi, vj] = T.max(C[vi, vj], 0.0)
+        for i, j, k in T.grid(1024, 1024, 1024):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+        for i, j in T.grid(1024, 1024):
+            with T.block("relu"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                D[vi, vj] = T.max(C[vi, vj], 0.0)
 
 
 @script.ir_module
@@ -82,10 +88,12 @@ def batch_matmul(  # pylint: disable=no-self-argument
         A = T.match_buffer(a, [16, 128, 128])
         B = T.match_buffer(b, [16, 128, 128])
         C = T.match_buffer(c, [16, 128, 128])
-        with T.block([16, 128, 128, T.reduce_axis(0, 128)], "update") as [vn, vi, vj, vk]:
-            with T.init():
-                C[vn, vi, vj] = 0.0
-            C[vn, vi, vj] = C[vn, vi, vj] + A[vn, vi, vk] * B[vn, vj, vk]
+        for n, i, j, k in T.grid(16, 128, 128, 128):
+            with T.block("update"):
+                vn, vi, vj, vk = T.axis.remap("SSSR", [n, i, j, k])
+                with T.init():
+                    C[vn, vi, vj] = 0.0
+                C[vn, vi, vj] = C[vn, vi, vj] + A[vn, vi, vk] * B[vn, vj, vk]
 
 
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,missing-docstring
diff --git a/tests/python/unittest/test_meta_schedule_database.py b/tests/python/unittest/test_meta_schedule_database.py
index cb39c91eaca4..121ec2fd480b 100644
--- a/tests/python/unittest/test_meta_schedule_database.py
+++ b/tests/python/unittest/test_meta_schedule_database.py
@@ -41,10 +41,12 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:
         A = T.match_buffer(a, (1024, 1024), "float32")
         B = T.match_buffer(b, (1024, 1024), "float32")
         C = T.match_buffer(c, (1024, 1024), "float32")
-        with T.block([1024, 1024, T.reduce_axis(0, 1024)], "matmul") as [vi, vj, vk]:
-            with T.init():
-                C[vi, vj] = 0.0
-            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+        for i, j, k in T.grid(1024, 1024, 1024):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
 
 @tvm.script.ir_module
@@ -56,12 +58,16 @@ def main(a: T.handle, b: T.handle, d: T.handle) -> None:  # pylint: disable=no-s
         B = T.match_buffer(b, (16, 16), "float32")
         D = T.match_buffer(d, (16, 16), "float32")
         C = T.alloc_buffer((16, 16), "float32")
-        with T.block([16, 16, T.reduce_axis(0, 16)], "matmul") as [vi, vj, vk]:
-            with T.init():
-                C[vi, vj] = 0.0
-            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
-        with T.block([16, 16], "relu") as [vi, vj]:
-            D[vi, vj] = T.max(C[vi, vj], 0.0)
+        for i, j, k in T.grid(16, 16, 16):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+        for i, j in T.grid(16, 16):
+            with T.block("relu"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                D[vi, vj] = T.max(C[vi, vj], 0.0)
 
 
 # fmt: on
diff --git a/tests/python/unittest/test_meta_schedule_runner.py b/tests/python/unittest/test_meta_schedule_runner.py
index 9fb1e5ef19c1..46be12569c78 100644
--- a/tests/python/unittest/test_meta_schedule_runner.py
+++ b/tests/python/unittest/test_meta_schedule_runner.py
@@ -68,10 +68,12 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-s
         A = T.match_buffer(a, (16, 16), "float32")
         B = T.match_buffer(b, (16, 16), "float32")
         C = T.match_buffer(c, (16, 16), "float32")
-        with T.block([16, 16, T.reduce_axis(0, 16)], "matmul") as [vi, vj, vk]:
-            with T.init():
-                C[vi, vj] = 0.0
-            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+        for i, j, k in T.grid(16, 16, 16):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
 
 @tvm.script.ir_module
@@ -83,12 +85,16 @@ def main(a: T.handle, b: T.handle, d: T.handle) -> None:  # pylint: disable=no-s
         B = T.match_buffer(b, (16, 16), "float32")
         D = T.match_buffer(d, (16, 16), "float32")
         C = T.alloc_buffer((16, 16), "float32")
-        with T.block([16, 16, T.reduce_axis(0, 16)], "matmul") as [vi, vj, vk]:
-            with T.init():
-                C[vi, vj] = 0.0
-            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
-        with T.block([16, 16], "relu") as [vi, vj]:
-            D[vi, vj] = T.max(C[vi, vj], 0.0)
+        for i, j, k in T.grid(16, 16, 16):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+        for i, j in T.grid(16, 16):
+            with T.block("relu"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                D[vi, vj] = T.max(C[vi, vj], 0.0)
 
 
 @tvm.script.ir_module
@@ -99,10 +105,12 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-s
         A = T.match_buffer(a, [16, 32, 32])
         B = T.match_buffer(b, [16, 32, 32])
         C = T.match_buffer(c, [16, 32, 32])
-        with T.block([16, 32, 32, T.reduce_axis(0, 32)], "update") as [vn, vi, vj, vk]:
-            with T.init():
-                C[vn, vi, vj] = 0.0
-            C[vn, vi, vj] = C[vn, vi, vj] + A[vn, vi, vk] * B[vn, vj, vk]
+        for n, i, j, k in T.grid(16, 32, 32, 32):
+            with T.block("update"):
+                vn, vi, vj, vk = T.axis.remap("SSSR", [n, i, j, k])
+                with T.init():
+                    C[vn, vi, vj] = 0.0
+                C[vn, vi, vj] = C[vn, vi, vj] + A[vn, vi, vk] * B[vn, vj, vk]
 
 
 @tvm.script.ir_module
@@ -113,8 +121,10 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-s
         A = T.match_buffer(a, [32], "float32")
         B = T.match_buffer(b, [32], "float32")
         C = T.match_buffer(c, [32], "float32")
-        with T.block([32], "add") as [vi]:
-            C[vi] = A[vi] + B[vi]
+        for i in range(32):
+            with T.block("add"):
+                vi = T.axis.S(32, i)
+                C[vi] = A[vi] + B[vi]
 
 
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,missing-docstring
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index e12871391558..9b3ddfd7c789 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -45,10 +45,12 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:
         A = T.match_buffer(a, (32, 32), "float32")
         B = T.match_buffer(b, (32, 32), "float32")
         C = T.match_buffer(c, (32, 32), "float32")
-        with T.block([32, 32, T.reduce_axis(0, 32)], "matmul") as [vi, vj, vk]:
-            with T.init():
-                C[vi, vj] = 0.0
-            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+        for i, j, k in T.grid(32, 32, 32):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
 # fmt: on
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
diff --git a/tests/python/unittest/test_meta_schedule_space_generator.py b/tests/python/unittest/test_meta_schedule_space_generator.py
index 39bb1acf065f..3f7749ca9e2c 100644
--- a/tests/python/unittest/test_meta_schedule_space_generator.py
+++ b/tests/python/unittest/test_meta_schedule_space_generator.py
@@ -40,10 +40,12 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:
         A = T.match_buffer(a, (1024, 1024), "float32")
         B = T.match_buffer(b, (1024, 1024), "float32")
         C = T.match_buffer(c, (1024, 1024), "float32")
-        with T.block([1024, 1024, T.reduce_axis(0, 1024)], "matmul") as [vi, vj, vk]:
-            with T.init():
-                C[vi, vj] = 0.0
-            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+        for i, j, k in T.grid(1024, 1024, 1024):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
 # fmt: on
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
diff --git a/tests/python/unittest/test_meta_schedule_task_scheduler.py b/tests/python/unittest/test_meta_schedule_task_scheduler.py
index a30409696543..4854aeb5f5aa 100644
--- a/tests/python/unittest/test_meta_schedule_task_scheduler.py
+++ b/tests/python/unittest/test_meta_schedule_task_scheduler.py
@@ -48,10 +48,12 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-s
         A = T.match_buffer(a, (1024, 1024), "float32")
         B = T.match_buffer(b, (1024, 1024), "float32")
         C = T.match_buffer(c, (1024, 1024), "float32")
-        with T.block([1024, 1024, T.reduce_axis(0, 1024)], "matmul") as [vi, vj, vk]:
-            with T.init():
-                C[vi, vj] = 0.0
-            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+        for i, j, k in T.grid(1024, 1024, 1024):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
 
 @tvm.script.ir_module
@@ -63,12 +65,16 @@ def main(a: T.handle, b: T.handle, d: T.handle) -> None:  # pylint: disable=no-s
         B = T.match_buffer(b, (1024, 1024), "float32")
         D = T.match_buffer(d, (1024, 1024), "float32")
         C = T.alloc_buffer((1024, 1024), "float32")
-        with T.block([1024, 1024, T.reduce_axis(0, 1024)], "matmul") as [vi, vj, vk]:
-            with T.init():
-                C[vi, vj] = 0.0
-            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
-        with T.block([1024, 1024], "relu") as [vi, vj]:
-            D[vi, vj] = T.max(C[vi, vj], 0.0)
+        for i, j, k in T.grid(1024, 1024, 1024):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+        for i, j in T.grid(1024, 1024):
+            with T.block("relu"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                D[vi, vj] = T.max(C[vi, vj], 0.0)
 
 
 @tvm.script.ir_module
@@ -79,10 +85,12 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-s
         A = T.match_buffer(a, [16, 128, 128])
         B = T.match_buffer(b, [16, 128, 128])
         C = T.match_buffer(c, [16, 128, 128])
-        with T.block([16, 128, 128, T.reduce_axis(0, 128)], "matmul") as [vn, vi, vj, vk]:
-            with T.init():
-                C[vn, vi, vj] = 0.0
-            C[vn, vi, vj] = C[vn, vi, vj] + A[vn, vi, vk] * B[vn, vj, vk]
+        for n, i, j, k in T.grid(16, 128, 128, 128):
+            with T.block("matmul"):
+                vn, vi, vj, vk = T.axis.remap("SSSR", [n, i, j, k])
+                with T.init():
+                    C[vn, vi, vj] = 0.0
+                C[vn, vi, vj] = C[vn, vi, vj] + A[vn, vi, vk] * B[vn, vj, vk]
 
 
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks
diff --git a/tests/python/unittest/test_meta_schedule_tune_context.py b/tests/python/unittest/test_meta_schedule_tune_context.py
index 44bb949b925b..01a4379e5127 100644
--- a/tests/python/unittest/test_meta_schedule_tune_context.py
+++ b/tests/python/unittest/test_meta_schedule_tune_context.py
@@ -35,10 +35,12 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-s
         A = T.match_buffer(a, (1024, 1024), "float32")
         B = T.match_buffer(b, (1024, 1024), "float32")
         C = T.match_buffer(c, (1024, 1024), "float32")
-        with T.block([1024, 1024, T.reduce_axis(0, 1024)], "matmul") as [vi, vj, vk]:
-            with T.init():
-                C[vi, vj] = 0.0
-            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+        for i, j, k in T.grid(1024, 1024, 1024):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
 
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,missing-docstring
diff --git a/tests/python/unittest/test_micro_project_api.py b/tests/python/unittest/test_micro_project_api.py
index b5e2a57c122c..e319318656ef 100644
--- a/tests/python/unittest/test_micro_project_api.py
+++ b/tests/python/unittest/test_micro_project_api.py
@@ -25,6 +25,8 @@
 import pytest
 
 import tvm
+
+pytest.importorskip("tvm.micro")
 from tvm.micro import project_api
 
 
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index ca6cb0181489..3e38a526855a 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -52,15 +52,20 @@ def read_csv(report):
 @pytest.mark.skipif(not profiler_vm.enabled(), reason="VM Profiler not enabled")
 @tvm.testing.parametrize_targets
 def test_vm(target, dev):
-    mod, params = mlp.get_workload(1)
-
-    exe = relay.vm.compile(mod, target, params=params)
+    dtype = "float32"
+    x = relay.var("x", shape=(relay.Any(), relay.Any()), dtype=dtype)
+    y = relay.var("y", shape=(relay.Any(), relay.Any()), dtype=dtype)
+    mod = tvm.IRModule()
+    mod["main"] = relay.Function([x, y], relay.add(x, y))
+    exe = relay.vm.compile(mod, target)
     vm = profiler_vm.VirtualMachineProfiler(exe, dev)
 
-    data = np.random.rand(1, 1, 28, 28).astype("float32")
-    report = vm.profile(data, func_name="main")
-    assert "fused_nn_softmax" in str(report)
+    data = np.random.rand(28, 28).astype("float32")
+    report = vm.profile(data, data, func_name="main")
+    assert "fused_add" in str(report)
     assert "Total" in str(report)
+    assert "AllocTensorReg" in str(report)
+    assert "AllocStorage" in str(report)
 
     csv = read_csv(report)
     assert "Hash" in csv.keys()
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index 22aea8d1fcea..6e1fc815d66d 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -29,6 +29,7 @@
 from tvm import rpc
 from tvm.contrib import utils, cc
 from tvm.rpc.tracker import Tracker
+from tvm.rpc.proxy import Proxy
 
 
 if __name__ == "__main__":
@@ -538,3 +539,46 @@ def test_rpc_tracker_request():
     proc2.join()
     server.terminate()
     tracker.terminate()
+
+
+@tvm.testing.requires_rpc
+def test_rpc_tracker_via_proxy():
+    """
+         tracker
+         /     \
+    Host   --   Proxy -- RPC server
+    """
+
+    device_key = "test_device"
+
+    tracker_server = Tracker(port=9000, port_end=9100)
+    proxy_server = Proxy(
+        host=tracker_server.host,
+        port=8888,
+        port_end=8988,
+        tracker_addr=(tracker_server.host, tracker_server.port),
+    )
+
+    server1 = rpc.Server(
+        host=proxy_server.host,
+        port=proxy_server.port,
+        key=device_key,
+        tracker_addr=(tracker_server.host, tracker_server.port),
+        is_proxy=True,
+    )
+    server2 = rpc.Server(
+        host=proxy_server.host,
+        port=proxy_server.port,
+        key=device_key,
+        tracker_addr=(tracker_server.host, tracker_server.port),
+        is_proxy=True,
+    )
+
+    client = rpc.connect_tracker(tracker_server.host, tracker_server.port)
+    remote1 = client.request(device_key, session_timeout=30)  # pylint: disable=unused-variable
+    remote2 = client.request(device_key, session_timeout=30)  # pylint: disable=unused-variable
+
+    server2.terminate()
+    server1.terminate()
+    proxy_server.terminate()
+    tracker_server.terminate()
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index 8c8d601672ac..5a1b33ae10b1 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -885,5 +885,21 @@ def check_llvm(use_file):
     check_llvm(use_file=False)
 
 
+@tvm.testing.requires_llvm
+def test_llvm_scalar_concat():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.decl_buffer((1,), "int32x2")
+    s = tvm.tir.Shuffle([x, y], [0, 1])
+    f = tvm.tir.PrimFunc([x, y, z], z.vstore(0, s))
+
+    mod = tvm.ir.IRModule.from_expr(f.with_attr("global_symbol", "codegen_scalar_concat"))
+
+    # This will crash in LLVM codegen if CodeGenLLVM::CreateVecConcat doesn't convert
+    # scalars to single-lane LLVM vectors.
+    with tvm.transform.PassContext(config={"tir.disable_assert": True}):
+        m = tvm.build(mod, [x, y, z], target="llvm")
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index 987898001a1b..6b5c26d08b7b 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -54,10 +54,12 @@ def tir_matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, (128, 128))
     C = T.match_buffer(c, (128, 128))
 
-    with T.block([128, 128, T.reduce_axis(0, 128)]) as [i, j, k]:
-        with T.init():
-            C[i, j] = 0.0
-        C[i, j] += A[i, k] * B[j, k]
+    for i0, j0, k0 in T.grid(128, 128, 128):
+        with T.block():
+            i, j, k = T.axis.remap("SSR", [i0, j0, k0])
+            with T.init():
+                C[i, j] = 0.0
+            C[i, j] += A[i, k] * B[j, k]
 
 
 def test_matmul():
@@ -77,10 +79,14 @@ def tir_element_wise(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (128, 128))
     B = T.alloc_buffer((128, 128))
 
-    with T.block([128, 128]) as [i, j]:
-        B[i, j] = A[i, j] * 2.0
-    with T.block([128, 128]) as [i, j]:
-        C[i, j] = B[i, j] + 1.0
+    for i0, j0 in T.grid(128, 128):
+        with T.block():
+            i, j = T.axis.remap("SS", [i0, j0])
+            B[i, j] = A[i, j] * 2.0
+    for i0, j0 in T.grid(128, 128):
+        with T.block():
+            i, j = T.axis.remap("SS", [i0, j0])
+            C[i, j] = B[i, j] + 1.0
 
 
 def test_element_wise():
@@ -125,19 +131,21 @@ def tir_conv2d(a: T.handle, w: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [16, 32, 14, 14])
     Apad = T.alloc_buffer([16, 16, 16, 16])
 
-    with T.block([16, 16, 16, 16], "Apad") as [nn, cc, yy, xx]:
-        Apad[nn, cc, yy, xx] = T.if_then_else(
-            yy >= 1 and yy - 1 < 14 and xx >= 1 and xx - 1 < 14,
-            A[nn, cc, yy - 1, xx - 1],
-            0.0,
-            dtype="float32",
-        )
-    with T.block(
-        [16, 32, 14, 14, T.reduce_axis(0, 16), T.reduce_axis(0, 3), T.reduce_axis(0, 3)], "B"
-    ) as [nn, ff, yy, xx, rc, ry, rx]:
-        with T.init():
-            B[nn, ff, yy, xx] = 0.0
-        B[nn, ff, yy, xx] += Apad[nn, rc, yy + ry, xx + rx] * W[rc, ry, rx, ff]
+    for n, c, y, x in T.grid(16, 16, 16, 16):
+        with T.block("Apad"):
+            nn, cc, yy, xx = T.axis.remap("SSSS", [n, c, y, x])
+            Apad[nn, cc, yy, xx] = T.if_then_else(
+                yy >= 1 and yy - 1 < 14 and xx >= 1 and xx - 1 < 14,
+                A[nn, cc, yy - 1, xx - 1],
+                0.0,
+                dtype="float32",
+            )
+    for n, f, y, x, kc, ky, kx in T.grid(16, 32, 14, 14, 16, 3, 3):
+        with T.block("B"):
+            nn, ff, yy, xx, rc, ry, rx = T.axis.remap("SSSSRRR", [n, f, y, x, kc, ky, kx])
+            with T.init():
+                B[nn, ff, yy, xx] = 0.0
+            B[nn, ff, yy, xx] += Apad[nn, rc, yy + ry, xx + rx] * W[rc, ry, rx, ff]
 
 
 def test_conv2d():
@@ -163,9 +171,11 @@ def tir_multi_output(a0: T.handle, a1: T.handle, b0: T.handle, b1: T.handle) ->
     B1 = T.match_buffer(b1, (m, n))
 
     for i0, i1 in T.grid(m, n):
-        with T.block([m, n], "B.v0") as [i, j]:
+        with T.block("B.v0"):
+            i, j = T.axis.remap("SS", [i0, i1])
             B0[i, j] = A0[i, j] + 2.0
-        with T.block([m, n], "B.v1") as [i, j]:
+        with T.block("B.v1"):
+            i, j = T.axis.remap("SS", [i0, i1])
             B1[i, j] = A1[i, j] * 3.0
 
 
@@ -193,7 +203,7 @@ def tir_extern(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, (128, 128))
     C = T.match_buffer(c, (128, 128))
     # body
-    with T.block([], "C"):
+    with T.block("C"):
         T.reads([A[0:128, 0:128], B[0:128, 0:128]])
         T.writes([C[0:128, 0:128]])
         T.evaluate(
@@ -251,10 +261,12 @@ def tir_reordered_matmul(c: T.handle, a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128, 128))
     C = T.match_buffer(c, (128, 128))
 
-    with T.block([128, 128, T.reduce_axis(0, 128)]) as [i, j, k]:
-        with T.init():
-            C[i, j] = 0.0
-        C[i, j] += A[i, k] * B[j, k]
+    for i0, j0, k0 in T.grid(128, 128, 128):
+        with T.block():
+            i, j, k = T.axis.remap("SSR", [i0, j0, k0])
+            with T.init():
+                C[i, j] = 0.0
+            C[i, j] += A[i, k] * B[j, k]
 
 
 def test_arg_order():
diff --git a/tests/python/unittest/test_te_schedule_ops.py b/tests/python/unittest/test_te_schedule_ops.py
index bc4bc4f56e19..ca3ab3aade98 100644
--- a/tests/python/unittest/test_te_schedule_ops.py
+++ b/tests/python/unittest/test_te_schedule_ops.py
@@ -14,9 +14,11 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import numpy as np
+
 import tvm
 from tvm import te
-import numpy as np
+from tvm.driver.build_module import schedule_to_module
 
 
 def test_schedule0():
@@ -26,11 +28,8 @@ def test_schedule0():
     A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
     s = te.create_schedule(A1.op)
 
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, A1], stmt, None)
-    assert isinstance(func, tvm.tir.PrimFunc)
+    mod = schedule_to_module(s, [A, A1])
+    assert isinstance(mod["main"], tvm.tir.PrimFunc)
 
 
 def test_schedule1():
@@ -42,12 +41,9 @@ def test_schedule1():
     s = te.create_schedule(A1.op)
     xo, xi = s[A1].split(A1.op.axis[0], 8)
     s[A1].pragma(xo, "auto_unroll_max_step", 10)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, A1], stmt, None)
-    assert isinstance(func, tvm.tir.PrimFunc)
+    mod = schedule_to_module(s, [A, A1])
+    assert isinstance(mod["main"], tvm.tir.PrimFunc)
 
 
 def test_schedule2():
@@ -60,11 +56,9 @@ def test_schedule2():
     s = te.create_schedule(A2.op)
     xo, xi = s[A2].split(A2.op.axis[0], 8)
     s[A1].compute_at(s[A2], xo)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, A2], stmt, None)
-    assert isinstance(func, tvm.tir.PrimFunc)
+
+    mod = schedule_to_module(s, [A, A2])
+    assert isinstance(mod["main"], tvm.tir.PrimFunc)
 
 
 def test_schedule_scan():
diff --git a/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py b/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py
index 1aae8cdd03e1..1a0dfd09a2df 100644
--- a/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py
+++ b/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py
@@ -25,17 +25,23 @@ def buffer_load_store_func(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128, 128), "float32")
     C = T.alloc_buffer((128, 128), "float32")
     D = T.alloc_buffer((128, 128), "float32")
-    with T.block([128, 128]) as [i, j]:
-        A[i, j] = T.float32(0)
-    with T.block([32, 32, T.reduce_axis(0, 32)]) as [i, j, k]:
-        with T.init():
+    for ii, jj in T.grid(128, 128):
+        with T.block():
+            i, j = T.axis.remap("SS", [ii, jj])
+            A[i, j] = T.float32(0)
+    for i0, j0, k0 in T.grid(32, 32, 32):
+        with T.block():
+            i, j, k = T.axis.remap("SSR", [i0, j0, k0])
+            with T.init():
+                for ii, jj in T.grid(4, 4):
+                    B[i * 4 + ii, j * 4 + jj] = A[i * 4 + ii, j * 4 + jj]
             for ii, jj in T.grid(4, 4):
-                B[i * 4 + ii, j * 4 + jj] = A[i * 4 + ii, j * 4 + jj]
-        for ii, jj in T.grid(4, 4):
-            for kk in range(0, 4):
-                B[i * 4 + ii, j * 4 + jj] += C[i * 4 + ii, k * 4 + kk]
-            for kk in range(0, 4):
-                B[i * 4 + ii, j * 4 + jj] += D[j * 4 + jj, k * 4 + kk] * C[i * 4 + ii, k * 4 + kk]
+                for kk in range(0, 4):
+                    B[i * 4 + ii, j * 4 + jj] += C[i * 4 + ii, k * 4 + kk]
+                for kk in range(0, 4):
+                    B[i * 4 + ii, j * 4 + jj] += (
+                        D[j * 4 + jj, k * 4 + kk] * C[i * 4 + ii, k * 4 + kk]
+                    )
 
 
 @T.prim_func
@@ -43,7 +49,7 @@ def buffer_opaque_access(b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [16, 16], "float32")
     C = T.match_buffer(c, [16, 16], "float32")
 
-    with T.block([]):
+    with T.block():
         T.reads([])
         T.writes(B[0:16, 0:16])
         A = T.allocate([256], "float32", "global")
@@ -56,9 +62,8 @@ def buffer_opaque_access(b: T.handle, c: T.handle) -> None:
                 T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, T.float32(0), dtype="handle"))
 
     for i, j in T.grid(16, 16):
-        with T.block([16, 16]) as [vi, vj]:
-            T.bind(vi, i)
-            T.bind(vj, j)
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
             C[vi, vj] = B[vi, vj]
 
 
@@ -72,16 +77,20 @@ def lca_is_func_root(a: T.handle) -> None:
 def match_buffer_func(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128), "float32")
     B = T.match_buffer(b, (128, 128), "float32")
-    with T.block([8, 8], "block") as [vi, vj]:
-        T.reads(B[vi * 16 + 2 : vi * 16 + 12, vj * 16 + 2 : vj * 16 + 16])
-        T.writes(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-        B0 = T.match_buffer(B[vi * 16 + 2 : vi * 16 + 6, vj * 16 + 2 : vj * 16 + 6], (4, 4))
-        B1 = T.match_buffer(B[vi * 16 + 8 : vi * 16 + 12, vj * 16 + 8 : vj * 16 + 16], (4, 8))
-        with T.block([16, 16], "AAA") as [i, j]:
-            AA = T.match_buffer(A[i, j], ())
-            AA[()] = 1.0
-        T.evaluate(B0.data)
-        T.evaluate(B1.data)
+    for i, j in T.grid(8, 8):
+        with T.block("block"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(B[vi * 16 + 2 : vi * 16 + 12, vj * 16 + 2 : vj * 16 + 16])
+            T.writes(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+            B0 = T.match_buffer(B[vi * 16 + 2 : vi * 16 + 6, vj * 16 + 2 : vj * 16 + 6], (4, 4))
+            B1 = T.match_buffer(B[vi * 16 + 8 : vi * 16 + 12, vj * 16 + 8 : vj * 16 + 16], (4, 8))
+            for ii, jj in T.grid(16, 16):
+                with T.block("AAA"):
+                    vii, vjj = T.axis.remap("SS", [ii, jj])
+                    AA = T.match_buffer(A[vii, vjj], ())
+                    AA[()] = 1.0
+            T.evaluate(B0.data)
+            T.evaluate(B1.data)
 
 
 def test_buffer_load_store():
diff --git a/tests/python/unittest/test_tir_analysis_get_block_access_region.py b/tests/python/unittest/test_tir_analysis_get_block_access_region.py
index e3a63c325434..4ea35c0a2d6c 100644
--- a/tests/python/unittest/test_tir_analysis_get_block_access_region.py
+++ b/tests/python/unittest/test_tir_analysis_get_block_access_region.py
@@ -27,57 +27,65 @@ def func() -> None:
     B = T.alloc_buffer((128, 128), "float32")
     C = T.alloc_buffer((128, 128), "float32")
     D = T.alloc_buffer((128, 128), "float32")
-    with T.block([]):
+    with T.block():
         # Need add read/write region manually to avoid triggering block access region detector
         T.reads([B[0, 0], C[0:16, 0:16], A[4:12, 4:12]])
         T.writes([A[0:12, 0:12]])
         for i, j in T.grid(8, 8):
             A[i, j] = B[0, 0] + C[0, 0]
-        with T.block([2, 2]) as [vi, vj]:
-            T.reads([A[vi * 4 + 4 : vi * 4 + 8, vj * 4 + 4 : vj * 4 + 8], C[12:16, 12:16]])
-            T.writes([A[vi * 4 + 4 : vi * 4 + 8, vj * 4 + 4 : vj * 4 + 8]])
-            for i, j in T.grid(4, 4):
-                A[vi * 4 + 4 + i, vj * 4 + 4 + j] += C[i + 12, j + 12]
+        for i, j in T.grid(2, 2):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.reads([A[vi * 4 + 4 : vi * 4 + 8, vj * 4 + 4 : vj * 4 + 8], C[12:16, 12:16]])
+                T.writes([A[vi * 4 + 4 : vi * 4 + 8, vj * 4 + 4 : vj * 4 + 8]])
+                for i, j in T.grid(4, 4):
+                    A[vi * 4 + 4 + i, vj * 4 + 4 + j] += C[i + 12, j + 12]
         T.evaluate(D.data)
 
 
 @T.prim_func
 def match_buffer_func() -> None:
-    with T.block([], "root"):
+    with T.block("root"):
         A = T.alloc_buffer((128, 128), "float32")
         B = T.alloc_buffer((128, 128), "float32")
         T.reads([])
         T.writes([])
         # Need add read/write region manually to avoid triggering block access region detector
-        with T.block([8, 8], "block") as [vi, vj]:
-            T.reads(B[vi * 16 + 2 : vi * 16 + 12, vj * 16 + 2 : vj * 16 + 16])
-            T.writes(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-            AA = T.match_buffer(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16], (16, 16))
-            B0 = T.match_buffer(B[vi * 16 + 2 : vi * 16 + 6, vj * 16 + 2 : vj * 16 + 6], (4, 4))
-            B1 = T.match_buffer(B[vi * 16 + 8 : vi * 16 + 12, vj * 16 + 8 : vj * 16 + 16], (4, 8))
-            with T.block([16, 16], "AAA") as [i, j]:
-                T.reads([])
-                T.writes(AA[i, j])
-                AAA = T.match_buffer(AA[i, j], ())
-                AAA[()] = 1.0
-            T.evaluate(B0.data)
-            T.evaluate(B1.data)
+        for i, j in T.grid(8, 8):
+            with T.block("block"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.reads(B[vi * 16 + 2 : vi * 16 + 12, vj * 16 + 2 : vj * 16 + 16])
+                T.writes(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+                AA = T.match_buffer(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16], (16, 16))
+                B0 = T.match_buffer(B[vi * 16 + 2 : vi * 16 + 6, vj * 16 + 2 : vj * 16 + 6], (4, 4))
+                B1 = T.match_buffer(
+                    B[vi * 16 + 8 : vi * 16 + 12, vj * 16 + 8 : vj * 16 + 16], (4, 8)
+                )
+                for ii, jj in T.grid(16, 16):
+                    with T.block("AAA"):
+                        vii, vjj = T.axis.remap("SS", [ii, jj])
+                        T.reads([])
+                        T.writes(AA[vii, vjj])
+                        AAA = T.match_buffer(AA[vii, vjj], ())
+                        AAA[()] = 1.0
+                T.evaluate(B0.data)
+                T.evaluate(B1.data)
 
 
 @T.prim_func
 def opaque_block_func() -> None:
-    with T.block([], "root"):
+    with T.block("root"):
         A = T.alloc_buffer((16, 16), "float32")
         B = T.alloc_buffer((16, 16), "float32")
         T.reads([])
         T.writes([])
         # Need add read/write region manually to avoid triggering block access region detector
         for i in range(0, 16):
-            with T.block([]):
+            with T.block():
                 T.reads(A[i, 0:16])
                 T.writes([B[i, 0:16]])
                 for j in range(0, 16):
-                    with T.block([]):
+                    with T.block():
                         T.reads(A[i, j])
                         T.writes(B[i, j])
                         B[i, j] = A[i, j] + 1.0
@@ -88,8 +96,8 @@ def opaque_access_func() -> None:
     A = T.alloc_buffer([1024])
     B = T.alloc_buffer([1024])
     for i in T.serial(0, 8):
-        with T.block([8]) as [v]:
-            T.bind(v, i)
+        with T.block():
+            v = T.axis.S(8, i)
             T.reads([A[v * 128 : v * 128 + 128]])
             T.writes([B[v * 128 : v * 128 + 128]])
             T.evaluate(
diff --git a/tests/python/unittest/test_tir_lower_match_buffer.py b/tests/python/unittest/test_tir_lower_match_buffer.py
index 7129275aebcd..5ca9cf0da3c9 100644
--- a/tests/python/unittest/test_tir_lower_match_buffer.py
+++ b/tests/python/unittest/test_tir_lower_match_buffer.py
@@ -39,7 +39,7 @@ def buffer_load_store(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16, 16))
     C = T.match_buffer(c, (16, 16))
     for i, j, k in T.grid(4, 16, 8):
-        with T.block([]):
+        with T.block():
             T.reads(C[i * 4 : i * 4 + 4, k * 2 : k * 2 + 2])
             T.writes(A[i * 4 : i * 4 + 4, j, k * 2 : k * 2 + 2])
             sub_A = T.match_buffer(
@@ -55,7 +55,7 @@ def transformed_buffer_load_store(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16, 16))
     C = T.match_buffer(c, (16, 16))
     for i, j, k in T.grid(4, 16, 8):
-        with T.block([]):
+        with T.block():
             T.reads(C[i * 4 : i * 4 + 4, k * 2 : k * 2 + 2])
             T.writes(A[i * 4 : i * 4 + 4, j, k * 2 : k * 2 + 2])
             for ii, kk in T.grid(4, 2):
@@ -72,7 +72,7 @@ def opaque_access(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (32, 64, 128))
     B = T.match_buffer(b, (64, 64, 64))
     for i, j, k in T.grid(2, 64, 8):
-        with T.block([]):
+        with T.block():
             T.reads([])
             T.writes(A[i * 16 : i * 16 + 16, j, k * 16 : k * 16 + 16])
             sub_A = T.match_buffer(
@@ -93,7 +93,7 @@ def opaque_access(a: T.handle, b: T.handle) -> None:
                 )
             )
     for i, j, k in T.grid(64, 2, 8):
-        with T.block([]):
+        with T.block():
             Bs_0 = T.var("int32")
             Bs_1 = T.var("int32")
             T.reads([])
@@ -122,7 +122,7 @@ def transformed_opaque_access(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (32, 64, 128))
     B = T.match_buffer(b, (64, 64, 64))
     for i, j, k in T.grid(2, 64, 8):
-        with T.block([]):
+        with T.block():
             T.reads([])
             T.writes(A[i * 16 : i * 16 + 16, j, k * 16 : k * 16 + 16])
             T.evaluate(
@@ -137,7 +137,7 @@ def transformed_opaque_access(a: T.handle, b: T.handle) -> None:
                 )
             )
     for i, j, k in T.grid(64, 2, 8):
-        with T.block([]):
+        with T.block():
             T.reads([])
             T.writes(B[i, j * 32 : j * 32 + 32, k * 8 : k * 8 + 8])
             T.evaluate(
@@ -157,7 +157,7 @@ def transformed_opaque_access(a: T.handle, b: T.handle) -> None:
 def high_dim_opaque_access(a: T.handle) -> None:
     A = T.match_buffer(a, (16, 32, 64))
     for i, j, k in T.grid(16, 2, 4):
-        with T.block([]):
+        with T.block():
             As_0 = T.var("int32")
             As_1 = T.var("int32")
             T.reads([])
@@ -185,7 +185,7 @@ def high_dim_opaque_access(a: T.handle) -> None:
 def transformed_high_dim_opaque_access(a: T.handle) -> None:
     A = T.match_buffer(a, (16, 32, 64))
     for i, j, k in T.grid(16, 2, 4):
-        with T.block([]):
+        with T.block():
             T.reads([])
             T.writes(A[i, j * 16 : j * 16 + 16, k * 16 : k * 16 + 16])
             T.evaluate(
@@ -205,7 +205,7 @@ def transformed_high_dim_opaque_access(a: T.handle) -> None:
 def high_dim_opaque_access_with_source_strides(a: T.handle) -> None:
     A = T.match_buffer(a, (16, 32, 64), strides=[2576, 80, 1])
     for i, j, k in T.grid(16, 2, 4):
-        with T.block([]):
+        with T.block():
             As_0 = T.var("int32")
             As_1 = T.var("int32")
             T.reads([])
@@ -233,7 +233,7 @@ def high_dim_opaque_access_with_source_strides(a: T.handle) -> None:
 def transformed_high_dim_opaque_access_with_source_strides(a: T.handle) -> None:
     A = T.match_buffer(a, (16, 32, 64), strides=[2576, 80, 1])
     for i, j, k in T.grid(16, 2, 4):
-        with T.block([]):
+        with T.block():
             T.reads([])
             T.writes(A[i, j * 16 : j * 16 + 16, k * 16 : k * 16 + 16])
             T.evaluate(
@@ -254,7 +254,7 @@ def recursive_match(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (64, 64, 64))
     B = T.match_buffer(b, (64, 64, 64))
     for i, j, k in T.grid(64, 4, 4):
-        with T.block([]):
+        with T.block():
             T.reads([])
             T.writes(
                 [
@@ -276,7 +276,7 @@ def recursive_match(a: T.handle, b: T.handle) -> None:
                 offset_factor=1,
             )
             for jj, kk in T.grid(4, 4):
-                with T.block([]):
+                with T.block():
                     T.reads([])
                     T.writes(
                         [
@@ -317,7 +317,7 @@ def transformed_recursive_match(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (64, 64, 64))
     B = T.match_buffer(b, (64, 64, 64))
     for i, j, k in T.grid(64, 4, 4):
-        with T.block([]):
+        with T.block():
             T.reads([])
             T.writes(
                 [
@@ -326,7 +326,7 @@ def transformed_recursive_match(a: T.handle, b: T.handle) -> None:
                 ]
             )
             for jj, kk in T.grid(4, 4):
-                with T.block([]):
+                with T.block():
                     T.reads([])
                     T.writes(
                         [
@@ -362,7 +362,7 @@ def symbolic_match(a: T.handle, b: T.handle, n: T.int32, m: T.int32) -> None:
     A = T.match_buffer(a, (n * m, m))
     B = T.match_buffer(b, (n * 2, m * 4))
     for i in range(0, n):
-        with T.block([]):
+        with T.block():
             T.reads([])
             T.writes([A[i * m : i * m + n, 0:m], B[i * n : i * n + 2, 0 : m * 4]])
             Bs_0 = T.var("int32")
@@ -392,7 +392,7 @@ def transformed_symbolic_match(a: T.handle, b: T.handle, n: T.int32, m: T.int32)
     A = T.match_buffer(a, (n * m, m))
     B = T.match_buffer(b, (n * 2, m * 4))
     for i in range(0, n):
-        with T.block([]):
+        with T.block():
             T.reads([])
             T.writes([A[i * m : i * m + n, 0:m], B[i * n : i * n + 2, 0 : m * 4]])
             for ii, jj in T.grid(m, m):
@@ -416,7 +416,7 @@ def rank0_buffer(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (8, 8))
     B = T.match_buffer(b, (8, 8))
     for i, j in T.grid(8, 8):
-        with T.block([]):
+        with T.block():
             T.reads([])
             T.writes([A[i, j], B[i, j]])
             sub_A = T.match_buffer(A[i, j], (), offset_factor=1)
@@ -440,7 +440,7 @@ def transformed_rank0_buffer(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (8, 8))
     B = T.match_buffer(b, (8, 8))
     for i, j in T.grid(8, 8):
-        with T.block([]):
+        with T.block():
             T.reads([])
             T.writes([A[i, j], B[i, j]])
             A[i, j] = 1
@@ -461,7 +461,7 @@ def transformed_rank0_buffer(a: T.handle, b: T.handle) -> None:
 def fail_match_load(a: T.handle) -> None:
     A = T.match_buffer(a, (8, 8))
     for i, j in T.grid(8, 8):
-        with T.block([]):
+        with T.block():
             T.reads(A[i, j])
             T.writes([])
             sub_A = T.match_buffer(A[i, j], ())
@@ -472,7 +472,7 @@ def fail_match_load(a: T.handle) -> None:
 def fail_match_store(a: T.handle) -> None:
     A = T.match_buffer(a, (8, 8))
     for i, j in T.grid(8, 8):
-        with T.block([]):
+        with T.block():
             T.reads([])
             T.writes(A[i, j])
             sub_A = T.match_buffer(A[i, j], ())
@@ -483,7 +483,7 @@ def fail_match_store(a: T.handle) -> None:
 def fail_buffer_bind(a: T.handle) -> None:
     A = T.match_buffer(a, (8, 8))
     for i, j in T.grid(8, 2):
-        with T.block([]):
+        with T.block():
             stride = T.var("int32")
             sub_A = T.match_buffer(
                 A[i, j * 4 : j * 4 + 4], (1, 4), strides=[stride, stride], offset_factor=1
@@ -496,7 +496,7 @@ def fail_buffer_bind(a: T.handle) -> None:
 def fail_match_func_param(a: T.handle, m: T.handle, n: T.handle) -> None:
     A = T.match_buffer(a, (8, 8))
     for i, j in T.grid(8, 2):
-        with T.block([]):
+        with T.block():
             sub_A = T.match_buffer(A[i, j * 4 : j * 4 + 4], (1, 4), strides=[m, n], offset_factor=1)
             for jj in range(0, 4):
                 sub_A[i, j * 4 + jj] = 1
diff --git a/tests/python/unittest/test_tir_schedule_block_scope.py b/tests/python/unittest/test_tir_schedule_block_scope.py
index 2182c7b9f449..ad789a010745 100644
--- a/tests/python/unittest/test_tir_schedule_block_scope.py
+++ b/tests/python/unittest/test_tir_schedule_block_scope.py
@@ -32,10 +32,14 @@ def elementwise(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128), "float32")
     C = T.match_buffer(c, (128, 128), "float32")
     B = T.alloc_buffer((128, 128), "float32")
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -44,10 +48,12 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "init") as [vi, vj]:
+        with T.block("init"):
+            vi, vj = T.axis.remap("SS", [i, j])
             C[vi, vj] = T.float32(0)
         for k in range(0, 128):
-            with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
+            with T.block("update"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
@@ -58,9 +64,11 @@ def war_dependency(a: T.handle, b: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (128, 128))
 
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "C") as [vi, vj]:
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
             C[vi, vj] = B[vi, vj] + 1.0
-        with T.block([128, 128], "B") as [vi, vj]:
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
             B[vi, vj] = A[vi, vj] * 2.0
 
 
diff --git a/tests/python/unittest/test_tir_schedule_cache_read_write.py b/tests/python/unittest/test_tir_schedule_cache_read_write.py
index ff5b61a135eb..853f44affe5d 100644
--- a/tests/python/unittest/test_tir_schedule_cache_read_write.py
+++ b/tests/python/unittest/test_tir_schedule_cache_read_write.py
@@ -33,10 +33,14 @@ def elementwise(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.alloc_buffer((128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -45,20 +49,23 @@ def access_under_scope(b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, (128, 128))
     C = T.match_buffer(c, (128, 128))
 
-    with T.block([8, 8], "scope") as [i, j]:
-        for x, y in T.grid(16, 16):
-            with T.block([128, 128], "A") as [vi, vj]:
-                T.bind(vi, i * 16 + x)
-                T.bind(vj, j * 16 + y)
-                A[vi, vj] = 1.0
-        for x, y in T.grid(16, 16):
-            with T.block([128, 128], "B") as [vi, vj]:
-                T.bind(vi, i * 16 + x)
-                T.bind(vj, j * 16 + y)
-                B[vi, vj] = A[vi, vj] + 1.0
-
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = A[vi, vj] * 2.0
+    for i0, j0 in T.grid(8, 8):
+        with T.block("scope"):
+            i, j = T.axis.remap("SS", [i0, j0])
+            for x, y in T.grid(16, 16):
+                with T.block("A"):
+                    vi = T.axis.S(128, i * 16 + x)
+                    vj = T.axis.S(128, j * 16 + y)
+                    A[vi, vj] = 1.0
+            for x, y in T.grid(16, 16):
+                with T.block("B"):
+                    vi = T.axis.S(128, i * 16 + x)
+                    vj = T.axis.S(128, j * 16 + y)
+                    B[vi, vj] = A[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = A[vi, vj] * 2.0
 
 
 @T.prim_func
@@ -68,76 +75,82 @@ def opaque_access(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
     C = T.match_buffer(c, (128, 128), dtype="float16")
     D = T.match_buffer(d, (128, 128), dtype="float16")
 
-    with T.block([128, 128], "load_store") as [vi, vj]:
-        T.reads(A[vi, vj])
-        T.writes(D[vi, vj])
-        D.data[vi * 128 + vj] = T.load("float16", A.data, vi * 128 + vj)
-    with T.block([8, 8], "opaque") as [vi, vj]:
-        T.reads(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-        T.writes(B[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-        T.evaluate(
-            T.tvm_load_matrix_sync(
-                B.data,
-                16,
-                16,
-                16,
-                vi * 8 + vj,
-                T.tvm_access_ptr(
-                    T.type_annotation(dtype="float16"),
-                    A.data,
-                    vi * 2048 + vj * 16,
+    for i, j in T.grid(128, 128):
+        with T.block("load_store"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(A[vi, vj])
+            T.writes(D[vi, vj])
+            D.data[vi * 128 + vj] = T.load("float16", A.data, vi * 128 + vj)
+    for i, j in T.grid(8, 8):
+        with T.block("opaque"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+            T.writes(B[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+            T.evaluate(
+                T.tvm_load_matrix_sync(
+                    B.data,
+                    16,
+                    16,
+                    16,
+                    vi * 8 + vj,
+                    T.tvm_access_ptr(
+                        T.type_annotation(dtype="float16"),
+                        A.data,
+                        vi * 2048 + vj * 16,
+                        128,
+                        1,
+                        dtype="handle",
+                    ),
                     128,
-                    1,
+                    "row_major",
                     dtype="handle",
-                ),
-                128,
-                "row_major",
-                dtype="handle",
+                )
+            )
+    for i, j in T.grid(8, 8):
+        with T.block("match_buffer"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+            T.writes(C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+            A0 = T.match_buffer(
+                A[
+                    vi * 16 : vi * 16 + 16,
+                    vj * 16 : vj * 16 + 16,
+                ],
+                (16, 16),
+                "float16",
+                strides=[128, 1],
+                offset_factor=1,
+            )
+            C0 = T.match_buffer(
+                C[
+                    vi * 16 : vi * 16 + 16,
+                    vj * 16 : vj * 16 + 16,
+                ],
+                (16, 16),
+                "float16",
+                strides=[128, 1],
+                offset_factor=1,
             )
-        )
-    with T.block([8, 8], "match_buffer") as [vi, vj]:
-        T.reads(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-        T.writes(C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-        A0 = T.match_buffer(
-            A[
-                vi * 16 : vi * 16 + 16,
-                vj * 16 : vj * 16 + 16,
-            ],
-            (16, 16),
-            "float16",
-            strides=[128, 1],
-            offset_factor=1,
-        )
-        C0 = T.match_buffer(
-            C[
-                vi * 16 : vi * 16 + 16,
-                vj * 16 : vj * 16 + 16,
-            ],
-            (16, 16),
-            "float16",
-            strides=[128, 1],
-            offset_factor=1,
-        )
-        T.evaluate(
-            T.tvm_load_matrix_sync(
-                C0.data,
-                16,
-                16,
-                16,
-                vi * 8 + vj,
-                T.tvm_access_ptr(
-                    T.type_annotation(dtype="float16"),
-                    A0.data,
-                    A0.elem_offset,
-                    A0.strides[0],
-                    1,
+            T.evaluate(
+                T.tvm_load_matrix_sync(
+                    C0.data,
+                    16,
+                    16,
+                    16,
+                    vi * 8 + vj,
+                    T.tvm_access_ptr(
+                        T.type_annotation(dtype="float16"),
+                        A0.data,
+                        A0.elem_offset,
+                        A0.strides[0],
+                        1,
+                        dtype="handle",
+                    ),
+                    128,
+                    "row_major",
                     dtype="handle",
-                ),
-                128,
-                "row_major",
-                dtype="handle",
+                )
             )
-        )
 
 
 @T.prim_func
@@ -147,15 +160,16 @@ def func_multi_consumer() -> None:
     C = T.alloc_buffer((128))
     for i in T.grid(8):
         for j in T.grid(16):
-            with T.block([128], "A") as [vi]:
-                T.bind(vi, i * 16 + j)
+            with T.block("A"):
+                vi = T.axis.S(128, i * 16 + j)
                 A[vi] = 1.0
         for j in T.grid(16):
-            with T.block([128], "B") as [vi]:
-                T.bind(vi, i * 16 + j)
+            with T.block("B"):
+                vi = T.axis.S(128, i * 16 + j)
                 B[vi] = A[vi] + 1.0
     for i in T.grid(128):
-        with T.block([128], "C") as [vi]:
+        with T.block("C"):
+            vi = T.axis.S(128, i)
             C[vi] = A[vi]
 
 
@@ -163,12 +177,18 @@ def func_multi_consumer() -> None:
 def func_multi_producer() -> None:
     A = T.alloc_buffer((128))
     B = T.alloc_buffer((128))
-    with T.block([128], "A0") as [vi]:
-        A[vi] = 1.0
-    with T.block([128], "A1") as [vi]:
-        A[vi] = 2.0
-    with T.block([128], "B") as [vi]:
-        B[vi] = A[vi]
+    for i in range(128):
+        with T.block("A0"):
+            vi = T.axis.S(128, i)
+            A[vi] = 1.0
+    for i in range(128):
+        with T.block("A1"):
+            vi = T.axis.S(128, i)
+            A[vi] = 2.0
+    for i in range(128):
+        with T.block("B"):
+            vi = T.axis.S(128, i)
+            B[vi] = A[vi]
 
 
 ########## Expected function after cache_read ##########
@@ -181,14 +201,22 @@ def cache_read_elementwise(a: T.handle, c: T.handle) -> None:
     B = T.alloc_buffer((128, 128))
     A_global = T.alloc_buffer((128, 128))
     B_local = T.alloc_buffer((128, 128), scope="local")
-    with T.block([128, 128], "A_global") as [vi, vj]:
-        A_global[vi, vj] = A[vi, vj]
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A_global[vi, vj] * 2.0
-    with T.block([128, 128], "B_local") as [vi, vj]:
-        B_local[vi, vj] = B[vi, vj]
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B_local[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("A_global"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            A_global[vi, vj] = A[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A_global[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("B_local"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B_local[vi, vj] = B[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B_local[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -198,27 +226,33 @@ def cache_read_under_scope(b: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (128, 128))
     A_global = T.alloc_buffer((128, 128))
 
-    with T.block([8, 8], "scope") as [i, j]:
-        A_local = T.alloc_buffer((128, 128), scope="local")
-        for x, y in T.grid(16, 16):
-            with T.block([128, 128], "A") as [vi, vj]:
-                T.bind(vi, i * 16 + x)
-                T.bind(vj, j * 16 + y)
-                A[vi, vj] = 1.0
-        for x, y in T.grid(16, 16):
-            with T.block([128, 128], "A_local") as [vi, vj]:
-                T.bind(vi, i * 16 + x)
-                T.bind(vj, j * 16 + y)
-                A_local[vi, vj] = A[vi, vj]
-        for x, y in T.grid(16, 16):
-            with T.block([128, 128], "B") as [vi, vj]:
-                T.bind(vi, i * 16 + x)
-                T.bind(vj, j * 16 + y)
-                B[vi, vj] = A_local[vi, vj] + 1.0
-    with T.block([128, 128], "A_global") as [vi, vj]:
-        A_global[vi, vj] = A[vi, vj]
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = A_global[vi, vj] * 2.0
+    for i0, j0 in T.grid(8, 8):
+        with T.block("scope"):
+            i, j = T.axis.remap("SS", [i0, j0])
+            A_local = T.alloc_buffer((128, 128), scope="local")
+            for x, y in T.grid(16, 16):
+                with T.block("A"):
+                    vi = T.axis.S(128, i * 16 + x)
+                    vj = T.axis.S(128, j * 16 + y)
+                    A[vi, vj] = 1.0
+            for x, y in T.grid(16, 16):
+                with T.block("A_local"):
+                    vi = T.axis.S(128, i * 16 + x)
+                    vj = T.axis.S(128, j * 16 + y)
+                    A_local[vi, vj] = A[vi, vj]
+            for x, y in T.grid(16, 16):
+                with T.block("B"):
+                    vi = T.axis.S(128, i * 16 + x)
+                    vj = T.axis.S(128, j * 16 + y)
+                    B[vi, vj] = A_local[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("A_global"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            A_global[vi, vj] = A[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = A_global[vi, vj] * 2.0
 
 
 @T.prim_func
@@ -229,78 +263,86 @@ def cache_read_opaque_access(a: T.handle, b: T.handle, c: T.handle, d: T.handle)
     D = T.match_buffer(d, (128, 128), dtype="float16")
     A_global = T.alloc_buffer((128, 128), dtype="float16")
 
-    with T.block([128, 128], "A_global") as [vi, vj]:
-        A_global[vi, vj] = A[vi, vj]
-    with T.block([128, 128], "load_store") as [vi, vj]:
-        T.reads(A_global[vi, vj])
-        T.writes(D[vi, vj])
-        D.data[vi * 128 + vj] = T.load("float16", A_global.data, vi * 128 + vj)
-    with T.block([8, 8], "opaque") as [vi, vj]:
-        T.reads(A_global[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-        T.writes(B[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-        T.evaluate(
-            T.tvm_load_matrix_sync(
-                B.data,
-                16,
-                16,
-                16,
-                vi * 8 + vj,
-                T.tvm_access_ptr(
-                    T.type_annotation(dtype="float16"),
-                    A_global.data,
-                    vi * 2048 + vj * 16,
+    for i, j in T.grid(128, 128):
+        with T.block("A_global"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            A_global[vi, vj] = A[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("load_store"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(A_global[vi, vj])
+            T.writes(D[vi, vj])
+            D.data[vi * 128 + vj] = T.load("float16", A_global.data, vi * 128 + vj)
+    for i, j in T.grid(8, 8):
+        with T.block("opaque"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(A_global[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+            T.writes(B[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+            T.evaluate(
+                T.tvm_load_matrix_sync(
+                    B.data,
+                    16,
+                    16,
+                    16,
+                    vi * 8 + vj,
+                    T.tvm_access_ptr(
+                        T.type_annotation(dtype="float16"),
+                        A_global.data,
+                        vi * 2048 + vj * 16,
+                        128,
+                        1,
+                        dtype="handle",
+                    ),
                     128,
-                    1,
+                    "row_major",
                     dtype="handle",
-                ),
-                128,
-                "row_major",
-                dtype="handle",
+                )
+            )
+    for i, j in T.grid(8, 8):
+        with T.block("match_buffer"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(A_global[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+            T.writes(C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+            A0 = T.match_buffer(
+                A_global[
+                    vi * 16 : vi * 16 + 16,
+                    vj * 16 : vj * 16 + 16,
+                ],
+                (16, 16),
+                "float16",
+                strides=[128, 1],
+                offset_factor=1,
             )
-        )
-    with T.block([8, 8], "match_buffer") as [vi, vj]:
-        T.reads(A_global[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-        T.writes(C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-        A0 = T.match_buffer(
-            A_global[
-                vi * 16 : vi * 16 + 16,
-                vj * 16 : vj * 16 + 16,
-            ],
-            (16, 16),
-            "float16",
-            strides=[128, 1],
-            offset_factor=1,
-        )
-        C0 = T.match_buffer(
-            C[
-                vi * 16 : vi * 16 + 16,
-                vj * 16 : vj * 16 + 16,
-            ],
-            (16, 16),
-            "float16",
-            strides=[128, 1],
-            offset_factor=1,
-        )
-        T.evaluate(
-            T.tvm_load_matrix_sync(
-                C0.data,
-                16,
-                16,
-                16,
-                vi * 8 + vj,
-                T.tvm_access_ptr(
-                    T.type_annotation(dtype="float16"),
-                    A0.data,
-                    A0.elem_offset,
-                    A0.strides[0],
-                    1,
+            C0 = T.match_buffer(
+                C[
+                    vi * 16 : vi * 16 + 16,
+                    vj * 16 : vj * 16 + 16,
+                ],
+                (16, 16),
+                "float16",
+                strides=[128, 1],
+                offset_factor=1,
+            )
+            T.evaluate(
+                T.tvm_load_matrix_sync(
+                    C0.data,
+                    16,
+                    16,
+                    16,
+                    vi * 8 + vj,
+                    T.tvm_access_ptr(
+                        T.type_annotation(dtype="float16"),
+                        A0.data,
+                        A0.elem_offset,
+                        A0.strides[0],
+                        1,
+                        dtype="handle",
+                    ),
+                    128,
+                    "row_major",
                     dtype="handle",
-                ),
-                128,
-                "row_major",
-                dtype="handle",
+                )
             )
-        )
 
 
 @T.prim_func
@@ -311,20 +353,21 @@ def cache_read_multi_consumer() -> None:
     A_global = T.alloc_buffer((128))
     for i in T.grid(8):
         for j in T.grid(16):
-            with T.block([128], "A") as [vi]:
-                T.bind(vi, i * 16 + j)
+            with T.block("A"):
+                vi = T.axis.S(128, i * 16 + j)
                 A[vi] = 1.0
         for j in T.grid(16):
-            with T.block([128], "A") as [vi]:
-                T.bind(vi, i * 16 + j)
+            with T.block("A"):
+                vi = T.axis.S(128, i * 16 + j)
                 A_global[vi] = A[vi]
         for j in T.grid(16):
-            with T.block([128], "B") as [vi]:
-                T.bind(vi, i * 16 + j)
+            with T.block("B"):
+                vi = T.axis.S(128, i * 16 + j)
                 B[vi] = A_global[vi] + 1.0
 
     for i in T.grid(128):
-        with T.block([128], "C") as [vi]:
+        with T.block("C"):
+            vi = T.axis.S(128, i)
             C[vi] = A_global[vi]
 
 
@@ -335,14 +378,22 @@ def continuous_cache_read(a: T.handle, c: T.handle) -> None:
     B = T.alloc_buffer((128, 128))
     B_shared = T.alloc_buffer((128, 128), scope="shared")
     B_local = T.alloc_buffer((128, 128), scope="local")
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "B_shared") as [vi, vj]:
-        B_shared[vi, vj] = B[vi, vj]
-    with T.block([128, 128], "B_local") as [vi, vj]:
-        B_local[vi, vj] = B_shared[vi, vj]
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B_local[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("B_shared"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B_shared[vi, vj] = B[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("B_local"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B_local[vi, vj] = B_shared[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B_local[vi, vj] + 1.0
 
 
 ########## Expected function after cache_write ##########
@@ -355,14 +406,22 @@ def cache_write_elementwise(a: T.handle, c: T.handle) -> None:
     B = T.alloc_buffer((128, 128))
     B_global = T.alloc_buffer((128, 128), scope="local")
     C_local = T.alloc_buffer((128, 128))
-    with T.block([128, 128], "B_global") as [vi, vj]:
-        B_global[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = B_global[vi, vj]
-    with T.block([128, 128], "C_local") as [vi, vj]:
-        C_local[vi, vj] = B[vi, vj] + 1.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = C_local[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("B_global"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B_global[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = B_global[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("C_local"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C_local[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = C_local[vi, vj]
 
 
 @T.prim_func
@@ -372,33 +431,39 @@ def cache_write_under_scope(b: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (128, 128))
     A_global = T.alloc_buffer((128, 128))
 
-    with T.block([8, 8], "scope") as [i, j]:
-        A_local = T.alloc_buffer((128, 128), scope="local")
-        B_global = T.alloc_buffer((128, 128))
-        for x, y in T.grid(16, 16):
-            with T.block([128, 128], "A_local") as [vi, vj]:
-                T.bind(vi, i * 16 + x)
-                T.bind(vj, j * 16 + y)
-                A_local[vi, vj] = 1.0
-        for x, y in T.grid(16, 16):
-            with T.block([128, 128], "A") as [vi, vj]:
-                T.bind(vi, i * 16 + x)
-                T.bind(vj, j * 16 + y)
-                A_global[vi, vj] = A_local[vi, vj]
-        for x, y in T.grid(16, 16):
-            with T.block([128, 128], "B_global") as [vi, vj]:
-                T.bind(vi, i * 16 + x)
-                T.bind(vj, j * 16 + y)
-                B_global[vi, vj] = A_global[vi, vj] + 1.0
-        for x, y in T.grid(16, 16):
-            with T.block([128, 128], "B_global") as [vi, vj]:
-                T.bind(vi, i * 16 + x)
-                T.bind(vj, j * 16 + y)
-                B[vi, vj] = B_global[vi, vj]
-    with T.block([128, 128], "A_global") as [vi, vj]:
-        A[vi, vj] = A_global[vi, vj]
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = A[vi, vj] * 2.0
+    for i0, j0 in T.grid(8, 8):
+        with T.block("scope"):
+            i, j = T.axis.remap("SS", [i0, j0])
+            A_local = T.alloc_buffer((128, 128), scope="local")
+            B_global = T.alloc_buffer((128, 128))
+            for x, y in T.grid(16, 16):
+                with T.block("A_local"):
+                    vi = T.axis.S(128, i * 16 + x)
+                    vj = T.axis.S(128, j * 16 + y)
+                    A_local[vi, vj] = 1.0
+            for x, y in T.grid(16, 16):
+                with T.block("A"):
+                    vi = T.axis.S(128, i * 16 + x)
+                    vj = T.axis.S(128, j * 16 + y)
+                    A_global[vi, vj] = A_local[vi, vj]
+            for x, y in T.grid(16, 16):
+                with T.block("B_global"):
+                    vi = T.axis.S(128, i * 16 + x)
+                    vj = T.axis.S(128, j * 16 + y)
+                    B_global[vi, vj] = A_global[vi, vj] + 1.0
+            for x, y in T.grid(16, 16):
+                with T.block("B_global"):
+                    vi = T.axis.S(128, i * 16 + x)
+                    vj = T.axis.S(128, j * 16 + y)
+                    B[vi, vj] = B_global[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("A_global"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            A[vi, vj] = A_global[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = A[vi, vj] * 2.0
 
 
 @T.prim_func
@@ -411,83 +476,95 @@ def cache_write_opaque_access(a: T.handle, b: T.handle, c: T.handle, d: T.handle
     B_global = T.alloc_buffer((128, 128), dtype="float16")
     C_global = T.alloc_buffer((128, 128), dtype="float16")
 
-    with T.block([128, 128], "load_store") as [vi, vj]:
-        T.reads(A[vi, vj])
-        T.writes(D_global[vi, vj])
-        D_global.data[vi * 128 + vj] = T.load("float16", A.data, vi * 128 + vj)
-    with T.block([8, 8], "opaque") as [vi, vj]:
-        T.reads(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-        T.writes(B_global[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-        T.evaluate(
-            T.tvm_load_matrix_sync(
-                B_global.data,
-                16,
-                16,
-                16,
-                vi * 8 + vj,
-                T.tvm_access_ptr(
-                    T.type_annotation(dtype="float16"),
-                    A.data,
-                    vi * 2048 + vj * 16,
+    for i, j in T.grid(128, 128):
+        with T.block("load_store"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(A[vi, vj])
+            T.writes(D_global[vi, vj])
+            D_global.data[vi * 128 + vj] = T.load("float16", A.data, vi * 128 + vj)
+    for i, j in T.grid(8, 8):
+        with T.block("opaque"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+            T.writes(B_global[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+            T.evaluate(
+                T.tvm_load_matrix_sync(
+                    B_global.data,
+                    16,
+                    16,
+                    16,
+                    vi * 8 + vj,
+                    T.tvm_access_ptr(
+                        T.type_annotation(dtype="float16"),
+                        A.data,
+                        vi * 2048 + vj * 16,
+                        128,
+                        1,
+                        dtype="handle",
+                    ),
                     128,
-                    1,
+                    "row_major",
                     dtype="handle",
-                ),
-                128,
-                "row_major",
-                dtype="handle",
+                )
+            )
+    for i, j in T.grid(8, 8):
+        with T.block("match_buffer"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+            T.writes(C_global[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+            A0 = T.match_buffer(
+                A[
+                    vi * 16 : vi * 16 + 16,
+                    vj * 16 : vj * 16 + 16,
+                ],
+                (16, 16),
+                "float16",
+                strides=[128, 1],
+                offset_factor=1,
             )
-        )
-    with T.block([8, 8], "match_buffer") as [vi, vj]:
-        T.reads(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-        T.writes(C_global[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-        A0 = T.match_buffer(
-            A[
-                vi * 16 : vi * 16 + 16,
-                vj * 16 : vj * 16 + 16,
-            ],
-            (16, 16),
-            "float16",
-            strides=[128, 1],
-            offset_factor=1,
-        )
-        C0 = T.match_buffer(
-            C_global[
-                vi * 16 : vi * 16 + 16,
-                vj * 16 : vj * 16 + 16,
-            ],
-            (16, 16),
-            "float16",
-            strides=[128, 1],
-            offset_factor=1,
-        )
-        T.evaluate(
-            T.tvm_load_matrix_sync(
-                C0.data,
-                16,
-                16,
-                16,
-                vi * 8 + vj,
-                T.tvm_access_ptr(
-                    T.type_annotation(dtype="float16"),
-                    A0.data,
-                    A0.elem_offset,
-                    A0.strides[0],
-                    1,
+            C0 = T.match_buffer(
+                C_global[
+                    vi * 16 : vi * 16 + 16,
+                    vj * 16 : vj * 16 + 16,
+                ],
+                (16, 16),
+                "float16",
+                strides=[128, 1],
+                offset_factor=1,
+            )
+            T.evaluate(
+                T.tvm_load_matrix_sync(
+                    C0.data,
+                    16,
+                    16,
+                    16,
+                    vi * 8 + vj,
+                    T.tvm_access_ptr(
+                        T.type_annotation(dtype="float16"),
+                        A0.data,
+                        A0.elem_offset,
+                        A0.strides[0],
+                        1,
+                        dtype="handle",
+                    ),
+                    128,
+                    "row_major",
                     dtype="handle",
-                ),
-                128,
-                "row_major",
-                dtype="handle",
+                )
             )
-        )
 
-    with T.block([128, 128], "D") as [vi, vj]:
-        D[vi, vj] = D_global[vi, vj]
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = B_global[vi, vj]
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = C_global[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("D"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            D[vi, vj] = D_global[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = B_global[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = C_global[vi, vj]
 
 
 @T.prim_func
@@ -498,20 +575,21 @@ def cache_write_multi_consumer() -> None:
     A_global = T.alloc_buffer((128))
     for i in T.grid(8):
         for j in T.grid(16):
-            with T.block([128], "A_global") as [vi]:
-                T.bind(vi, i * 16 + j)
+            with T.block("A_global"):
+                vi = T.axis.S(128, i * 16 + j)
                 A_global[vi] = 1.0
         for j in T.grid(16):
-            with T.block([128], "A") as [vi]:
-                T.bind(vi, i * 16 + j)
+            with T.block("A"):
+                vi = T.axis.S(128, i * 16 + j)
                 A[vi] = A_global[vi]
         for j in T.grid(16):
-            with T.block([128], "B") as [vi]:
-                T.bind(vi, i * 16 + j)
+            with T.block("B"):
+                vi = T.axis.S(128, i * 16 + j)
                 B[vi] = A[vi] + 1.0
 
     for i in T.grid(128):
-        with T.block([128], "C") as [vi]:
+        with T.block("C"):
+            vi = T.axis.S(128, i)
             C[vi] = A[vi]
 
 
@@ -522,14 +600,22 @@ def continuous_cache_write(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (128, 128))
     B_shared = T.alloc_buffer((128, 128), scope="shared")
     B_local = T.alloc_buffer((128, 128), scope="local")
-    with T.block([128, 128], "B") as [vi, vj]:
-        B_local[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "B") as [vi, vj]:
-        B_shared[vi, vj] = B_local[vi, vj]
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = B_shared[vi, vj]
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B_local[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B_shared[vi, vj] = B_local[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = B_shared[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 ########## Testcases for cache_read ##########
diff --git a/tests/python/unittest/test_tir_schedule_compute_at.py b/tests/python/unittest/test_tir_schedule_compute_at.py
index 5235664595ad..6e956e1ee688 100644
--- a/tests/python/unittest/test_tir_schedule_compute_at.py
+++ b/tests/python/unittest/test_tir_schedule_compute_at.py
@@ -32,10 +32,15 @@ def two_elementwise(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128), "float32")
     B = T.alloc_buffer((128, 128), "float32")
     C = T.match_buffer(c, (128, 128), "float32")
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -45,12 +50,13 @@ def two_elementwise_after_compute_at(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (128, 128), "float32")
     for i in range(0, 128):
         for ax0, ax1 in T.grid(1, 128):
-            with T.block([128, 128], "B") as [vi, vj]:
-                T.bind(vi, i + ax0)
-                T.bind(vj, ax1)
+            with T.block("B"):
+                vi = T.axis.S(128, i + ax0)
+                vj = T.axis.S(128, ax1)
                 B[vi, vj] = A[vi, vj] * 2.0
         for j in range(0, 128):
-            with T.block([128, 128], "B") as [vi, vj]:
+            with T.block("B"):
+                vi, vj = T.axis.remap("SS", [i, j])
                 C[vi, vj] = B[vi, vj] + 1.0
 
 
@@ -59,22 +65,26 @@ def blockized_1(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, [128, 128], "float32")
     B = T.alloc_buffer([128, 128], "float32")
     C = T.match_buffer(c, [128, 128], "float32")
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([8, 8], "C_outer") as [vi_o, vj_o]:
-        T.reads([B[
-            vi_o * 16 : vi_o * 16 + 16,
-            vj_o * 16 : vj_o * 16 + 16,
-        ]])
-        T.writes([C[
-            vi_o * 16 : vi_o * 16 + 16,
-            vj_o * 16 : vj_o * 16 + 16
-        ]])
-        for i_i, j_i in T.grid(16, 16):
-            with T.block([128, 128], "C_inner") as [vi, vj]:
-                T.bind(vi, vi_o * 16 + i_i)
-                T.bind(vj, vj_o * 16 + j_i)
-                C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(8, 8):
+        with T.block("C_outer"):
+            vi_o, vj_o = T.axis.remap("SS", [i, j])
+            T.reads([B[
+                vi_o * 16 : vi_o * 16 + 16,
+                vj_o * 16 : vj_o * 16 + 16,
+            ]])
+            T.writes([C[
+                vi_o * 16 : vi_o * 16 + 16,
+                vj_o * 16 : vj_o * 16 + 16
+            ]])
+            for i_i, j_i in T.grid(16, 16):
+                with T.block("C_inner"):
+                    vi = T.axis.S(128, vi_o * 16 + i_i)
+                    vj = T.axis.S(128, vj_o * 16 + j_i)
+                    C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -84,13 +94,12 @@ def blockized_after_compute_at(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, [128, 128], "float32")
     for i0_0, i1_0 in T.grid(8, 8):
         for ax0, ax1 in T.grid(16, 16):
-            with T.block([128, 128], "B") as [vi, vj]:
-                T.bind(vi, i0_0 * 16 + ax0)
-                T.bind(vj, i1_0 * 16 + ax1)
+            with T.block("B"):
+                vi = T.axis.S(128, i0_0 * 16 + ax0)
+                vj = T.axis.S(128, i1_0 * 16 + ax1)
                 B[vi, vj] = A[vi, vj] * 2.0
-        with T.block([8, 8], "C_outer") as [vi_o, vj_o]:
-            T.bind(vi_o, i0_0)
-            T.bind(vj_o, i1_0)
+        with T.block("C_outer"):
+            vi_o, vj_o = T.axis.remap("SS", [i0_0, i1_0])
             T.reads([B[
                 vi_o * 16 : vi_o * 16 + 16,
                 vj_o * 16 : vj_o * 16 + 16,
@@ -100,9 +109,9 @@ def blockized_after_compute_at(a: T.handle, c: T.handle) -> None:
                 vj_o * 16 : vj_o * 16 + 16
             ]])
             for i0_1, i1_1 in T.grid(16, 16):
-                with T.block([128, 128], "C_inner") as [vi, vj]:
-                    T.bind(vi, vi_o * 16 + i0_1)
-                    T.bind(vj, vj_o * 16 + i1_1)
+                with T.block("C_inner"):
+                    vi = T.axis.S(128, vi_o * 16 + i0_1)
+                    vj = T.axis.S(128, vj_o * 16 + i1_1)
                     C[vi, vj] = B[vi, vj] + 1.0
 
 
@@ -112,9 +121,8 @@ def blockized_2(a: T.handle, c: T.handle) -> None:
     B = T.alloc_buffer([128, 128], "float32")
     C = T.match_buffer(c, [128, 128], "float32")
     for i_o, j_o in T.grid(8, 8):
-        with T.block([8, 8], "B_outer") as [vio, vjo]:
-            T.bind(vio, i_o)
-            T.bind(vjo, j_o)
+        with T.block("B_outer"):
+            vio, vjo = T.axis.remap("SS", [i_o, j_o])
             T.reads([A[
                 vio * 16 : vio * 16 + 16,
                 vjo * 16 : vjo * 16 + 16,
@@ -124,14 +132,14 @@ def blockized_2(a: T.handle, c: T.handle) -> None:
                 vjo * 16 : vjo * 16 + 16
             ]])
             for i_i, j_i in T.grid(16, 16):
-                with T.block([128, 128], "B_inner") as [vi, vj]:
-                    T.bind(vi, vio * 16 + i_i)
-                    T.bind(vj, vjo * 16 + j_i)
+                with T.block("B_inner"):
+                    vi = T.axis.S(128, vio * 16 + i_i)
+                    vj = T.axis.S(128, vjo * 16 + j_i)
                     B[vi, vj] = A[vi, vj] * 2.0
     for i_o, j_o, i_i, j_i in T.grid(4, 4, 32, 32):
-        with T.block([128, 128], "C") as [vi, vj]:
-            T.bind(vi, i_o * 32 + i_i)
-            T.bind(vj, j_o * 32 + j_i)
+        with T.block("C"):
+            vi = T.axis.S(128, i_o * 32 + i_i)
+            vj = T.axis.S(128, j_o * 32 + j_i)
             C[vi, vj] = B[vi, vj] + 1.0
 
 
@@ -141,9 +149,8 @@ def blockized_2_after_reverse_compute_at(a: T.handle, c: T.handle) -> None:
     B = T.alloc_buffer([128, 128], "float32")
     C = T.match_buffer(c, [128, 128], "float32")
     for i_o, j_o in T.grid(8, 8):
-        with T.block([8, 8], "B_outer") as [vio, vjo]:
-            T.bind(vio, i_o)
-            T.bind(vjo, j_o)
+        with T.block("B_outer"):
+            vio, vjo = T.axis.remap("SS", [i_o, j_o])
             T.reads([A[
                 vio * 16 : vio * 16 + 16,
                 vjo * 16 : vjo * 16 + 16,
@@ -153,14 +160,14 @@ def blockized_2_after_reverse_compute_at(a: T.handle, c: T.handle) -> None:
                 vjo * 16 : vjo * 16 + 16
             ]])
             for i_i, j_i in T.grid(16, 16):
-                with T.block([128, 128], "B_inner") as [vi, vj]:
-                    T.bind(vi, vio * 16 + i_i)
-                    T.bind(vj, vjo * 16 + j_i)
+                with T.block("B_inner"):
+                    vi = T.axis.S(128, vio * 16 + i_i)
+                    vj = T.axis.S(128, vjo * 16 + j_i)
                     B[vi, vj] = A[vi, vj] * 2.0
         for ax0, ax1 in T.grid(16, 16):
-            with T.block([128, 128], "C") as [vi, vj]:
-                T.bind(vi, i_o * 16 + ax0)
-                T.bind(vj, j_o * 16 + ax1)
+            with T.block("C"):
+                vi = T.axis.S(128, i_o * 16 + ax0)
+                vj = T.axis.S(128, j_o * 16 + ax1)
                 T.reads([B[vi, vj]])
                 T.writes([C[vi, vj]])
                 C[vi, vj] = B[vi, vj] + 1.0
@@ -173,9 +180,9 @@ def blockized_2_after_compute_at(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, [128, 128], "float32")
     for i_o, j_o in T.grid(4, 4):
         for ax0, ax1 in T.grid(2, 2):
-            with T.block([8, 8], "blockized_B") as [vio, vjo]:
-                T.bind(vio, i_o * 2 + ax0)
-                T.bind(vjo, j_o * 2 + ax1)
+            with T.block("blockized_B"):
+                vio = T.axis.S(8, i_o * 2 + ax0)
+                vjo = T.axis.S(8, j_o * 2 + ax1)
                 T.reads([A[
                     vio * 16 : vio * 16 + 16,
                     vjo * 16 : vjo * 16 + 16,
@@ -185,14 +192,14 @@ def blockized_2_after_compute_at(a: T.handle, c: T.handle) -> None:
                     vjo * 16 : vjo * 16 + 16,
                 ]])
                 for i_i, j_i in T.grid(16, 16):
-                    with T.block([128, 128], "B") as [vi, vj]:
-                        T.bind(vi, vio * 16 + i_i)
-                        T.bind(vj, vjo * 16 + j_i)
+                    with T.block("B"):
+                        vi = T.axis.S(128, vio * 16 + i_i)
+                        vj = T.axis.S(128, vjo * 16 + j_i)
                         B[vi, vj] = A[vi, vj] * 2.0
         for i_i, j_i in T.grid(32, 32):
-            with T.block([128, 128], "C") as [vi, vj]:
-                T.bind(vi, i_o * 32 + i_i)
-                T.bind(vj, j_o * 32 + j_i)
+            with T.block("C"):
+                vi = T.axis.S(128, i_o * 32 + i_i)
+                vj = T.axis.S(128, j_o * 32 + j_i)
                 C[vi, vj] = B[vi, vj] + 1.0
 
 @T.prim_func
@@ -205,18 +212,28 @@ def cuda_matmul_0(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: dis
     A_shared_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
     B_shared_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
     C_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
-    with T.block([2048, 2048], "A_shared") as [v0, v1]:
-        A_shared[v0, v1] = A[v0, v1]
-    with T.block([2048, 2048], "B_shared") as [v0, v1]:
-        B_shared[v0, v1] = B[v0, v1]
-    with T.block([2048, 2048], "A_shared_local") as [v0, v1]:
-        A_shared_local[v0, v1] = A_shared[v0, v1]
-    with T.block([2048, 2048], "B_shared_local") as [v0, v1]:
-        B_shared_local[v0, v1] = B_shared[v0, v1]
-    with T.block([2048, 2048, T.reduce_axis(0, 2048)], "C") as [vi, vj, vk]:
-        with T.init():
-            C_local[vi, vj] = 0.0
-        C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
+    for i, j in T.grid(2048, 2048):
+        with T.block("A_shared"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            A_shared[v0, v1] = A[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("B_shared"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            B_shared[v0, v1] = B[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("A_shared_local"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            A_shared_local[v0, v1] = A_shared[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("B_shared_local"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            B_shared_local[v0, v1] = B_shared[v0, v1]
+    for i, j, k in T.grid(2048, 2048, 2048):
+        with T.block("C"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C_local[vi, vj] = 0.0
+            C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
     for by in T.thread_binding(0, 32, thread = "blockIdx.y"):
         for bx in T.thread_binding(0, 32, thread = "blockIdx.x"):
             for vy in T.thread_binding(0, 2, thread = "vthread.y"):
@@ -224,9 +241,9 @@ def cuda_matmul_0(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: dis
                     for ty in T.thread_binding(0, 8, thread = "threadIdx.y"):
                         for tx in T.thread_binding(0, 8, thread = "threadIdx.x"):
                             for i, j in T.grid(4, 4):
-                                with T.block([2048, 2048], "C_local") as [v0_4, v1_4]:
-                                    T.bind(v0_4, by * 64 + vy * 32 + ty * 4 + i)
-                                    T.bind(v1_4, bx * 64 + vx * 32 + tx * 4 + j)
+                                with T.block("C_local"):
+                                    v0_4 = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + i)
+                                    v1_4 = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
                                     C[v0_4, v1_4] = C_local[v0_4, v1_4]
 
 
@@ -240,14 +257,22 @@ def cuda_matmul_0_after_compute_at(a: T.handle, b: T.handle, c: T.handle) -> Non
     A_shared_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
     B_shared_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
     C_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
-    with T.block([2048, 2048], "A_shared") as [v0, v1]:
-        A_shared[v0, v1] = A[v0, v1]
-    with T.block([2048, 2048], "B_shared") as [v0, v1]:
-        B_shared[v0, v1] = B[v0, v1]
-    with T.block([2048, 2048], "A_shared_local") as [v0, v1]:
-        A_shared_local[v0, v1] = A_shared[v0, v1]
-    with T.block([2048, 2048], "B_shared_local") as [v0, v1]:
-        B_shared_local[v0, v1] = B_shared[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("A_shared"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            A_shared[v0, v1] = A[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("B_shared"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            B_shared[v0, v1] = B[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("A_shared_local"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            A_shared_local[v0, v1] = A_shared[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("B_shared_local"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            B_shared_local[v0, v1] = B_shared[v0, v1]
     for by in T.thread_binding(0, 32, thread = "blockIdx.y"):
         for bx in T.thread_binding(0, 32, thread = "blockIdx.x"):
             for vy in T.thread_binding(0, 2, thread = "vthread.y"):
@@ -255,17 +280,17 @@ def cuda_matmul_0_after_compute_at(a: T.handle, b: T.handle, c: T.handle) -> Non
                     for ty in T.thread_binding(0, 8, thread = "threadIdx.y"):
                         for tx in T.thread_binding(0, 8, thread = "threadIdx.x"):
                             for i, j, k in T.grid(4, 4, 2048):
-                                with T.block([2048, 2048, T.reduce_axis(0, 2048)], "C") as [vi, vj, vk]:
-                                    T.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
-                                    T.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
-                                    T.bind(vk, k)
+                                with T.block("C"):
+                                    vi = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + i)
+                                    vj = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
+                                    vk = T.axis.R(2048, k)
                                     with T.init():
                                         C_local[vi, vj] = 0.0
                                     C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
                             for i, j in T.grid(4, 4):
-                                with T.block([2048, 2048], "C_local") as [vi, vj]:
-                                    T.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
-                                    T.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
+                                with T.block("C_local"):
+                                    vi = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + i)
+                                    vj = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
                                     C[vi, vj] = C_local[vi, vj]
 
 
@@ -279,14 +304,22 @@ def cuda_matmul_1(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: dis
     A_shared_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
     B_shared_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
     C_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
-    with T.block([2048, 2048], "A_shared") as [v0, v1]:
-        A_shared[v0, v1] = A[v0, v1]
-    with T.block([2048, 2048], "B_shared") as [v0, v1]:
-        B_shared[v0, v1] = B[v0, v1]
-    with T.block([2048, 2048], "A_shared_local") as [v0, v1]:
-        A_shared_local[v0, v1] = A_shared[v0, v1]
-    with T.block([2048, 2048], "B_shared_local") as [v0, v1]:
-        B_shared_local[v0, v1] = B_shared[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("A_shared"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            A_shared[v0, v1] = A[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("B_shared"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            B_shared[v0, v1] = B[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("A_shared_local"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            A_shared_local[v0, v1] = A_shared[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("B_shared_local"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            B_shared_local[v0, v1] = B_shared[v0, v1]
     for by in T.thread_binding(0, 32, thread = "blockIdx.y"):
         for bx in T.thread_binding(0, 32, thread = "blockIdx.x"):
             for vy in T.thread_binding(0, 2, thread = "vthread.y"):
@@ -296,17 +329,17 @@ def cuda_matmul_1(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: dis
                             for k_0 in T.serial(0, 256):
                                 for k_1 in T.unroll(0, 8):
                                     for _, i, j in T.grid(1, 4, 4):
-                                        with T.block([2048, 2048, T.reduce_axis(0, 2048)], "C") as [vi, vj, vk]:
-                                            T.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
-                                            T.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
-                                            T.bind(vk, k_0 * 8 + k_1)
+                                        with T.block("C"):
+                                            vi = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + i)
+                                            vj = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
+                                            vk = T.axis.R(2048, k_0 * 8 + k_1)
                                             with T.init():
                                                 C_local[vi, vj] = 0.0
                                             C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
                             for i, j in T.grid(4, 4):
-                                with T.block([2048, 2048], "C_local") as [vi, vj]:
-                                    T.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
-                                    T.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
+                                with T.block("C_local"):
+                                    vi = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + i)
+                                    vj = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
                                     C[vi, vj] = C_local[vi, vj]
 
 
@@ -320,12 +353,18 @@ def cuda_matmul_2(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: dis
     A_shared_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
     B_shared_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
     C_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
-    with T.block([2048, 2048], "A_shared") as [v0, v1]:
-        A_shared[v0, v1] = A[v0, v1]
-    with T.block([2048, 2048], "B_shared") as [v0, v1]:
-        B_shared[v0, v1] = B[v0, v1]
-    with T.block([2048, 2048], "B_shared_local") as [v0, v1]:
-        B_shared_local[v0, v1] = B_shared[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("A_shared"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            A_shared[v0, v1] = A[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("B_shared"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            B_shared[v0, v1] = B[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("B_shared_local"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            B_shared_local[v0, v1] = B_shared[v0, v1]
     for by in T.thread_binding(0, 32, thread = "blockIdx.y"):
         for bx in T.thread_binding(0, 32, thread = "blockIdx.x"):
             for vy in T.thread_binding(0, 2, thread = "vthread.y"):
@@ -335,22 +374,22 @@ def cuda_matmul_2(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: dis
                             for k_0 in T.serial(0, 256):
                                 for k_1 in T.unroll(0, 8):
                                     for i, j in T.grid(1, 4):
-                                        with T.block([2048, 2048], "A_shared_local") as [v0, v1]:
-                                            T.bind(v0, k_0 * 8 + k_1 + i)
-                                            T.bind(v1, by * 64 + vy * 32 + ty * 4 + j)
+                                        with T.block("A_shared_local"):
+                                            v0 = T.axis.S(2048, k_0 * 8 + k_1 + i)
+                                            v1 = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + j)
                                             A_shared_local[v0, v1] = A_shared[v0, v1]
                                     for _, i, j in T.grid(1, 4, 4):
-                                        with T.block([2048, 2048, T.reduce_axis(0, 2048)], "C") as [vi, vj, vk]:
-                                            T.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
-                                            T.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
-                                            T.bind(vk, k_0 * 8 + k_1)
+                                        with T.block("C"):
+                                            vi = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + i)
+                                            vj = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
+                                            vk = T.axis.R(2048, k_0 * 8 + k_1)
                                             with T.init():
                                                 C_local[vi, vj] = T.float32(0)
                                             C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
                             for i, j in T.grid(4, 4):
-                                with T.block([2048, 2048], "C_local") as [v0, v1]:
-                                    T.bind(v0, by * 64 + vy * 32 + ty * 4 + i)
-                                    T.bind(v1, bx * 64 + vx * 32 + tx * 4 + j)
+                                with T.block("C_local"):
+                                    v0 = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + i)
+                                    v1 = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
                                     C[v0, v1] = C_local[v0, v1]
 
 
@@ -364,10 +403,14 @@ def cuda_matmul_3(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: dis
     A_shared_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
     B_shared_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
     C_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
-    with T.block([2048, 2048], "A_shared") as [v0, v1]:
-        A_shared[v0, v1] = A[v0, v1]
-    with T.block([2048, 2048], "B_shared") as [v0, v1]:
-        B_shared[v0, v1] = B[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("A_shared"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            A_shared[v0, v1] = A[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("B_shared"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            B_shared[v0, v1] = B[v0, v1]
     for by in T.thread_binding(0, 32, thread = "blockIdx.y"):
         for bx in T.thread_binding(0, 32, thread = "blockIdx.x"):
             for vy in T.thread_binding(0, 2, thread = "vthread.y"):
@@ -377,27 +420,27 @@ def cuda_matmul_3(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: dis
                             for k0 in T.serial(0, 256):
                                 for k1 in T.unroll(0, 8):
                                     for i, j in T.grid(1, 4):
-                                        with T.block([2048, 2048], "A_shared_local") as [v0, v1]:
-                                            T.bind(v0, k0 * 8 + k1 + i)
-                                            T.bind(v1, by * 64 + vy * 32 + ty * 4 + j)
+                                        with T.block("A_shared_local"):
+                                            v0 = T.axis.S(2048, k0 * 8 + k1 + i)
+                                            v1 = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + j)
                                             A_shared_local[v0, v1] = A_shared[v0, v1]
                                     for i, j in T.grid(1, 4):
-                                        with T.block([2048, 2048], "B_shared_local") as [v0, v1]:
-                                            T.bind(v0, k0 * 8 + k1 + i)
-                                            T.bind(v1, bx * 64 + vx * 32 + tx * 4 + j)
+                                        with T.block("B_shared_local"):
+                                            v0 = T.axis.S(2048, k0 * 8 + k1 + i)
+                                            v1 = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
                                             B_shared_local[v0, v1] = B_shared[v0, v1]
                                     for _, i, j in T.grid(1, 4, 4):
-                                        with T.block([2048, 2048, T.reduce_axis(0, 2048)], "C") as [vi, vj, vk]:
-                                            T.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
-                                            T.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
-                                            T.bind(vk, k0 * 8 + k1)
+                                        with T.block("C"):
+                                            vi = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + i)
+                                            vj = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
+                                            vk = T.axis.R(2048, k0 * 8 + k1)
                                             with T.init():
                                                 C_local[vi, vj] = T.float32(0)
                                             C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
                             for i, j in T.grid(4, 4):
-                                with T.block([2048, 2048], "C_local") as [v0, v1]:
-                                    T.bind(v0, by * 64 + vy * 32 + ty * 4 + i)
-                                    T.bind(v1, bx * 64 + vx * 32 + tx * 4 + j)
+                                with T.block("C_local"):
+                                    v0 = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + i)
+                                    v1 = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
                                     C[v0, v1] = C_local[v0, v1]
 
 
@@ -411,8 +454,10 @@ def cuda_matmul_4(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: dis
     A_shared_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
     B_shared_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
     C_local = T.alloc_buffer([2048, 2048], "float32", scope="local")
-    with T.block([2048, 2048], "B_shared") as [v0, v1]:
-        B_shared[v0, v1] = B[v0, v1]
+    for i, j in T.grid(2048, 2048):
+        with T.block("B_shared"):
+            v0, v1 = T.axis.remap("SS", [i, j])
+            B_shared[v0, v1] = B[v0, v1]
     for by in T.thread_binding(0, 32, thread = "blockIdx.y"):
         for bx in T.thread_binding(0, 32, thread = "blockIdx.x"):
             for vy in T.thread_binding(0, 2, thread = "vthread.y"):
@@ -421,33 +466,33 @@ def cuda_matmul_4(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: dis
                         for tx in T.thread_binding(0, 8, thread = "threadIdx.x"):
                             for k0 in T.serial(0, 256):
                                 for i, j in T.grid(8, 64):
-                                    with T.block([2048, 2048], "A_shared") as [v0, v1]:
-                                        T.bind(v0, k0 * 8 + i)
-                                        T.bind(v1, by * 64 + j)
+                                    with T.block("A_shared"):
+                                        v0 = T.axis.S(2048, k0 * 8 + i)
+                                        v1 = T.axis.S(2048, by * 64 + j)
                                         A_shared[v0, v1] = A[v0, v1]
                                 for k1 in T.unroll(0, 8):
                                     for i, j in T.grid(1, 4):
-                                        with T.block([2048, 2048], "A_shared_local") as [v0, v1]:
-                                            T.bind(v0, k0 * 8 + k1 + i)
-                                            T.bind(v1, by * 64 + vy * 32 + ty * 4 + j)
+                                        with T.block("A_shared_local"):
+                                            v0 = T.axis.S(2048, k0 * 8 + k1 + i)
+                                            v1 = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + j)
                                             A_shared_local[v0, v1] = A_shared[v0, v1]
                                     for i, j in T.grid(1, 4):
-                                        with T.block([2048, 2048], "B_shared_local") as [v0, v1]:
-                                            T.bind(v0, k0 * 8 + k1 + i)
-                                            T.bind(v1, bx * 64 + vx * 32 + tx * 4 + j)
+                                        with T.block("B_shared_local"):
+                                            v0 = T.axis.S(2048, k0 * 8 + k1 + i)
+                                            v1 = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
                                             B_shared_local[v0, v1] = B_shared[v0, v1]
                                     for _, i, j in T.grid(1, 4, 4):
-                                        with T.block([2048, 2048, T.reduce_axis(0, 2048)], "C") as [vi, vj, vk]:
-                                            T.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
-                                            T.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
-                                            T.bind(vk, k0 * 8 + k1)
+                                        with T.block("C"):
+                                            vi = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + i)
+                                            vj = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
+                                            vk = T.axis.R(2048, k0 * 8 + k1)
                                             with T.init():
                                                 C_local[vi, vj] = 0.0
                                             C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
                             for i, j in T.grid(4, 4):
-                                with T.block([2048, 2048], "C_local") as [v0, v1]:
-                                    T.bind(v0, by * 64 + vy * 32 + ty * 4 + i)
-                                    T.bind(v1, bx * 64 + vx * 32 + tx * 4 + j)
+                                with T.block("C_local"):
+                                    v0 = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + i)
+                                    v1 = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
                                     C[v0, v1] = C_local[v0, v1]
 
 
@@ -469,38 +514,38 @@ def cuda_matmul_5(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: dis
                         for tx in T.thread_binding(0, 8, thread = "threadIdx.x"):
                             for k0 in T.serial(0, 256):
                                 for i, j in T.grid(8, 64):
-                                    with T.block([2048, 2048], "A_shared") as [v0, v1]:
-                                        T.bind(v0, k0 * 8 + i)
-                                        T.bind(v1, by * 64 + j)
+                                    with T.block("A_shared"):
+                                        v0 = T.axis.S(2048, k0 * 8 + i)
+                                        v1 = T.axis.S(2048, by * 64 + j)
                                         A_shared[v0, v1] = A[v0, v1]
                                 for i, j in T.grid(8, 64):
-                                    with T.block([2048, 2048], "B_shared") as [v0, v1]:
-                                        T.bind(v0, k0 * 8 + i)
-                                        T.bind(v1, bx * 64 + j)
+                                    with T.block("B_shared"):
+                                        v0 = T.axis.S(2048, k0 * 8 + i)
+                                        v1 = T.axis.S(2048, bx * 64 + j)
                                         B_shared[v0, v1] = B[v0, v1]
                                 for k1 in T.unroll(0, 8):
                                     for i, j in T.grid(1, 4):
-                                        with T.block([2048, 2048], "A_shared_local") as [v0, v1]:
-                                            T.bind(v0, k0 * 8 + k1 + i)
-                                            T.bind(v1, by * 64 + vy * 32 + ty * 4 + j)
+                                        with T.block("A_shared_local"):
+                                            v0 = T.axis.S(2048, k0 * 8 + k1 + i)
+                                            v1 = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + j)
                                             A_shared_local[v0, v1] = A_shared[v0, v1]
                                     for i, j in T.grid(1, 4):
-                                        with T.block([2048, 2048], "B_shared_local") as [v0, v1]:
-                                            T.bind(v0, k0 * 8 + k1 + i)
-                                            T.bind(v1, bx * 64 + vx * 32 + tx * 4 + j)
+                                        with T.block("B_shared_local"):
+                                            v0 = T.axis.S(2048, k0 * 8 + k1 + i)
+                                            v1 = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
                                             B_shared_local[v0, v1] = B_shared[v0, v1]
                                     for _, i, j in T.grid(1, 4, 4):
-                                        with T.block([2048, 2048, T.reduce_axis(0, 2048)], "C") as [vi, vj, vk]:
-                                            T.bind(vi, by * 64 + vy * 32 + ty * 4 + i)
-                                            T.bind(vj, bx * 64 + vx * 32 + tx * 4 + j)
-                                            T.bind(vk, k0 * 8 + k1)
+                                        with T.block("C"):
+                                            vi = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + i)
+                                            vj = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
+                                            vk = T.axis.R(2048, k0 * 8 + k1)
                                             with T.init():
                                                 C_local[vi, vj] = 0.0
                                             C_local[vi, vj] = C_local[vi, vj] + A_shared_local[vk, vi] * B_shared_local[vk, vj]
                             for i, j in T.grid(4, 4):
-                                with T.block([2048, 2048], "C_local") as [v0, v1]:
-                                    T.bind(v0, by * 64 + vy * 32 + ty * 4 + i)
-                                    T.bind(v1, bx * 64 + vx * 32 + tx * 4 + j)
+                                with T.block("C_local"):
+                                    v0 = T.axis.S(2048, by * 64 + vy * 32 + ty * 4 + i)
+                                    v1 = T.axis.S(2048, bx * 64 + vx * 32 + tx * 4 + j)
                                     C[v0, v1] = C_local[v0, v1]
 
 
@@ -510,12 +555,14 @@ def tiled(a: T.handle, c: T.handle) -> None:
     B = T.alloc_buffer([128, 128], "float32")
     C = T.match_buffer(c, [128, 128], "float32")
     for i_0, j_0, i_1, j_1 in T.grid(8, 8, 16, 16):
-        with T.block([128, 128], "B") as [vi, vj]:
-            T.bind(vi, i_0 * 16 + i_1)
-            T.bind(vj, j_0 * 16 + j_1)
+        with T.block("B"):
+            vi = T.axis.S(128, i_0 * 16 + i_1)
+            vj = T.axis.S(128, j_0 * 16 + j_1)
             B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -525,14 +572,14 @@ def tiled_after_reverse_compute_at(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, [128, 128], "float32")
     for i_0, j_0, i_1 in T.grid(8, 8, 16):
         for j_1 in T.serial(0, 16):
-            with T.block([128, 128], "B") as [vi, vj]:
-                T.bind(vi, i_0 * 16 + i_1)
-                T.bind(vj, j_0 * 16 + j_1)
+            with T.block("B"):
+                vi = T.axis.S(128, i_0 * 16 + i_1)
+                vj = T.axis.S(128, j_0 * 16 + j_1)
                 B[vi, vj] = A[vi, vj] * 2.0
         for j_1 in T.serial(0, 16):
-            with T.block([128, 128], "C") as [vi, vj]:
-                T.bind(vi, i_0 * 16 + i_1)
-                T.bind(vj, j_0 * 16 + j_1)
+            with T.block("C"):
+                vi = T.axis.S(128, i_0 * 16 + i_1)
+                vj = T.axis.S(128, j_0 * 16 + j_1)
                 C[vi, vj] = B[vi, vj] + 1.0
 
 
@@ -544,17 +591,15 @@ def factorized(a: T.handle, b: T.handle) -> None:
     for j in T.thread_binding(0, 16, thread = "blockIdx.x"):
         for i_o in T.thread_binding(0, 4, thread = "threadIdx.x"):
             for i_i, k in T.grid(4, 16):
-                with T.block([16, 16, T.reduce_axis(0, 16)], "B_rf") as [vi, vj, vk]:
-                    T.bind(vi, i_o * 4 + i_i)
-                    T.bind(vj, j)
-                    T.bind(vk, k)
+                with T.block("B_rf"):
+                    vi = T.axis.S(16, i_o * 4 + i_i)
+                    vj, vk = T.axis.remap("SR", [j, k])
                     with T.init():
                         B_rf_local[vi, vj] = 0.0
                     B_rf_local[vi, vj] = B_rf_local[vi, vj] + A[vj, vi, vk]
     for i, k in T.grid(16, 16):
-        with T.block([16, T.reduce_axis(0, 16)], "B") as [vi, vk]:
-            T.bind(vi, i)
-            T.bind(vk, k)
+        with T.block("B"):
+            vi, vk = T.axis.remap("SR", [i, k])
             with T.init():
                 B[vi] = 0.0
             B[vi] = B[vi] + B_rf_local[vk, vi]
@@ -568,17 +613,17 @@ def factorized_after_reverse_compute_at(a: T.handle, b: T.handle) -> None:
     for j in T.thread_binding(0, 16, thread = "blockIdx.x"):
         for i_o in T.thread_binding(0, 4, thread = "threadIdx.x"):
             for i_i, k in T.grid(4, 16):
-                with T.block([16, 16, T.reduce_axis(0, 16)], "B_rf") as [vi, vj, vk]:
-                    T.bind(vi, i_o * 4 + i_i)
-                    T.bind(vj, j)
-                    T.bind(vk, k)
+                with T.block("B_rf"):
+                    vi = T.axis.S(16, i_o * 4 + i_i)
+                    vj = T.axis.S(16, j)
+                    vk = T.axis.R(16, k)
                     with T.init():
                         B_rf_local[vi, vj] = 0.0
                     B_rf_local[vi, vj] = B_rf_local[vi, vj] + A[vj, vi, vk]
             for k in T.serial(0, 4):
-                with T.block([16, T.reduce_axis(0, 16)], "B") as [vi, vk]:
-                    T.bind(vi, j)
-                    T.bind(vk, i_o * 4 + k)
+                with T.block("B"):
+                    vi = T.axis.S(16, j)
+                    vk = T.axis.R(16, i_o * 4 + k)
                     with T.init():
                         B[vi] = 0.0
                     B[vi] = B[vi] + B_rf_local[vk, vi]
@@ -591,17 +636,19 @@ def fail_subtree_compact_dataflow(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (128, 128), "float32")
     for i in range(0, 128):
         for j in range(0, 64):
-            with T.block([128, 128], "B_0") as [vi, vj]:
-                T.bind(vi, i)
-                T.bind(vj, j)
+            with T.block("B_0"):
+                vi = T.axis.S(128, i)
+                vj = T.axis.S(128, j)
                 B[vi, vj] = A[vi, vj] * 2.0
         for j in range(0, 64):
-            with T.block([128, 128], "B_1") as [vi, vj]:
-                T.bind(vi, i)
-                T.bind(vj, j + 64)
+            with T.block("B_1"):
+                vi = T.axis.S(128, i)
+                vj = T.axis.S(128, j + 64)
                 B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -611,13 +658,16 @@ def fail_all_consumers_under_loop(a: T.handle, c: T.handle, d: T.handle) -> None
     C = T.match_buffer(c, (128, 128), "float32")
     D = T.match_buffer(d, (128, 128), "float32")
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "B") as [vi, vj]:
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
             B[vi, vj] = A[vi, vj] * 2.0
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "C") as [vi, vj]:
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
             C[vi, vj] = B[vi, vj] + 1.0
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "D") as [vi, vj]:
+        with T.block("D"):
+            vi, vj = T.axis.remap("SS", [i, j])
             D[vi, vj] = B[vi, vj] + 1.0
 
 
@@ -628,13 +678,16 @@ def fail_all_producers_under_loop(a: T.handle, d: T.handle) -> None:
     C = T.alloc_buffer((128, 128), "float32")
     D = T.match_buffer(d, (128, 128), "float32")
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "B") as [vi, vj]:
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
             B[vi, vj] = A[vi, vj] * 2.0
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "C") as [vi, vj]:
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
             C[vi, vj] = A[vi, vj] + 1.0
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "D") as [vi, vj]:
+        with T.block("D"):
+            vi, vj = T.axis.remap("SS", [i, j])
             D[vi, vj] = B[vi, vj] + C[vi, vj]
 
 
@@ -644,10 +697,12 @@ def read_out_of_bound(a: T.handle, c:T.handle) -> None:
     B = T.alloc_buffer([16], "float32")
     C = T.match_buffer(c, [16], "float32")
     for i in T.serial(0, 16):
-        with T.block([16], "B") as [v]:
+        with T.block("B"):
+            v = T.axis.S(16, i)
             B[v] = A[v]
     for j in T.serial(0, 16):
-        with T.block([16], "C") as [v]:
+        with T.block("C"):
+            v = T.axis.S(16, j)
             T.reads(B[v : v + 2])
             C[v] = T.if_then_else(v < 15, T.max(B[v], B[v + 1]), B[v], dtype="float32")
 
@@ -659,11 +714,11 @@ def read_out_of_bound_after_compute_at(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, [16], "float32")
     for j in T.serial(0, 16):
         for i in T.serial(0, T.min(1, 15 - j) + 1):
-            with T.block([16], "B") as [v]:
-                T.bind(v, j + i)
+            with T.block("B"):
+                v = T.axis.S(16, j + i)
                 B[v] = A[v]
-        with T.block([16], "C") as [v]:
-            T.bind(v, j)
+        with T.block("C"):
+            v = T.axis.S(16, j)
             T.reads([B[v : v + 2]])
             C[v] = T.if_then_else(v < 15, T.max(B[v], B[v + 1]), B[v], dtype="float32")
 
diff --git a/tests/python/unittest/test_tir_schedule_compute_inline.py b/tests/python/unittest/test_tir_schedule_compute_inline.py
index f9049f6da732..617c75b75cd9 100644
--- a/tests/python/unittest/test_tir_schedule_compute_inline.py
+++ b/tests/python/unittest/test_tir_schedule_compute_inline.py
@@ -31,10 +31,14 @@ def elementwise(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.alloc_buffer((128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -43,12 +47,18 @@ def elementwise_multi_producer_consumer(a: T.handle, c: T.handle, d: T.handle) -
     B = T.alloc_buffer((128, 128))
     C = T.match_buffer(c, (128, 128))
     D = T.match_buffer(d, (128, 128))
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0  # B has two consumers
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
-    with T.block([128, 128], "D") as [vi, vj]:
-        D[vi, vj] = B[vi, vj] + 2.0 + C[vi, vj]  # D has two producers
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0  # B has two consumers
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("D"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            D[vi, vj] = B[vi, vj] + 2.0 + C[vi, vj]  # D has two producers
 
 
 @T.prim_func
@@ -56,10 +66,14 @@ def elementwise_multi_consumer_inlined(a: T.handle, c: T.handle, d: T.handle) ->
     A = T.match_buffer(a, (128, 128))
     C = T.match_buffer(c, (128, 128))
     D = T.match_buffer(d, (128, 128))
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = A[vi, vj] * 2.0 + 1.0
-    with T.block([128, 128], "D") as [vi, vj]:
-        D[vi, vj] = A[vi, vj] * 2.0 + 2.0 + C[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = A[vi, vj] * 2.0 + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("D"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            D[vi, vj] = A[vi, vj] * 2.0 + 2.0 + C[vi, vj]
 
 
 @T.prim_func
@@ -67,18 +81,24 @@ def elementwise_standalone(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.alloc_buffer((128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = A[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = A[vi, vj] + 1.0
 
 
 @T.prim_func
 def elementwise_standalone_dce(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = A[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = A[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -88,14 +108,12 @@ def elementwise_under_loop(a: T.handle, c: T.handle) -> None:
     B = T.alloc_buffer((128, 128))
     for i in T.serial(0, 128):
         for j in T.serial(0, 128):
-            with T.block([128, 128], "B") as [vi, vj]:
-                T.bind(vi, i)
-                T.bind(vj, j)
+            with T.block("B"):
+                vi, vj = T.axis.remap("SS", [i, j])
                 B[vi, vj] = A[vi, vj] * 2.0
         for j in T.serial(0, 128):
-            with T.block([128, 128], "C") as [vi, vj]:
-                T.bind(vi, i)
-                T.bind(vj, j)
+            with T.block("C"):
+                vi, vj = T.axis.remap("SS", [i, j])
                 C[vi, vj] = B[vi, vj] + 1.0
 
 
@@ -103,8 +121,10 @@ def elementwise_under_loop(a: T.handle, c: T.handle) -> None:
 def elementwise_inlined(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = A[vi, vj] * 2.0 + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = A[vi, vj] * 2.0 + 1.0
 
 
 @T.prim_func
@@ -113,11 +133,15 @@ def fail_multi_reader_writer(a: T.handle, d: T.handle) -> None:
     B = T.alloc_buffer((128, 128))
     C = T.alloc_buffer((128, 128))
     D = T.match_buffer(d, (128, 128))
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-        C[vi, vj] = A[vi, vj] + 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        D[vi, vj] = B[vi, vj] + C[vi, vj]
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+            C[vi, vj] = A[vi, vj] + 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            D[vi, vj] = B[vi, vj] + C[vi, vj]
 
 
 @T.prim_func
@@ -125,18 +149,24 @@ def elementwise_multi_reverse_loads(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.alloc_buffer((128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = (B[vi, vj] + 1.0) * (B[vi, vj] * 2.0) + 3.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = (B[vi, vj] + 1.0) * (B[vi, vj] * 2.0) + 3.0
 
 
 @T.prim_func
 def elementwise_multi_reverse_loads_inlined(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "B") as [vi, vj]:
-        C[vi, vj] = (A[vi, vj] * 2.0 + 1.0) * (A[vi, vj] * 2.0 * 2.0) + 3.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = (A[vi, vj] * 2.0 + 1.0) * (A[vi, vj] * 2.0 * 2.0) + 3.0
 
 
 @T.prim_func
@@ -144,12 +174,16 @@ def opaque_access_load(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.alloc_buffer((128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        T.reads(B[0:128, 0:128])
-        T.writes(C[0:128, 0:128])
-        C[vi, vj] = T.load("float32", B.data, vi * 128 + vj) + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(B[0:128, 0:128])
+            T.writes(C[0:128, 0:128])
+            C[vi, vj] = T.load("float32", B.data, vi * 128 + vj) + 1.0
 
 
 @T.prim_func
@@ -157,13 +191,17 @@ def opaque_access_store(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.alloc_buffer((128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        T.reads(B[0:128, 0:128])
-        T.writes(C[0:128, 0:128])
-        T.store(C.data, vi * 128 + vj, B[vi, vj] + 1.0)
-        C[vi, vj] = T.load("float32", B.data, vi * 16 + vj) + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(B[0:128, 0:128])
+            T.writes(C[0:128, 0:128])
+            T.store(C.data, vi * 128 + vj, B[vi, vj] + 1.0)
+            C[vi, vj] = T.load("float32", B.data, vi * 16 + vj) + 1.0
 
 
 @T.prim_func
@@ -171,11 +209,15 @@ def buffer_matched(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.alloc_buffer((128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        Bb = T.match_buffer(B[vi : vi + 1, vj], (1, 1))
-        C[vi, vj] = Bb[0, 0] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            Bb = T.match_buffer(B[vi : vi + 1, vj], (1, 1))
+            C[vi, vj] = Bb[0, 0] + 1.0
 
 
 @T.prim_func
@@ -183,10 +225,13 @@ def elementwise_predicate(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.alloc_buffer((128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "C") as [vi, vj]:
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
             T.where(B[i, j] < 10.0)
             C[vi, vj] = B[vi, vj] + 1.0
 
@@ -196,7 +241,8 @@ def elementwise_predicate_inlined(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     C = T.match_buffer(c, (128, 128))
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "C") as [vi, vj]:
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
             T.where(A[i, j] * 2.0 < 10.0)
             C[vi, vj] = A[vi, vj] * 2.0 + 1.0
 
@@ -206,18 +252,24 @@ def elementwise_multi_loads(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.alloc_buffer((128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 126], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + B[vi, vj + 1] + B[vi, vj + 2]
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + B[vi, vj + 1] + B[vi, vj + 2]
 
 
 @T.prim_func
 def elementwise_multi_loads_inlined(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 126], "C") as [vi, vj]:
-        C[vi, vj] = A[vi, vj] * 2.0 + A[vi, vj + 1] * 2.0 + A[vi, vj + 2] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = A[vi, vj] * 2.0 + A[vi, vj + 1] * 2.0 + A[vi, vj + 2] * 2.0
 
 
 # pylint: enable=no-member,invalid-name,unused-variable
diff --git a/tests/python/unittest/test_tir_schedule_error.py b/tests/python/unittest/test_tir_schedule_error.py
index 7a9c8e01d355..ad6a1931bb0b 100644
--- a/tests/python/unittest/test_tir_schedule_error.py
+++ b/tests/python/unittest/test_tir_schedule_error.py
@@ -31,10 +31,12 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "init") as [vi, vj]:
+        with T.block("init"):
+            vi, vj = T.axis.remap("SS", [i, j])
             C[vi, vj] = T.float32(0)
-        for k in range(0, 128):
-            with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
+        for k in range(128):
+            with T.block("update"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
diff --git a/tests/python/unittest/test_tir_schedule_for_kind.py b/tests/python/unittest/test_tir_schedule_for_kind.py
index 60269ac01c14..9075e93b9d45 100644
--- a/tests/python/unittest/test_tir_schedule_for_kind.py
+++ b/tests/python/unittest/test_tir_schedule_for_kind.py
@@ -31,9 +31,10 @@
 def element_wise(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.match_buffer(b, (128, 128))
-
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
 
 
 @T.prim_func
@@ -42,9 +43,8 @@ def element_wise_parallelized(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128, 128))
     for i0 in T.parallel(0, 128):
         for i1 in T.serial(0, 128):
-            with T.block([128, 128], "B") as [vi, vj]:
-                T.bind(vi, i0)
-                T.bind(vj, i1)
+            with T.block("B"):
+                vi, vj = T.axis.remap("SS", [i0, i1])
                 B[vi, vj] = A[vi, vj] * 2.0
 
 
@@ -54,9 +54,8 @@ def element_wise_i_bound(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128, 128))
     for i0 in T.thread_binding(0, 128, thread="threadIdx.x"):
         for i1 in T.serial(0, 128):
-            with T.block([128, 128], "B") as [vi, vj]:
-                T.bind(vi, i0)
-                T.bind(vj, i1)
+            with T.block("B"):
+                vi, vj = T.axis.remap("SS", [i0, i1])
                 B[vi, vj] = A[vi, vj] * 2.0
 
 
@@ -67,14 +66,13 @@ def element_wise_compute_at_split(a: T.handle, c: T.handle) -> None:
     B = T.alloc_buffer((128, 128))
     for i in T.serial(0, 128):
         for j0 in T.serial(0, 128):
-            with T.block([128, 128], "B") as [vi, vj]:
-                T.bind(vi, i)
-                T.bind(vj, j0)
+            with T.block("B"):
+                vi, vj = T.axis.remap("SS", [i, j0])
                 B[vi, vj] = A[vi, vj] * 2.0
         for j1o, j1i in T.grid(32, 4):
-            with T.block([128, 128], "C") as [vi, vj]:
-                T.bind(vi, i)
-                T.bind(vj, j1o * 4 + j1i)
+            with T.block("C"):
+                vi = T.axis.S(128, i)
+                vj = T.axis.S(128, j1o * 4 + j1i)
                 C[vi, vj] = B[vi, vj] + 1.0
 
 
@@ -85,15 +83,14 @@ def element_wise_compute_at_split_vectorized(a: T.handle, c: T.handle) -> None:
     B = T.alloc_buffer((128, 128))
     for i in T.serial(0, 128):
         for j0 in T.serial(0, 128):
-            with T.block([128, 128], "B") as [vi, vj]:
-                T.bind(vi, i)
-                T.bind(vj, j0)
+            with T.block("B"):
+                vi, vj = T.axis.remap("SS", [i, j0])
                 B[vi, vj] = A[vi, vj] * 2.0
         for j1o in T.serial(0, 32):
             for j1i in T.vectorized(0, 4):
-                with T.block([128, 128], "C") as [vi, vj]:
-                    T.bind(vi, i)
-                    T.bind(vj, j1o * 4 + j1i)
+                with T.block("C"):
+                    vi = T.axis.S(128, i)
+                    vj = T.axis.S(128, j1o * 4 + j1i)
                     C[vi, vj] = B[vi, vj] + 1.0
 
 
@@ -102,10 +99,10 @@ def element_wise_split_predicate(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [128, 128])
     B = T.match_buffer(b, [128, 128])
     for i, j_0, j_1 in T.grid(128, 13, 10):
-        with T.block([128, 128], "B") as [vi, vj]:
+        with T.block("B"):
             T.where(j_0 * 10 + j_1 < 128)
-            T.bind(vi, i)
-            T.bind(vj, j_0 * 10 + j_1)
+            vi = T.axis.S(128, i)
+            vj = T.axis.S(128, j_0 * 10 + j_1)
             B[vi, vj] = A[vi, vj] * 2.0
 
 
@@ -116,10 +113,10 @@ def element_wise_split_predicate_parallelized(a: T.handle, b: T.handle) -> None:
     for i in T.serial(0, 128):
         for j_0 in T.parallel(0, 13):
             for j_1 in T.serial(0, 10):
-                with T.block([128, 128], "B") as [vi, vj]:
+                with T.block("B"):
                     T.where(j_0 * 10 + j_1 < 128)
-                    T.bind(vi, i)
-                    T.bind(vj, j_0 * 10 + j_1)
+                    vi = T.axis.S(128, i)
+                    vj = T.axis.S(128, j_0 * 10 + j_1)
                     B[vi, vj] = A[vi, vj] * 2.0
 
 
@@ -129,10 +126,10 @@ def element_wise_split_predicate_vectorized(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     for i in T.vectorized(0, 128):
         for j_0, j_1 in T.grid(13, 10):
-            with T.block([128, 128], "B") as [vi, vj]:
+            with T.block("B"):
                 T.where(j_0 * 10 + j_1 < 128)
-                T.bind(vi, i)
-                T.bind(vj, j_0 * 10 + j_1)
+                vi = T.axis.S(128, i)
+                vj = T.axis.S(128, j_0 * 10 + j_1)
                 B[vi, vj] = A[vi, vj] * 2.0
 
 
@@ -143,15 +140,14 @@ def element_wise_compute_at_split_j0_j1o_bound(a: T.handle, c: T.handle) -> None
     B = T.alloc_buffer((128, 128))
     for i in T.serial(0, 128):
         for j0 in T.thread_binding(0, 128, thread="threadIdx.x"):
-            with T.block([128, 128], "B") as [vi, vj]:
-                T.bind(vi, i)
-                T.bind(vj, j0)
+            with T.block("B"):
+                vi, vj = T.axis.remap("SS", [i, j0])
                 B[vi, vj] = A[vi, vj] * 2.0
         for j1o in T.thread_binding(0, 32, thread="threadIdx.x"):
             for j1i in T.serial(0, 4):
-                with T.block([128, 128], "C") as [vi, vj]:
-                    T.bind(vi, i)
-                    T.bind(vj, j1o * 4 + j1i)
+                with T.block("C"):
+                    vi = T.axis.S(128, i)
+                    vj = T.axis.S(128, j1o * 4 + j1i)
                     C[vi, vj] = B[vi, vj] + 1.0
 
 
@@ -161,10 +157,12 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, (128, 128))
     C = T.match_buffer(c, (128, 128))
 
-    with T.block([128, 128, T.reduce_axis(0, 128)], "C") as [vi, vj, vk]:
-        with T.init():
-            C[vi, vj] = 0.0
-        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block("C"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vi, vj] = 0.0
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
 @T.prim_func
@@ -172,10 +170,12 @@ def rowsum(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.match_buffer(b, (128,))
 
-    with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]:
-        with T.init():
-            B[vi] = 0.0
-        B[vi] = B[vi] + A[vi, vk]
+    for i, k in T.grid(128, 128):
+        with T.block("B"):
+            vi, vk = T.axis.remap("SR", [i, k])
+            with T.init():
+                B[vi] = 0.0
+            B[vi] = B[vi] + A[vi, vk]
 
 
 @T.prim_func
@@ -184,9 +184,8 @@ def rowsum_unrolled(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128,))
     for i0 in T.unroll(0, 128):
         for i1 in T.serial(0, 128):
-            with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]:
-                T.bind(vi, i0)
-                T.bind(vk, i1)
+            with T.block("B"):
+                vi, vk = T.axis.remap("SR", [i0, i1])
                 with T.init():
                     B[vi] = 0.0
                 B[vi] = B[vi] + A[vi, vk]
@@ -198,9 +197,9 @@ def rowsum_not_quasi_affine(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128,))
 
     for i, k in T.grid(128, 16):
-        with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]:
-            T.bind(vi, i)
-            T.bind(vk, T.floordiv(k * k, 2))
+        with T.block("B"):
+            vi = T.axis.S(128, i)
+            vk = T.axis.R(128, T.floordiv(k * k, 2))
             with T.init():
                 B[vi] = 0.0
             B[vi] = B[vi] + A[vi, vk]
@@ -211,10 +210,12 @@ def rowsum_not_compact_data_flow(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.match_buffer(b, (128,))
 
-    with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]:
-        with T.init():
-            B[vk] = 0.0
-        B[vk] = B[vk] + A[vi, vk]
+    for i, k in T.grid(128, 16):
+        with T.block("B"):
+            vi, vk = T.axis.remap("SR", [i, k])
+            with T.init():
+                B[vk] = 0.0
+            B[vk] = B[vk] + A[vi, vk]
 
 
 @T.prim_func
@@ -223,9 +224,8 @@ def rowsum_cross_thread_reduction(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128,))
     for i0 in T.serial(0, 128):
         for i1 in T.thread_binding(0, 128, thread="threadIdx.x"):
-            with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]:
-                T.bind(vi, i0)
-                T.bind(vk, i1)
+            with T.block("B"):
+                vi, vk = T.axis.remap("SR", [i0, i1])
                 with T.init():
                     B[vi] = 0.0
                 B[vi] = B[vi] + A[vi, vk]
@@ -235,7 +235,7 @@ def rowsum_cross_thread_reduction(a: T.handle, b: T.handle) -> None:
 def opaque_block(a: T.handle) -> None:
     A = T.match_buffer(a, (16,))
     for i in T.serial(0, 15):
-        with T.block([], "opaque"):
+        with T.block("opaque"):
             A[i + 1] = A[i + 1] + A[i]
 
 
diff --git a/tests/python/unittest/test_tir_schedule_reduction.py b/tests/python/unittest/test_tir_schedule_reduction.py
index 8460b5cf3e66..e158f6a026e1 100644
--- a/tests/python/unittest/test_tir_schedule_reduction.py
+++ b/tests/python/unittest/test_tir_schedule_reduction.py
@@ -32,18 +32,17 @@ def rowsum_blockized(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [32, 4])
     A = T.match_buffer(a, [32, 4, 128])
     for i0, i2_0 in T.grid(32, 16):
-        with T.block([32, T.reduce_axis(0, 16)], "blockized_B") as [io, ko]:
-            T.bind(io, i0)
-            T.bind(ko, i2_0)
+        with T.block("blockized_B"):
+            io, ko = T.axis.remap("SR", [i0, i2_0])
             with T.init():
                 for i1 in T.serial(0, 4):
-                    with T.block([4], "B_init") as [ii_init]:
-                        T.bind(ii_init, i1)
+                    with T.block("B_init"):
+                        ii_init = T.axis.S(4, i1)
                         B[io, ii_init] = 0.0
             for i1_1, i2_1 in T.grid(4, 8):
-                with T.block([4, T.reduce_axis(0, 128)], "B") as [ii, k]:
-                    T.bind(ii, i1_1)
-                    T.bind(k, ko * 8 + i2_1)
+                with T.block("B"):
+                    ii = T.axis.S(4, i1_1)
+                    k = T.axis.R(128, ko * 8 + i2_1)
                     B[io, ii] = B[io, ii] + A[io, ii, k]
 
 
@@ -52,11 +51,12 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, [128, 128])
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
-
-    with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
-        with T.init():
-            C[vi, vj] = 0.0
-        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block("update"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vi, vj] = 0.0
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
 @T.prim_func
@@ -65,11 +65,15 @@ def matmul_decompose0(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
 
-    with T.block([128, 128], "init") as [vi, vj]:
-        C[vi, vj] = 0.0
+    for i, j in T.grid(128, 128):
+        with T.block("init"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = 0.0
 
-    with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
-        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block("update"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
 @T.prim_func
@@ -78,16 +82,19 @@ def matmul_decompose1(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [32, 4], elem_offset=0, align=128, offset_factor=1)
 
     for i0 in T.serial(0, 32):
-        with T.block([32], "blockized_B_init") as [io]:
+        with T.block("blockized_B_init"):
+            io = T.axis.S(32, i0)
             for i1 in T.serial(0, 4):
-                with T.block([4], "B_init") as [ii]:
+                with T.block("B_init"):
+                    ii = T.axis.S(4, i1)
                     B[io, ii] = T.float32(0)
     for i0, i2_o in T.grid(32, 16):
-        with T.block([32, T.reduce_axis(0, 16)], "blockized_B_update") as [io, ko]:
+        with T.block("blockized_B_update"):
+            io, ko = T.axis.remap("SR", [i0, i2_o])
             for i1, i2_i in T.grid(4, 8):
-                with T.block([4, T.reduce_axis(0, 128)], "B") as [ii, k]:
-                    T.bind(ii, i1)
-                    T.bind(k, ((ko * 8) + i2_i))
+                with T.block("B"):
+                    ii = T.axis.S(4, i1)
+                    k = T.axis.R(128, ko * 8 + i2_i)
                     B[io, ii] = B[io, ii] + A[io, ii, k]
 
 
@@ -98,10 +105,12 @@ def matmul_decompose2(a: T.handle, b: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1)
 
     for i0, i1 in T.grid(128, 128):
-        with T.block([128, 128], "update_init") as [vi_init, vj_init]:
+        with T.block("update_init"):
+            vi_init, vj_init = T.axis.remap("SS", [i0, i1])
             C[vi_init, vj_init] = T.float32(0)
         for i2 in T.serial(0, 128):
-            with T.block([128, 128, T.reduce_axis(0, 128)], "update_update") as [vi, vj, vk]:
+            with T.block("update_update"):
+                vi, vj, vk = T.axis.remap("SSR", [i0, i1, i2])
                 C[vi, vj] = C[vi, vj] + (A[vi, vk] * B[vj, vk])
 
 
@@ -112,12 +121,10 @@ def matmul_decompose_fail3(a: T.handle, b: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, [128, 128])
 
     for i, k, j in T.grid(128, 128, 128):
-        with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
+        with T.block("update"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
             with T.init():
                 C[vi, vj] = 0.0
-            T.bind(vi, i)
-            T.bind(vj, j)
-            T.bind(vk, k)
             C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
@@ -127,25 +134,21 @@ def matmul_decompose4(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128], elem_offset=0, align=128, offset_factor=1)
     A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1)
     # body
-    with T.block([], "root"):
+    with T.block("root"):
         T.reads([])
         T.writes([])
         for i0_0 in T.serial(0, 16):
             for i0_1_init, i1_init in T.grid(8, 128):
-                with T.block([128, 128], "update_init") as [vi_init, vj_init]:
-                    T.bind(vi_init, ((i0_0 * 8) + i0_1_init))
-                    T.bind(vj_init, i1_init)
+                with T.block("update_init"):
+                    vi_init = T.axis.S(128, i0_0 * 8 + i0_1_init)
+                    vj_init = T.axis.S(128, i1_init)
                     C[vi_init, vj_init] = T.float32(0)
             for i0_1, i1, i2_0, i2_1 in T.grid(8, 128, 19, 7):
-                with T.block([128, 128, T.reduce_axis(0, 128)], "update_update") as [
-                    vi,
-                    vj,
-                    vk,
-                ]:
+                with T.block("update_update"):
                     T.where((((i2_0 * 7) + i2_1) < 128))
-                    T.bind(vi, ((i0_0 * 8) + i0_1))
-                    T.bind(vj, i1)
-                    T.bind(vk, ((i2_0 * 7) + i2_1))
+                    vi = T.axis.S(128, i0_0 * 8 + i0_1)
+                    vj = T.axis.S(128, i1)
+                    vk = T.axis.R(128, i2_0 * 7 + i2_1)
                     C[vi, vj] = C[vi, vj] + (A[vi, vk] * B[vj, vk])
 
 
diff --git a/tests/python/unittest/test_tir_schedule_reorder.py b/tests/python/unittest/test_tir_schedule_reorder.py
index a60ab8dca972..8267a369cf5d 100644
--- a/tests/python/unittest/test_tir_schedule_reorder.py
+++ b/tests/python/unittest/test_tir_schedule_reorder.py
@@ -30,8 +30,10 @@
 def elementwise(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128, 128, 128))
     B = T.match_buffer(b, (128, 128, 128, 128))
-    with T.block([128, 128, 128, 128], "B") as [vi, vj, vk, vl]:
-        B[vi, vj, vk, vl] = A[vi, vj, vk, vl] * 2.0
+    for i, j, k, l in T.grid(128, 128, 128, 128):
+        with T.block("B"):
+            vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+            B[vi, vj, vk, vl] = A[vi, vj, vk, vl] * 2.0
 
 
 @T.prim_func
@@ -39,11 +41,9 @@ def elementwise_not_affine(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128, 128, 128))
     B = T.match_buffer(b, (128, 128, 128, 128))
     for i, j, k, l in T.grid(128, 128, 128, 8):
-        with T.block([128, 128, 128, 128], "B") as [vi, vj, vk, vl]:
-            T.bind(vi, i)
-            T.bind(vj, j)
-            T.bind(vk, k)
-            T.bind(vl, l * 16)
+        with T.block("B"):
+            vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+            vl = T.axis.S(128, l * 16)
             B[vi, vj, vk, vl] = A[vi, vj, vk, vl] * 2.0
 
 
@@ -53,7 +53,8 @@ def elementwise_dependent_loop(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128, 128, 128, 128))
     for i in T.serial(0, 128):
         for j, k, l in T.grid(128, i, 128):
-            with T.block([128, 128, i, 128], "B") as [vi, vj, vk, vl]:
+            with T.block("B"):
+                vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                 B[vi, vj, vk, vl] = A[vi, vj, vk, vl] * 2.0
 
 
@@ -62,8 +63,9 @@ def elementwise_predicate(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128, 128, 128))
     B = T.match_buffer(b, (128, 128, 128, 128))
     for i, j, k, l in T.grid(128, 128, 128, 128):
-        with T.block([128, 128, 128, 128], "B") as [vi, vj, vk, vl]:
+        with T.block("B"):
             T.where(i * 2097152 + j * 16384 + k * 128 + l < 100)
+            vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
             B[vi, vj, vk, vl] = A[vi, vj, vk, vl] * 2.0
 
 
@@ -74,16 +76,12 @@ def elementwise_non_single_branch(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128, 128, 128))
     for i, j in T.grid(128, 128):
         for k in T.serial(0, 128):
-            with T.block([128, 128, 128], "C") as [vi, vj, vk]:
-                T.bind(vi, i)
-                T.bind(vj, j)
-                T.bind(vk, k)
+            with T.block("C"):
+                vi, vj, vk = T.axis.remap("SSS", [i, j, k])
                 C[vi, vj, vk] = A[vi, vj, vk] * 2.0
         for k in T.serial(0, 128):
-            with T.block([128, 128, 128], "B") as [vi, vj, vk]:
-                T.bind(vi, i)
-                T.bind(vj, j)
-                T.bind(vk, k)
+            with T.block("B"):
+                vi, vj, vk = T.axis.remap("SSS", [i, j, k])
                 B[vi, vj, vk] = C[vi, vj, vk] * 2.0
 
 
@@ -92,12 +90,11 @@ def elementwise_with_loops_not_same_scope(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128, 128))
     B = T.match_buffer(b, (128, 128, 128))
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "A") as [vi, vj]:
-            T.bind(vi, i)
-            T.bind(vj, j)
+        with T.block("A"):
+            vi, vj = T.axis.remap("SS", [i, j])
             for k in T.serial(0, 128):
-                with T.block([128], "B") as [vk]:
-                    T.bind(vk, k)
+                with T.block("B"):
+                    vk = T.axis.S(128, k)
                     T.reads([A[vi, vj, vk]])
                     T.writes([B[vi, vj, vk]])
                     B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -108,10 +105,9 @@ def elementwise_with_wrong_block_var_type(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128, 128))
     B = T.match_buffer(b, (128, 128, 128))
     for i, j, k in T.grid(128, 128, 128):
-        with T.block([128, 128, T.scan_axis(0, 128)], "B") as [vi, vj, vk]:
-            T.bind(vi, i)
-            T.bind(vj, j)
-            T.bind(vk, k)
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            vk = T.axis.scan(128, k)
             T.reads([A[vi, vj, vk]])
             T.writes([B[vi, vj, vk]])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -122,11 +118,8 @@ def elementwise_reordered(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128, 128, 128))
     B = T.match_buffer(b, (128, 128, 128, 128))
     for l, j, k, i in T.grid(128, 128, 128, 128):
-        with T.block([128, 128, 128, 128], "B") as [vi, vj, vk, vl]:
-            T.bind(vi, i)
-            T.bind(vj, j)
-            T.bind(vk, k)
-            T.bind(vl, l)
+        with T.block("B"):
+            vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
             B[vi, vj, vk, vl] = A[vi, vj, vk, vl] * 2.0
 
 
@@ -135,11 +128,8 @@ def elementwise_reordered2(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128, 128, 128))
     B = T.match_buffer(b, (128, 128, 128, 128))
     for k, j, i, l in T.grid(128, 128, 128, 128):
-        with T.block([128, 128, 128, 128], "B") as [vi, vj, vk, vl]:
-            T.bind(vi, i)
-            T.bind(vj, j)
-            T.bind(vk, k)
-            T.bind(vl, l)
+        with T.block("B"):
+            vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
             B[vi, vj, vk, vl] = A[vi, vj, vk, vl] * 2.0
 
 
@@ -148,12 +138,9 @@ def elementwise_reordered_with_predicate(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128, 128, 128))
     B = T.match_buffer(b, (128, 128, 128, 128))
     for l, j, k, i in T.grid(128, 128, 128, 128):
-        with T.block([128, 128, 128, 128], "B") as [vi, vj, vk, vl]:
+        with T.block("B"):
             T.where(i * 2097152 + j * 16384 + k * 128 + l < 100)
-            T.bind(vi, i)
-            T.bind(vj, j)
-            T.bind(vk, k)
-            T.bind(vl, l)
+            vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
             B[vi, vj, vk, vl] = A[vi, vj, vk, vl] * 2.0
 
 
@@ -161,14 +148,18 @@ def elementwise_reordered_with_predicate(a: T.handle, b: T.handle) -> None:
 def opaque_access(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [16, 16], "float32")
     B = T.match_buffer(b, [16, 16], "float32")
-    with T.block([16, 16], "A") as [vi, vj]:
-        T.reads([])
-        T.writes([A[0:16, 0:16]])
-        T.store(A.data, vi * 16 + vj, 1)
-    with T.block([16, 16], "B") as [vi, vj]:
-        T.reads([])
-        T.writes([B[0:16, 0:16]])
-        T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, vi * 16 + vj, dtype="handle"))
+    for i, j in T.grid(16, 16):
+        with T.block("A"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads([])
+            T.writes([A[0:16, 0:16]])
+            T.store(A.data, vi * 16 + vj, 1)
+    for i, j in T.grid(16, 16):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads([])
+            T.writes([B[0:16, 0:16]])
+            T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, vi * 16 + vj, dtype="handle"))
 
 
 @T.prim_func
@@ -176,16 +167,14 @@ def opaque_access_reorder(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [16, 16], "float32")
     B = T.match_buffer(b, [16, 16], "float32")
     for j, i in T.grid(16, 16):
-        with T.block([16, 16], "A") as [vi, vj]:
-            T.bind(vi, i)
-            T.bind(vj, j)
+        with T.block("A"):
+            vi, vj = T.axis.remap("SS", [i, j])
             T.reads([])
             T.writes([A[0:16, 0:16]])
             T.store(A.data, vi * 16 + vj, 1)
     for j, i in T.grid(16, 16):
-        with T.block([16, 16], "B") as [vi, vj]:
-            T.bind(vi, i)
-            T.bind(vj, j)
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
             T.reads([])
             T.writes([B[0:16, 0:16]])
             T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, vi * 16 + vj, dtype="handle"))
diff --git a/tests/python/unittest/test_tir_schedule_rfactor.py b/tests/python/unittest/test_tir_schedule_rfactor.py
index 78b6a4696baa..bd474ed34295 100644
--- a/tests/python/unittest/test_tir_schedule_rfactor.py
+++ b/tests/python/unittest/test_tir_schedule_rfactor.py
@@ -34,10 +34,9 @@ def transformed_matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, [128, 128])
 
     for i0, i1, i2_outer, i2_inner_outer, i2_inner_inner in T.grid(128, 128, 4, 8, 4):
-        with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
-            T.bind(vi, i0)
-            T.bind(vj, i1)
-            T.bind(vk, (((i2_outer * 32) + (i2_inner_outer * 4)) + i2_inner_inner))
+        with T.block("update"):
+            vi, vj = T.axis.remap("SS", [i0, i1])
+            vk = T.axis.R(128, i2_outer * 32 + i2_inner_outer * 4 + i2_inner_inner)
             T.reads([C[vi, vj], A[vi, vk], B[vj, vk]])
             T.writes([C[vi, vj]])
             with T.init():
@@ -53,18 +52,12 @@ def matmul_rfactor(a: T.handle, b: T.handle, c: T.handle) -> None:
     C_rf = T.alloc_buffer([4, 128, 128])
 
     for i0, i1, i2_outer, i2_inner_outer, i2_inner_inner in T.grid(128, 128, 4, 8, 4):
-        with T.block([4, 128, 128, T.reduce_axis(0, 4), T.reduce_axis(0, 8)], "update_rf") as [
-            vi2_inner_inner,
-            vi,
-            vj,
-            vi2_outer,
-            vi2_inner_outer,
-        ]:
-            T.bind(vi2_inner_inner, i2_inner_inner)
-            T.bind(vi, i0)
-            T.bind(vj, i1)
-            T.bind(vi2_outer, i2_outer)
-            T.bind(vi2_inner_outer, i2_inner_outer)
+        with T.block("update_rf"):
+            vi2_inner_inner = T.axis.S(4, i2_inner_inner)
+            vi = T.axis.S(128, i0)
+            vj = T.axis.S(128, i1)
+            vi2_outer = T.axis.R(4, i2_outer)
+            vi2_inner_outer = T.axis.R(8, i2_inner_outer)
             with T.init():
                 C_rf[vi2_inner_inner, vi, vj] = 0.0
             C_rf[vi2_inner_inner, vi, vj] = C_rf[vi2_inner_inner, vi, vj] + (
@@ -73,14 +66,8 @@ def matmul_rfactor(a: T.handle, b: T.handle, c: T.handle) -> None:
             )
 
     for i0_1, i1_1, i2_inner_inner_1 in T.grid(128, 128, 4):
-        with T.block([T.reduce_axis(0, 4), 128, 128], "update") as [
-            vi2_inner_inner_1,
-            vi_1,
-            vj_1,
-        ]:
-            T.bind(vi2_inner_inner_1, i2_inner_inner_1)
-            T.bind(vi_1, i0_1)
-            T.bind(vj_1, i1_1)
+        with T.block("update"):
+            vi2_inner_inner_1, vi_1, vj_1 = T.axis.remap("RSS", [i2_inner_inner_1, i0_1, i1_1])
             with T.init():
                 C[vi_1, vj_1] = 0.0
             C[vi_1, vj_1] = C[vi_1, vj_1] + C_rf[vi2_inner_inner_1, vi_1, vj_1]
@@ -93,13 +80,17 @@ def matmul_not_stage_pipeline(a: T.handle, b: T.handle, d: T.handle) -> None:
     D = T.match_buffer(d, [256, 256])
     C = T.alloc_buffer([256, 256])
 
-    with T.block([128, 128, T.reduce_axis(0, 128)], "C") as [vi, vj, vk]:
-        with T.init():
-            C[vi, vj] = 0.0
-        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block("C"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vi, vj] = 0.0
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
-    with T.block([256, 256], "D") as [vi, vj]:
-        D[vi, vj] = C[vi, vj]
+    for i, j in T.grid(256, 256):
+        with T.block("D"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            D[vi, vj] = C[vi, vj]
 
 
 @T.prim_func
@@ -108,10 +99,12 @@ def matmul_not_same_buffer_access(a: T.handle, b: T.handle, c: T.handle) -> None
     B = T.match_buffer(b, (128, 128))
     C = T.match_buffer(c, (128, 128))
 
-    with T.block([128, 128, T.reduce_axis(0, 128)], "C") as [vi, vj, vk]:
-        with T.init():
-            C[vi, vj] = 0.0
-        C[vj, vi] = C[vj, vi] + A[vi, vk] * B[vk, vj]
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block("C"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vi, vj] = 0.0
+            C[vj, vi] = C[vj, vi] + A[vi, vk] * B[vk, vj]
 
 
 @T.prim_func
@@ -122,17 +115,13 @@ def matmul_loop_multiple_children(a: T.handle, b: T.handle, c: T.handle, d: T.ha
     D = T.match_buffer(d, [128, 128])
 
     for k, i, j in T.grid(128, 128, 128):
-        with T.block([T.reduce_axis(0, 128), 128, 128], "C") as [ck, ci, cj]:
-            T.bind(ck, k)
-            T.bind(ci, i)
-            T.bind(cj, j)
+        with T.block("C"):
+            ck, ci, cj = T.axis.remap("RSS", [k, i, j])
             with T.init():
                 C[ci, cj] = 0.0
             C[ci, cj] = C[ci, cj] + A[ci, ck] * B[ck, cj]
-        with T.block([T.reduce_axis(0, 128), 128, 128], "D") as [dk, di, dj]:
-            T.bind(dk, k)
-            T.bind(di, i)
-            T.bind(dj, j)
+        with T.block("D"):
+            dk, di, dj = T.axis.remap("RSS", [k, i, j])
             with T.init():
                 D[di, dj] = 0.0
             D[di, dj] = D[di, dj] + B[di, dk] * A[dk, dj]
@@ -143,10 +132,12 @@ def square_sum(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, [16, 256, 256])
     C = T.match_buffer(c, [16])
 
-    with T.block([16, T.reduce_axis(0, 256), T.reduce_axis(0, 256)], "C") as [b, i, j]:
-        with T.init():
-            C[b] = 0.0
-        C[b] = C[b] + A[b, i, j] * A[b, i, j]
+    for b0, i0, j0 in T.grid(16, 256, 256):
+        with T.block("C"):
+            b, i, j = T.axis.remap("SRR", [b0, i0, j0])
+            with T.init():
+                C[b] = 0.0
+            C[b] = C[b] + A[b, i, j] * A[b, i, j]
 
 
 @T.prim_func
@@ -156,18 +147,15 @@ def square_sum_rfactor(a: T.handle, c: T.handle) -> None:
     C_rf = T.alloc_buffer([16, 256])
 
     for i0, i1, i2 in T.grid(16, 256, 256):
-        with T.block([256, 16, T.reduce_axis(0, 256)], "C_rf") as [vi2, b, i]:
-            T.bind(vi2, i2)
-            T.bind(b, i0)
-            T.bind(i, i1)
+        with T.block("C_rf"):
+            vi2, b, i = T.axis.remap("SSR", [i2, i0, i1])
             with T.init():
                 C_rf[b, vi2] = 0.0
             C_rf[b, vi2] = C_rf[b, vi2] + (A[b, i, vi2] * A[b, i, vi2])
 
     for i0_1, i2_1 in T.grid(16, 256):
-        with T.block([T.reduce_axis(0, 256), 16], "C") as [vi2_1, b_1]:
-            T.bind(vi2_1, i2_1)
-            T.bind(b_1, i0_1)
+        with T.block("C"):
+            vi2_1, b_1 = T.axis.remap("RS", [i2_1, i0_1])
             with T.init():
                 C[b_1] = 0.0
             C[b_1] = C[b_1] + C_rf[b_1, vi2_1]
@@ -180,18 +168,18 @@ def transformed_square_sum_square_root(a: T.handle, d: T.handle) -> None:
     C = T.alloc_buffer([16])
 
     for i0, i1_i2_fused_outer, i1_i2_fused_inner in T.grid(16, 65536, 1):
-        with T.block([16, T.reduce_axis(0, 256), T.reduce_axis(0, 256)], "C") as [b, i, j]:
-            T.bind(b, i0)
-            T.bind(i, T.floordiv(i1_i2_fused_outer, 256))
-            T.bind(j, T.floormod(i1_i2_fused_outer, 256))
+        with T.block("C"):
+            b = T.axis.S(16, i0)
+            i = T.axis.R(256, T.floordiv(i1_i2_fused_outer, 256))
+            j = T.axis.R(256, T.floormod(i1_i2_fused_outer, 256))
             T.reads([C[b], A[b, i, j]])
             T.writes([C[b]])
             with T.init():
                 C[b] = 0.0
             C[b] = C[b] + (A[b, i, j] * A[b, i, j])
     for i0_1 in T.serial(0, 16):
-        with T.block([16], "D") as [b_1]:
-            T.bind(b_1, i0_1)
+        with T.block("D"):
+            b_1 = T.axis.S(16, i0_1)
             T.reads([C[b_1]])
             T.writes([D[b_1]])
             D[b_1] = T.sqrt(C[b_1], dtype="float32")
@@ -205,31 +193,24 @@ def square_sum_square_root_rfactor(a: T.handle, d: T.handle) -> None:
     C_rf = T.alloc_buffer([1, 16])
 
     for i0, i1_i2_fused_outer, i1_i2_fused_inner in T.grid(16, 65536, 1):
-        with T.block([1, 16, T.reduce_axis(0, 256), T.reduce_axis(0, 256)], "C_rf") as [
-            vi1_i2_fused_inner,
-            b,
-            i,
-            j,
-        ]:
-            T.bind(vi1_i2_fused_inner, i1_i2_fused_inner)
-            T.bind(b, i0)
-            T.bind(i, T.floordiv(i1_i2_fused_outer, 256))
-            T.bind(j, T.floormod(i1_i2_fused_outer, 256))
+        with T.block("C_rf"):
+            vi1_i2_fused_inner, b = T.axis.remap("SS", [i1_i2_fused_inner, i0])
+            i = T.axis.R(256, T.floordiv(i1_i2_fused_outer, 256))
+            j = T.axis.R(256, T.floormod(i1_i2_fused_outer, 256))
             with T.init():
                 C_rf[vi1_i2_fused_inner, b] = 0.0
             C_rf[vi1_i2_fused_inner, b] = C_rf[vi1_i2_fused_inner, b] + (A[b, i, j] * A[b, i, j])
 
     for i0_1, i1_i2_fused_inner_1 in T.grid(16, 1):
-        with T.block([T.reduce_axis(0, 1), 16], "C") as [vi1_i2_fused_inner_1, b_1]:
-            T.bind(vi1_i2_fused_inner_1, i1_i2_fused_inner_1)
-            T.bind(b_1, i0_1)
+        with T.block("C"):
+            vi1_i2_fused_inner_1, b_1 = T.axis.remap("RS", [i1_i2_fused_inner_1, i0_1])
             with T.init():
                 C[b_1] = 0.0
             C[b_1] = C[b_1] + C_rf[vi1_i2_fused_inner_1, b_1]
 
     for i0_2 in T.serial(0, 16):
-        with T.block([16], "D") as [b_2]:
-            T.bind(b_2, i0_2)
+        with T.block("D"):
+            b_2 = T.axis.S(16, i0_2)
             D[b_2] = T.sqrt(C[b_2], dtype="float32")
 
 
@@ -238,8 +219,10 @@ def element_wise(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.match_buffer(b, (128, 128))
 
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
 
 
 @T.prim_func
@@ -247,10 +230,12 @@ def rowsum(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.match_buffer(b, (128,))
 
-    with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]:
-        with T.init():
-            B[vi] = 0.0
-        B[vi] = B[vi] + A[vi, vk]
+    for i, k in T.grid(128, 128):
+        with T.block("B"):
+            vi, vk = T.axis.remap("SR", [i, k])
+            with T.init():
+                B[vi] = 0.0
+            B[vi] = B[vi] + A[vi, vk]
 
 
 @T.prim_func
@@ -259,9 +244,9 @@ def rowsum_not_quasi_affine(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128,))
 
     for i, k in T.grid(128, 16):
-        with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]:
-            T.bind(vi, i)
-            T.bind(vk, T.floordiv(k * k, 2))
+        with T.block("B"):
+            vi = T.axis.S(128, i)
+            vk = T.axis.R(128, T.floordiv(k * k, 2))
             with T.init():
                 B[vi] = 0.0
             B[vi] = B[vi] + A[vi, vk]
@@ -272,10 +257,12 @@ def rowsum_not_dominant(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.match_buffer(b, (128, 128))
 
-    with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]:
-        with T.init():
-            B[vi, vk] = 0.0
-        B[vi, vk] = B[vi, vk] + A[vi, vk]
+    for i, k in T.grid(128, 128):
+        with T.block("B"):
+            vi, vk = T.axis.remap("SR", [i, k])
+            with T.init():
+                B[vi, vk] = 0.0
+            B[vi, vk] = B[vi, vk] + A[vi, vk]
 
 
 @T.prim_func
@@ -285,9 +272,8 @@ def rowsum_not_serial(a: T.handle, b: T.handle) -> None:
 
     for i in T.serial(0, 128):
         for k in T.parallel(0, 128):
-            with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]:
-                T.bind(vi, i)
-                T.bind(vk, k)
+            with T.block("B"):
+                vi, vk = T.axis.remap("SR", [i, k])
                 with T.init():
                     B[vi] = 0.0
                 B[vi] = B[vi] + A[vi, vk]
@@ -298,10 +284,12 @@ def rowsum_wrong_reduce_pattern1(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.match_buffer(b, (128,))
 
-    with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]:
-        with T.init():
-            B[vi] = 1.0
-        B[vi] = B[vi] + A[vi, vk]
+    for i, k in T.grid(128, 128):
+        with T.block("B"):
+            vi, vk = T.axis.remap("SR", [i, k])
+            with T.init():
+                B[vi] = 1.0
+            B[vi] = B[vi] + A[vi, vk]
 
 
 @T.prim_func
@@ -309,10 +297,12 @@ def rowsum_wrong_reduce_pattern2(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.match_buffer(b, (128,))
 
-    with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]:
-        with T.init():
-            B[vi] = 0.0
-        B[vi] = B[vi] - A[vi, vk]
+    for i, k in T.grid(128, 128):
+        with T.block("B"):
+            vi, vk = T.axis.remap("SR", [i, k])
+            with T.init():
+                B[vi] = 0.0
+            B[vi] = B[vi] - A[vi, vk]
 
 
 @T.prim_func
@@ -321,9 +311,9 @@ def rowsum_transformed(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128,))
 
     for io, ii_ko_fused, ki in T.grid(32, 128, 4):
-        with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]:
-            T.bind(vi, io * 4 + T.floordiv(ii_ko_fused, 32))
-            T.bind(vk, T.floormod(ii_ko_fused, 32) * 4 + ki)
+        with T.block("B"):
+            vi = T.axis.S(128, io * 4 + T.floordiv(ii_ko_fused, 32))
+            vk = T.axis.R(128, T.floormod(ii_ko_fused, 32) * 4 + ki)
             with T.init():
                 B[vi] = 0.0
             B[vi] = B[vi] + A[vi, vk]
@@ -334,10 +324,12 @@ def rowsum_zero_dim(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [128])
     B = T.match_buffer(b, [])
 
-    with T.block([T.reduce_axis(0, 128)], "B") as [k]:
-        with T.init():
-            B[()] = 0.0
-        B[()] = B[()] + A[k]
+    for k0 in range(128):
+        with T.block("B"):
+            k = T.axis.R(128, k0)
+            with T.init():
+                B[()] = 0.0
+            B[()] = B[()] + A[k]
 
 
 @T.prim_func
@@ -346,15 +338,19 @@ def rowsum_zero_dim_rfactor(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [])
     B_rf = T.alloc_buffer([128])
 
-    with T.block([128], "B_rf") as [vi0]:
-        with T.init():
-            B_rf[vi0] = 0.0
-        B_rf[vi0] = B_rf[vi0] + A[vi0]
+    for i in range(128):
+        with T.block("B_rf"):
+            vi0 = T.axis.S(128, i)
+            with T.init():
+                B_rf[vi0] = 0.0
+            B_rf[vi0] = B_rf[vi0] + A[vi0]
 
-    with T.block([T.reduce_axis(0, 128)], "B") as [vi0_1]:
-        with T.init():
-            B[()] = 0.0
-        B[()] = B[()] + B_rf[vi0_1]
+    for i in range(128):
+        with T.block("B"):
+            vi0_1 = T.axis.R(128, i)
+            with T.init():
+                B[()] = 0.0
+            B[()] = B[()] + B_rf[vi0_1]
 
 
 @T.prim_func
@@ -362,10 +358,10 @@ def rowsum_predicate(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [128, 128], dtype="float32")
     B = T.match_buffer(b, [128], dtype="float32")
     for i, k_0, k_1 in T.grid(128, 13, 10):
-        with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]:
+        with T.block("B"):
             T.where(k_0 * 10 + k_1 < 128)
-            T.bind(vi, i)
-            T.bind(vk, k_0 * 10 + k_1)
+            vi = T.axis.S(128, i)
+            vk = T.axis.R(128, k_0 * 10 + k_1)
             with T.init():
                 B[vi] = 0.0
             B[vi] = B[vi] + A[vi, vk]
@@ -377,18 +373,15 @@ def rowsum_predicate_rfactor(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [128], dtype="float32")
     B_rf = T.alloc_buffer([128, 13], dtype="float32")
     for i, k_0, k_1 in T.grid(128, 13, 10):
-        with T.block([13, 128, T.reduce_axis(0, 10)], "B_rf") as [vk_0, vi, vk_1]:
+        with T.block("B_rf"):
+            vk_0, vi, vk_1 = T.axis.remap("SSR", [k_0, i, k_1])
             T.where(k_0 * 10 + k_1 < 128)
-            T.bind(vk_0, k_0)
-            T.bind(vi, i)
-            T.bind(vk_1, k_1)
             with T.init():
                 B_rf[vi, vk_0] = T.float32(0)
             B_rf[vi, vk_0] = B_rf[vi, vk_0] + A[vi, vk_0 * 10 + vk_1]
     for i, k_0 in T.grid(128, 13):
-        with T.block([T.reduce_axis(0, 13), 128], "B") as [vk_0, vi]:
-            T.bind(vk_0, k_0)
-            T.bind(vi, i)
+        with T.block("B"):
+            vk_0, vi = T.axis.remap("RS", [k_0, i])
             with T.init():
                 B[vi] = T.float32(0)
             B[vi] = B[vi] + B_rf[vi, vk_0]
@@ -405,35 +398,31 @@ def multiple_reduction_blocks(a: T.handle, f: T.handle) -> None:
     for i in T.serial(0, 16):
         for j1 in T.serial(0, 16):
             for k1o, k1i in T.grid(4, 4):
-                with T.block([16, 16, T.reduce_axis(0, 16)], "C") as [ci, cj, ck]:
-                    T.bind(ci, i)
-                    T.bind(cj, j1)
-                    T.bind(ck, k1o * 4 + k1i)
+                with T.block("C"):
+                    ci, cj = T.axis.remap("SS", [i, j1])
+                    ck = T.axis.R(16, k1o * 4 + k1i)
                     with T.init():
                         C[ci, cj] = 0.0
                     C[ci, cj] = C[ci, cj] + A[ci, cj, ck]
             for k2o, k2i in T.grid(4, 4):
-                with T.block([16, 16, T.reduce_axis(0, 16)], "D") as [di, dj, dk]:
-                    T.bind(di, i)
-                    T.bind(dj, j1)
-                    T.bind(dk, k2o * 4 + k2i)
+                with T.block("D"):
+                    di, dj = T.axis.remap("SS", [i, j1])
+                    dk = T.axis.R(16, k2o * 4 + k2i)
                     with T.init():
                         D[di, dj] = 0.0
                     D[di, dj] = D[di, dj] + A[di, dj, dk] + C[di, dj]
         for j2 in T.serial(0, 16):
             for k3o, k3i in T.grid(4, 4):
-                with T.block([16, 16, T.reduce_axis(0, 16)], "E") as [ei, ej, ek]:
-                    T.bind(ei, i)
-                    T.bind(ej, j2)
-                    T.bind(ek, k3o * 4 + k3i)
+                with T.block("E"):
+                    ei, ej = T.axis.remap("SS", [i, j2])
+                    ek = T.axis.R(16, k3o * 4 + k3i)
                     with T.init():
                         E[ei, ej] = 0.0
                     E[ei, ej] = E[ei, ej] + A[ei, ej, ek] + D[ei, ej]
             for k4o, k4i in T.grid(4, 4):
-                with T.block([16, 16, T.reduce_axis(0, 16)], "F") as [fi, fj, fk]:
-                    T.bind(fi, i)
-                    T.bind(fj, j2)
-                    T.bind(fk, k4o * 4 + k4i)
+                with T.block("F"):
+                    fi, fj = T.axis.remap("SS", [i, j2])
+                    fk = T.axis.R(16, k4o * 4 + k4i)
                     with T.init():
                         F[fi, fj] = 0.0
                     F[fi, fj] = F[fi, fj] + A[fi, fj, fk] + E[fi, fj]
@@ -449,46 +438,38 @@ def multiple_reduction_blocks_rfactor(a: T.handle, f: T.handle) -> None:
     C_rf = T.alloc_buffer([16, 16, 4])
 
     for i, j1, k1o, k1i in T.grid(16, 16, 4, 4):
-        with T.block([4, 16, 16, T.reduce_axis(0, 4)], "C_rf") as [vk1o, ci, cj, vk1i]:
-            T.bind(vk1o, k1o)
-            T.bind(ci, i)
-            T.bind(cj, j1)
-            T.bind(vk1i, k1i)
+        with T.block("C_rf"):
+            vk1o, ci, cj, vk1i = T.axis.remap("SSSR", [k1o, i, j1, k1i])
             with T.init():
                 C_rf[ci, cj, vk1o] = 0.0
             C_rf[ci, cj, vk1o] = C_rf[ci, cj, vk1o] + A[ci, cj, ((vk1o * 4) + vk1i)]
     for i_1 in T.serial(0, 16):
         for j1_1 in T.serial(0, 16):
             for k1o_1 in T.serial(0, 4):
-                with T.block([T.reduce_axis(0, 4), 16, 16], "C") as [vk1o_1, ci_1, cj_1]:
-                    T.bind(vk1o_1, k1o_1)
-                    T.bind(ci_1, i_1)
-                    T.bind(cj_1, j1_1)
+                with T.block("C"):
+                    vk1o_1, ci_1, cj_1 = T.axis.remap("RSS", [k1o_1, i_1, j1_1])
                     with T.init():
                         C[ci_1, cj_1] = 0.0
                     C[ci_1, cj_1] = C[ci_1, cj_1] + C_rf[ci_1, cj_1, vk1o_1]
             for k2o, k2i in T.grid(4, 4):
-                with T.block([16, 16, T.reduce_axis(0, 16)], "D") as [di, dj, dk]:
-                    T.bind(di, i_1)
-                    T.bind(dj, j1_1)
-                    T.bind(dk, (k2o * 4) + k2i)
+                with T.block("D"):
+                    di, dj = T.axis.remap("SS", [i_1, j1_1])
+                    dk = T.axis.R(16, k2o * 4 + k2i)
                     with T.init():
                         D[di, dj] = 0.0
                     D[di, dj] = (D[di, dj] + A[di, dj, dk]) + C[di, dj]
         for j2 in T.serial(0, 16):
             for k3o, k3i in T.grid(4, 4):
-                with T.block([16, 16, T.reduce_axis(0, 16)], "E") as [ei, ej, ek]:
-                    T.bind(ei, i_1)
-                    T.bind(ej, j2)
-                    T.bind(ek, (k3o * 4) + k3i)
+                with T.block("E"):
+                    ei, ej = T.axis.remap("SS", [i_1, j2])
+                    ek = T.axis.R(16, k3o * 4 + k3i)
                     with T.init():
                         E[ei, ej] = 0.0
                     E[ei, ej] = (E[ei, ej] + A[ei, ej, ek]) + D[ei, ej]
             for k4o, k4i in T.grid(4, 4):
-                with T.block([16, 16, T.reduce_axis(0, 16)], "F") as [fi, fj, fk]:
-                    T.bind(fi, i_1)
-                    T.bind(fj, j2)
-                    T.bind(fk, (k4o * 4) + k4i)
+                with T.block("F"):
+                    fi, fj = T.axis.remap("SS", [i_1, j2])
+                    fk = T.axis.R(16, k4o * 4 + k4i)
                     with T.init():
                         F[fi, fj] = 0.0
                     F[fi, fj] = (F[fi, fj] + A[fi, fj, fk]) + E[fi, fj]
diff --git a/tests/python/unittest/test_tir_schedule_sampling.py b/tests/python/unittest/test_tir_schedule_sampling.py
index c93c7ca63aa8..fbf0a6a5bd78 100644
--- a/tests/python/unittest/test_tir_schedule_sampling.py
+++ b/tests/python/unittest/test_tir_schedule_sampling.py
@@ -32,8 +32,10 @@
 def elementwise(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128, 128))
     B = T.match_buffer(b, (128, 128, 128))
-    with T.block([128, 128, 128], "B") as [vi, vj, vk]:
-        B[vi, vj, vk] = A[vi, vj, vk] * 2.0
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block("B"):
+            vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+            B[vi, vj, vk] = A[vi, vj, vk] * 2.0
 
 
 # pylint: enable=no-member,invalid-name,unused-variable
diff --git a/tests/python/unittest/test_tir_schedule_split_fuse.py b/tests/python/unittest/test_tir_schedule_split_fuse.py
index 29cfe8cadfb3..d2365c39c9cb 100644
--- a/tests/python/unittest/test_tir_schedule_split_fuse.py
+++ b/tests/python/unittest/test_tir_schedule_split_fuse.py
@@ -30,8 +30,10 @@
 def elementwise(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128, 128))
     B = T.match_buffer(b, (128, 128, 128))
-    with T.block([128, 128, 128], "B") as [vi, vj, vk]:
-        B[vi, vj, vk] = A[vi, vj, vk] * 2.0
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block("B"):
+            vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+            B[vi, vj, vk] = A[vi, vj, vk] * 2.0
 
 
 @T.prim_func
@@ -40,7 +42,10 @@ def elementwise_dependent_loops(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128, 128, 128))
     for i in T.serial(0, 128):
         for j, k in T.grid(i, 128):
-            with T.block([128, i, 128], "B") as [vi, vj, vk]:
+            with T.block("B"):
+                vi = T.axis.S(128, i)
+                vj = T.axis.S(i, j)
+                vk = T.axis.S(128, k)
                 B[vi, vj, vk] = A[vi, vj, vk] * 2.0
 
 
@@ -49,7 +54,8 @@ def elementwise_symbolic(a: T.handle, b: T.handle, n: T.int32) -> None:
     A = T.match_buffer(a, (128, 128, n))
     B = T.match_buffer(b, (128, 128, n))
     for i, j, k in T.grid(128, 128, n):
-        with T.block([128, 128, n], "B") as [vi, vj, vk]:
+        with T.block("B"):
+            vi, vj, vk = T.axis.remap("SSS", [i, j, k])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
 
 
@@ -58,10 +64,10 @@ def elementwise_symbolic_fused(a: T.handle, b: T.handle, n: T.int32) -> None:
     A = T.match_buffer(a, (128, 128, n))
     B = T.match_buffer(b, (128, 128, n))
     for i_j_k_fused in T.serial(0, (n * 16384)):
-        with T.block([128, 128, n], "B") as [vi, vj, vk]:
-            T.bind(vi, T.floordiv(i_j_k_fused, (n * 128)))
-            T.bind(vj, T.floormod(T.floordiv(i_j_k_fused, n), 128))
-            T.bind(vk, T.floormod(i_j_k_fused, n))
+        with T.block("B"):
+            vi = T.axis.S(128, T.floordiv(i_j_k_fused, n * 128))
+            vj = T.axis.S(128, T.floormod(T.floordiv(i_j_k_fused, n), 128))
+            vk = T.axis.S(n, T.floormod(i_j_k_fused, n))
             T.reads([A[vi, vj, vk]])
             T.writes([B[vi, vj, vk]])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -72,11 +78,10 @@ def elementwise_symbolic_split(a: T.handle, b: T.handle, n: T.int32) -> None:
     A = T.match_buffer(a, (128, 128, n))
     B = T.match_buffer(b, (128, 128, n))
     for i, j, k0, k1 in T.grid(128, 128, 10, T.floordiv((n + 9), 10)):
-        with T.block([128, 128, n], "B") as [vi, vj, vk]:
+        with T.block("B"):
             T.where((((k0 * T.floordiv((n + 9), 10)) + k1) < n))
-            T.bind(vi, i)
-            T.bind(vj, j)
-            T.bind(vk, ((k0 * T.floordiv((n + 9), 10)) + k1))
+            vi, vj = T.axis.remap("SS", [i, j])
+            vk = T.axis.S(n, k0 * T.floordiv(n + 9, 10) + k1)
             T.reads([A[vi, vj, vk]])
             T.writes([B[vi, vj, vk]])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -89,10 +94,12 @@ def elementwise_with_seq(a: T.handle, b: T.handle) -> None:
     C = T.alloc_buffer((128, 128, 128))
     for i, j in T.grid(128, 128):
         for k in T.serial(0, 128):
-            with T.block([128, 128, 128], "C") as [vi, vj, vk]:
+            with T.block("C"):
+                vi, vj, vk = T.axis.remap("SSS", [i, j, k])
                 C[vi, vj, vk] = A[vi, vj, vk] * 2.0
         for k in T.serial(0, 128):
-            with T.block([128, 128, 128], "B") as [vi, vj, vk]:
+            with T.block("B"):
+                vi, vj, vk = T.axis.remap("SSS", [i, j, k])
                 B[vi, vj, vk] = C[vi, vj, vk] * 2.0
 
 
@@ -102,10 +109,8 @@ def elementwise_with_anno(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128, 128, 128))
     for i, j in T.grid(128, 128):
         for k in T.serial(0, 128, annotations={"useless_annotation": True}):
-            with T.block([128, 128, 128], "B") as [vi, vj, vk]:
-                T.bind(vi, i)
-                T.bind(vj, j)
-                T.bind(vk, k)
+            with T.block("B"):
+                vi, vj, vk = T.axis.remap("SSS", [i, j, k])
                 T.reads([A[vi, vj, vk]])
                 T.writes([B[vi, vj, vk]])
                 B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -117,10 +122,8 @@ def elementwise_with_thread_binding(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128, 128, 128))
     for i, j in T.grid(128, 128):
         for k in T.thread_binding(0, 128, thread="threadIdx.x"):
-            with T.block([128, 128, 128], "B") as [vi, vj, vk]:
-                T.bind(vi, i)
-                T.bind(vj, j)
-                T.bind(vk, k)
+            with T.block("B"):
+                vi, vj, vk = T.axis.remap("SSS", [i, j, k])
                 T.reads([A[vi, vj, vk]])
                 T.writes([B[vi, vj, vk]])
                 B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -132,10 +135,8 @@ def elementwise_with_starting_point(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (128, 128, 128))
     for i, j in T.grid(128, 128):
         for k in T.serial(10, 128):
-            with T.block([128, 128, 128], "B") as [vi, vj, vk]:
-                T.bind(vi, i)
-                T.bind(vj, j)
-                T.bind(vk, k)
+            with T.block("B"):
+                vi, vj, vk = T.axis.remap("SSS", [i, j, k])
                 T.reads([A[vi, vj, vk]])
                 T.writes([B[vi, vj, vk]])
                 B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -146,13 +147,11 @@ def elementwise_with_opaque_block(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128, 128))
     B = T.match_buffer(b, (128, 128, 128))
     for i, j, k in T.grid(128, 128, 128):
-        with T.block([], "opaque"):
+        with T.block("opaque"):
             T.reads([A[i, j, k]])
             T.writes([B[i, j, k]])
-            with T.block([128, 128, 128], "B") as [vi, vj, vk]:
-                T.bind(vi, i)
-                T.bind(vj, j)
-                T.bind(vk, k)
+            with T.block("B"):
+                vi, vj, vk = T.axis.remap("SSS", [i, j, k])
                 T.reads([A[vi, vj, vk]])
                 T.writes([B[vi, vj, vk]])
                 B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -163,10 +162,10 @@ def elementwise_fused(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128, 128))
     B = T.match_buffer(b, (128, 128, 128))
     for fused in T.serial(0, 2097152):
-        with T.block([128, 128, 128], "B") as [vi, vj, vk]:
-            T.bind(vi, T.floordiv(fused, 16384))
-            T.bind(vj, T.floormod(T.floordiv(fused, 128), 128))
-            T.bind(vk, T.floormod(fused, 128))
+        with T.block("B"):
+            vi = T.axis.S(128, T.floordiv(fused, 16384))
+            vj = T.axis.S(128, T.floormod(T.floordiv(fused, 128), 128))
+            vk = T.axis.S(128, T.floormod(fused, 128))
             T.reads([A[vi, vj, vk]])
             T.writes([B[vi, vj, vk]])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -177,10 +176,10 @@ def elementwise_split_case0(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [128, 128, 128])
     B = T.match_buffer(b, [128, 128, 128])
     for i1, i2, i3, j1, j2, k1, k2 in T.grid(2, 1, 64, 4, 32, 16, 8):
-        with T.block([128, 128, 128], "B") as [vi, vj, vk]:
-            T.bind(vi, ((i1 * 64) + i3))
-            T.bind(vj, ((j1 * 32) + j2))
-            T.bind(vk, ((k1 * 8) + k2))
+        with T.block("B"):
+            vi = T.axis.S(128, i1 * 64 + i3)
+            vj = T.axis.S(128, j1 * 32 + j2)
+            vk = T.axis.S(128, k1 * 8 + k2)
             T.reads([A[vi, vj, vk]])
             T.writes([B[vi, vj, vk]])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -191,10 +190,10 @@ def elementwise_split_case1(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [128, 128, 128])
     B = T.match_buffer(b, [128, 128, 128])
     for i1, i2, i3, j1, j2, j3, k1, k2, k3 in T.grid(2, 1, 64, 2, 1, 64, 2, 1, 64):
-        with T.block([128, 128, 128], "B") as [vi, vj, vk]:
-            T.bind(vi, i1 * 64 + i3)
-            T.bind(vj, j1 * 64 + j3)
-            T.bind(vk, k1 * 64 + k3)
+        with T.block("B"):
+            vi = T.axis.S(128, i1 * 64 + i3)
+            vj = T.axis.S(128, j1 * 64 + j3)
+            vk = T.axis.S(128, k1 * 64 + k3)
             T.reads([A[vi, vj, vk]])
             T.writes([B[vi, vj, vk]])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -205,16 +204,11 @@ def elementwise_split_with_predicate(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [128, 128, 128])
     A = T.match_buffer(a, [128, 128, 128])
     for i0, i1, i2, j0, j1, k0, k1 in T.grid(1000, 2, 3, 1, 129, 3, 43):
-        with T.block([128, 128, 128], "B") as [vi, vj, vk]:
-            T.where(
-                (
-                    ((((((i0 * 2) + i1) * 3) + i2) < 128) and (((j0 * 129) + j1) < 128))
-                    and (((k0 * 43) + k1) < 128)
-                )
-            )
-            T.bind(vi, (((i0 * 6) + (i1 * 3)) + i2))
-            T.bind(vj, j1)
-            T.bind(vk, ((k0 * 43) + k1))
+        with T.block("B"):
+            T.where((i0 * 2 + i1) * 3 + i2 < 128 and j0 * 129 + j1 < 128 and k0 * 43 + k1 < 128)
+            vi = T.axis.S(128, i0 * 6 + i1 * 3 + i2)
+            vj = T.axis.S(128, j1)
+            vk = T.axis.S(128, k0 * 43 + k1)
             T.reads([A[vi, vj, vk]])
             T.writes([B[vi, vj, vk]])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -225,7 +219,7 @@ def elementwise_fuse_with_opaque_block(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [128, 128, 128])
     A = T.match_buffer(a, [128, 128, 128])
     for i_j_k_fused in T.serial(0, 2097152):
-        with T.block([], "opaque"):
+        with T.block("opaque"):
             T.reads(
                 [
                     A[
@@ -244,10 +238,10 @@ def elementwise_fuse_with_opaque_block(a: T.handle, b: T.handle) -> None:
                     ]
                 ]
             )
-            with T.block([128, 128, 128], "B") as [vi, vj, vk]:
-                T.bind(vi, T.floordiv(i_j_k_fused, 16384))
-                T.bind(vj, T.floormod(T.floordiv(i_j_k_fused, 128), 128))
-                T.bind(vk, T.floormod(i_j_k_fused, 128))
+            with T.block("B"):
+                vi = T.axis.S(128, T.floordiv(i_j_k_fused, 16384))
+                vj = T.axis.S(128, T.floormod(T.floordiv(i_j_k_fused, 128), 128))
+                vk = T.axis.S(128, T.floormod(i_j_k_fused, 128))
                 T.reads([A[vi, vj, vk]])
                 T.writes([B[vi, vj, vk]])
                 B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -259,13 +253,12 @@ def elementwise_split_with_opaque_block(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [128, 128, 128])
 
     for i0, i1, j, k in T.grid(8, 16, 128, 128):
-        with T.block([], "opaque"):
+        with T.block("opaque"):
             T.reads([A[i0 * 16 + i1, j, k]])
             T.writes([B[i0 * 16 + i1, j, k]])
-            with T.block([128, 128, 128], "B") as [vi, vj, vk]:
-                T.bind(vi, i0 * 16 + i1)
-                T.bind(vj, j)
-                T.bind(vk, k)
+            with T.block("B"):
+                vi = T.axis.S(128, i0 * 16 + i1)
+                vj, vk = T.axis.remap("SS", [j, k])
                 T.reads([A[vi, vj, vk]])
                 T.writes([B[vi, vj, vk]])
                 B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -275,14 +268,18 @@ def elementwise_split_with_opaque_block(a: T.handle, b: T.handle) -> None:
 def opaque_access(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [16, 16], "float32")
     B = T.match_buffer(b, [16, 16], "float32")
-    with T.block([16, 16], "A") as [vi, vj]:
-        T.reads([])
-        T.writes([A[0:16, 0:16]])
-        T.store(A.data, vi * 16 + vj, 1)
-    with T.block([16, 16], "B") as [vi, vj]:
-        T.reads([])
-        T.writes([B[0:16, 0:16]])
-        T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, vi * 16 + vj, dtype="handle"))
+    for i, j in T.grid(16, 16):
+        with T.block("A"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads([])
+            T.writes([A[0:16, 0:16]])
+            T.store(A.data, vi * 16 + vj, 1)
+    for i, j in T.grid(16, 16):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads([])
+            T.writes([B[0:16, 0:16]])
+            T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, vi * 16 + vj, dtype="handle"))
 
 
 @T.prim_func
@@ -290,16 +287,16 @@ def opaque_access_fused(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [16, 16])
     B = T.match_buffer(b, [16, 16])
     for i_j_fused in T.serial(0, 256):
-        with T.block([16, 16], "A") as [vi, vj]:
-            T.bind(vi, T.floordiv(i_j_fused, 16))
-            T.bind(vj, T.floormod(i_j_fused, 16))
+        with T.block("A"):
+            vi = T.axis.S(16, T.floordiv(i_j_fused, 16))
+            vj = T.axis.S(16, T.floormod(i_j_fused, 16))
             T.reads([])
             T.writes([A[0:16, 0:16]])
             T.store(A.data, ((vi * 16) + vj), 1, 1)
     for i_j_fused in T.serial(0, 256):
-        with T.block([16, 16], "B") as [vi, vj]:
-            T.bind(vi, T.floordiv(i_j_fused, 16))
-            T.bind(vj, T.floormod(i_j_fused, 16))
+        with T.block("B"):
+            vi = T.axis.S(16, T.floordiv(i_j_fused, 16))
+            vj = T.axis.S(16, T.floormod(i_j_fused, 16))
             T.reads([])
             T.writes([B[0:16, 0:16]])
             T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, ((vi * 16) + vj), dtype="handle"))
@@ -310,16 +307,16 @@ def opaque_access_split(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (16, 16))
     B = T.match_buffer(b, (16, 16))
     for i, j0, j1 in T.grid(16, 4, 4):
-        with T.block([16, 16], "A") as [vi, vj]:
-            T.bind(vi, i)
-            T.bind(vj, ((j0 * 4) + j1))
+        with T.block("A"):
+            vi = T.axis.S(16, i)
+            vj = T.axis.S(16, j0 * 4 + j1)
             T.reads([])
             T.writes([A[0:16, 0:16]])
             T.store(A.data, ((vi * 16) + vj), 1, 1)
     for i, j0, j1 in T.grid(16, 4, 4):
-        with T.block([16, 16], "B") as [vi, vj]:
-            T.bind(vi, i)
-            T.bind(vj, ((j0 * 4) + j1))
+        with T.block("B"):
+            vi = T.axis.S(16, i)
+            vj = T.axis.S(16, j0 * 4 + j1)
             T.reads([])
             T.writes([B[0:16, 0:16]])
             T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, ((vi * 16) + vj), dtype="handle"))
@@ -331,9 +328,9 @@ def elementwise_not_affine(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, (127, 128))
     for i in T.serial(0, 4):
         for j, k in T.grid(T.min(31, 126 - i * 32) + 1, 128):
-            with T.block([127, 128], "B") as [vi, vj]:
-                T.bind(vi, i * 32 + j)
-                T.bind(vj, k)
+            with T.block("B"):
+                vi = T.axis.S(127, i * 32 + j)
+                vj = T.axis.S(128, k)
                 B[vi, vj] = A[vi, vj]
 
 
@@ -343,12 +340,12 @@ def elementwise_not_affine_fused(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [127, 128])
     for i in T.grid(4):
         for j_k_fused in T.serial(0, T.min(31, 126 - i * 32) * 128 + 128):
-            with T.block([127, 128], "B") as [vi, vj]:
-                T.bind(
-                    vi,
+            with T.block("B"):
+                vi = T.axis.S(
+                    127,
                     i * 32 + T.floormod(T.floordiv(j_k_fused, 128), T.min(31, 126 - i * 32) + 1),
                 )
-                T.bind(vj, T.floormod(j_k_fused, 128))
+                vj = T.axis.S(128, T.floormod(j_k_fused, 128))
                 T.reads([A[vi, vj]])
                 T.writes([B[vi, vj]])
                 B[vi, vj] = A[vi, vj]
diff --git a/tests/python/unittest/test_tir_schedule_state.py b/tests/python/unittest/test_tir_schedule_state.py
index 94e1b4a6b395..bc62fa1ba950 100644
--- a/tests/python/unittest/test_tir_schedule_state.py
+++ b/tests/python/unittest/test_tir_schedule_state.py
@@ -32,10 +32,14 @@ def elementwise(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128), "float32")
     C = T.match_buffer(c, (128, 128), "float32")
     B = T.alloc_buffer((128, 128), "float32")
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -44,10 +48,12 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "init") as [vi, vj]:
+        with T.block("init"):
+            vi, vj = T.axis.remap("SS", [i, j])
             C[vi, vj] = T.float32(0)
         for k in range(0, 128):
-            with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
+            with T.block("update"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
@@ -55,22 +61,28 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
 def block_in_opaque_block(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128), "float32")
     B = T.match_buffer(b, (128, 128), "float32")
-    with T.block([128], "B") as vi:
-        T.reads([A[0:128, 0:128]])
-        T.writes([B[0:128, 0:128]])
-        B[vi, 0] = A[vi, 0]
-        if A[vi, 0] == 0.0:
-            with T.block([], "C"):
-                T.reads([A[0:128, 0:128]])
-                T.writes([B[0:128, 0:128]])
-                with T.block([128], "D") as vj:
-                    B[vi, vj] = A[vi, vj] * 3.0
-        else:
-            with T.block([], "E"):
-                T.reads([A[0:128, 0:128]])
-                T.writes([B[0:128, 0:128]])
-                with T.block([128], "F") as vj:
-                    B[vi, vj] = A[vi, vj] * 2.0
+    for i in range(128):
+        with T.block("B"):
+            vi = T.axis.S(128, i)
+            T.reads([A[0:128, 0:128]])
+            T.writes([B[0:128, 0:128]])
+            B[vi, 0] = A[vi, 0]
+            if A[vi, 0] == 0.0:
+                with T.block("C"):
+                    T.reads([A[0:128, 0:128]])
+                    T.writes([B[0:128, 0:128]])
+                    for j in range(128):
+                        with T.block("D"):
+                            vj = T.axis.S(128, j)
+                            B[vi, vj] = A[vi, vj] * 3.0
+            else:
+                with T.block("E"):
+                    T.reads([A[0:128, 0:128]])
+                    T.writes([B[0:128, 0:128]])
+                    for j in range(128):
+                        with T.block("F"):
+                            vj = T.axis.S(128, j)
+                            B[vi, vj] = A[vi, vj] * 2.0
 
 
 # pylint: enable=no-member,invalid-name,unused-variable
diff --git a/tests/python/unittest/test_tir_schedule_state_cached_flags.py b/tests/python/unittest/test_tir_schedule_state_cached_flags.py
index e2b39ce7c289..e3bd000c2e70 100644
--- a/tests/python/unittest/test_tir_schedule_state_cached_flags.py
+++ b/tests/python/unittest/test_tir_schedule_state_cached_flags.py
@@ -32,10 +32,14 @@ def elementwise(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128), "float32")
     C = T.match_buffer(c, (128, 128), "float32")
     B = T.alloc_buffer((128, 128), "float32")
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -44,10 +48,12 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "init") as [vi, vj]:
+        with T.block("init"):
+            vi, vj = T.axis.remap("SS", [i, j])
             C[vi, vj] = 0.0
         for k in range(0, 128):
-            with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
+            with T.block("update"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
@@ -55,22 +61,28 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
 def block_in_opaque_block(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128, 128), "float32")
     B = T.match_buffer(b, (128, 128), "float32")
-    with T.block([128], "B") as vi:
-        T.reads([A[0:128, 0:128]])
-        T.writes([B[0:128, 0:128]])
-        B[vi, 0] = A[vi, 0]
-        if A[vi, 0] == 0.0:
-            with T.block([], "C"):
-                T.reads([A[0:128, 0:128]])
-                T.writes([B[0:128, 0:128]])
-                with T.block([128], "D") as vj:
-                    B[vi, vj] = A[vi, vj] * 3.0
-        else:
-            with T.block([], "E"):
-                T.reads([A[0:128, 0:128]])
-                T.writes([B[0:128, 0:128]])
-                with T.block([128], "F") as vj:
-                    B[vi, vj] = A[vi, vj] * 2.0
+    for i in range(128):
+        with T.block("B"):
+            vi = T.axis.S(128, i)
+            T.reads([A[0:128, 0:128]])
+            T.writes([B[0:128, 0:128]])
+            B[vi, 0] = A[vi, 0]
+            if A[vi, 0] == 0.0:
+                with T.block("C"):
+                    T.reads([A[0:128, 0:128]])
+                    T.writes([B[0:128, 0:128]])
+                    for j in range(128):
+                        with T.block("D"):
+                            vj = T.axis.S(128, j)
+                            B[vi, vj] = A[vi, vj] * 3.0
+            else:
+                with T.block("E"):
+                    T.reads([A[0:128, 0:128]])
+                    T.writes([B[0:128, 0:128]])
+                    for j in range(128):
+                        with T.block("F"):
+                            vj = T.axis.S(128, j)
+                            B[vi, vj] = A[vi, vj] * 2.0
 
 
 @T.prim_func
@@ -78,10 +90,14 @@ def write_after_read(a: T.handle, b: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.match_buffer(b, (128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
 
 
 @T.prim_func
@@ -90,9 +106,11 @@ def loop_carried_dependency(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, (128,))
     C = T.match_buffer(c, (128,))
     for i in range(0, 128):
-        with T.block([128], "B") as vi:
+        with T.block("B"):
+            vi = T.axis.S(128, i)
             B[vi] = A[vi] * 2.0
-        with T.block([128], "C") as vi:
+        with T.block("C"):
+            vi = T.axis.S(128, i)
             C[vi] = T.if_then_else(vi >= 1, B[vi - 1] + 1.0, 0.0, dtype="float32")
 
 
@@ -101,14 +119,17 @@ def concatenate_multi_producer(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128,))
     B = T.match_buffer(b, (128,))
     for i in range(0, 64):
-        with T.block([64], "A_0") as vi:
+        with T.block("A_0"):
+            vi = T.axis.S(64, i)
             A[vi] = vi + 1
     for i in range(0, 64):
-        with T.block([64], "A_1") as vi:
-            T.bind(vi, i + 64)
+        with T.block("A_1"):
+            vi = T.axis.S(64, i + 64)
             A[vi] = vi + 2
-    with T.block([128], "B") as vi:
-        B[vi] = A[vi] * 2.0
+    for i in range(0, 128):
+        with T.block("B"):
+            vi = T.axis.S(128, i)
+            B[vi] = A[vi] * 2.0
 
 
 @T.prim_func
@@ -116,14 +137,17 @@ def concatenate_multi_producer_uncovered(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128,))
     B = T.match_buffer(b, (128,))
     for i in range(0, 63):
-        with T.block([63], "A_0") as vi:
+        with T.block("A_0"):
+            vi = T.axis.S(63, i)
             A[vi] = vi + 1
     for i in range(0, 64):
-        with T.block([64], "A_1") as vi:
-            T.bind(vi, i + 64)
+        with T.block("A_1"):
+            vi = T.axis.S(64, i + 64)
             A[vi] = vi + 2
-    with T.block([128], "B") as vi:
-        B[vi] = A[vi] * 2.0
+    for i in range(0, 128):
+        with T.block("B"):
+            vi = T.axis.S(128, i)
+            B[vi] = A[vi] * 2.0
 
 
 @T.prim_func
@@ -132,9 +156,11 @@ def lca_at_loop(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, (128,))
     C = T.match_buffer(c, (128,))
     for i in range(0, 128):
-        with T.block([128], "B") as vi:
+        with T.block("B"):
+            vi = T.axis.S(128, i)
             B[vi] = A[vi] * 2.0
-        with T.block([128], "C") as vi:
+        with T.block("C"):
+            vi = T.axis.S(128, i)
             C[vi] = B[vi] + 1.0
 
 
@@ -143,18 +169,20 @@ def multi_producer_consumer(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (128,))
     B = T.match_buffer(b, (128,))
     for i in range(0, 64):
-        with T.block([64], "A_0") as vi:
+        with T.block("A_0"):
+            vi = T.axis.S(64, i)
             A[vi] = vi + 1
     for i in range(0, 64):
-        with T.block([64], "A_1") as vi:
-            T.bind(vi, i + 64)
+        with T.block("A_1"):
+            vi = T.axis.S(64, i + 64)
             A[vi] = vi + 2
     for i in range(0, 64):
-        with T.block([64], "B_0") as vi:
+        with T.block("B_0"):
+            vi = T.axis.S(64, i)
             B[vi] = A[vi] + 2.0
     for i in range(0, 64):
-        with T.block([64], "B_1") as vi:
-            T.bind(vi, i + 64)
+        with T.block("B_1"):
+            vi = T.axis.S(64, i + 64)
             B[vi] = A[vi] + 3.0
 
 
@@ -164,12 +192,14 @@ def elementwise_affine_producer(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (128, 128), "float32")
     B = T.alloc_buffer((128, 128), "float32")
     for i, j, k, l in T.grid(16, 2, 32, 16):
-        with T.block([128, 128], "B") as [vi, vj]:
-            T.bind(vi, i * 8 + j * 4 + k // 8)
-            T.bind(vj, k % 8 * 16 + l)
+        with T.block("B"):
+            vi = T.axis.S(128, i * 8 + j * 4 + k // 8)
+            vj = T.axis.S(128, k % 8 * 16 + l)
             B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -177,13 +207,19 @@ def elementwise_subblock(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128), "float32")
     C = T.match_buffer(c, (128, 128), "float32")
     B = T.alloc_buffer((128, 128), "float32")
-    with T.block([32, 32], "B") as [vi, vj]:
-        T.reads([A[vi * 4 : vi * 4 + 4, vj * 4 : vj * 4 + 4]])
-        T.writes([B[vi * 4 : vi * 4 + 4, vj * 4 : vj * 4 + 4]])
-        with T.block([4, 4], "B_sub") as [vi_i, vj_i]:
-            B[vi * 4 + vi_i, vj * 4 + vj_i] = A[vi * 4 + vi_i, vj * 4 + vj_i] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(32, 32):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads([A[vi * 4 : vi * 4 + 4, vj * 4 : vj * 4 + 4]])
+            T.writes([B[vi * 4 : vi * 4 + 4, vj * 4 : vj * 4 + 4]])
+            for ii, jj in T.grid(4, 4):
+                with T.block("B_sub"):
+                    vi_i, vj_i = T.axis.remap("SS", [ii, jj])
+                    B[vi * 4 + vi_i, vj * 4 + vj_i] = A[vi * 4 + vi_i, vj * 4 + vj_i] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -191,13 +227,19 @@ def elementwise_subblock_uncovered(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128), "float32")
     C = T.match_buffer(c, (128, 128), "float32")
     B = T.alloc_buffer((128, 128), "float32")
-    with T.block([32, 32], "B") as [vi, vj]:
-        T.reads([A[vi * 4 : vi * 4 + 2, vj * 4 : vj * 4 + 2]])
-        T.writes([B[vi * 4 : vi * 4 + 2, vj * 4 : vj * 4 + 2]])
-        with T.block([2, 2], "B_sub") as [vi_i, vj_i]:
-            B[vi * 4 + vi_i, vj * 4 + vj_i] = A[vi * 4 + vi_i, vj * 4 + vj_i] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(32, 32):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads([A[vi * 4 : vi * 4 + 2, vj * 4 : vj * 4 + 2]])
+            T.writes([B[vi * 4 : vi * 4 + 2, vj * 4 : vj * 4 + 2]])
+            for ii, jj in T.grid(2, 2):
+                with T.block("B_sub"):
+                    vi_i, vj_i = T.axis.remap("SS", [ii, jj])
+                    B[vi * 4 + vi_i, vj * 4 + vj_i] = A[vi * 4 + vi_i, vj * 4 + vj_i] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -207,10 +249,12 @@ def bound_to_thread(a: T.handle, c: T.handle) -> None:
     B = T.alloc_buffer([128, 128], scope="shared")
     for i in T.thread_binding(0, 128, thread="threadIdx.x"):
         for j in T.serial(0, 128):
-            with T.block([128, 128], "B") as [vi, vj]:
+            with T.block("B"):
+                vi, vj = T.axis.remap("SS", [i, j])
                 B[vi, vj] = A[vi, vj] * 2.0
         for j in T.serial(0, 128):
-            with T.block([128, 128], "C") as [vi, vj]:
+            with T.block("C"):
+                vi, vj = T.axis.remap("SS", [i, j])
                 C[vj, vi] = B[vj, vi] + 1.0
 
 
@@ -222,14 +266,14 @@ def equal_ranked_threads(a: T.handle, c: T.handle) -> None:
     for i_o in T.thread_binding(0, 16, thread="threadIdx.x"):
         for i_i in T.thread_binding(0, 8, thread="threadIdx.y"):
             for j in T.serial(0, 128):
-                with T.block([128, 128], "B") as [vi, vj]:
-                    T.bind(vi, i_o * 8 + i_i)
-                    T.bind(vj, j)
+                with T.block("B"):
+                    vi = T.axis.S(128, i_o * 8 + i_i)
+                    vj = T.axis.S(128, j)
                     B[vi, vj] = A[vi, vj] * 2.0
             for j in T.serial(0, 128):
-                with T.block([128, 128], "C") as [vi, vj]:
-                    T.bind(vi, i_o * 8 + i_i)
-                    T.bind(vj, j)
+                with T.block("C"):
+                    vi = T.axis.S(128, i_o * 8 + i_i)
+                    vj = T.axis.S(128, j)
                     C[vj, vi] = B[vj, vi] + 1.0
 
 
@@ -241,10 +285,12 @@ def warp_memory(a: T.handle, c: T.handle) -> None:
     for i_o in T.thread_binding(0, 4, thread="threadIdx.y"):
         for i_i in T.thread_binding(0, 32, thread="threadIdx.x"):
             for j in T.serial(0, 128):
-                with T.block([4, 32, 128], "B") as [warp_id, lane_id, vj]:
+                with T.block("B"):
+                    warp_id, lane_id, vj = T.axis.remap("SSS", [i_o, i_i, j])
                     B[vj, warp_id, lane_id] = A[warp_id * 32 + lane_id, vj] * 2.0
             for j in T.serial(0, 128):
-                with T.block([4, 32, 128], "C") as [warp_id, lane_id, vj]:
+                with T.block("C"):
+                    warp_id, lane_id, vj = T.axis.remap("SSS", [i_o, i_i, j])
                     C[warp_id * 32 + lane_id, vj] = B[vj, warp_id, lane_id] + 1.0
 
 
@@ -256,11 +302,15 @@ def warp_memory_negative(a: T.handle, c: T.handle) -> None:
     for i_o in T.thread_binding(0, 4, thread="threadIdx.y"):
         for i_i in T.thread_binding(0, 32, thread="threadIdx.x"):
             for j in T.serial(0, 128):
-                with T.block([4, 32, 128], "B") as [warp_id, lane_id, vj]:
+                with T.block("B"):
+                    warp_id, lane_id, vj = T.axis.remap("SSS", [i_o, i_i, j])
                     B[vj, warp_id, lane_id] = A[warp_id * 32 + lane_id, vj] * 2.0
             for i_o_prime in T.thread_binding(0, 4, thread="threadIdx.y"):
                 for j in T.serial(0, 128):
-                    with T.block([4, 32, 4, 128], "C") as [_warp_id, lane_id, warp_id, vj]:
+                    with T.block("C"):
+                        _warp_id, warp_id, lane_id, vj = T.axis.remap(
+                            "SSSS", [i_o, i_i, i_o_prime, j]
+                        )
                         C[warp_id * 32 + lane_id, vj] = B[vj, warp_id, lane_id] + 1.0
 
 
diff --git a/tests/python/unittest/test_tir_schedule_storage_align.py b/tests/python/unittest/test_tir_schedule_storage_align.py
index 7d0e91f70e60..3b699fd8f1b2 100644
--- a/tests/python/unittest/test_tir_schedule_storage_align.py
+++ b/tests/python/unittest/test_tir_schedule_storage_align.py
@@ -29,22 +29,20 @@ def element_wise(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1)
     A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1)
     # body
-    with T.block([], "root"):
+    with T.block("root"):
         T.reads([])
         T.writes([])
         B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1)
         for i0 in T.serial(0, 128):
             for ax1 in T.serial(0, 128):
-                with T.block([128, 128], "B") as [vi, vj]:
-                    T.bind(vi, i0)
-                    T.bind(vj, ax1)
+                with T.block("B"):
+                    vi, vj = T.axis.remap("SS", [i0, ax1])
                     T.reads([A[vi, vj]])
                     T.writes([B[vi, vj]])
                     B[vi, vj] = (A[vi, vj]*T.float32(2))
             for i1 in T.serial(0, 128):
-                with T.block([128, 128], "C") as [vi_1, vj_1]:
-                    T.bind(vi_1, i0)
-                    T.bind(vj_1, i1)
+                with T.block("C"):
+                    vi_1, vj_1 = T.axis.remap("SS", [i0, i1])
                     T.reads([B[vi_1, vj_1]])
                     T.writes([C[vi_1, vj_1]])
                     C[vi_1, vj_1] = (B[vi_1, vj_1] + T.float32(1))
@@ -55,23 +53,21 @@ def element_wise_storage_align(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1)
     A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1)
     # body
-    with T.block([], "root"):
+    with T.block("root"):
         T.reads([])
         T.writes([])
         B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1)
         for i0 in T.serial(0, 128):
             for ax1 in T.serial(0, 128):
-                with T.block([128, 128], "B") as [vi, vj]:
-                    T.bind(vi, i0)
-                    T.bind(vj, ax1)
+                with T.block("B"):
+                    vi, vj = T.axis.remap("SS", [i0, ax1])
                     T.reads([A[vi, vj]])
                     T.writes([B[vi, vj]])
                     T.block_attr({"buffer_dim_align":[[0, 0, 128, 127]]})
                     B[vi, vj] = (A[vi, vj]*T.float32(2))
             for i1 in T.serial(0, 128):
-                with T.block([128, 128], "C") as [vi_1, vj_1]:
-                    T.bind(vi_1, i0)
-                    T.bind(vj_1, i1)
+                with T.block("C"):
+                    vi_1, vj_1 = T.axis.remap("SS", [i0, i1])
                     T.reads([B[vi_1, vj_1]])
                     T.writes([C[vi_1, vj_1]])
                     C[vi_1, vj_1] = (B[vi_1, vj_1] + T.float32(1))
@@ -82,23 +78,21 @@ def element_wise_invalid_annotation(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1)
     A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1)
     # body
-    with T.block([], "root"):
+    with T.block("root"):
         T.reads([])
         T.writes([])
         B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1)
         for i0 in T.serial(0, 128):
             for ax1 in T.serial(0, 128):
-                with T.block([128, 128], "B") as [vi, vj]:
+                with T.block("B"):
                     T.block_attr({"buffer_dim_align": [0]})
-                    T.bind(vi, i0)
-                    T.bind(vj, ax1)
+                    vi, vj = T.axis.remap("SS", [i0, ax1])
                     T.reads([A[vi, vj]])
                     T.writes([B[vi, vj]])
                     B[vi, vj] = (A[vi, vj]*T.float32(2))
             for i1 in T.serial(0, 128):
-                with T.block([128, 128], "C") as [vi_1, vj_1]:
-                    T.bind(vi_1, i0)
-                    T.bind(vj_1, i1)
+                with T.block("C"):
+                    vi_1, vj_1 = T.axis.remap("SS", [i0, i1])
                     T.reads([B[vi_1, vj_1]])
                     T.writes([C[vi_1, vj_1]])
                     C[vi_1, vj_1] = (B[vi_1, vj_1] + T.float32(1))
diff --git a/tests/python/unittest/test_tir_schedule_trace.py b/tests/python/unittest/test_tir_schedule_trace.py
index 36e05c6b5170..f1c97c57b2ff 100644
--- a/tests/python/unittest/test_tir_schedule_trace.py
+++ b/tests/python/unittest/test_tir_schedule_trace.py
@@ -32,18 +32,24 @@ def elementwise(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     B = T.alloc_buffer((128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
 def elementwise_inlined(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
     C = T.match_buffer(c, (128, 128))
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = A[vi, vj] * 2.0 + 1.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = A[vi, vj] * 2.0 + 1.0
 
 
 # pylint: enable=no-member,invalid-name,unused-variable
diff --git a/tests/python/unittest/test_tir_schedule_utilities.py b/tests/python/unittest/test_tir_schedule_utilities.py
index 185d229b44e1..440d0ab67a50 100644
--- a/tests/python/unittest/test_tir_schedule_utilities.py
+++ b/tests/python/unittest/test_tir_schedule_utilities.py
@@ -34,10 +34,12 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "init") as [vi, vj]:
+        with T.block("init"):
+            vi, vj = T.axis.remap("SS", [i, j])
             C[vi, vj] = T.float32(0)
         for k in range(0, 128):
-            with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
+            with T.block("update"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
diff --git a/tests/python/unittest/test_tir_specialize.py b/tests/python/unittest/test_tir_specialize.py
index 86dc5dffed9f..72666a89ebcb 100644
--- a/tests/python/unittest/test_tir_specialize.py
+++ b/tests/python/unittest/test_tir_specialize.py
@@ -27,10 +27,12 @@ def matmul(a: T.handle, b: T.handle, c: T.handle, n: T.int32) -> None:
     B = T.match_buffer(b, [m, n])
     C = T.match_buffer(c, [m, m])
 
-    with T.block([m, m, T.reduce_axis(0, n)], "update") as [vi, vj, vk]:
-        with T.init():
-            C[vi, vj] = 0.0
-        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+    for i, j, k in T.grid(m, m, n):
+        with T.block("update"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vi, vj] = 0.0
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
 @T.prim_func
@@ -39,10 +41,12 @@ def matmul_128(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
 
-    with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
-        with T.init():
-            C[vi, vj] = 0.0
-        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block("update"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vi, vj] = 0.0
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
 @T.prim_func
@@ -52,10 +56,12 @@ def matmul_m_128(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [m, 128])
     C = T.match_buffer(c, [m, m])
 
-    with T.block([m, m, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
-        with T.init():
-            C[vi, vj] = 0.0
-        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+    for i, j, k in T.grid(m, m, 128):
+        with T.block("update"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vi, vj] = 0.0
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
 @T.prim_func
@@ -66,10 +72,12 @@ def matmul_m_8x(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [m, x * 8])
     C = T.match_buffer(c, [m, m])
 
-    with T.block([m, m, T.reduce_axis(0, x * 8)], "update") as [vi, vj, vk]:
-        with T.init():
-            C[vi, vj] = 0.0
-        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+    for i, j, k in T.grid(m, m, x * 8):
+        with T.block("update"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vi, vj] = 0.0
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
 @T.prim_func
@@ -81,11 +89,15 @@ def element_wise(a: T.handle, c: T.handle) -> None:
 
     B = T.alloc_buffer((m, n), "float32")
 
-    with T.block([m, n], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(m, n):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
 
-    with T.block([m, n], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(m, n):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -94,11 +106,15 @@ def element_wise_128_64(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (128, 64), "float32")
     B = T.alloc_buffer((128, 64), "float32")
 
-    with T.block([128, 64], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 64):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
 
-    with T.block([128, 64], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, 64):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -108,11 +124,15 @@ def element_wise_128_n(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (128, n), "float32")
     B = T.alloc_buffer((128, n), "float32")
 
-    with T.block([128, n], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, n):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
 
-    with T.block([128, n], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + 1.0
+    for i, j in T.grid(128, n):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
 
 
 @T.prim_func
@@ -120,8 +140,10 @@ def mem_copy(a: T.handle, b: T.handle, m: T.int32, n: T.int32, p: T.int32, q: T.
     A = T.match_buffer(a, (m, n), "float32", strides=[p, 1], elem_offset=q)
     B = T.match_buffer(b, (m, n), "float32", strides=[p, 1], elem_offset=q)
 
-    with T.block([m, n], "") as [vi, vj]:
-        B[vi, vj] = A[vi, vj]
+    for i, j in T.grid(m, n):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj]
 
 
 @T.prim_func
@@ -129,8 +151,10 @@ def mem_copy_16_16_8_4(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32", strides=[8, 1], elem_offset=4)
     B = T.match_buffer(b, (16, 16), "float32", strides=[8, 1], elem_offset=4)
 
-    with T.block([16, 16], "") as [vi, vj]:
-        B[vi, vj] = A[vi, vj]
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj]
 
 
 @T.prim_func
@@ -138,8 +162,10 @@ def mem_copy_m_n_p_n(a: T.handle, b: T.handle, m: T.int32, n: T.int32, p: T.int3
     A = T.match_buffer(a, (m, n), "float32", strides=[p, 1], elem_offset=n)
     B = T.match_buffer(b, (m, n), "float32", strides=[p, 1], elem_offset=n)
 
-    with T.block([m, n], "") as [vi, vj]:
-        B[vi, vj] = A[vi, vj]
+    for i, j in T.grid(m, n):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj]
 
 
 @T.prim_func
@@ -147,8 +173,10 @@ def param_in_arith_exprs(a: T.handle, b: T.handle) -> None:
     n = T.var("int32")
     A = T.match_buffer(a, [n // 8, 8], "int32")
     B = T.match_buffer(b, [n], "int32")
-    with T.block([n - 1], "") as [vi]:
-        B[vi] = A[vi // 8, vi % 8] + (n + 1) * 42
+    for i in range(n - 1):
+        with T.block():
+            vi = T.axis.S(n - 1, i)
+            B[vi] = A[vi // 8, vi % 8] + (n + 1) * 42
 
 
 @T.prim_func
@@ -156,8 +184,10 @@ def param_in_arith_exprs_n_16(a: T.handle, b: T.handle) -> None:
     n = T.var("int32")
     A = T.match_buffer(a, [2, 8], "int32")
     B = T.match_buffer(b, [16], "int32")
-    with T.block([15], "") as [vi]:
-        B[vi] = A[vi // 8, vi % 8] + 714
+    for i in range(15):
+        with T.block():
+            vi = T.axis.S(15, i)
+            B[vi] = A[vi // 8, vi % 8] + 714
 
 
 def test_specialize_nothing():
diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
index 0cfc724e41de..7d3115428f5a 100644
--- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
@@ -32,17 +32,17 @@ def elementwise_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     C = T.match_buffer(c, (16, 16), "float32")
     for i in range(0, 16):
-        with T.block([]):
+        with T.block():
             T.reads(A[i, 0:16])
             T.writes(C[i, 0:16])
             B = T.alloc_buffer((16, 16), "float32")
             for j in range(0, 16):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(A[i, j])
                     T.writes(B[i, j])
                     B[i, j] = A[i, j] + 1.0
             for j in range(0, 16):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(B[i, j])
                     T.writes(C[i, j])
                     C[i, j] = B[i, j] * 2.0
@@ -53,7 +53,7 @@ def compacted_elementwise_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     C = T.match_buffer(c, (16, 16), "float32")
     for i in range(0, 16):
-        with T.block([]):
+        with T.block():
             T.reads(A[i, 0:16])
             T.writes(C[i, 0:16])
             B = T.alloc_buffer((1, 16), "float32")
@@ -74,7 +74,7 @@ def unschedulable_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     C = T.match_buffer(c, (16, 16), "float32")
     for i in range(0, 16):
-        with T.block([]):
+        with T.block():
             T.reads(A[i, 0:16])
             T.writes(C[i, 0:16])
             B = T.alloc_buffer((16, 16), "float32")
@@ -89,11 +89,11 @@ def param_buffer_access_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (20, 20), "float32")
     B = T.match_buffer(c, (20, 20), "float32")
     for i in range(0, 16):
-        with T.block([]):
+        with T.block():
             T.reads(A[i, 0:16])
             T.writes(B[i, 0:16])
             for j in range(0, 16):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(A[i, j])
                     T.writes(B[i, j])
                     B[i, j] = A[i, j] + 1.0
@@ -106,17 +106,17 @@ def shared_mem_func(a: T.handle, c: T.handle) -> None:
     for i0 in T.thread_binding(0, 2, thread="blockIdx.x"):
         for i1 in T.thread_binding(0, 2, thread="vthread"):
             for i2 in T.thread_binding(0, 4, thread="threadIdx.x"):
-                with T.block([]):
+                with T.block():
                     T.reads(A[i0 * 8 + i1 * 4 + i2, 0:16])
                     T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16])
                     B = T.alloc_buffer((16, 16), "float32", scope="shared")
                     for j in range(0, 16):
-                        with T.block([]) as []:
+                        with T.block() as []:
                             T.reads(A[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(B[i0 * 8 + i1 * 4 + i2, j])
                             B[i0 * 8 + i1 * 4 + i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0
                     for j in range(0, 16):
-                        with T.block([]) as []:
+                        with T.block() as []:
                             T.reads(B[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(C[i0 * 8 + i1 * 4 + i2, j])
                             C[i0 * 8 + i1 * 4 + i2, j] = B[i0 * 8 + i1 * 4 + i2, j] * 2.0
@@ -129,17 +129,17 @@ def compacted_shared_mem_func(a: T.handle, c: T.handle) -> None:
     for i0 in T.thread_binding(0, 2, thread="blockIdx.x"):
         for i1 in T.thread_binding(0, 2, thread="vthread"):
             for i2 in T.thread_binding(0, 4, thread="threadIdx.x"):
-                with T.block([]):
+                with T.block():
                     T.reads(A[i0 * 8 + i1 * 4 + i2, 0:16])
                     T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16])
                     B = T.alloc_buffer((8, 16), "float32", scope="shared")
                     for j in range(0, 16):
-                        with T.block([]) as []:
+                        with T.block() as []:
                             T.reads(A[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(B[i1 * 4 + i2, j])
                             B[i1 * 4 + i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0
                     for j in range(0, 16):
-                        with T.block([]) as []:
+                        with T.block() as []:
                             T.reads(B[i1 * 4 + i2, j])
                             T.writes(C[i0 * 8 + i1 * 4 + i2, j])
                             C[i0 * 8 + i1 * 4 + i2, j] = B[i1 * 4 + i2, j] * 2.0
@@ -152,17 +152,17 @@ def warp_mem_func(a: T.handle, c: T.handle) -> None:
     for i0 in T.thread_binding(0, 2, thread="blockIdx.x"):
         for i1 in T.thread_binding(0, 2, thread="vthread"):
             for i2 in T.thread_binding(0, 4, thread="threadIdx.x"):
-                with T.block([]):
+                with T.block():
                     T.reads(A[i0 * 8 + i1 * 4 + i2, 0:16])
                     T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16])
                     B = T.alloc_buffer((16, 16), "float32", scope="warp")
                     for j in range(0, 16):
-                        with T.block([]) as []:
+                        with T.block() as []:
                             T.reads(A[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(B[i0 * 8 + i1 * 4 + i2, j])
                             B[i0 * 8 + i1 * 4 + i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0
                     for j in range(0, 16):
-                        with T.block([]) as []:
+                        with T.block() as []:
                             T.reads(B[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(C[i0 * 8 + i1 * 4 + i2, j])
                             C[i0 * 8 + i1 * 4 + i2, j] = B[i0 * 8 + i1 * 4 + i2, j] * 2.0
@@ -175,17 +175,17 @@ def compacted_warp_mem_func(a: T.handle, c: T.handle) -> None:
     for i0 in T.thread_binding(0, 2, thread="blockIdx.x"):
         for i1 in T.thread_binding(0, 2, thread="vthread"):
             for i2 in T.thread_binding(0, 4, thread="threadIdx.x"):
-                with T.block([]):
+                with T.block():
                     T.reads(A[i0 * 8 + i1 * 4 + i2, 0:16])
                     T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16])
                     B = T.alloc_buffer((4, 16), "float32", scope="warp")
                     for j in range(0, 16):
-                        with T.block([]) as []:
+                        with T.block() as []:
                             T.reads(A[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(B[i2, j])
                             B[i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0
                     for j in range(0, 16):
-                        with T.block([]) as []:
+                        with T.block() as []:
                             T.reads(B[i2, j])
                             T.writes(C[i0 * 8 + i1 * 4 + i2, j])
                             C[i0 * 8 + i1 * 4 + i2, j] = B[i2, j] * 2.0
@@ -196,17 +196,17 @@ def symbolic_func(a: T.handle, c: T.handle, n: T.int32) -> None:
     A = T.match_buffer(a, (n * 8,), "float32")
     C = T.match_buffer(c, (n * 8,), "float32")
     for i in range(0, n):
-        with T.block([]):
+        with T.block():
             T.reads(A[i * 8 : i * 8 + 8])
             T.writes(C[i * 8 : i * 8 + 8])
             B = T.alloc_buffer((n * 8,), "float32")
             for j in range(0, 8):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(A[i * 8 + j])
                     T.writes(B[i * 8 + j])
                     B[i * 8 + j] = A[i * 8 + j] + 1.0
             for j in range(0, 8):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(B[i * 8 + j])
                     T.writes(C[i * 8 + j])
                     C[i * 8 + j] = B[i * 8 + j] * 2.0
@@ -217,17 +217,17 @@ def compacted_symbolic_func(a: T.handle, c: T.handle, n: T.int32) -> None:
     A = T.match_buffer(a, (n * 8,), "float32")
     C = T.match_buffer(c, (n * 8,), "float32")
     for i in range(0, n):
-        with T.block([]):
+        with T.block():
             T.reads(A[i * 8 : i * 8 + 8])
             T.writes(C[i * 8 : i * 8 + 8])
             B = T.alloc_buffer((8,), "float32")
             for j in range(0, 8):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(A[i * 8 + j])
                     T.writes(B[j])
                     B[j] = A[i * 8 + j] + 1.0
             for j in range(0, 8):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(B[j])
                     T.writes(C[i * 8 + j])
                     C[i * 8 + j] = B[j] * 2.0
@@ -238,12 +238,12 @@ def complex_func(a: T.handle, c: T.handle, n: T.int32) -> None:
     A = T.match_buffer(a, (8, 8), "float32")
     C = T.match_buffer(c, (8, 8), "float32")
     for i in range(0, 8):
-        with T.block([]):
+        with T.block():
             T.reads(A[0, 8])
             T.writes(C[0, 8])
             B = T.alloc_buffer((8, 8), "float32")
             for j in range(0, 4):
-                with T.block([]) as []:
+                with T.block() as []:
                     D = T.alloc_buffer((8, 8), "float32")
                     T.reads(A[i, j])
                     T.writes(B[i, j])
@@ -252,12 +252,12 @@ def complex_func(a: T.handle, c: T.handle, n: T.int32) -> None:
                     for k in range(2, 4):
                         T.store(B.data, j, A[i, j] + D[k, j])
             for j in range(3, 5):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(B[i, j])
                     T.writes(C[i, j])
                     C[i, j] = B[i, j]
             for j in range(6, 8):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(B[i, j])
                     T.writes(C[i, j])
                     C[i, j] = B[i, j]
@@ -268,12 +268,12 @@ def compacted_complex_func(a: T.handle, c: T.handle, n: T.int32) -> None:
     A = T.match_buffer(a, (8, 8), "float32")
     C = T.match_buffer(c, (8, 8), "float32")
     for i in range(0, 8):
-        with T.block([]):
+        with T.block():
             T.reads(A[0, 8])
             T.writes(C[0, 8])
             B = T.alloc_buffer((1, 8), "float32")
             for j in range(0, 4):
-                with T.block([]) as []:
+                with T.block() as []:
                     D = T.alloc_buffer((6, 1), "float32")
                     T.reads(A[i, j])
                     T.writes(B[0, j])
@@ -282,12 +282,12 @@ def compacted_complex_func(a: T.handle, c: T.handle, n: T.int32) -> None:
                     for k in range(2, 4):
                         T.store(B.data, j, A[i, j] + D[k - 2, 0])
             for j in range(3, 5):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(B[0, j])
                     T.writes(C[i, j])
                     C[i, j] = B[0, j]
             for j in range(6, 8):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(B[0, j])
                     T.writes(C[i, j])
                     C[i, j] = B[0, j]
@@ -298,19 +298,19 @@ def match_buffer_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16))
     C = T.match_buffer(c, (16, 16))
     for i in range(0, 16):
-        with T.block([]):
+        with T.block():
             A0 = T.match_buffer(A[i, 0:16], (16))
             C0 = T.match_buffer(C[i, 0:16], (16))
             B = T.alloc_buffer((16, 16))
-            with T.block([]):
+            with T.block():
                 B0 = T.match_buffer(B[i, 0:16], (16))
                 for j in range(0, 16):
-                    with T.block([]) as []:
+                    with T.block() as []:
                         A1 = T.match_buffer(A0[j], ())
                         B1 = T.match_buffer(B0[j], ())
                         B1[()] = A1[()] + 1.0
             for j in range(0, 16):
-                with T.block([]) as []:
+                with T.block() as []:
                     C1 = T.match_buffer(C0[j], ())
                     B2 = T.match_buffer(B[i, j], ())
                     C1[()] = B2[()] * 2.0
@@ -321,19 +321,19 @@ def compacted_match_buffer_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16))
     C = T.match_buffer(c, (16, 16))
     for i in range(0, 16):
-        with T.block([]):
+        with T.block():
             A0 = T.match_buffer(A[i, 0:16], (16))
             C0 = T.match_buffer(C[i, 0:16], (16))
             B = T.alloc_buffer((1, 16))
-            with T.block([]):
+            with T.block():
                 B0 = T.match_buffer(B[0, 0:16], (16))
                 for j in range(0, 16):
-                    with T.block([]) as []:
+                    with T.block() as []:
                         A1 = T.match_buffer(A0[j], ())
                         B1 = T.match_buffer(B0[j], ())
                         B1[()] = A1[()] + 1.0
             for j in range(0, 16):
-                with T.block([]) as []:
+                with T.block() as []:
                     C1 = T.match_buffer(C0[j], ())
                     B2 = T.match_buffer(B[0, j], ())
                     C1[()] = B2[()] * 2.0
@@ -344,18 +344,18 @@ def storage_align_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     C = T.match_buffer(c, (16, 16), "float32")
     for i in range(0, 16):
-        with T.block([]):
+        with T.block():
             T.reads(A[i, 0:16])
             T.writes(C[i, 0:16])
             B = T.alloc_buffer((16, 16), "float32")
             for j in range(0, 16):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(A[i, j])
                     T.writes(B[i, j])
                     T.block_attr({"buffer_dim_align": [[0, 0, 16, 15]]})
                     B[i, j] = A[i, j] + 1.0
             for j in range(0, 16):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(B[i, j])
                     T.writes(C[i, j])
                     C[i, j] = B[i, j] * 2.0
@@ -366,7 +366,7 @@ def compacted_storage_align_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     C = T.match_buffer(c, (16, 16), "float32")
     for i in range(0, 16):
-        with T.block([]):
+        with T.block():
             T.reads(A[i, 0:16])
             T.writes(C[i, 0:16])
             B = T.alloc_buffer((1, 16), strides=(31, 1), dtypes="float32")
diff --git a/tests/python/unittest/test_tir_transform_convert_blocks_to_opaque.py b/tests/python/unittest/test_tir_transform_convert_blocks_to_opaque.py
index 287a30916520..ee323a64c50f 100644
--- a/tests/python/unittest/test_tir_transform_convert_blocks_to_opaque.py
+++ b/tests/python/unittest/test_tir_transform_convert_blocks_to_opaque.py
@@ -32,19 +32,19 @@ def elementwise_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     C = T.match_buffer(c, (16, 16), "float32")
     for i in range(0, 16):
-        with T.block([]):
+        with T.block():
             T.reads(A[i, 0:16])
             T.writes(C[i, 0:16])
             B = T.alloc_buffer((16, 16), "float32")
             for j in range(0, 16):
-                with T.block([16, 16]) as [vi, vj]:
-                    T.bind(vi, i)
-                    T.bind(vj, j)
+                with T.block():
+                    vi = T.axis.S(16, i)
+                    vj = T.axis.S(16, j)
                     B[vi, vj] = A[vi, vj] + 1.0
             for j in range(0, 16):
-                with T.block([16, 16]) as [vi, vj]:
-                    T.bind(vi, i)
-                    T.bind(vj, j)
+                with T.block():
+                    vi = T.axis.S(16, i)
+                    vj = T.axis.S(16, j)
                     C[vi, vj] = B[vi, vj] * 2.0
 
 
@@ -53,7 +53,7 @@ def substituted_elementwise_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     C = T.match_buffer(c, (16, 16), "float32")
     for i in range(0, 16):
-        with T.block([]):
+        with T.block():
             T.reads(A[i, 0:16])
             T.writes(C[i, 0:16])
             B = T.alloc_buffer([16, 16], "float32")
diff --git a/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py b/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py
new file mode 100644
index 000000000000..a91fa2591e00
--- /dev/null
+++ b/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+import tvm
+
+from tvm.script import tir as T
+from tvm.tir import stmt_functor
+
+# fmt: off
+@T.prim_func
+def fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(placeholder_30: T.handle, placeholder_31: T.handle, placeholder_32: T.handle, T_cast_8: T.handle) -> None:
+    # function attr dict
+    T.func_attr({"global_symbol": "fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2", "tir.noalias": True})
+    placeholder_33 = T.match_buffer(placeholder_30, [1, 28, 28, 192], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+    placeholder_34 = T.match_buffer(placeholder_31, [1, 1, 192, 16], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+    placeholder_35 = T.match_buffer(placeholder_32, [1, 1, 1, 16], dtype="int32", elem_offset=0, align=128, offset_factor=1)
+    T_cast_9 = T.match_buffer(T_cast_8, [1, 28, 28, 16], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+    # body
+    PaddedInput_3 = T.allocate([1, 28, 28, 192], "int16", "global")
+    for i0_i1_fused_3 in T.parallel(0, 28):
+        for i2_3, i3_3 in T.grid(28, 192):
+            T.store(PaddedInput_3, (((i0_i1_fused_3*5376) + (i2_3*192)) + i3_3), T.load("int16", placeholder_33.data, (((i0_i1_fused_3*5376) + (i2_3*192)) + i3_3)), True)
+    for ax0_ax1_fused_ax2_fused_3 in T.parallel(0, 784):
+        for ax3_2 in T.serial(0, 16):
+            Conv2dOutput_3 = T.allocate([1, 1, 1, 1], "int32", "global")
+            T.store(Conv2dOutput_3, 0, 0, True)
+            for rc_3 in T.serial(0, 192):
+                T.store(Conv2dOutput_3, 0, (T.load("int32", Conv2dOutput_3, 0) + (T.cast(T.load("int16", PaddedInput_3, ((ax0_ax1_fused_ax2_fused_3*192) + rc_3)), "int32")*T.cast(T.load("int16", placeholder_34.data, ((rc_3*16) + ax3_2)), "int32"))), True)
+            T.store(T_cast_9.data, ((ax0_ax1_fused_ax2_fused_3*16) + ax3_2), T.cast(T.cast(T.max(T.min(T.q_multiply_shift((T.load("int32", Conv2dOutput_3, 0) + T.load("int32", placeholder_35.data, ax3_2)), 1764006585, 31, -7, dtype="int32"), 255), 0), "uint8"), "int16"), True)
+# fmt: on
+
+
+def test_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2():
+    primfunc = fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2
+    mod = tvm.IRModule.from_expr(primfunc)
+    mod = tvm.tir.transform.ConvertForLoopsToSerial()(mod)
+
+    def verify_serial_loops(stmt):
+        if isinstance(stmt, tvm.tir.For):
+            assert stmt.kind == tvm.tir.ForKind.SERIAL
+
+    for _, primfunc in mod.functions.items():
+        stmt_functor.post_order_visit(primfunc.body, verify_serial_loops)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/unittest/test_tir_transform_flatten_buffer.py b/tests/python/unittest/test_tir_transform_flatten_buffer.py
index 21c896c7bb7e..eed82ebb9118 100644
--- a/tests/python/unittest/test_tir_transform_flatten_buffer.py
+++ b/tests/python/unittest/test_tir_transform_flatten_buffer.py
@@ -32,7 +32,7 @@ def compacted_elementwise_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     C = T.match_buffer(c, (16, 16), "float32")
     for i in range(0, 16):
-        with T.block([]):
+        with T.block():
             T.reads(A[i, 0:16])
             T.writes(C[i, 0:16])
             B = T.alloc_buffer([1, 16], "float32", scope="global")
@@ -67,7 +67,7 @@ def compacted_gpu_func(a: T.handle, c: T.handle) -> None:
     for i0 in T.thread_binding(0, 4, thread="blockIdx.x"):
         for i1 in T.thread_binding(0, 2, thread="threadIdx.x"):
             for i2 in T.thread_binding(0, 2, thread="vthread"):
-                with T.block([]):
+                with T.block():
                     T.reads(A[i0 * 4 + i1 * 2 + i2, 0:16])
                     T.writes(C[i0 * 4 + i1 * 2 + i2, 0:16])
                     B = T.alloc_buffer([1, 16], "float32", scope="local")
@@ -108,17 +108,17 @@ def compacted_symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32) ->
     C = T.match_buffer(c, (n, m), "float32")
 
     for i in range(0, n):
-        with T.block([]):
+        with T.block():
             T.reads(A[i, m])
             T.writes(C[i, m])
             B = T.alloc_buffer((m,), "float32", scope="global")
             for j in range(0, m):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(A[i, j])
                     T.writes(B[j])
                     B[j] = A[i, j] + 1.0
             for j in range(0, m):
-                with T.block([]) as []:
+                with T.block() as []:
                     T.reads(B[j])
                     T.writes(C[i, j])
                     C[i, j] = B[j] * 2.0
@@ -143,7 +143,7 @@ def compacted_predicate_func(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (32), "float32")
 
     for i, j in T.grid(5, 7):
-        with T.block([]) as []:
+        with T.block() as []:
             T.reads(A[i * 7 + j])
             T.writes(C[i * 7 + j])
             T.where(i * 7 + j < 32)
@@ -166,7 +166,7 @@ def compacted_unit_loop_func(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (32), "float32")
 
     for x, y, z in T.grid(4, 1, 8):
-        with T.block([]) as []:
+        with T.block() as []:
             T.reads(A[x * 8 + y * 8 + z])
             T.writes(C[x * 8 + y * 8 + z])
             C[x * 8 + y * 8 + z] = A[x * 8 + y * 8 + z] + 1.0
@@ -187,7 +187,7 @@ def compacted_multi_alloc_func(a: T.handle, d: T.handle) -> None:
     D = T.match_buffer(d, (32), "float32")
 
     for i in range(0, 32):
-        with T.block([]) as []:
+        with T.block() as []:
             T.reads(A[i])
             T.writes(D[i])
             B = T.alloc_buffer((32,), scope="global")
@@ -215,7 +215,7 @@ def compacted_strided_buffer_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     C = T.match_buffer(c, (16, 16), "float32")
     for i0 in range(0, 4):
-        with T.block([]):
+        with T.block():
             T.reads(A[i0 * 4 : i0 * 4 + 4, 0:16])
             T.writes(C[i0 * 4 : i0 * 4 + 4, 0:16])
             B = T.alloc_buffer([4, 16], "float32", strides=[17, 1], scope="global")
diff --git a/tests/python/unittest/test_tir_transform_inject_copy_intrin.py b/tests/python/unittest/test_tir_transform_inject_copy_intrin.py
index 86bf87d5fa85..aa0448c3c682 100644
--- a/tests/python/unittest/test_tir_transform_inject_copy_intrin.py
+++ b/tests/python/unittest/test_tir_transform_inject_copy_intrin.py
@@ -17,6 +17,7 @@
 import tvm
 import tvm.testing
 from tvm import te
+from tvm.driver.build_module import schedule_to_module
 
 
 def test_copy2d():
@@ -53,11 +54,7 @@ def test_copy_pad():
     )
     s = te.create_schedule(B.op)
     s[B].pragma(B.op.axis[0], "memcpy")
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None)
-    mod = tvm.IRModule.from_expr(func)
+    mod = schedule_to_module(s, [A, B])
     mod = tvm.tir.transform.StorageFlatten(64)(mod)
 
     def cb(src, dst, pad_before, pad_after, pad_value):
@@ -77,11 +74,7 @@ def test_single_point_test():
     B = te.compute((1,), lambda i: A[i], name="B")
     s = te.create_schedule(B.op)
     s[B].pragma(B.op.axis[0], "memcpy")
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None)
-    mod = tvm.IRModule.from_expr(func)
+    mod = schedule_to_module(s, [A, B])
     mod = tvm.tir.transform.StorageFlatten(64)(mod)
 
     def cb(src, dst, pad_before, pad_after, pad_value):
@@ -105,11 +98,8 @@ def test_copy_pad_split():
     xo, xi = s[B].split(B.op.axis[0], factor=4)
     s[Apad].compute_at(s[B], xo)
     s[Apad].pragma(s[Apad].op.axis[0], "memcpy")
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None)
-    mod = tvm.IRModule.from_expr(func)
+    mod = schedule_to_module(s, [A, B])
     mod = tvm.tir.transform.StorageFlatten(64)(mod._move())
     mod = tvm.tir.transform.Simplify()(mod._move())
 
diff --git a/tests/python/unittest/test_tir_transform_lower_init_block.py b/tests/python/unittest/test_tir_transform_lower_init_block.py
index c1c4fb3d2e8f..a4fd9404eee4 100644
--- a/tests/python/unittest/test_tir_transform_lower_init_block.py
+++ b/tests/python/unittest/test_tir_transform_lower_init_block.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import tir, te
+from tvm import te
 from tvm.script import tir as T
 
 # pylint: disable=no-self-argument
@@ -28,10 +28,13 @@ def main(a: T.handle, b: T.handle) -> None:
         A = T.match_buffer(a, [64, 64, 64])
         B = T.match_buffer(b, [64])
 
-        with T.block([64, T.reduce_axis(0, 64), T.reduce_axis(32, 64)]) as [i, j, k]:
-            with T.init():
-                B[i] = T.float32(0)
-            B[i] += A[i, j, k]
+        for i0, j0 in T.grid(64, 64):
+            for k0 in T.serial(32, 64):
+                with T.block():
+                    i, j, k = T.axis.remap("SRR", [i0, j0, k0])
+                    with T.init():
+                        B[i] = T.float32(0)
+                    B[i] += A[i, j, k]
 
 
 @tvm.script.ir_module
@@ -41,10 +44,13 @@ def main(a: T.handle, b: T.handle) -> None:
         A = T.match_buffer(a, [64, 64, 64])
         B = T.match_buffer(b, [64])
 
-        with T.block([64, T.reduce_axis(0, 64), T.reduce_axis(32, 64)]) as [i, j, k]:
-            if (j == 0) and (k == 32):
-                B[i] = T.float32(0)
-            B[i] += A[i, j, k]
+        for i0, j0 in T.grid(64, 64):
+            for k0 in T.serial(32, 64):
+                with T.block():
+                    i, j, k = T.axis.remap("SRR", [i0, j0, k0])
+                    if (j == 0) and (k == 32):
+                        B[i] = T.float32(0)
+                    B[i] += A[i, j, k]
 
 
 @tvm.script.ir_module
@@ -54,12 +60,15 @@ def main(a: T.handle, b: T.handle) -> None:
         A = T.match_buffer(a, [64, 64, 64])
         B = T.match_buffer(b, [64])
 
-        with T.block([64, T.reduce_axis(0, 64), T.reduce_axis(32, 64)]) as [i, j, k]:
-            BB = T.match_buffer(B[i], ())
-            AA = T.match_buffer(A[i, 0:64, 0:64], (64, 64))
-            with T.init():
-                BB[()] = T.float32(0)
-            BB[()] += AA[j, k]
+        for i0, j0 in T.grid(64, 64):
+            for k0 in T.serial(32, 64):
+                with T.block():
+                    i, j, k = T.axis.remap("SRR", [i0, j0, k0])
+                    BB = T.match_buffer(B[i], ())
+                    AA = T.match_buffer(A[i, 0:64, 0:64], (64, 64))
+                    with T.init():
+                        BB[()] = T.float32(0)
+                    BB[()] += AA[j, k]
 
 
 @tvm.script.ir_module
@@ -69,17 +78,21 @@ def main(a: T.handle, b: T.handle) -> None:
         A = T.match_buffer(a, [64, 64, 64])
         B = T.match_buffer(b, [64])
 
-        with T.block([64, T.reduce_axis(0, 64), T.reduce_axis(32, 64)]) as [i, j, k]:
-            BB = T.match_buffer(B[i], ())
-            AA = T.match_buffer(A[i, 0:64, 0:64], (64, 64))
-            if (j == 0) and (k == 32):
-                BB[()] = T.float32(0)
-            BB[()] += AA[j, k]
+        for i0, j0 in T.grid(64, 64):
+            for k0 in T.serial(32, 64):
+                with T.block():
+                    i, j, k = T.axis.remap("SRR", [i0, j0, k0])
+                    BB = T.match_buffer(B[i], ())
+                    AA = T.match_buffer(A[i, 0:64, 0:64], (64, 64))
+                    if (j == 0) and (k == 32):
+                        BB[()] = T.float32(0)
+                    BB[()] += AA[j, k]
 
 
 def test_lower_reduction():
     origin_mod = WithInit
     mod = tvm.tir.transform.LowerInitBlock()(origin_mod)
+    print(mod.script())
     tvm.ir.assert_structural_equal(mod, WithBranch, True)
 
 
diff --git a/tests/python/unittest/test_tir_transform_make_packed_api.py b/tests/python/unittest/test_tir_transform_make_packed_api.py
index 15f994069abd..1ab6bdaad90a 100644
--- a/tests/python/unittest/test_tir_transform_make_packed_api.py
+++ b/tests/python/unittest/test_tir_transform_make_packed_api.py
@@ -14,9 +14,11 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import numpy
+
 import tvm
 from tvm import te
-import numpy
+from tvm.driver.build_module import schedule_to_module
 
 
 def test_makeapi():
@@ -27,10 +29,7 @@ def test_makeapi():
     C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
     s = te.create_schedule(C.op)
 
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([n, A, B, C], stmt, None)
-    mod = tvm.IRModule.from_expr(func)
+    mod = schedule_to_module(s, [n, A, B, C])
     mod = tvm.tir.transform.StorageFlatten(64)(mod)
     mod = tvm.tir.transform.Apply(
         lambda f: f.with_attr(
diff --git a/tests/python/unittest/test_tir_transform_merge_dynamic_shared_memory_allocations.py b/tests/python/unittest/test_tir_transform_merge_dynamic_shared_memory_allocations.py
index 9c511f1de6b9..cc78b84f9b4e 100644
--- a/tests/python/unittest/test_tir_transform_merge_dynamic_shared_memory_allocations.py
+++ b/tests/python/unittest/test_tir_transform_merge_dynamic_shared_memory_allocations.py
@@ -14,20 +14,17 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import tvm
-from tvm import te
 import numpy as np
+
+import tvm
 import tvm.testing
+from tvm import te
+from tvm.driver.build_module import schedule_to_module
 from tvm.topi.math import cast
 
 
 def run_passes(sch, args):
-    bounds = tvm.te.schedule.InferBound(sch)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(sch, bounds)
-
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc(args, stmt, None)
-    mod = tvm.IRModule.from_expr(func)
+    mod = schedule_to_module(sch, args)
     return tvm.transform.Sequential(
         [
             tvm.tir.transform.StorageFlatten(64),
diff --git a/tests/python/unittest/test_tir_transform_narrow_datatype.py b/tests/python/unittest/test_tir_transform_narrow_datatype.py
index cb8968cfc880..b5620d748d8a 100644
--- a/tests/python/unittest/test_tir_transform_narrow_datatype.py
+++ b/tests/python/unittest/test_tir_transform_narrow_datatype.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import te
-from tvm import relay
+from tvm import te, relay
+from tvm.driver.build_module import schedule_to_module
 from tvm.tir import const
 
 
@@ -39,11 +39,8 @@ def lower_sch(sch, args, target_bits):
         else:
             raise ValueError("args must be Tensor, Buffer or Var")
     sch = sch.normalize()
-    bounds = te.schedule.InferBound(sch)
-    stmt = te.schedule.ScheduleOps(sch, bounds)
 
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc(args, stmt, None)
-    mod = tvm.IRModule.from_expr(func)
+    mod = schedule_to_module(sch, args)
     mod = tvm.tir.transform.StorageFlatten(64)(mod)
     return tvm.tir.transform.NarrowDataType(target_bits)(mod)["main"].body
 
diff --git a/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py b/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py
index e55555305a09..c22f5f82ee10 100644
--- a/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py
+++ b/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import tir, te
+from tvm import te
 from tvm.script import tir as T
 
 
@@ -31,12 +31,14 @@ def element_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16))
     C = T.match_buffer(c, (16, 16))
     B = T.alloc_buffer((16, 16))
-    for i_0 in range(0, 16):
-        for j_0 in range(0, 16):
-            with T.block([16, 16]) as [i, j]:
+    for i0 in range(0, 16):
+        for j0 in range(0, 16):
+            with T.block():
+                i, j = T.axis.remap("SS", [i0, j0])
                 B[i, j] = A[i, j] + 1.0
-        for j_0 in range(0, 16):
-            with T.block([16, 16]) as [i, j]:
+        for j0 in range(0, 16):
+            with T.block():
+                i, j = T.axis.remap("SS", [i0, j0])
                 C[i, j] = B[i, j] * 2.0
 
 
@@ -46,95 +48,112 @@ def transformed_element_func(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, [16, 16])
 
     for i_0 in range(0, 16):
-        with T.block([]):
+        with T.block():
             T.reads([A[i_0, 0:16]])
             T.writes([C[i_0, 0:16]])
             B = T.alloc_buffer([16, 16])
             for j_0 in T.serial(0, 16):
-                with T.block([16, 16], "") as [i, j]:
-                    T.bind(i, i_0)
-                    T.bind(j, j_0)
+                with T.block():
+                    i, j = T.axis.remap("SS", [i_0, j_0])
                     B[i, j] = A[i, j] + 1.0
             for j_0 in T.serial(0, 16):
-                with T.block([16, 16], "") as [i, j]:
-                    T.bind(i, i_0)
-                    T.bind(j, j_0)
+                with T.block():
+                    i, j = T.axis.remap("SS", [i_0, j_0])
                     C[i, j] = B[i, j] * 2.0
 
 
 @T.prim_func
 def original_func() -> None:
     A = T.alloc_buffer((128, 128), "float32")
-    with T.block([128, 128]) as [i, j]:
-        A[i, j] = T.float32(0)
-    with T.block([32, 32, T.reduce_axis(0, 32)]) as [i, j, k]:
-        B = T.alloc_buffer((128, 128), "float32")
-        C = T.alloc_buffer((128, 128), "float32")
-        D = T.alloc_buffer((128, 128), "float32")
-        if k == 0:
+    for i0, j0 in T.grid(128, 128):
+        with T.block():
+            i, j = T.axis.remap("SS", [i0, j0])
+            A[i, j] = T.float32(0)
+    for i0, j0, k0 in T.grid(32, 32, 32):
+        with T.block():
+            i, j, k = T.axis.remap("SSR", [i0, j0, k0])
+            B = T.alloc_buffer((128, 128), "float32")
+            C = T.alloc_buffer((128, 128), "float32")
+            D = T.alloc_buffer((128, 128), "float32")
+            if k == 0:
+                for ii, jj in T.grid(4, 4):
+                    B[i * 4 + ii, j * 4 + jj] = A[i * 4 + ii, j * 4 + jj]
             for ii, jj in T.grid(4, 4):
-                B[i * 4 + ii, j * 4 + jj] = A[i * 4 + ii, j * 4 + jj]
-        for ii, jj in T.grid(4, 4):
-            for kk in range(0, 4):
-                B[i * 4 + ii, j * 4 + jj] += C[i * 4 + ii, k * 4 + kk]
-            for kk in range(0, 4):
-                B[i * 4 + ii, j * 4 + jj] += D[j * 4 + jj, k * 4 + kk] * C[i * 4 + ii, k * 4 + kk]
+                for kk in range(0, 4):
+                    B[i * 4 + ii, j * 4 + jj] += C[i * 4 + ii, k * 4 + kk]
+                for kk in range(0, 4):
+                    B[i * 4 + ii, j * 4 + jj] += (
+                        D[j * 4 + jj, k * 4 + kk] * C[i * 4 + ii, k * 4 + kk]
+                    )
 
 
 @T.prim_func
 def transformed_func() -> None:
     A = T.alloc_buffer([128, 128])
-    with T.block([128, 128], "") as [i, j]:
-        A[i, j] = T.float32(0)
-    with T.block([32, 32, T.reduce_axis(0, 32)], "") as [i, j, k]:
-        B = T.alloc_buffer([128, 128])
-        if k == 0:
+    for i0, j0 in T.grid(128, 128):
+        with T.block():
+            i, j = T.axis.remap("SS", [i0, j0])
+            A[i, j] = T.float32(0)
+    for i0, j0, k0 in T.grid(32, 32, 32):
+        with T.block():
+            i, j, k = T.axis.remap("SSR", [i0, j0, k0])
+            B = T.alloc_buffer([128, 128])
+            if k == 0:
+                for ii, jj in T.grid(4, 4):
+                    B[i * 4 + ii, j * 4 + jj] = A[i * 4 + ii, j * 4 + jj]
             for ii, jj in T.grid(4, 4):
-                B[i * 4 + ii, j * 4 + jj] = A[i * 4 + ii, j * 4 + jj]
-        for ii, jj in T.grid(4, 4):
-            with T.block([], ""):
-                T.reads([B[((i * 4) + ii), ((j * 4) + jj)]])
-                T.writes([B[((i * 4) + ii), ((j * 4) + jj)]])
-                C = T.alloc_buffer([128, 128])
-                for kk in T.serial(0, 4):
-                    B[((i * 4) + ii), ((j * 4) + jj)] = (
-                        B[((i * 4) + ii), ((j * 4) + jj)] + C[((i * 4) + ii), ((k * 4) + kk)]
-                    )
-                for kk in T.serial(0, 4):
-                    with T.block([], ""):
-                        T.reads(
-                            [
-                                B[((i * 4) + ii), ((j * 4) + jj)],
-                                C[((i * 4) + ii), ((k * 4) + kk)],
-                            ]
-                        )
-                        T.writes([B[((i * 4) + ii), ((j * 4) + jj)]])
-                        D = T.alloc_buffer([128, 128])
-                        B[((i * 4) + ii), ((j * 4) + jj)] = B[((i * 4) + ii), ((j * 4) + jj)] + (
-                            D[((j * 4) + jj), ((k * 4) + kk)] * C[((i * 4) + ii), ((k * 4) + kk)]
+                with T.block(""):
+                    T.reads([B[((i * 4) + ii), ((j * 4) + jj)]])
+                    T.writes([B[((i * 4) + ii), ((j * 4) + jj)]])
+                    C = T.alloc_buffer([128, 128])
+                    for kk in T.serial(0, 4):
+                        B[((i * 4) + ii), ((j * 4) + jj)] = (
+                            B[((i * 4) + ii), ((j * 4) + jj)] + C[((i * 4) + ii), ((k * 4) + kk)]
                         )
+                    for kk in T.serial(0, 4):
+                        with T.block(""):
+                            T.reads(
+                                [
+                                    B[((i * 4) + ii), ((j * 4) + jj)],
+                                    C[((i * 4) + ii), ((k * 4) + kk)],
+                                ]
+                            )
+                            T.writes([B[((i * 4) + ii), ((j * 4) + jj)]])
+                            D = T.alloc_buffer([128, 128])
+                            B[((i * 4) + ii), ((j * 4) + jj)] = B[
+                                ((i * 4) + ii), ((j * 4) + jj)
+                            ] + (
+                                D[((j * 4) + jj), ((k * 4) + kk)]
+                                * C[((i * 4) + ii), ((k * 4) + kk)]
+                            )
 
 
 @T.prim_func
 def match_buffer_func() -> None:
     C = T.alloc_buffer((128, 128))
-    with T.block([128]) as [vi]:
-        C0 = T.match_buffer(C[vi, 0:128], (128))
-        with T.block([128]) as [jj]:
-            C1 = T.match_buffer(C0[jj], ())
-            C1[()] = 0
+    for i in range(128):
+        with T.block():
+            vi = T.axis.S(128, i)
+            C0 = T.match_buffer(C[vi, 0:128], (128))
+            for j in range(128):
+                with T.block():
+                    jj = T.axis.S(128, j)
+                    C1 = T.match_buffer(C0[jj], ())
+                    C1[()] = 0
 
 
 @T.prim_func
 def transformed_match_buffer_func() -> None:
     for i in range(0, 128):
-        with T.block([128]) as [vi]:
-            T.bind(vi, i)
+        with T.block():
+            vi = T.axis.S(128, i)
             C = T.alloc_buffer((128, 128))
             C0 = T.match_buffer(C[vi, 0:128], (128))
-            with T.block([128]) as [jj]:
-                C1 = T.match_buffer(C0[jj], ())
-                C1[()] = 0
+            for j in range(128):
+                with T.block():
+                    jj = T.axis.S(128, j)
+                    C1 = T.match_buffer(C0[jj], ())
+                    C1[()] = 0
 
 
 @T.prim_func
@@ -143,9 +162,10 @@ def opaque_access(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [1024])
     A_cache = T.alloc_buffer([1024])
     for i in T.serial(0, 8):
-        with T.block([8]) as [vi]:
-            with T.block([8]) as [v]:
-                T.bind(v, vi)
+        with T.block():
+            vi = T.axis.S(8, i)
+            with T.block():
+                v = T.axis.S(8, vi)
                 T.reads([A[(v * 128) : ((v * 128) + 128)]])
                 T.writes([A_cache[(v * 128) : ((v * 128) + 128)]])
                 T.evaluate(
@@ -161,8 +181,8 @@ def opaque_access(a: T.handle, b: T.handle) -> None:
                     )
                 )
             for j in T.serial(0, 128):
-                with T.block([1024]) as [v]:
-                    T.bind(v, ((vi * 128) + j))
+                with T.block():
+                    v = T.axis.S(1024, vi * 128 + j)
                     T.reads([A_cache[v]])
                     T.writes([B[v]])
                     B[v] = A_cache[v]
@@ -173,12 +193,13 @@ def transformed_opaque_access(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [1024])
     B = T.match_buffer(b, [1024])
     for i in T.serial(0, 8):
-        with T.block([8]) as [vi]:
+        with T.block():
+            vi = T.axis.S(8, i)
             T.reads(A[vi * 128 : vi * 128 + 128])
             T.writes(B[vi * 128 : vi * 128 + 128])
             A_cache = T.alloc_buffer([1024])
-            with T.block([8]) as [v]:
-                T.bind(v, vi)
+            with T.block():
+                v = T.axis.S(8, vi)
                 T.reads([A[v * 128 : v * 128 + 128]])
                 T.writes([A_cache[v * 128 : v * 128 + 128]])
                 T.evaluate(
@@ -187,8 +208,8 @@ def transformed_opaque_access(a: T.handle, b: T.handle) -> None:
                     )
                 )
             for j in T.serial(0, 128):
-                with T.block([1024]) as [v]:
-                    T.bind(v, ((vi * 128) + j))
+                with T.block():
+                    v = T.axis.S(1024, vi * 128 + j)
                     T.reads([A_cache[v]])
                     T.writes([B[v]])
                     B[v] = A_cache[v]
diff --git a/tests/python/unittest/test_tir_transform_storage_flatten.py b/tests/python/unittest/test_tir_transform_storage_flatten.py
index 37223493a8b5..a51e926155d3 100644
--- a/tests/python/unittest/test_tir_transform_storage_flatten.py
+++ b/tests/python/unittest/test_tir_transform_storage_flatten.py
@@ -16,6 +16,7 @@
 # under the License.
 import tvm
 from tvm import te
+from tvm.driver.build_module import schedule_to_module
 from tvm.script import tir as T
 from tvm.relay import GlobalVar
 
@@ -30,14 +31,10 @@ def test_flatten2():
     s = te.create_schedule(A2.op)
     xo, xi = s[A2].split(A2.op.axis[0], 8)
     s[A1].compute_at(s[A2], xo)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
     Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="A")
     A2b = tvm.tir.decl_buffer(A2.shape, A2.dtype, name="A2")
 
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([Ab, A2b], stmt, {A: Ab, A2: A2b})
-    mod = tvm.IRModule.from_expr(func)
+    mod = schedule_to_module(s, [Ab, A2b], binds={A: Ab, A2: A2b})
     mod = tvm.tir.transform.StorageFlatten(64)(mod)
 
 
@@ -70,12 +67,8 @@ def test_flatten_storage_align():
 
     s = te.create_schedule(A2.op)
     s[A1].storage_align(A1.op.axis[0], 2, 1)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, A2], stmt, None)
-    mod = tvm.IRModule.from_expr(func)
+    mod = schedule_to_module(s, [A, A2])
     mod = tvm.transform.Sequential(
         [tvm.tir.transform.StorageFlatten(64), tvm.tir.transform.Simplify()]
     )(mod)
diff --git a/tests/python/unittest/test_tir_transform_storage_rewrite.py b/tests/python/unittest/test_tir_transform_storage_rewrite.py
index 9e738b136b17..5a91788283d6 100644
--- a/tests/python/unittest/test_tir_transform_storage_rewrite.py
+++ b/tests/python/unittest/test_tir_transform_storage_rewrite.py
@@ -16,6 +16,7 @@
 # under the License.
 import tvm
 from tvm import te
+from tvm.driver.build_module import schedule_to_module
 
 
 def test_storage_share():
@@ -28,12 +29,7 @@ def test_storage_share():
         B = te.compute((m, l), lambda i, j: B[i, j] + (t + 1), name="A%d" % t)
 
     s = te.create_schedule(B.op)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None)
-    mod = tvm.IRModule.from_expr(func)
+    mod = schedule_to_module(s, [A, B])
     mod = tvm.tir.transform.StorageFlatten(64)(mod)
 
     mod = tvm.tir.transform.Simplify()(mod)
@@ -169,12 +165,7 @@ def test_inplace_rule():
     AA = te.compute((m,), lambda i: A0[i] + A1[i] + A1[0], name="AA")
     B = te.compute((m,), lambda i: AA[i] + 1, name="B")
     s = te.create_schedule(B.op)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None)
-    mod = tvm.IRModule.from_expr(func)
+    mod = schedule_to_module(s, [A, B])
     mod = tvm.tir.transform.StorageFlatten(64)(mod)
 
     mod = tvm.tir.transform.Simplify()(mod)
@@ -206,11 +197,8 @@ def test_storage_combine():
     s = te.create_schedule(B.op)
     for S in stages[:-1]:
         s[S].set_scope("global:tag")
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None)
-    mod = tvm.IRModule.from_expr(func)
+
+    mod = schedule_to_module(s, [A, B])
     mod = tvm.tir.transform.StorageFlatten(64)(mod)
 
     mod = tvm.tir.transform.Simplify()(mod)
@@ -238,10 +226,7 @@ def test_storage_combine_with_vectorization():
     BB = s.cache_read(B, "global:tag", readers=[C])
     CC = s.cache_write(C, "global:tag")
     s[CC].vectorize(s[CC].op.axis[0])
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B, C], stmt, None)
-    mod = tvm.IRModule.from_expr(func)
+    mod = schedule_to_module(s, [A, B, C])
     mod = tvm.tir.transform.StorageFlatten(64)(mod)
     mod = tvm.tir.transform.VectorizeLoop()(mod)
     mod = tvm.tir.transform.StorageRewrite()(mod)
@@ -285,11 +270,7 @@ def test_storage_share_gpu():
         s[A[2 * t + 1]].compute_at(s[A[2 * t + 2]], tx)
         s[A[2 * t + 1]].set_scope("shared")
 
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A[0], A[-1]], stmt, None)
-    mod = tvm.IRModule.from_expr(func)
+    mod = schedule_to_module(s, [A[0], A[-1]])
     mod = tvm.tir.transform.StorageFlatten(64)(mod)
     mod = tvm.tir.transform.Simplify()(mod)
     mod = tvm.tir.transform.StorageRewrite()(mod)
@@ -418,12 +399,7 @@ def test_inplace_rule2(scope_tb="local_TB2", max_bits=1024 * 1024 * 1024):
     A0L = s.cache_read(A0, scope_tb, [A2])
     A1L = s.cache_read(A1, scope_tb, [A2])
     A2L = s.cache_read(A2, scope_tb, [B])
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B, C, D], stmt, None)
-    mod = tvm.IRModule.from_expr(func)
+    mod = schedule_to_module(s, [A, B, C, D])
     mod = tvm.tir.transform.StorageFlatten(64)(mod)
 
     mod = tvm.tir.transform.Simplify()(mod)
@@ -511,12 +487,7 @@ def test_inplace_rule3():
     s[B10].compute_inline()
 
     s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([B0, B1, B2, B3, B4, B5, B], stmt, None)
-    mod = tvm.IRModule.from_expr(func)
+    mod = schedule_to_module(s, [B0, B1, B2, B3, B4, B5, B])
     mod = tvm.tir.transform.StorageFlatten(64)(mod)
 
     mod = tvm.tir.transform.Simplify()(mod)
diff --git a/tests/python/unittest/test_tvm_testing_features.py b/tests/python/unittest/test_tvm_testing_features.py
index cbcdc4356250..c00fc02c4331 100644
--- a/tests/python/unittest/test_tvm_testing_features.py
+++ b/tests/python/unittest/test_tvm_testing_features.py
@@ -46,8 +46,11 @@ def test_device_parametrization(self, dev):
         self.devices_used.append(dev)
 
     def test_all_targets_used(self):
-        assert self.targets_used == self.enabled_targets
-        assert self.devices_used == self.enabled_devices
+        assert sorted(self.targets_used) == sorted(self.enabled_targets)
+
+    def test_all_devices_used(self):
+        sort_key = lambda dev: (dev.device_type, dev.device_id)
+        assert sorted(self.devices_used, key=sort_key) == sorted(self.enabled_devices, key=sort_key)
 
     targets_with_explicit_list = []
 
@@ -70,9 +73,9 @@ def test_exclude_target(self, target):
         self.targets_with_exclusion.append(target)
 
     def test_all_nonexcluded_targets_ran(self):
-        assert self.targets_with_exclusion == [
-            target for target in self.enabled_targets if not target.startswith("llvm")
-        ]
+        assert sorted(self.targets_with_exclusion) == sorted(
+            [target for target in self.enabled_targets if not target.startswith("llvm")]
+        )
 
     run_targets_with_known_failure = []
 
@@ -85,7 +88,7 @@ def test_known_failing_target(self, target):
         assert "llvm" not in target
 
     def test_all_targets_ran(self):
-        assert self.run_targets_with_known_failure == self.enabled_targets
+        assert sorted(self.run_targets_with_known_failure) == sorted(self.enabled_targets)
 
     @tvm.testing.known_failing_targets("llvm")
     @tvm.testing.parametrize_targets("llvm")
diff --git a/tests/python/unittest/test_tvmscript_complete.py b/tests/python/unittest/test_tvmscript_complete.py
index 7c521db21bb8..105b4a2d6a3f 100644
--- a/tests/python/unittest/test_tvmscript_complete.py
+++ b/tests/python/unittest/test_tvmscript_complete.py
@@ -26,10 +26,12 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
 
-    with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
-        with T.init():
-            C[vi, vj] = T.float32(0)
-        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block("update"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vi, vj] = T.float32(0)
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
 @T.prim_func
@@ -39,12 +41,14 @@ def matmul_original(a: T.handle, b: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, [128, 128])
 
     for i, j in T.grid(32, 32):
-        with T.block([32, 32], "init") as [vi, vj]:
+        with T.block("init"):
+            vi, vj = T.axis.remap("SS", [i, j])
             for ii, jj in T.grid(4, 4):
                 C[vi * 4 + ii, vj * 4 + jj] = T.float32(0)
 
         for k in range(0, 32):
-            with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
+            with T.block("update"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                 for ii, jj, kk in T.grid(4, 4, 4):
                     C[vi * 4 + ii, vj * 4 + jj] = (
                         C[vi * 4 + ii, vj * 4 + jj]
@@ -58,12 +62,15 @@ def elementwise_with_root(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
 
-    with T.block([]) as []:
-        with T.block([128, 128]) as [vi, vj]:
-            B[vi, vj] = A[vi, vj] + T.float32(1)
-
-        with T.block([128, 128]) as [vi, vj]:
-            C[vi, vj] = B[vi, vj] + T.float32(1)
+    with T.block() as []:
+        for i, j in T.grid(128, 128):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i, j])
+                B[vi, vj] = A[vi, vj] + T.float32(1)
+        for i, j in T.grid(128, 128):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i, j])
+                C[vi, vj] = B[vi, vj] + T.float32(1)
 
 
 def func_with_opaque_block(a: T.handle, b: T.handle, c: T.handle) -> None:
@@ -71,12 +78,13 @@ def func_with_opaque_block(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
 
-    with T.block([]) as []:
-        with T.block([]) as []:
+    with T.block() as []:
+        with T.block() as []:
             B[0, 0] = A[0, 0] + T.float32(1)
-
-        with T.block([128, 128]) as [vi, vj]:
-            C[vi, vj] = B[vi, vj] + T.float32(1)
+        for i, j in T.grid(128, 128):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i, j])
+                C[vi, vj] = B[vi, vj] + T.float32(1)
 
 
 @T.prim_func
@@ -85,14 +93,18 @@ def func_with_part_access_region(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
 
-    with T.block([]) as []:
-        with T.block([128, 128]) as [vi, vj]:
-            T.reads(A[vi, vj])
-            B[vi, vj] = A[vi, vj] + T.float32(1)
+    with T.block() as []:
+        for i, j in T.grid(128, 128):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.reads(A[vi, vj])
+                B[vi, vj] = A[vi, vj] + T.float32(1)
 
-        with T.block([128, 128]) as [vi, vj]:
-            T.writes(C[vi, vj])
-            C[vi, vj] = B[vi, vj] + T.float32(1)
+        for i, j in T.grid(128, 128):
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.writes(C[vi, vj])
+                C[vi, vj] = B[vi, vj] + T.float32(1)
 
 
 def test_complete_matmul():
@@ -181,22 +193,23 @@ def func_with_bufferslice_indices(data: T.handle, index: T.handle) -> None:
     index_buf = T.match_buffer(index, (1,), "int32")
     out_buf = T.alloc_buffer((16, 16), "float32")
 
-    with T.block([16, 16]) as [vi, vj]:
-        out_buf[vi, vj] = data_buf[vi, index_buf[0]]
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            out_buf[vi, vj] = data_buf[vi, index_buf[0]]
 
 
 @T.prim_func
 def expected_bufferslice_indices(data: T.handle, index: T.handle) -> None:
     index_buf = T.match_buffer(index, [1], dtype="int32", elem_offset=0, align=128, offset_factor=1)
     data_buf = T.match_buffer(data, [16, 16], elem_offset=0, align=128, offset_factor=1)
-    with T.block([], "root"):
+    with T.block("root"):
         T.reads([])
         T.writes([])
         out_buf = T.alloc_buffer([16, 16], elem_offset=0, align=128, offset_factor=1)
         for i0, i1 in T.grid(16, 16):
-            with T.block([16, 16], "") as [vi, vj]:
-                T.bind(vi, i0)
-                T.bind(vj, i1)
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i0, i1])
                 T.reads([data_buf[vi, 0:16], index_buf[0]])
                 T.writes([out_buf[vi, vj]])
                 out_buf[vi, vj] = data_buf[vi, index_buf[0]]
@@ -208,22 +221,23 @@ def func_with_recursive_bufferslice_indices(data: T.handle, index: T.handle) ->
     index_buf = T.match_buffer(index, (1,), "int32")
     out_buf = T.alloc_buffer((16, 16), "float32")
 
-    with T.block([16, 16]) as [vi, vj]:
-        out_buf[vi, vj] = data_buf[index_buf[index_buf[0]], index_buf[0]]
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            out_buf[vi, vj] = data_buf[index_buf[index_buf[0]], index_buf[0]]
 
 
 @T.prim_func
 def expected_recursive_bufferslice_indices(data: T.handle, index: T.handle) -> None:
     index_buf = T.match_buffer(index, [1], dtype="int32", elem_offset=0, align=128, offset_factor=1)
     data_buf = T.match_buffer(data, [16, 16], elem_offset=0, align=128, offset_factor=1)
-    with T.block([], "root"):
+    with T.block("root"):
         T.reads([])
         T.writes([])
         out_buf = T.alloc_buffer([16, 16], elem_offset=0, align=128, offset_factor=1)
         for i0, i1 in T.grid(16, 16):
-            with T.block([16, 16], "") as [vi, vj]:
-                T.bind(vi, i0)
-                T.bind(vj, i1)
+            with T.block():
+                vi, vj = T.axis.remap("SS", [i0, i1])
                 T.reads([data_buf[0:16, 0:16], index_buf[0]])
                 T.writes([out_buf[vi, vj]])
                 out_buf[vi, vj] = data_buf[index_buf[index_buf[0]], index_buf[0]]
@@ -240,11 +254,11 @@ def test_complete_buffer_indices():
 def match_buffer_func(a: T.handle) -> None:
     A = T.match_buffer(a, (16, 16))
     for i in range(0, 16):
-        with T.block([]):
+        with T.block():
             A0 = T.match_buffer(A[i, 0:16], (16))
-            with T.block([]):
+            with T.block():
                 for j in range(0, 16):
-                    with T.block([]) as []:
+                    with T.block() as []:
                         A1 = T.match_buffer(A0[j], ())
                         A1[()] = 1.0
 
@@ -253,15 +267,15 @@ def match_buffer_func(a: T.handle) -> None:
 def expected_match_buffer_func(a: T.handle) -> None:
     A = T.match_buffer(a, (16, 16))
     for i in range(0, 16):
-        with T.block([]):
+        with T.block():
             T.reads([])
             T.writes(A[i, 0:16])
             A0 = T.match_buffer(A[i, 0:16], (16))
-            with T.block([]):
+            with T.block():
                 T.reads([])
                 T.writes(A0[0:16])
                 for j in range(0, 16):
-                    with T.block([]) as []:
+                    with T.block() as []:
                         T.reads([])
                         T.writes(A0[j])
                         A1 = T.match_buffer(A0[j], ())
@@ -272,6 +286,34 @@ def test_complete_match_buffer():
     tvm.ir.assert_structural_equal(match_buffer_func, expected_match_buffer_func)
 
 
+@T.prim_func
+def alloc_buffer_func(a: T.handle, b: T.handle) -> None:
+    A = T.match_buffer(a, [2, 2], dtype="float32")
+    B = T.match_buffer(b, [2, 2], dtype="float32")
+    C = T.alloc_buffer([2, 2], dtype="float32")
+    A[(0, 0)] = T.float32(2)
+    C[(0, 0)] = A[(0, 0)] + B[(0, 0)]
+    B[(0, 0)] = C[(0, 0)]
+
+
+@T.prim_func
+def expect_alloc_buffer_func(a: T.handle, b: T.handle) -> None:
+    A = T.match_buffer(a, [2, 2], dtype="float32", elem_offset=0, align=128, offset_factor=1)
+    B = T.match_buffer(b, [2, 2], dtype="float32", elem_offset=0, align=128, offset_factor=1)
+    with T.block("root"):
+        T.reads([])
+        T.writes([])
+        C = T.alloc_buffer([2, 2], dtype="float32", elem_offset=0, align=128, offset_factor=1)
+        A[(0, 0)] = T.float32(2)
+        C[(0, 0)] = A[(0, 0)] + B[(0, 0)]
+        B[(0, 0)] = C[(0, 0)]
+
+
+def test_complete_alloc_buffer():
+    rt_func = tvm.script.from_source(alloc_buffer_func.script(show_meta=True))
+    tvm.ir.assert_structural_equal(alloc_buffer_func, expect_alloc_buffer_func)
+
+
 if __name__ == "__main__":
     test_complete_matmul()
     test_complete_matmul_original()
@@ -279,3 +321,4 @@ def test_complete_match_buffer():
     test_complete_part_region()
     test_complete_buffer_indices()
     test_complete_match_buffer()
+    test_complete_alloc_buffer()
diff --git a/tests/python/unittest/test_tvmscript_error_report.py b/tests/python/unittest/test_tvmscript_error_report.py
index 99a22636b927..80c37229f519 100644
--- a/tests/python/unittest/test_tvmscript_error_report.py
+++ b/tests/python/unittest/test_tvmscript_error_report.py
@@ -155,33 +155,83 @@ def test_allocate_with_buffers():
     check_error(allocate_with_buffers, 2)
 
 
-def inconsistent_binding() -> None:
-    with T.block([128, 128]) as [vi]:  # error
+def inconsistent_binding_value() -> None:
+    for i, j in T.grid(16, 16):
+        vi, vj = T.axis.remap("SS", [i])  # error
+        T.evaluate(1.0)
+
+
+def inconsistent_binding_type() -> None:
+    for i, j in T.grid(16, 16):
+        vi, vj = T.axis.remap("S", [i, j])  # error
         T.evaluate(1.0)
 
 
 def test_inconsistent_binding():
-    check_error(inconsistent_binding, 2)
+    check_error(inconsistent_binding_value, 3)
+    check_error(inconsistent_binding_type, 3)
+
+
+def error_remap_type() -> None:
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi, vj = T.axis.remap("TT", [i, j])  # error
+            T.evaluate(1.0)
+
+
+def error_remap_value() -> None:
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i + j, j])  # error
+            T.evaluate(1.0)
+
+
+def test_error_remap_args():
+    check_error(error_remap_type, 4)
+    check_error(error_remap_value, 4)
 
 
 def invalid_block_axes(a: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
-    with T.block([A]) as [vi]:  # error
-        T.evaluate(1.0)
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi = T.axis.S(i, A)  # error
+            T.evaluate(1.0)
 
 
 def test_invalid_block_axes():
-    check_error(invalid_block_axes, 3)
+    check_error(invalid_block_axes, 5)
 
 
-def miss_block_bind() -> None:
-    with T.block([16, 16]) as [vi, vj]:  # error
-        T.bind(vi, 1)
-        T.evaluate(1.0)
+def duplicate_block_axes() -> None:
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi = T.axis.S(16, i)
+            vi = T.axis.S(16, j)  # error
+            T.evaluate(1.0)
+
+
+def duplicate_block_axes_remap() -> None:
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi, vi = T.axis.remap("SS", [i, j])  # error
+            T.evaluate(1.0)
+
+
+def test_duplicate_block_axes():
+    check_error(duplicate_block_axes, 5)
+    check_error(duplicate_block_axes_remap, 4)
+
+
+def miss_block_bind_value() -> None:
+    for i, j in T.grid(128, 128):
+        with T.block():
+            vi = T.axis.S(i)  # error
+            T.evaluate(1.0)
 
 
 def test_miss_block_bind():
-    check_error(miss_block_bind, 2)
+    check_error(miss_block_bind_value, 4)
 
 
 def invalid_loop_var() -> None:
@@ -203,74 +253,99 @@ def test_inconsistent_grid():
 
 
 def invalid_match_buffer_region() -> None:
-    with T.block([16, 16]) as [vi, vj]:
-        A = T.match_buffer(vi)  # error
-        T.evaluate(1.0)
+    for i, j in T.grid(128, 128):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            A = T.match_buffer(vi)  # error
+            T.evaluate(1.0)
 
 
 def test_invalid_match_buffer_region():
-    check_error(invalid_match_buffer_region, 3)
+    check_error(invalid_match_buffer_region, 5)
 
 
 def duplicate_buffer() -> None:
     A = T.alloc_buffer((128, 128), "float32")
-    with T.block([16, 16]) as [vi, vj]:
-        A = T.alloc_buffer((128, 128), "float32")  # error
-        T.evaluate(1.0)
+    for i, j in T.grid(128, 128):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            A = T.alloc_buffer((128, 128), "float32")  # error
+            T.evaluate(1.0)
 
 
 def test_duplicate_buffer():
-    check_error(duplicate_buffer, 4)
+    check_error(duplicate_buffer, 6)
 
 
 def duplicate_reads() -> None:
     A = T.alloc_buffer((128, 128), "float32")
-    with T.block([16, 16]) as [vi, vj]:
-        T.reads(A[0:8, 0:8])
-        T.reads(A[0:16, 0:16])  # error
-        T.evaluate(1.0)
+    for i, j in T.grid(128, 128):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(A[0:8, 0:8])
+            T.reads(A[0:16, 0:16])  # error
+            T.evaluate(1.0)
 
 
 def duplicate_writes() -> None:
     A = T.alloc_buffer((128, 128), "float32")
-    with T.block([16, 16]) as [vi, vj]:
-        T.writes(A[0:8, 0:8])
-        T.writes(A[0:16, 0:16])  # error
-        T.evaluate(1.0)
+    for i, j in T.grid(128, 128):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.writes(A[0:8, 0:8])
+            T.writes(A[0:16, 0:16])  # error
+            T.evaluate(1.0)
 
 
 def duplicate_predicate() -> None:
-    with T.block([16, 16]) as [vi, vj]:
-        T.where(1)
-        T.where(0)  # error
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.where(1)
+            T.where(0)  # error
 
 
 def duplicate_annotations() -> None:
-    with T.block([16, 16]) as [vi, vj]:
-        T.block_attr({})
-        T.block_attr({})  # error
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.block_attr({})
+            T.block_attr({})  # error
 
 
 def duplicate_init() -> None:
-    with T.block([16, 16]) as [vi, vj]:
-        with T.init():
-            T.evaluate(1.0)
-        with T.init():  # error
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            with T.init():
+                T.evaluate(1.0)
+            with T.init():  # error
+                T.evaluate(1.0)
+
+
+def duplicate_axes() -> None:
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            vi = T.axis.S(i, 16)  # error
             T.evaluate(1.0)
 
 
 def test_duplicate_block_signature():
-    check_error(duplicate_reads, 5)
-    check_error(duplicate_writes, 5)
-    check_error(duplicate_predicate, 4)
-    check_error(duplicate_annotations, 4)
-    check_error(duplicate_init, 5)
+    check_error(duplicate_reads, 7)
+    check_error(duplicate_writes, 7)
+    check_error(duplicate_predicate, 6)
+    check_error(duplicate_annotations, 6)
+    check_error(duplicate_init, 7)
+    check_error(duplicate_axes, 5)
 
 
 def opaque_access_during_complete(a: T.handle) -> None:  # error
     A = T.match_buffer(a, (16, 16), "float32")
-    with T.block([16, 16]) as [vi, vj]:
-        T.evaluate(T.load("float32", A.data, vi * 16 + vj))
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.evaluate(T.load("float32", A.data, vi * 16 + vj))
 
 
 def test_opaque_access_during_complete():
@@ -279,55 +354,65 @@ def test_opaque_access_during_complete():
 
 def convert_slice_to_bufferload() -> None:
     A = T.alloc_buffer((128, 128), "float32")
-    with T.block([16, 16]) as [vi, vj]:
-        A[vi, vj] = A[vi : vi + 2, vj] + 1  # error
+    for i, j in T.grid(128, 128):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            A[vi, vj] = A[vi : vi + 2, vj] + 1  # error
 
 
 def test_convert_slice_to_bufferload():
-    check_error(convert_slice_to_bufferload, 4)
+    check_error(convert_slice_to_bufferload, 6)
 
 
 def error_index_type() -> None:
     A = T.alloc_buffer((128, 128), "float32")
-    with T.block([16, 16]) as [vi, vj]:
-        A[vi, vj] = A[vi, 0.0] + 1  # error
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            A[vi, vj] = A[vi, 0.0] + 1  # error
 
 
 def error_bufferslice_index_type() -> None:
     A = T.alloc_buffer((1,), "float32")
     B = T.alloc_buffer((16, 16), "float32")
     C = T.alloc_buffer((16, 16), "float32")
-    with T.block([16, 16]) as [vi, vj]:
-        C[vi, vj] = B[vi, A[0]]  # error
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, A[0]]  # error
 
 
 def test_error_index_type():
-    check_error(error_index_type, 4)
-    check_error(error_bufferslice_index_type, 6)
+    check_error(error_index_type, 6)
+    check_error(error_bufferslice_index_type, 8)
 
 
 def error_index_with_stop() -> None:
     A = T.alloc_buffer((128, 128), "float32")
-    with T.block([16, 16]) as [vi, vj]:
-        A[vi, vj] = A[vi, 1:10] + 1  # error
+    for i, j in T.grid(128, 128):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            A[vi, vj] = A[vi, 1:10] + 1  # error
 
 
 def error_bufferslice_index_with_stop() -> None:
     A = T.alloc_buffer((1,), "int32")
     B = T.alloc_buffer((16, 16), "float32")
     C = T.alloc_buffer((16, 16), "float32")
-    with T.block([16, 16]) as [vi, vj]:
-        C[vi, vj] = B[vi, A[0:1]]  # error
+    for i, j in T.grid(16, 16):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, A[0:1]]  # error
 
 
 def test_error_index_with_stop_slice():
-    check_error(error_index_with_stop, 4)
-    check_error(error_bufferslice_index_with_stop, 6)
+    check_error(error_index_with_stop, 6)
+    check_error(error_bufferslice_index_with_stop, 8)
 
 
 def mismatch_args() -> None:
     A = T.alloc_buffer((128, 128), "float32")
-    with T.block([16, 16]) as [vi, vj]:
+    with T.block():
         T.reads(A[0, 0], A[1, 1])  # error
         T.evaluate(1.0)
 
@@ -338,8 +423,7 @@ def test_mismatch_args():
 
 def special_stmt_except() -> None:
     A = T.alloc_buffer("(128, 128)", "float32")  # error
-    with T.block([16, 16]) as [vi, vj]:
-        T.evaluate(1.0)
+    T.evaluate(1.0)
 
 
 def scope_handler_except() -> None:
@@ -368,7 +452,7 @@ def test_tvm_exception_catch():
 def buffer_shape_mismatch(a: T.handle) -> None:
     A = T.match_buffer(a, (8, 8))
     for i, j in T.grid(8, 2):
-        with T.block([]):
+        with T.block():
             T.reads([])
             T.writes([A[i, j * 4 : j * 4 + 4]])
             sub_A = T.match_buffer(
@@ -383,7 +467,7 @@ def test_match_buffer_shape_mismatch():
 
 
 def high_dim_store() -> None:
-    with T.block([], "root"):
+    with T.block("root"):
         B = T.allocate([256], "float32", "global")
         for i, j in T.grid(16, 16):
             B[i, j] = 1.0  # error: Store is only allowed with one index
@@ -393,6 +477,15 @@ def test_high_dim_store():
     check_error(high_dim_store, 5)
 
 
+def block_has_option_vars() -> None:
+    with T.block("root") as x:  # error: block does not support option_vars
+        T.evaluate(0.0)
+
+
+def test_block_has_option_vars():
+    check_error(block_has_option_vars, 2)
+
+
 def check_error(func, rel_lineno):
     # Override the default renderer to accumulate errors
     errors = []
@@ -416,5 +509,7 @@ def render(e):
         ), f"Expected error to be on line {rel_lineno}, but it was on {d.span.line - 1}"
 
 
+# TODO(Siyuan): block iter errors.
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_tvmscript_ops.py b/tests/python/unittest/test_tvmscript_ops.py
index c55fd7b69282..82f0fa5c86bc 100644
--- a/tests/python/unittest/test_tvmscript_ops.py
+++ b/tests/python/unittest/test_tvmscript_ops.py
@@ -37,22 +37,25 @@ def get_valid_counts(
     out_buf = T.match_buffer(out, (1, 2500, 6), "float32")
     out_indices_buf = T.match_buffer(out_indices, (1, 2500), "int32")
 
-    with T.block([1], "init") as [vi]:
+    with T.block("init"):
+        vi = T.axis.S(1, 0)
         valid_count_buf[vi] = T.int32(0)
-        with T.block([2500], "update") as [vj]:
-            T.reads([data_buf[vi, vj, 6]])
-            T.writes([valid_count_buf[vi], out_indices_buf[vi, vj], out_buf[vi, vj, 6]])
-            if (data_buf[vi, vj, score_index] > score_threshold) and (
-                (id_index < 0) or (data_buf[vi, vj, id_index] >= T.float32(0))
-            ):
-                for k in T.serial(0, 6):
-                    out_buf[vi, valid_count_buf[vi], k] = data_buf[vi, vj, k]
-                out_indices_buf[vi, valid_count_buf[vi]] = vj
-                valid_count_buf[vi] = valid_count_buf[vi] + 1
-            if vj >= valid_count_buf[vi]:
-                for k in T.serial(0, 6):
-                    out_buf[vi, vj, k] = T.float32(-1)
-                out_indices_buf[vi, vj] = T.int32(-1)
+        for j in range(2500):
+            with T.block("update"):
+                vj = T.axis.S(2500, j)
+                T.reads([data_buf[vi, vj, 6]])
+                T.writes([valid_count_buf[vi], out_indices_buf[vi, vj], out_buf[vi, vj, 6]])
+                if (data_buf[vi, vj, score_index] > score_threshold) and (
+                    (id_index < 0) or (data_buf[vi, vj, id_index] >= T.float32(0))
+                ):
+                    for k in T.serial(0, 6):
+                        out_buf[vi, valid_count_buf[vi], k] = data_buf[vi, vj, k]
+                    out_indices_buf[vi, valid_count_buf[vi]] = vj
+                    valid_count_buf[vi] = valid_count_buf[vi] + 1
+                if vj >= valid_count_buf[vi]:
+                    for k in T.serial(0, 6):
+                        out_buf[vi, vj, k] = T.float32(-1)
+                    out_indices_buf[vi, vj] = T.int32(-1)
 
 
 def _check_get_valid_counts_with_numpy(f, dshape, score_threshold, id_index, score_index):
@@ -101,5 +104,64 @@ def test_get_valid_counts_script_func():
     _check_get_valid_counts_with_numpy(f, (1, 2500, 6), 0.0, 0, 1)
 
 
+@T.prim_func
+def alloc_zero_dim_buffer(a: T.handle, b: T.handle) -> None:
+    A = T.match_buffer(a, [], dtype="float32")
+    B = T.match_buffer(b, [], dtype="float32")
+    # body
+    # tir.with block("root")
+    C = T.alloc_buffer([], dtype="float32")
+    A[()] = T.float32(2)
+    C[()] = A[()] + B[()]
+    B[()] = C[()]
+
+
+@T.prim_func
+def alloc_zero_dim_buffer_block(a: T.handle, b: T.handle) -> None:
+    A = T.match_buffer(a, (), "float32")
+    B = T.match_buffer(b, (), "float32")
+    with T.block("root"):
+        T.reads([])
+        T.writes([])
+        C = T.alloc_buffer((), "float32")
+        A[()] = T.float32(2)
+        C[()] = A[()] + B[()]
+        B[()] = C[()]
+
+
+def _check_alloc_zero_dim_buffer(f):
+    dtype = "float32"
+    ctx = tvm.cpu()
+
+    np_data = np.zeros(shape=()).astype(dtype)
+    np_out = np.zeros(shape=()).astype(dtype)
+    tvm_data = tvm.nd.array(np_data, ctx)
+    tvm_out = tvm.nd.array(np_out, ctx)
+
+    # np func exection
+    np_inter = np.array(1)
+    np_data[()] = 2.0
+    np_inter[()] = np_data[()] + np_out[()]
+    np_out[()] = np_inter[()]
+
+    # tvm func execution
+    f(tvm_data, tvm_out)
+    tvm.testing.assert_allclose(tvm_out.numpy(), np_out, rtol=1e-5)
+
+
+def test_alloc_zero_dim_buffer_round_trip():
+    func = alloc_zero_dim_buffer
+    func_with_block = alloc_zero_dim_buffer_block
+    rt_func = tvm.script.from_source(func.script(show_meta=True))
+    rt_func_with_block = tvm.script.from_source(func_with_block.script(show_meta=True))
+    rt_mod = tvm.build(rt_func, "llvm")
+    rt_mod_with_block = tvm.build(rt_func_with_block, "llvm")
+    tvm.ir.assert_structural_equal(func, func_with_block)
+    tvm.ir.assert_structural_equal(rt_func, rt_func_with_block)
+    _check_alloc_zero_dim_buffer(rt_mod)
+    _check_alloc_zero_dim_buffer(rt_mod_with_block)
+
+
 if __name__ == "__main__":
     test_get_valid_counts_script_func()
+    test_alloc_zero_dim_buffer_round_trip()
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 8058b96b024d..7c54cdc85f82 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -2672,10 +2672,12 @@ def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
 
-    with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
-        with T.init():
-            C[vi, vj] = T.float32(0)
-        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block("update"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vi, vj] = T.float32(0)
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
 @T.prim_func
@@ -2685,11 +2687,13 @@ def matmul_original(a: T.handle, b: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, [128, 128])
 
     for i, j in T.grid(128, 128):
-        with T.block([128, 128], "init") as [vi, vj]:
+        with T.block("init"):
+            vi, vj = T.axis.remap("SS", [i, j])
             C[vi, vj] = T.float32(0)
 
         for k in range(128):
-            with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
+            with T.block("update"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
 
@@ -2699,11 +2703,14 @@ def element_wise(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (128, 128), "float32")
     B = T.alloc_buffer((128, 128), "float32")
 
-    with T.block([128, 128], "B") as [vi, vj]:
-        B[vi, vj] = A[vi, vj] * T.float32(2)
-
-    with T.block([128, 128], "C") as [vi, vj]:
-        C[vi, vj] = B[vi, vj] + T.float32(1)
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * T.float32(2)
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + T.float32(1)
 
 
 @T.prim_func
@@ -2712,9 +2719,9 @@ def predicate(b: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (16, 16), "float32")
 
     for i, jo, ji in T.grid(16, 4, 5):
-        with T.block([16, 16], "update") as [vi, vj]:
-            T.bind(vi, i)
-            T.bind(vj, jo * 4 + ji)
+        with T.block("update"):
+            vi = T.axis.S(16, i)
+            vj = T.axis.S(16, jo * 4 + ji)
             T.where(jo * 4 + ji < 16)
             C[vi, vj] = B[vi, vj] + T.float32(1)
 
@@ -2807,12 +2814,16 @@ def match_buffer_region(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (16, 16, 16), "float32")
     B = T.match_buffer(b, (1), "float32")
 
-    with T.block([16, 4]) as [vi, vj]:
-        C = T.match_buffer(A[0:16, vi, vj * 4 : vj * 4 + 4], (16, 1, 4))
-        with T.block([4]) as [vii]:
-            D = T.match_buffer(C[vii * 4 : vii * 4 + 4, 0, 0:4], (4, 1, 4))
-            for i, j in T.grid(4, 4):
-                B[0] += D[i, 0, j]
+    for i, j in T.grid(16, 4):
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
+            C = T.match_buffer(A[0:16, vi, vj * 4 : vj * 4 + 4], (16, 1, 4))
+            for ii in range(4):
+                with T.block():
+                    vii = T.axis.S(4, ii)
+                    D = T.match_buffer(C[vii * 4 : vii * 4 + 4, 0, 0:4], (4, 1, 4))
+                    for i, j in T.grid(4, 4):
+                        B[0] += D[i, 0, j]
 
 
 def test_match_buffer_region():
@@ -2844,8 +2855,8 @@ def block_elements(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     B = T.match_buffer(b, (1, 1), "float32")
 
-    with T.block([1], "update") as [vi]:
-        T.bind(vi, 0)
+    with T.block("update"):
+        vi = T.axis.S(1, 0)
         T.where(True)
         T.reads(A[0:16, 0:16])
         T.writes(B[0, 0])
@@ -2879,11 +2890,11 @@ def opaque_block(a: T.handle, b: T.handle) -> None:
 
     for i in range(16):
         for j in range(16):
-            with T.block([]):
+            with T.block():
                 T.reads([])
                 T.writes(A[i, j])
                 A[i, j] = T.float32(0)
-        with T.block([]):
+        with T.block():
             T.reads([A[i, 0:16]])
             T.writes([B[i, 0:16]])
             for j in range(16):
@@ -2927,7 +2938,7 @@ def rank0_block(a: T.handle) -> None:
     B = T.alloc_buffer((), "float32")
     T.store(B.data, 0, T.load("float32", A.data, 0))
 
-    with T.block([], "update") as []:
+    with T.block("update") as []:
         T.reads([A[()]])
         T.writes([B[()]])
         for i in range(1):
@@ -2969,8 +2980,10 @@ def test_minmax():
 def abs(a: T.handle) -> None:
     A = T.match_buffer(a, (128, 128), "float32")
 
-    with T.block([128, 128], "A") as [vi, vj]:
-        A[vi, vj] = T.abs(A[vi, vj])
+    for i, j in T.grid(128, 128):
+        with T.block("A"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            A[vi, vj] = T.abs(A[vi, vj])
 
 
 def test_abs():
@@ -3011,15 +3024,13 @@ def test_simplify_bracket():
 @T.prim_func
 def var_with_same_name(a: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
-    with T.block([16, 16]) as [vi, vj]:
-        A[vi, vj] = 0
-    with T.block([16, 16]) as [vi, vj]:
-        A[vi, vj] = 0
     for i, j in T.grid(16, 16):
-        with T.block([16, 16]) as [vi, vj]:
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
             A[vi, vj] = 0
     for i, j in T.grid(16, 16):
-        with T.block([16, 16]) as [vi, vj]:
+        with T.block():
+            vi, vj = T.axis.remap("SS", [i, j])
             A[vi, vj] = 0
 
 
@@ -3029,14 +3040,10 @@ def test_same_name_var():
     rt_func = tvm.script.from_source(out_str)
     tvm.ir.assert_structural_equal(func, rt_func)
 
-    assert out_str.count("with T.block([16, 16]) as [vi, vj]") == 4
+    assert out_str.count('vi, vj = T.axis.remap("SS", [i, j])') == 2
     assert out_str.find("vi_") == -1
     assert out_str.find("vj_") == -1
 
-    assert out_str.count("for i0, i1 in T.grid(16, 16)") == 2
-    assert out_str.find("i0_") == -1
-    assert out_str.find("i1_") == -1
-
     assert out_str.count("for i, j in T.grid(16, 16)") == 2
     assert out_str.find("i_") == -1
     assert out_str.find("i_") == -1
@@ -3047,11 +3054,13 @@ def while_loop(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, (16,), "float32")
     B = T.match_buffer(b, (16,), "float32")
     i = T.alloc_buffer((), "int32", scope="local")
-    with T.block([16]) as [vi]:
-        B[vi] = 0
-    while i[()] < 10:
-        for j in range(16):
-            B[j] += A[j]
+    for ii in range(16):
+        with T.block():
+            vi = T.axis.S(16, ii)
+            B[vi] = 0
+        while i[()] < 10:
+            for j in range(16):
+                B[j] += A[j]
 
 
 def test_while_loop():
diff --git a/tests/scripts/task_ci_setup.sh b/tests/scripts/task_ci_setup.sh
index 7138effe395a..dfd2a32165f1 100755
--- a/tests/scripts/task_ci_setup.sh
+++ b/tests/scripts/task_ci_setup.sh
@@ -30,7 +30,7 @@ set -o pipefail
 #
 echo "Addtiional setup in" ${CI_IMAGE_NAME}
 
-python3 -m pip install --user tlcpack-sphinx-addon==0.2.1 synr==0.4.1
+python3 -m pip install --user tlcpack-sphinx-addon==0.2.1 synr==0.5.0
 
 # Rebuild standalone_crt in build/ tree. This file is not currently archived by pack_lib() in
 # Jenkinsfile. We expect config.cmake to be present from pack_lib().
diff --git a/tests/scripts/task_mypy.sh b/tests/scripts/task_mypy.sh
index ecc8ba5d17b0..aba4663d5931 100755
--- a/tests/scripts/task_mypy.sh
+++ b/tests/scripts/task_mypy.sh
@@ -15,6 +15,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+set -e
+set -u
 set -o pipefail
 
 echo "Checking MyPy Type defs in the TensorIR schedule package."
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index 79b87b75fc70..765c84137730 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -34,19 +34,18 @@ trap cleanup 0
 
 # cleanup old states
 rm -rf docs/_build
+rm -rf docs/_staging
 mkdir -p docs/_build/html
+mkdir -p docs/_staging/html
 rm -rf docs/gen_modules
 rm -rf docs/doxygen
 
 # prepare auto scheduler tutorials
-rm -rf tutorials/auto_scheduler/*.json
-rm -rf tutorials/get_started/*.json
-cp -f tutorials/auto_scheduler/ci_logs/*.json tutorials/auto_scheduler
-cp -f tutorials/auto_scheduler/ci_logs/*.json tutorials/get_started
+rm -rf gallery/how_to/tune_with_auto_scheduler/*.json
+rm -rf gallery/tutorial/*.json
+cp -f gallery/how_to/tune_with_autoscheduler/ci_logs/*.json gallery/how_to/tune_with_autoscheduler
+cp -f gallery/how_to/tune_with_autoscheduler/ci_logs/*.json gallery/tutorial
 
-# remove stale tutorials and always build from scratch.
-rm -rf docs/tutorials
-rm -rf docs/vta/tutorials
 
 # cleanup stale log files
 find . -type f -path "*.log" | xargs rm -f
@@ -84,11 +83,11 @@ cd ..
 rm -rf _docs
 mv docs/_build/html _docs
 rm -f _docs/.buildinfo
-mkdir -p _docs/api
-mv docs/doxygen/html _docs/api/doxygen
-mv jvm/core/target/site/apidocs _docs/api/javadoc
+mkdir -p _docs/reference/api
+mv docs/doxygen/html _docs/reference/api/doxygen
+mv jvm/core/target/site/apidocs _docs/reference/api/javadoc
 # mv rust/target/doc _docs/api/rust
-mv web/dist/docs _docs/api/typedoc
+mv web/dist/docs _docs/reference/api/typedoc
 
 echo "Start creating the docs tarball.."
 # make the tarball
diff --git a/tests/scripts/task_sphinx_precheck.sh b/tests/scripts/task_sphinx_precheck.sh
index 31fc663082f1..b5cac539062c 100755
--- a/tests/scripts/task_sphinx_precheck.sh
+++ b/tests/scripts/task_sphinx_precheck.sh
@@ -28,8 +28,8 @@ cleanup()
 trap cleanup 0
 
 # cleanup cache
-rm -rf docs/tutorials
-rm -rf docs/vta/tutorials
+rm -rf docs/_staging
+rm -rf docs/_build
 find . -type f -path "*.pyc" | xargs rm -f
 make cython3
 
diff --git a/tutorials/README.txt b/tutorials/README.txt
deleted file mode 100644
index 8b137891791f..000000000000
--- a/tutorials/README.txt
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/tutorials/auto_scheduler/README.txt b/tutorials/auto_scheduler/README.txt
deleted file mode 100644
index 75986679f0bd..000000000000
--- a/tutorials/auto_scheduler/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-AutoScheduler : Template-free Auto Scheduling
----------------------------------------------
diff --git a/tutorials/autotvm/README.txt b/tutorials/autotvm/README.txt
deleted file mode 100644
index a1d33ba088cc..000000000000
--- a/tutorials/autotvm/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-.. _tutorials-autotvm-sec:
-
-AutoTVM : Template-based Auto Tuning
-------------------------------------
diff --git a/tutorials/dev/README.txt b/tutorials/dev/README.txt
deleted file mode 100644
index a358280640de..000000000000
--- a/tutorials/dev/README.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-Developer Tutorials
--------------------
-
diff --git a/tutorials/frontend/README.txt b/tutorials/frontend/README.txt
deleted file mode 100644
index 319506d21f8f..000000000000
--- a/tutorials/frontend/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-.. _tutorial-frontend:
-
-Compile Deep Learning Models
-----------------------------
diff --git a/tutorials/get_started/README.txt b/tutorials/get_started/README.txt
deleted file mode 100644
index aa6c559c1b38..000000000000
--- a/tutorials/get_started/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Getting Started With TVM
-------------------------
diff --git a/tutorials/language/README.txt b/tutorials/language/README.txt
deleted file mode 100644
index 6da8e3c57c1f..000000000000
--- a/tutorials/language/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Tensor Expression and Schedules
--------------------------------
diff --git a/tutorials/micro/README.txt b/tutorials/micro/README.txt
deleted file mode 100644
index 70a5e580ecd1..000000000000
--- a/tutorials/micro/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-.. _tutorial-micro:
-
-microTVM
---------
diff --git a/tutorials/optimize/README.txt b/tutorials/optimize/README.txt
deleted file mode 100644
index b051548c5351..000000000000
--- a/tutorials/optimize/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Optimize Tensor Operators
--------------------------
diff --git a/tutorials/topi/README.txt b/tutorials/topi/README.txt
deleted file mode 100644
index eae0aafafc02..000000000000
--- a/tutorials/topi/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-TOPI: TVM Operator Inventory
-----------------------------
diff --git a/vta/tutorials/README.txt b/vta/tutorials/README.txt
index 3d3858b111ba..c1ff4ca0444d 100644
--- a/vta/tutorials/README.txt
+++ b/vta/tutorials/README.txt
@@ -1,3 +1,5 @@
+.. _vta-tutorials:
+
 VTA Tutorials
 =============
 This page contains tutorials about VTA and how to use TVM/Relay to target VTA.
diff --git a/vta/tutorials/frontend/deploy_detection.py b/vta/tutorials/frontend/deploy_detection.py
index 771801851a48..cbd22a752049 100644
--- a/vta/tutorials/frontend/deploy_detection.py
+++ b/vta/tutorials/frontend/deploy_detection.py
@@ -34,15 +34,15 @@
 #
 # .. code-block:: bash
 #
-# pip3 install "Pillow<7"
+#   pip3 install "Pillow<7"
 #
 # YOLO-V3-tiny Model with Darknet parsing have dependancy with CFFI and CV2 library,
 # we need to install CFFI and CV2 before executing this script.
 #
-# pip3 install "Pillow<7"
+# .. code-block:: bash
 #
-# pip3 install cffi
-# pip3 install opencv-python
+#   pip3 install cffi
+#   pip3 install opencv-python
 #
 # Now return to the python code. Import packages.