From bc51e74021c2698bf1e8bedac9f64cebb0ad7a2e Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 15 Feb 2025 11:21:50 -0500
Subject: [PATCH 1/5] [REFACTOR] Phase out te.schedule python components

This PR phases out te.schedule python components.
te.compute is kept around for future usages.
tir.Schedule is a more modern version of the scheduling that we can use onwards.

Doing so also helps us to cleanup the testcases that relies on
explicit full build and execution. As we move future unit testcases
towards structural equality based unit tests.
---
 apps/android_rpc/tests/android_rpc_test.py    |   59 +-
 apps/ios_rpc/tests/ios_rpc_test.py            |   33 +-
 docker/Dockerfile.ci_wasm                     |    4 +-
 .../tutorials/cross_compilation_and_rpc.py    |   16 +-
 golang/sample/deploy.py                       |   56 -
 jvm/README.md                                 |   29 -
 jvm/core/src/test/scripts/test_add_cpu.py     |   43 -
 jvm/core/src/test/scripts/test_add_gpu.py     |   58 -
 python/tvm/contrib/peak.py                    |  394 ------
 python/tvm/contrib/sparse.py                  |  204 ---
 python/tvm/contrib/tedd.py                    |  798 -----------
 python/tvm/driver/build_module.py             |  248 +---
 python/tvm/exec/measure_peak.py               |   52 -
 .../tvm/relax/frontend/torch/fx_translator.py |    1 -
 python/tvm/relax/vm_build.py                  |   21 +-
 python/tvm/te/__init__.py                     |   14 +-
 python/tvm/te/autodiff.py                     |   67 -
 python/tvm/te/hybrid/__init__.py              |  101 --
 python/tvm/te/hybrid/calls.py                 |  183 ---
 python/tvm/te/hybrid/module.py                |  113 --
 python/tvm/te/hybrid/parser.py                |  658 ---------
 python/tvm/te/hybrid/preprocessor.py          |  120 --
 python/tvm/te/hybrid/runtime.py               |  175 ---
 python/tvm/te/hybrid/utils.py                 |  103 --
 python/tvm/te/operation.py                    |    3 +
 python/tvm/te/schedule.py                     |  665 ---------
 python/tvm/te/tensor.py                       |   10 -
 python/tvm/te/tensor_intrin.py                |  146 --
 python/tvm/testing/utils.py                   |    3 +-
 python/tvm/tir/buffer.py                      |   23 -
 python/tvm/topi/__init__.py                   |    4 -
 python/tvm/topi/argwhere.py                   |  197 ---
 python/tvm/topi/nn/conv2d.py                  |   85 --
 python/tvm/topi/random/__init__.py            |   22 -
 python/tvm/topi/random/kernel.py              |  657 ---------
 python/tvm/topi/sparse_fill_empty_rows.py     |  109 --
 python/tvm/topi/transform.py                  |   30 -
 python/tvm/topi/unique.py                     |  234 ----
 python/tvm/topi/vision/__init__.py            |   25 -
 python/tvm/topi/vision/nms.py                 | 1183 -----------------
 python/tvm/topi/vision/nms_util.py            |  338 -----
 python/tvm/topi/vision/rcnn/__init__.py       |   22 -
 python/tvm/topi/vision/rcnn/proposal.py       |  448 -------
 python/tvm/topi/vision/rcnn/roi_align.py      |  228 ----
 python/tvm/topi/vision/rcnn/roi_pool.py       |   95 --
 python/tvm/topi/vision/reorg.py               |   42 -
 python/tvm/topi/vision/ssd/__init__.py        |   22 -
 python/tvm/topi/vision/ssd/multibox.py        |  369 -----
 python/tvm/utils/__init__.py                  |   19 -
 python/tvm/utils/roofline/__init__.py         |  279 ----
 python/tvm/utils/roofline/cuda.py             |  407 ------
 python/tvm/utils/roofline/registry.py         |  111 --
 python/tvm/utils/roofline/x86.py              |  331 -----
 .../test_minimal_target_codegen_llvm.py       |   22 +-
 .../test_runtime_ndarray.py                   |   22 -
 .../codegen/test_target_codegen_aarch64.py    |   63 +-
 .../python/codegen/test_target_codegen_arm.py |   19 +-
 .../codegen/test_target_codegen_bool.py       |   25 +-
 .../codegen/test_target_codegen_c_host.py     |  122 +-
 .../codegen/test_target_codegen_cross_llvm.py |   13 +-
 .../codegen/test_target_codegen_cuda.py       |  539 ++------
 .../codegen/test_target_codegen_device.py     |   49 +-
 .../codegen/test_target_codegen_extern.py     |   25 +-
 .../codegen/test_target_codegen_hexagon.py    |   58 +-
 .../codegen/test_target_codegen_llvm.py       |  437 +++---
 .../codegen/test_target_codegen_opencl.py     |    9 +-
 .../codegen/test_target_codegen_rocm.py       |   75 +-
 .../codegen/test_target_codegen_vulkan.py     |  247 +---
 .../python/codegen/test_target_codegen_x86.py |    6 +-
 tests/python/contrib/test_cblas.py            |   16 +-
 tests/python/contrib/test_dlpack.py           |    3 +-
 tests/python/contrib/test_gemm_acc16.py       |  105 --
 tests/python/contrib/test_gemm_acc32_vnni.py  |  115 --
 .../conv2d/test_conv2d_blocked.py             |  207 ---
 .../test_hexagon/conv2d/test_conv2d_conv2d.py |  252 ----
 .../test_hexagon/test_2d_physical_buffers.py  |   41 +-
 .../contrib/test_hexagon/test_launcher.py     |   12 +-
 .../test_hexagon/test_maxpool2d_blocked.py    |  158 ---
 tests/python/contrib/test_hipblas.py          |    6 +-
 tests/python/contrib/test_miopen.py           |  136 --
 tests/python/contrib/test_mps.py              |   26 +-
 tests/python/contrib/test_random.py           |    9 +-
 tests/python/contrib/test_rocblas.py          |    6 +-
 tests/python/contrib/test_sort.py             |   41 +-
 tests/python/contrib/test_sparse.py           |  123 --
 tests/python/relax/test_frontend_from_fx.py   |    1 -
 tests/python/runtime/test_runtime_dlpack.py   |    4 +-
 tests/python/runtime/test_runtime_measure.py  |    3 +-
 .../runtime/test_runtime_module_export.py     |  213 +--
 .../runtime/test_runtime_module_load.py       |   16 +-
 .../runtime/test_runtime_module_property.py   |   12 +-
 tests/python/runtime/test_runtime_rpc.py      |    3 +-
 tests/python/runtime/test_runtime_trace.py    |   21 +-
 tests/python/target/test_target_target.py     |    2 +-
 tests/python/te/test_te_autodiff.py           |  351 -----
 tests/python/te/test_te_build_lower.py        |   65 -
 tests/python/te/test_te_group.py              |   90 --
 tests/python/te/test_te_hybrid_script.py      |  872 ------------
 tests/python/te/test_te_schedule.py           |  382 ------
 .../te/test_te_schedule_bound_inference.py    |  512 -------
 ...test_te_schedule_bound_inference_tiling.py |   62 -
 tests/python/te/test_te_schedule_graph.py     |  142 --
 tests/python/te/test_te_schedule_lstm.py      |   91 --
 tests/python/te/test_te_schedule_ops.py       |  695 ----------
 ...hedule_postproc_rewrite_for_tensor_core.py |  231 ----
 .../python/te/test_te_schedule_tensor_core.py |  461 -------
 tests/python/te/test_te_schedule_tensorize.py |  392 ------
 tests/python/te/test_te_tensor.py             |  205 +--
 tests/python/te/test_te_transform_layout.py   |  592 ---------
 .../tir-analysis/test_tir_analysis_usedef.py  |   36 -
 .../test_tir_analysis_verify_gpu_code.py      |  434 ------
 .../test_tir_analysis_verify_memory.py        |  121 --
 tests/python/tir-base/test_lower_build.py     |   17 -
 tests/python/tir-base/test_tir_buffer.py      |   79 --
 tests/python/tir-base/test_tir_intrin.py      |   68 +-
 tests/python/tir-base/test_tir_ir_builder.py  |  565 --------
 ...est_tir_transform_compact_buffer_region.py |   14 -
 ..._tir_transform_convert_blocks_to_opaque.py |    9 -
 .../test_tir_transform_flatten_buffer.py      |   13 -
 .../test_tir_transform_hoist_if.py            |   59 -
 .../test_tir_transform_inject_copy_intrin.py  |  124 --
 ...est_tir_transform_inject_rolling_buffer.py |  177 ---
 ...tir_transform_instrument_bound_checkers.py |  608 ---------
 .../test_tir_transform_loop_partition.py      |  325 -----
 ..._transform_lower_cross_thread_reduction.py |   15 -
 .../test_tir_transform_lower_init_block.py    |    9 -
 .../test_tir_transform_lower_intrin.py        |    4 +-
 .../test_tir_transform_lower_opaque_block.py  |    9 -
 .../test_tir_transform_lower_warp_memory.py   |  356 -----
 .../test_tir_transform_make_packed_api.py     |   26 -
 ...merge_dynamic_shared_memory_allocations.py |  304 -----
 ..._merge_static_shared_memory_allocations.py |  203 ---
 .../test_tir_transform_narrow_datatype.py     |   41 -
 ..._plan_update_buffer_allocation_location.py |   11 -
 .../test_tir_transform_simplify.py            |   64 -
 .../test_tir_transform_split_host_device.py   |   39 -
 .../test_tir_transform_storage_flatten.py     |   63 -
 .../test_tir_transform_storage_rewrite.py     |  320 -----
 .../test_tir_transform_thread_sync.py         |   61 -
 ...test_tir_transform_unify_thread_binding.py |   11 -
 .../test_tir_transform_unroll_loop.py         |   18 -
 .../test_tir_transform_vectorize.py           |   25 +-
 web/tests/python/webgpu_rpc_test.py           |    3 +-
 web/tests/python/websock_rpc_test.py          |   91 --
 144 files changed, 781 insertions(+), 21603 deletions(-)
 delete mode 100644 golang/sample/deploy.py
 delete mode 100644 jvm/core/src/test/scripts/test_add_cpu.py
 delete mode 100644 jvm/core/src/test/scripts/test_add_gpu.py
 delete mode 100644 python/tvm/contrib/peak.py
 delete mode 100644 python/tvm/contrib/sparse.py
 delete mode 100644 python/tvm/contrib/tedd.py
 delete mode 100644 python/tvm/exec/measure_peak.py
 delete mode 100644 python/tvm/te/autodiff.py
 delete mode 100644 python/tvm/te/hybrid/__init__.py
 delete mode 100644 python/tvm/te/hybrid/calls.py
 delete mode 100644 python/tvm/te/hybrid/module.py
 delete mode 100644 python/tvm/te/hybrid/parser.py
 delete mode 100644 python/tvm/te/hybrid/preprocessor.py
 delete mode 100644 python/tvm/te/hybrid/runtime.py
 delete mode 100644 python/tvm/te/hybrid/utils.py
 delete mode 100644 python/tvm/te/schedule.py
 delete mode 100644 python/tvm/te/tensor_intrin.py
 delete mode 100644 python/tvm/topi/argwhere.py
 delete mode 100644 python/tvm/topi/random/__init__.py
 delete mode 100644 python/tvm/topi/random/kernel.py
 delete mode 100644 python/tvm/topi/sparse_fill_empty_rows.py
 delete mode 100644 python/tvm/topi/vision/__init__.py
 delete mode 100644 python/tvm/topi/vision/nms.py
 delete mode 100644 python/tvm/topi/vision/nms_util.py
 delete mode 100644 python/tvm/topi/vision/rcnn/__init__.py
 delete mode 100644 python/tvm/topi/vision/rcnn/proposal.py
 delete mode 100644 python/tvm/topi/vision/rcnn/roi_align.py
 delete mode 100644 python/tvm/topi/vision/rcnn/roi_pool.py
 delete mode 100644 python/tvm/topi/vision/reorg.py
 delete mode 100644 python/tvm/topi/vision/ssd/__init__.py
 delete mode 100644 python/tvm/topi/vision/ssd/multibox.py
 delete mode 100644 python/tvm/utils/__init__.py
 delete mode 100644 python/tvm/utils/roofline/__init__.py
 delete mode 100644 python/tvm/utils/roofline/cuda.py
 delete mode 100644 python/tvm/utils/roofline/registry.py
 delete mode 100644 python/tvm/utils/roofline/x86.py
 delete mode 100644 tests/python/contrib/test_gemm_acc16.py
 delete mode 100644 tests/python/contrib/test_gemm_acc32_vnni.py
 delete mode 100644 tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py
 delete mode 100644 tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py
 delete mode 100644 tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
 delete mode 100644 tests/python/contrib/test_miopen.py
 delete mode 100644 tests/python/contrib/test_sparse.py
 delete mode 100644 tests/python/te/test_te_autodiff.py
 delete mode 100644 tests/python/te/test_te_build_lower.py
 delete mode 100644 tests/python/te/test_te_group.py
 delete mode 100644 tests/python/te/test_te_hybrid_script.py
 delete mode 100644 tests/python/te/test_te_schedule.py
 delete mode 100644 tests/python/te/test_te_schedule_bound_inference.py
 delete mode 100644 tests/python/te/test_te_schedule_bound_inference_tiling.py
 delete mode 100644 tests/python/te/test_te_schedule_graph.py
 delete mode 100644 tests/python/te/test_te_schedule_lstm.py
 delete mode 100644 tests/python/te/test_te_schedule_ops.py
 delete mode 100644 tests/python/te/test_te_schedule_postproc_rewrite_for_tensor_core.py
 delete mode 100644 tests/python/te/test_te_schedule_tensor_core.py
 delete mode 100644 tests/python/te/test_te_schedule_tensorize.py
 delete mode 100644 tests/python/te/test_te_transform_layout.py
 delete mode 100644 tests/python/tir-analysis/test_tir_analysis_usedef.py
 delete mode 100644 tests/python/tir-analysis/test_tir_analysis_verify_gpu_code.py
 delete mode 100644 tests/python/tir-analysis/test_tir_analysis_verify_memory.py
 delete mode 100644 tests/python/tir-base/test_tir_ir_builder.py
 delete mode 100644 tests/python/tir-transform/test_tir_transform_inject_copy_intrin.py
 delete mode 100644 tests/python/tir-transform/test_tir_transform_instrument_bound_checkers.py
 delete mode 100644 tests/python/tir-transform/test_tir_transform_lower_warp_memory.py
 delete mode 100644 tests/python/tir-transform/test_tir_transform_merge_static_shared_memory_allocations.py
 delete mode 100644 web/tests/python/websock_rpc_test.py

diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py
index 0027cc4ba352..ba6c0f9c9679 100644
--- a/apps/android_rpc/tests/android_rpc_test.py
+++ b/apps/android_rpc/tests/android_rpc_test.py
@@ -56,37 +56,15 @@ def test_rpc_module():
     tracker = rpc.connect_tracker(tracker_host, tracker_port)
     remote = tracker.request(key, priority=0, session_timeout=60)
 
-    # Compile the Graph for CPU target
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=64)
-    s[B].parallel(xi)
-    s[B].pragma(xo, "parallel_launch_point")
-    s[B].pragma(xi, "parallel_barrier_when_finish")
-    f = tvm.build(s, [A, B], target, name="myadd_cpu")
-    path_dso_cpu = temp.relpath("cpu_lib.so")
-    f.export_library(path_dso_cpu, fcompile=ndk.create_shared)
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]).with_attr("global_symbol", "myadd"))
+    sch = tvm.tir.Schedule(mod)
+    (x,) = sch.get_loops(block=sch.get_block("B"))
+    xo, xi = sch.split(i, [None, 32])
+    sch.bind(xo, "blockIdx.x")
+    sch.bind(xi, "threadIdx.x")
 
-    # Execute the portable graph on cpu target
-    print("Run CPU test ...")
-    dev = remote.cpu(0)
-    remote.upload(path_dso_cpu)
-    f2 = remote.load_module("cpu_lib.so")
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
-    time_f = f2.time_evaluator(f2.entry_name, dev, number=10)
-    cost = time_f(a, b).mean
-    print("%g secs/op\n" % cost)
-    np.testing.assert_equal(b.numpy(), a.numpy() + 1)
-
-    # Compile the Graph for OpenCL target
     if test_opencl:
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], factor=64)
-        s[B].bind(xi, te.thread_axis("threadIdx.x"))
-        s[B].bind(xo, te.thread_axis("blockIdx.x"))
-        # Build the dynamic lib.
-        # If we don't want to do metal and only use cpu, just set target to be target
-        f = tvm.build(s, [A, B], tvm.target.Target("opencl", host=target), name="myadd")
+        f = tvm.build(sch.mod, target=tvm.target.Target("opencl", host=target))
         path_dso_cl = temp.relpath("dev_lib_cl.so")
         f.export_library(path_dso_cl, fcompile=ndk.create_shared)
 
@@ -101,29 +79,6 @@ def test_rpc_module():
         print("%g secs/op\n" % cost)
         np.testing.assert_equal(b.numpy(), a.numpy() + 1)
 
-    # Compile the Graph for Vulkan target
-    if test_vulkan:
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], factor=64)
-        s[B].bind(xi, te.thread_axis("threadIdx.x"))
-        s[B].bind(xo, te.thread_axis("blockIdx.x"))
-        # Build the dynamic lib.
-        # If we don't want to do metal and only use cpu, just set target to be target
-        f = tvm.build(s, [A, B], tvm.target.Target("vulkan", host=target), name="myadd")
-        path_dso_vulkan = temp.relpath("dev_lib_vulkan.so")
-        f.export_library(path_dso_vulkan, fcompile=ndk.create_shared)
-
-        print("Run GPU(Vulkan Flavor) test ...")
-        dev = remote.vulkan(0)
-        remote.upload(path_dso_vulkan)
-        f1 = remote.load_module("dev_lib_vulkan.so")
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
-        time_f = f1.time_evaluator(f1.entry_name, dev, number=10)
-        cost = time_f(a, b).mean
-        print("%g secs/op\n" % cost)
-        np.testing.assert_equal(b.numpy(), a.numpy() + 1)
-
 
 if __name__ == "__main__":
     test_rpc_module()
diff --git a/apps/ios_rpc/tests/ios_rpc_test.py b/apps/ios_rpc/tests/ios_rpc_test.py
index f0c31cd7d268..3e807adf484c 100644
--- a/apps/ios_rpc/tests/ios_rpc_test.py
+++ b/apps/ios_rpc/tests/ios_rpc_test.py
@@ -50,25 +50,19 @@ def test_rpc_module(host, port, key, mode):
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
     temp = utils.tempdir()
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=64)
-    s[B].bind(xi, te.thread_axis("threadIdx.x"))
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]).with_attr("global_symbol", "myadd"))
+    sch = tvm.tir.Schedule(mod)
+    (i,) = sch.get_loops(block=sch.get_block("B"))
+    i0, i1 = sch.split(i, [None, 32])
+    sch.bind(i0, "blockIdx.x")
+    sch.bind(i1, "threadIdx.x")
+
     # Build the dynamic lib.
     # If we don't want to do metal and only use cpu, just set target to be target
-    f = tvm.build(s, [A, B], tvm.target.Target("metal", host=target), name="myadd")
+    f = tvm.build(sch.mod, target=tvm.target.Target("metal", host=target))
     path_dso1 = temp.relpath("dev_lib.dylib")
     f.export_library(path_dso1, fcompile=xcode.create_dylib, arch=arch, sdk=sdk)
 
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=64)
-    s[B].parallel(xi)
-    s[B].pragma(xo, "parallel_launch_point")
-    s[B].pragma(xi, "parallel_barrier_when_finish")
-    f = tvm.build(s, [A, B], target, name="myadd_cpu")
-    path_dso2 = temp.relpath("cpu_lib.dylib")
-    f.export_library(path_dso2, fcompile=xcode.create_dylib, arch=arch, sdk=sdk)
-
     # connect to the proxy
     if mode == "tracker":
         remote = MODES[mode](host, port).request(key)
@@ -84,17 +78,6 @@ def test_rpc_module(host, port, key, mode):
     cost = time_f(a, b).mean
     print("Metal: %g secs/op" % cost)
     np.testing.assert_equal(b.numpy(), a.numpy() + 1)
-    # CPU
-    dev = remote.cpu(0)
-    remote.upload(path_dso2)
-    f2 = remote.load_module("cpu_lib.dylib")
-    a_np = np.random.uniform(size=1024).astype(A.dtype)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
-    time_f = f2.time_evaluator(f2.entry_name, dev, number=10)
-    cost = time_f(a, b).mean
-    print("CPU: %g secs/op" % cost)
-    np.testing.assert_equal(b.numpy(), a.numpy() + 1)
 
 
 if __name__ == "__main__":
diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm
index 6860c51d7277..83a9f0e9f0e8 100644
--- a/docker/Dockerfile.ci_wasm
+++ b/docker/Dockerfile.ci_wasm
@@ -64,8 +64,8 @@ RUN bash /install/ubuntu_install_emscripten.sh
 ENV EMSDK=/emsdk
 ENV PATH=${PATH}:${EMSDK}:${EMSDK}/upstream/emscripten
 ENV EMSCRIPTEN=${EMSDK}/upstream/emscripten
-ENV BINARYEN=${EMSDK}/upstream
-ENV LLVM=${EMSDK}/upstream/bin
+ENV EM_BINARYEN_ROOT=${EMSDK}/upstream
+ENV EM_LLVM_ROOT=${EMSDK}/upstream/bin
 
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
diff --git a/docs/how_to/tutorials/cross_compilation_and_rpc.py b/docs/how_to/tutorials/cross_compilation_and_rpc.py
index c7e302693de7..81c73fd051ef 100644
--- a/docs/how_to/tutorials/cross_compilation_and_rpc.py
+++ b/docs/how_to/tutorials/cross_compilation_and_rpc.py
@@ -104,7 +104,7 @@
 n = tvm.runtime.convert(1024)
 A = te.placeholder((n,), name="A")
 B = te.compute((n,), lambda i: A[i] + 1.0, name="B")
-s = te.create_schedule(B.op)
+mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]).with_attr("global_symbol", "add_one"))
 
 ######################################################################
 # Then we cross compile the kernel.
@@ -119,7 +119,7 @@
 else:
     target = "llvm -mtriple=armv7l-linux-gnueabihf"
 
-func = tvm.build(s, [A, B], target=target, name="add_one")
+func = tvm.build(mod, target=target, name="add_one")
 # save the lib at a local temp folder
 temp = utils.tempdir()
 path = temp.relpath("lib.tar")
@@ -231,11 +231,13 @@ def run_opencl():
     target = tvm.target.Target("opencl", host="llvm -mtriple=aarch64-linux-gnu")
 
     # create schedule for the above "add one" compute declaration
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=32)
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
-    s[B].bind(xi, te.thread_axis("threadIdx.x"))
-    func = tvm.build(s, [A, B], target=target)
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]))
+    sch = tvm.tir.Schedule(mod)
+    (x,) = sch.get_loops(block=sch.get_block("B"))
+    xo, xi = sch.split(i, [None, 32])
+    sch.bind(x, "blockIdx.x")
+    sch.bind(x, "threadIdx.x")
+    func = tvm.build(sch.mod, target=target)
 
     remote = rpc.connect(opencl_device_host, opencl_device_port)
 
diff --git a/golang/sample/deploy.py b/golang/sample/deploy.py
deleted file mode 100644
index fa4cbd433549..000000000000
--- a/golang/sample/deploy.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Get Started with TVM Go
-=======================
-"""
-from __future__ import absolute_import, print_function
-
-import tvm
-from tvm import te
-import numpy as np
-
-# Global declarations of environment.
-
-tgt = "llvm"
-
-######################################################################
-# Describe the Computation
-# ------------------------
-n = te.var("n")
-A = te.placeholder((n,), name="A")
-B = te.placeholder((n,), name="B")
-C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-
-######################################################################
-# Schedule the Computation
-# ------------------------
-s = te.create_schedule(C.op)
-
-######################################################################
-# Compilation
-# -----------
-fadd = tvm.build(s, [A, B, C], tgt, name="myadd")
-
-######################################################################
-# Save Compiled Module
-# --------------------
-from tvm.contrib import cc
-from tvm.contrib import utils
-
-fadd.save("deploy.o")
-cc.create_shared("deploy.so", ["deploy.o"])
diff --git a/jvm/README.md b/jvm/README.md
index c7535f0311b4..62b685010c2e 100644
--- a/jvm/README.md
+++ b/jvm/README.md
@@ -89,35 +89,6 @@ It is your job to verify the types of callback arguments, as well as the type of
 
 You can register the Java function by `Function.register` and use `Function.getFunction` to get the registered function later.
 
-## Use TVM to Generate Shared Library
-
-There's nothing special for this part. The following Python snippet generate add_cpu.so which add two vectors on CPU.
-
-```python
-import os
-import tvm
-from tvm import te
-from tvm.contrib import cc, utils
-
-def test_add(target_dir):
-    n = te.var("n")
-    A = te.placeholder((n,), name='A')
-    B = te.placeholder((n,), name='B')
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
-    fadd = tvm.build(s, [A, B, C], "llvm", name="myadd")
-
-    fadd.save(os.path.join(target_dir, "add_cpu.o"))
-    cc.create_shared(os.path.join(target_dir, "add_cpu.so"),
-            [os.path.join(target_dir, "add_cpu.o")])
-
-if __name__ == "__main__":
-    import sys
-    if len(sys.argv) != 2:
-        sys.exit(-1)
-    test_add(sys.argv[1])
-```
-
 ## Run the Generated Shared Library
 
 The following code snippet demonstrate how to load generated shared library (add_cpu.so).
diff --git a/jvm/core/src/test/scripts/test_add_cpu.py b/jvm/core/src/test/scripts/test_add_cpu.py
deleted file mode 100644
index 9a93d4e74694..000000000000
--- a/jvm/core/src/test/scripts/test_add_cpu.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-
-import tvm
-from tvm import te
-from tvm.contrib import cc, utils
-
-
-def test_add(target_dir):
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
-    fadd = tvm.build(s, [A, B, C], "llvm", name="myadd")
-
-    fadd.save(os.path.join(target_dir, "add_cpu.o"))
-    cc.create_shared(
-        os.path.join(target_dir, "add_cpu.so"), [os.path.join(target_dir, "add_cpu.o")]
-    )
-
-
-if __name__ == "__main__":
-    import sys
-
-    if len(sys.argv) != 2:
-        sys.exit(-1)
-    test_add(sys.argv[1])
diff --git a/jvm/core/src/test/scripts/test_add_gpu.py b/jvm/core/src/test/scripts/test_add_gpu.py
deleted file mode 100644
index 0eea5671baed..000000000000
--- a/jvm/core/src/test/scripts/test_add_gpu.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-
-import tvm
-from tvm import te
-from tvm.contrib import cc, nvcc, utils
-
-
-@tvm.register_func("tvm_callback_cuda_compile", override=True)
-def tvm_callback_cuda_compile(code, target):
-    ptx = nvcc.compile_cuda(code, target_format="ptx")
-    return ptx
-
-
-def test_add(target_dir):
-    if not tvm.runtime.enabled("cuda"):
-        print("skip %s because cuda is not enabled..." % __file__)
-        return
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-
-    s = te.create_schedule(C.op)
-
-    bx, tx = s[C].split(C.op.axis[0], factor=64)
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    fadd_cuda = tvm.build(s, [A, B, C], tvm.target.Target("cuda", host="llvm"), name="myadd")
-
-    fadd_cuda.save(os.path.join(target_dir, "add_cuda.o"))
-    fadd_cuda.imported_modules[0].save(os.path.join(target_dir, "add_cuda.ptx"))
-    cc.create_shared(
-        os.path.join(target_dir, "add_cuda.so"), [os.path.join(target_dir, "add_cuda.o")]
-    )
-
-
-if __name__ == "__main__":
-    import sys
-
-    if len(sys.argv) != 2:
-        sys.exit(-1)
-    test_add(sys.argv[1])
diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py
deleted file mode 100644
index 78dae846d6ca..000000000000
--- a/python/tvm/contrib/peak.py
+++ /dev/null
@@ -1,394 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""measure bandwidth and compute peak"""
-
-import logging
-import tvm
-from tvm import te
-from tvm.target import Target
-from . import utils
-from .. import rpc
-
-
-def _convert_to_remote(func, remote):
-    """convert module function to remote rpc function"""
-    temp = utils.tempdir()
-    path_dso = temp.relpath("tmp_func.tar")
-    func.export_library(path_dso)
-
-    remote.upload(path_dso)
-    func = remote.load_module("tmp_func.tar")
-    return func
-
-
-def measure_bandwidth_sum(
-    total_item,
-    item_per_thread,
-    stride,
-    base_type,
-    bits,
-    lanes,
-    target,
-    target_host,
-    remote,
-    dev,
-    n_times,
-):
-    """measure memory bandwidth of gpu by product reduction for a given type
-
-    The IR for measurement is
-
-    for each thread
-        for i in 1..num_per_thread:
-            y[global_id] = y[global_id] * x[base + i * stride]
-
-    Parameters
-    ----------
-    total_item: int
-        number of elements in input array
-    item_per_thread: int
-        number of elements each thread accumulates
-    stride: int
-        stride in memory access
-    base_type: str
-        can be "int", "float"
-    bits: int
-        can be 16, 32
-    lanes: int
-       lane of the vector type, can be 1, 2, 4, 8, 16
-    target: :any:`tvm.target.Target`
-        the target and option of the compilation.
-    target_host : str or :any:`tvm.target.Target`
-        host compilation target
-    dev: Device
-        the device of array
-    remote: tvm.rpc.RPCSession
-        remote rpc session
-    n_times: int
-        number of runs for taking mean
-
-    Returns
-    -------
-    GBPS: float
-         gigabyte per second
-    """
-    target, target_host = Target.canon_target_and_host(target, target_host)
-
-    n, m = total_item, item_per_thread
-    n //= lanes
-
-    base_type = str(base_type) + str(bits)
-    dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)
-
-    k = te.reduce_axis((0, m), name="k")
-
-    x = te.placeholder((n,), dtype=dtype, name="x")
-    op = te.comm_reducer(lambda x, y: x * y, lambda t: tvm.tir.const(1, dtype=t), name="sum")
-    y = te.compute(
-        (n // m,), lambda i: op(x[i // stride * stride * m + i % stride + k * stride], axis=k)
-    )
-    s = te.create_schedule(y.op)
-
-    yo, yi = s[y].split(y.op.axis[0], target.max_num_threads)
-    s[y].bind(yo, te.thread_axis("blockIdx.x"))
-    s[y].bind(yi, te.thread_axis("threadIdx.x"))
-    s[y].unroll(k)
-
-    try:
-        func = tvm.build(s, [x, y], target)
-
-        x = tvm.nd.empty((n,), dtype=dtype, device=dev)
-        y = tvm.nd.empty((n // m,), dtype=dtype, device=dev)
-
-        func = _convert_to_remote(func, remote)
-        time_f = func.time_evaluator(func.entry_name, dev, number=n_times)
-        time = time_f(x, y).mean
-    except tvm._ffi.base.TVMError:
-        # build error (occur when device does not support half)
-        return -1
-
-    return 1.0 * (total_item * bits / 8) / 1e9 / time
-
-
-def measure_bandwidth_all_types(
-    total_item, item_per_thread, n_times, target, target_host, remote, dev, verbose=True
-):
-    """measure memory bandwidth for all types
-
-    Parameters
-    ----------
-    total_item: int
-        number of elements in input array
-    item_per_thread: int
-        number of elements each thread accmulates
-    n_times: int
-        number of runs for averaging
-    target: :any:`tvm.target.Target`
-        the target and option of the compilation.
-    target_host : str or :any:`tvm.target.Target`
-        host compilation target
-    remote: tvm.rpc.RPCSession
-        remote rpc session
-    dev: Device
-        the device of array
-    verbose: bool
-        whether outputs immediate result
-
-    Returns
-    -------
-    result: list
-        a list of (type_name, GBPS) pairs
-    """
-    target, target_host = Target.canon_target_and_host(target, target_host)
-    max_threads = target.max_num_threads
-
-    result = []
-    for base_type in ["float"]:
-        for bits in [32]:
-            for lanes in [1, 2, 4, 8, 16]:
-                max_speed = -1e9
-                # try different strides
-                for stride in [max_threads, total_item // (lanes * item_per_thread)]:
-                    speed = measure_bandwidth_sum(
-                        total_item,
-                        item_per_thread,
-                        stride,
-                        base_type,
-                        bits,
-                        lanes,
-                        target,
-                        target_host,
-                        remote,
-                        dev,
-                        n_times,
-                    )
-                    max_speed = max(max_speed, speed)
-                type_name = base_type + str(bits)
-                result.append([f"{type_name}x{lanes}", max_speed])
-                if verbose:
-                    logging.info("\t%-10s %.2f GBPS", result[-1][0], result[-1][1])
-    return result
-
-
-def measure_compute_mad(
-    total_item, item_per_thread, base_type, bits, lanes, target, target_host, remote, dev, n_times
-):
-    """measure peak compute speed by computing mad for a type
-
-    The IR for measurement is
-
-    for each thread
-        for i in 1..item_per_thread
-            x = mad(x, x, y)
-            y = mad(y, y, x)
-
-    Parameters
-    ----------
-    total_item: int
-        number of elements in input array
-    item_per_thread: int
-        number of operations each thread does
-    base_type: str
-        can be "int", "float"
-    bits: int
-        can be 16, 32
-    lanes: int
-       lane of the vector type, can be 1, 2, 4, 8, 16
-    target: :any:`tvm.target.Target`
-        the target and option of the compilation.
-    target_host : str or :any:`tvm.target.Target`
-        host compilation target
-    remote: tvm.rpc.RPCSession
-        if it is not None, use remote rpc session
-    dev: Device
-        the device of array
-    n_times: int
-        number of runs for taking mean
-
-    Returns
-    -------
-    GOPS: float
-         giga operation per second
-    """
-    target, target_host = Target.canon_target_and_host(target, target_host)
-
-    n = total_item
-
-    if bits >= 64 or lanes >= 16:
-        n //= 2
-
-    max_threads = target.max_num_threads
-
-    base_type = str(base_type) + str(bits)
-    dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)
-
-    def extern(ins, outs):
-        # pylint: disable=unused-argument
-        """construct measurement function by building IR directly"""
-        ib = tvm.tir.ir_builder.create()
-
-        bx = te.thread_axis("blockIdx.x")
-        tx = te.thread_axis("threadIdx.x")
-
-        ib.scope_attr(bx, "thread_extent", n // max_threads)
-        ib.scope_attr(tx, "thread_extent", max_threads)
-
-        idx = bx.var * max_threads + tx.var
-
-        a = ib.allocate(dtype, (1), name="a", scope="local")
-        b = ib.allocate(dtype, (1), name="b", scope="local")
-
-        a[0] = outs[0].vload(idx, dtype)
-        b[0] = outs[0].vload(idx, dtype)
-
-        if base_type.find("float") != -1:
-
-            def mad_func(x, y):
-                return x * x + y
-
-        else:
-
-            def mad_func(x, y):
-                return y * y + x
-
-        for _ in range(item_per_thread // 4 // lanes):
-            a[0] = mad_func(a[0], b[0])
-            b[0] = mad_func(b[0], a[0])
-
-        ib.emit(outs[0].vstore(idx, b[0]))
-        return ib.get()
-
-    y = te.extern((n,), [], extern, name="y", dtype=dtype)
-    s = te.create_schedule(y.op)
-
-    try:
-        func = tvm.build(s, [y], target)
-        func = _convert_to_remote(func, remote)
-        time_f = func.time_evaluator(func.entry_name, dev, number=n_times)
-        y = tvm.nd.empty((n,), dtype=dtype, device=dev)
-        time = time_f(y).mean
-    except tvm._ffi.base.TVMError:
-        # build error (occur when device does not support half)
-        return -1
-
-    return 1.0 * (n * item_per_thread) / 1e9 / time
-
-
-def measure_compute_all_types(
-    total_item, item_per_thread, n_times, target, target_host, remote, dev, verbose=True
-):
-    """measure peak flops for all types
-
-    Parameters
-    ----------
-    total_item: int
-        number of elements in input array
-    item_per_thread: int
-        number of elements each thread accmulates
-    n_times: int
-        number of runs for averaging
-    target: :any:`tvm.target.Target`
-        the target and option of the compilation.
-    target_host : str or :any:`tvm.target.Target`
-        host compilation target
-    remote: tvm.rpc.RPCSession
-        remote rpc session
-    dev: Device
-        the device of array
-    verbose: bool
-        whether outputs immediate result
-
-    Returns
-    -------
-    result: list
-        a list of (type_name, GFLOPS/GIOPS) pairs
-    """
-    target, target_host = Target.canon_target_and_host(target, target_host)
-
-    result = []
-    for base_type in ["float", "int"]:
-        for bits in [16, 32, 64]:
-            for lanes in [1, 2, 4, 8, 16]:
-                if base_type == "int" and bits != 32:  # only measure int32
-                    continue
-
-                max_speed = -1e9
-                for per_thread in [item_per_thread // 2, item_per_thread, item_per_thread * 2]:
-                    speed = measure_compute_mad(
-                        total_item,
-                        per_thread,
-                        base_type,
-                        bits,
-                        lanes,
-                        target,
-                        target_host,
-                        remote,
-                        dev,
-                        n_times,
-                    )
-                    max_speed = max(max_speed, speed)
-                type_name = base_type + str(bits)
-                result.append([f"{type_name}x{lanes}", max_speed])
-
-                unit = "GFLOPS" if base_type == "float" else "GIOPS"
-
-                if verbose:
-                    logging.info("\t%-10s %.2f %s", result[-1][0], result[-1][1], unit)
-
-    return result
-
-
-def measure_peak_all(target, target_host, host, port):
-    """measure memory bandwidth and peak compute for gpu devices
-
-    Parameters
-    ----------
-    target: str or :any:`tvm.target.Target`
-    target_host: str
-    host: str
-    port: int
-    """
-
-    target, target_host = Target.canon_target_and_host(target, target_host)
-    remote = rpc.connect(host, port)
-    n_times = 20
-
-    bandwidth_total_item = 1 << 25
-    bandwidth_item_per_thread = 32
-
-    compute_total_item = 1 << 21
-    compute_item_per_thread = 4096
-
-    if str(target).startswith("opencl"):
-        dev = remote.cl()
-    elif str(target).startswith("cuda"):
-        dev = remote.cuda()
-    elif str(target).startswith("metal"):
-        dev = remote.metal()
-    else:
-        raise RuntimeError("Unsupported target")
-
-    logging.info("========== measure memory bandwidth ==========")
-    measure_bandwidth_all_types(
-        bandwidth_total_item, bandwidth_item_per_thread, n_times, target, target_host, remote, dev
-    )
-
-    logging.info("========== measure peak compute ==========")
-    measure_compute_all_types(
-        compute_total_item, compute_item_per_thread, n_times, target, target_host, remote, dev
-    )
diff --git a/python/tvm/contrib/sparse.py b/python/tvm/contrib/sparse.py
deleted file mode 100644
index 9f94ff24f906..000000000000
--- a/python/tvm/contrib/sparse.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Tensor and Operation class for computation declaration."""
-# pylint: disable=invalid-name
-import warnings
-import numpy as _np
-from tvm.runtime import ndarray as _nd
-from tvm import te
-from tvm.tir import expr as _expr
-from tvm.te import tensor as _tensor
-
-
-float32 = "float32"
-itype = "int32"
-
-
-class CSRNDArray(object):
-    """Sparse tensor object in CSR format."""
-
-    def __init__(self, arg1, device=None, shape=None):
-        """Construct a sparse matrix in CSR format.
-
-        Parameters
-        ----------
-        arg1 : numpy.ndarray or a tuple with (data, indices, indptr)
-            The corresponding a dense numpy array,
-            or a tuple for constructing a sparse matrix directly.
-
-        device: Device
-            The corresponding device.
-
-        shape : tuple of int
-            The shape of the array
-        """
-        if isinstance(arg1, tuple):
-            assert len(arg1) == 3
-            self.data, self.indices, self.indptr = arg1
-            self.shape = shape
-        elif isinstance(arg1, _np.ndarray):
-            source_array = arg1
-            ridx, cidx = _np.nonzero(source_array)
-            data = source_array[ridx, cidx]
-            self.data = _nd.array(data, device)
-            indices = _np.nonzero(source_array)[1].astype(itype)
-            self.indices = _nd.array(indices, device)
-            indptr = [0] + _np.apply_along_axis(
-                _np.count_nonzero, axis=1, arr=source_array
-            ).tolist()
-            indptr = _np.cumsum(_np.array(indptr, itype)).astype(itype)
-            self.indptr = _nd.array(indptr, device)
-            self.shape = source_array.shape
-        else:
-            raise RuntimeError(
-                f"Construct CSRNDArray with either a tuple (data, indices, indptr) "
-                f"or a numpy.array, can't handle type {type(arg1)}."
-            )
-        self.stype = "csr"
-        self.dtype = self.data.dtype
-        assert self.shape is not None
-        assert isinstance(self.data, _nd.NDArray)
-        assert isinstance(self.indices, _nd.NDArray)
-        assert str(self.indices.dtype) == "int32" or str(self.indices.dtype) == "int64", str(
-            self.indices.dtype
-        )
-        assert isinstance(self.indptr, _nd.NDArray)
-        assert str(self.indptr.dtype) == "int32" or str(self.indptr.dtype) == "int64", str(
-            self.indptr.dtype
-        )
-
-    def asnumpy(self):
-        """Construct a full matrix and convert it to numpy array. This API will be deprecated
-        in TVM v0.8 release. Please use `numpy` instead."""
-        warnings.warn(
-            "CSRNDArray.asnumpy() will be deprecated in TVM v0.8 release. "
-            "Please use CSRNDArray.numpy() instead.",
-            DeprecationWarning,
-        )
-        return self.numpy()
-
-    def numpy(self):
-        """Construct a full matrix and convert it to numpy array."""
-        full = _np.zeros(self.shape, self.dtype)
-        ridx = _np.diff(self.indptr.numpy())
-        ridx = _np.hstack([_np.ones((v,), itype) * i for i, v in enumerate(ridx)])
-        full[ridx, self.indices.numpy().astype(itype)] = self.data.numpy()
-        return full
-
-
-def array(source_array, device=None, shape=None, stype="csr"):
-    """Construct a sparse NDArray from numpy.ndarray"""
-    ret = None
-    if stype == "csr":
-        ret = CSRNDArray(source_array, shape=shape, device=device)
-    else:
-        raise NotImplementedError(f"stype={stype} is not supported yet.")
-    return ret
-
-
-class SparsePlaceholderOp(object):
-    """Placeholder class for sparse tensor representations."""
-
-    def __init__(self, shape, nonzeros, dtype, name):
-        # pylint: disable=unused-argument
-        """Contructing a bare bone structure for a sparse matrix
-
-        Parameters
-        ----------
-        shape: Tuple of Expr
-            The shape of the tensor
-
-        nonzeros: int
-            The number of non-zero values
-
-        dtype: str, optional
-            The data type of the tensor
-
-        name: str, optional
-            The name hint of the tensor
-        """
-        self.shape = shape
-        self.dtype = dtype
-        self.name = name
-        self.stype = "unknown"
-
-
-class CSRPlaceholderOp(SparsePlaceholderOp):
-    """Placeholder class for CSR based sparse tensor representation."""
-
-    def __init__(self, shape, nonzeros, dtype, name):
-        """Contructing a bare bone structure for a csr_matrix
-
-        Parameters
-        ----------
-        shape: Tuple of Expr
-            The shape of the tensor
-
-        nonzeros: int
-            The number of non-zero values
-
-        dtype: str, optional
-            The data type of the tensor
-
-        name: str, optional
-            The name hint of the tensor
-        """
-        SparsePlaceholderOp.__init__(self, shape, nonzeros, dtype, name)
-        self.stype = "csr"
-        self.data = te.placeholder((nonzeros,), dtype=dtype, name=self.name + "_data")
-        self.indices = te.placeholder((nonzeros,), dtype=itype, name=self.name + "_indices")
-        self.indptr = te.placeholder((self.shape[0] + 1,), dtype=itype, name=self.name + "_indptr")
-        assert isinstance(self.data, _tensor.Tensor)
-        assert isinstance(self.indices, _tensor.Tensor)
-        assert isinstance(self.indptr, _tensor.Tensor)
-
-
-def placeholder(shape, nonzeros=None, dtype=None, name="placeholder", stype=None):
-    """Construct an empty sparse tensor object.
-
-    Parameters
-    ----------
-    shape: Tuple of Expr
-        The shape of the tensor
-
-    nonzeros: int
-        The number of non-zero values
-
-    dtype: str, optional
-        The data type of the tensor
-
-    name: str, optional
-        The name hint of the tensor
-
-    stype: str, optional
-        The name storage type of the sparse tensor (e.g. csr, coo, ell)
-
-    Returns
-    -------
-    tensor: SparsePlaceholderOp
-        The created sparse tensor placeholder
-    """
-    shape = (shape,) if isinstance(shape, _expr.PrimExpr) else shape
-    nonzeros = 0 if nonzeros is None else nonzeros
-    dtype = float32 if dtype is None else dtype
-    stype = "csr" if stype is None else stype
-    ret = None
-    if stype == "csr":
-        ret = CSRPlaceholderOp(shape=shape, nonzeros=nonzeros, dtype=dtype, name=name)
-    else:
-        raise NotImplementedError(f"stype={stype} is not supported yet.")
-    return ret
diff --git a/python/tvm/contrib/tedd.py b/python/tvm/contrib/tedd.py
deleted file mode 100644
index 680297729789..000000000000
--- a/python/tvm/contrib/tedd.py
+++ /dev/null
@@ -1,798 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-outside-toplevel, nested-min-max
-"""Tensor Expression Debug Display (TEDD), visualizing Tensor Expression"""
-import html
-import json
-import warnings
-from graphviz import Digraph
-from graphviz import Source
-import tvm
-
-TVMDD_TABLE_BODY_WIDTH = 30
-# Must match enum IterVarType defined in include/tvm/expr.h
-ITERVAR_TYPE_STRING_MAP = {
-    0: ("kDataPar", "#FFFFFF"),
-    1: ("kThreadIndex", "#2980B9"),
-    2: ("kCommReduce", "#FAD7A0"),
-    3: ("kOrdered", "#D35400"),
-    4: ("kOpaque", "#ABB2B9"),
-    5: ("kUnrolled", "#D2B4DE"),
-    6: ("kVectorized", "#AED6F1"),
-    7: ("kParallelized", "#F5B7B1"),
-    8: ("kTensorized", "#A9DFBF"),
-}
-
-PALETTE = {
-    0: "#000000",
-    1: "#922B21",
-    2: "#76448A",
-    3: "#1F618D",
-    4: "#148F77",
-    5: "#B7950B",
-    6: "#AF601A",
-    7: "#F5B7B1",
-    8: "#A9DFBF",
-}
-
-PALETTE_SIZE = 9
-
-
-def dom_path_to_string(dom_path, prefix=""):
-    path_string = prefix
-    for index in dom_path:
-        path_string = path_string + "_" + str(index)
-    return path_string
-
-
-def insert_dot_id(sch):
-    """Insert unique ID for each node in the DOM tree.
-    They are used as Dot node ID.
-    """
-    for stage_idx, stage in enumerate(sch["stages"]):
-        dom_path = [stage_idx]
-        stage["id"] = dom_path_to_string(dom_path, stage["type"])
-        for itervar_idx, itervar in enumerate(stage["all_itervars"]):
-            dom_path = [stage_idx, itervar_idx]
-            itervar["id"] = dom_path_to_string(dom_path, itervar["type"])
-        for rel_idx, rel in enumerate(stage["relations"]):
-            dom_path = [stage_idx, rel_idx]
-            rel["id"] = dom_path_to_string(dom_path, rel["type"])
-        for tensor_idx, tensor in enumerate(stage["output_tensors"]):
-            dom_path = [stage_idx, tensor_idx]
-            tensor["id"] = dom_path_to_string(dom_path, tensor["type"])
-    return sch
-
-
-def itervar_equal(iv_a, iv_b):
-    """A helper method that compares the equality of two iterative variables"""
-    # Adopt the following method to assure the equality between two itervars.
-    # The plain comparison might fail (i.e. iv_a == iv_b) after the change of
-    # domain bounds from InferBound.
-    def _var_equal(v_a, v_b):
-        condtions = [
-            v_a.name == v_b.name,
-            v_a.dtype == v_b.dtype,
-            v_a.type_annotation == v_b.type_annotation,
-        ]
-        return all(c for c in condtions)
-
-    condtions = [
-        _var_equal(iv_a.var, iv_b.var),
-        iv_a.iter_type == iv_b.iter_type,
-        iv_a.thread_tag == iv_b.thread_tag,
-    ]
-    return all(c for c in condtions)
-
-
-class ObjectManager:
-    """A helper class tracking schedule objects, e.g. stage, IterVar,
-    relationship, and tensor, to their DOM path."""
-
-    def __init__(self, sch):
-        self.dict = {}
-        for stage_idx, stage in enumerate(sch.stages):
-            self.dict[stage] = [stage_idx]
-            for itervar_idx, itervar in enumerate(stage.all_iter_vars):
-                self.dict[itervar] = [stage_idx, itervar_idx]
-                # the itervars of leaf should also be mapped to the original one
-                for leaf_iv in stage.leaf_iter_vars:
-                    if itervar_equal(leaf_iv, itervar):
-                        self.dict[leaf_iv] = [stage_idx, itervar_idx]
-            for rel_idx, rel in enumerate(stage.relations):
-                self.dict[rel] = [stage_idx, rel_idx]
-            for tensor_idx in range(stage.op.num_outputs):
-                self.dict[frozenset({stage.op.name, tensor_idx})] = [stage_idx, tensor_idx]
-
-    def get_dom_path(self, obj):
-        if obj is None:
-            return None
-        assert obj in self.dict, "Node is no found."
-        return self.dict[obj]
-
-
-def get_or_create_dot_id(obj, prefix="", assert_on_missing=False):
-    """If obj's ID has been registered, return it.
-    If not, either assert or create a unique and legal ID, register and
-    return it, according to assert_on_missing.
-    ID must be a unique and legal Dotty ID.
-
-     Parameters
-     ----------
-     obj : objet
-                 Serve as the key to the ID.
-
-     prefix : string
-                 Prefix to attach to the ID.  Usually use obj's non-unique
-                 name as prefix.
-
-     assert_on_missing : bool
-                 Assert or not if object doesn't have a registered ID.
-    """
-    prefix = prefix.replace(".", "_")
-    if not hasattr(get_or_create_dot_id, "obj_id_dict"):
-        get_or_create_dot_id.obj_id_dict = {}
-    if obj not in get_or_create_dot_id.obj_id_dict:
-        if assert_on_missing:
-            assert False, "dot_id " + str(obj) + " has not been registered."
-        else:
-            get_or_create_dot_id.obj_id_dict[obj] = prefix + hex(id(obj))
-    return get_or_create_dot_id.obj_id_dict[obj]
-
-
-def get_port_id(is_input, index):
-    return "I_" + str(index) if is_input else "O_" + str(index)
-
-
-def get_itervar_type_info(iter_type):
-    assert iter_type < len(ITERVAR_TYPE_STRING_MAP), "Unknown IterVar type: " + str(iter_type)
-    return ITERVAR_TYPE_STRING_MAP[iter_type]
-
-
-def get_itervar_label_color(itervar, iv_type):
-    type_info = get_itervar_type_info(iv_type)
-    return (
-        linebrk(str(itervar["name"]) + "(" + type_info[0] + ")", TVMDD_TABLE_BODY_WIDTH),
-        type_info[1],
-    )
-
-
-def linebrk(s, n):
-    """Break input string s with <br/> for every n charactors."""
-    result = ""
-    j = 0
-    for i, c in enumerate(s):
-        if j == n and i != len(s) - 1:
-            result = result + "\n"
-            j = 0
-        j = j + 1
-        result = result + c
-    result = html.escape(str(result), quote=True)
-    result = result.replace("\n", "<br/>")
-    return result
-
-
-def create_graph(name="", rankdir="BT"):
-    graph = Digraph(name=name)
-    graph.graph_attr["rankdir"] = rankdir
-    return graph
-
-
-def itervar_label(itervar, index, index_color, label):
-    return (
-        '<TR><TD PORT="'
-        + itervar["id"]
-        + '" BGCOLOR="'
-        + index_color
-        + '">'
-        + str(index)
-        + '</TD><TD BGCOLOR="white" PORT="itervar">'
-        + label
-        + "<br/>"
-        + str(itervar["properties"]["range"])
-        + "</TD></TR>"
-    )
-
-
-def stage_label(stage):
-    return stage["name"] + "<br/>Scope: " + stage["properties"]["scope"]
-
-
-def legend_label():
-    """Generate legend labels."""
-    label = '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">'
-    for iter_type in ITERVAR_TYPE_STRING_MAP:
-        name, color = ITERVAR_TYPE_STRING_MAP[iter_type]
-        label += (
-            '<TR><TD BGCOLOR="' + color + '"></TD>' + '<TD BGCOLOR="white">' + name + "</TD></TR>"
-        )
-    label += "</TABLE>>"
-    return label
-
-
-def leaf_itervars(stage):
-    filtered = filter(lambda x: (x["index"] >= 0), stage["all_itervars"])
-    return sorted(filtered, key=lambda x: x["index"])
-
-
-def legend_dot(g):
-    with g.subgraph(name="cluster_legend") as subgraph:
-        subgraph.attr(label="Legend")
-        label = legend_label()
-        subgraph.node("legend", label, shape="none", margin="0")
-
-
-def extract_dom_for_viz(sch, need_range=True):
-    json_str = dump_json(sch, need_range)
-    s = json.loads(json_str)
-    s = insert_dot_id(s)
-    return s
-
-
-def dump_graph(dot_string, show_svg=True, dot_file_path="", output_dot_string=False):
-    """Output dot_string in various formats."""
-    if dot_file_path:
-        try:
-            dot_file = open(dot_file_path, "w+")
-            dot_file.write(dot_string)
-            dot_file.close()
-        except IOError:
-            print("Cannot open file: " + dot_file_path)
-    if show_svg:
-        from IPython.display import display
-        from IPython.display import SVG
-
-        src = Source(dot_string)
-        display(SVG(src.pipe(format="svg")))
-    if output_dot_string:
-        return dot_string
-    return None
-
-
-def dump_json(sch, need_range):
-    """Serialize data for visualization from a schedule in JSON format.
-
-    Parameters
-    ----------
-    sch : schedule
-                The schedule object to serialize
-
-    Returns
-    -------
-    json : string
-        Serialized JSON string
-    """
-
-    def encode_itervar(itervar, stage, index, range_map):
-        """Extract and encode IterVar visualization data to a dictionary"""
-        ivrange = range_map[itervar] if range_map is not None and itervar in range_map else None
-        bind_thread = None
-        tensor_intrin = None
-        if itervar in stage.iter_var_attrs:
-            attr = stage.iter_var_attrs[itervar]
-            iv_type = attr.iter_type
-            # binding
-            bind_thread = str(attr.bind_thread.var) if attr.bind_thread is not None else None
-            # tensorization
-            if attr.tensor_intrin is not None:
-                tensor_intrin = str(attr.tensor_intrin.body)
-                # remove the final \n
-                tensor_intrin = tensor_intrin[0:-1] if tensor_intrin[-1] == "\n" else tensor_intrin
-            else:
-                tensor_intrin = None
-        else:
-            iv_type = itervar.iter_type
-        itervar_dict = {
-            "type": "IterVar",
-            "index": index,
-            "name": str(itervar.var),
-            "itervar_type": iv_type,
-            "properties": {
-                "thread": bind_thread,
-                "intrin": tensor_intrin,
-                "range": str(ivrange) if ivrange is not None else "range(N/A)",
-            },
-        }
-        return itervar_dict
-
-    def encode_itervars(stage, range_map):
-        """Extract and encode IterVars visualization data from a stage to a dictionary"""
-
-        def get_leaf_itervar_index(itervar, leaf_iv):
-            for leaf_index, ivar in enumerate(leaf_iv):
-                if itervar_equal(ivar, itervar):
-                    return leaf_index
-            return -1
-
-        itervars = []
-        for itervar in stage.all_iter_vars:
-            leaf_index = get_leaf_itervar_index(itervar, stage.leaf_iter_vars)
-            itervars.append(encode_itervar(itervar, stage, leaf_index, range_map))
-        return itervars
-
-    def encode_itervar_relation(obj_manager, rel):
-        """Extract and encode IterVar Relationship visualization data to a dictionary"""
-        rel_type = type(rel)
-        if rel_type is tvm.te.schedule.Split:
-            node_type = "Split_Relation"
-            rel_dict = {
-                "type": node_type,
-                "parent": obj_manager.get_dom_path(rel.parent),
-                "outer": obj_manager.get_dom_path(rel.outer),
-                "inner": obj_manager.get_dom_path(rel.inner),
-            }
-        elif rel_type is tvm.te.schedule.Fuse:
-            node_type = "Fuse_Relation"
-            rel_dict = {
-                "type": node_type,
-                "fused": obj_manager.get_dom_path(rel.fused),
-                "outer": obj_manager.get_dom_path(rel.outer),
-                "inner": obj_manager.get_dom_path(rel.inner),
-            }
-        elif rel_type is tvm.te.schedule.Singleton:
-            node_type = "Singleton_Relation"
-            rel_dict = {
-                "type": node_type,
-                "iter": obj_manager.get_dom_path(rel.iter),
-            }
-        else:
-            return None
-        return rel_dict
-
-    def encode_itervar_relations(obj_manager, stage):
-        relations = []
-        for i in range(len(stage.relations)):
-            rel = encode_itervar_relation(obj_manager, stage.relations[i])
-            if rel is not None:
-                relations.append(rel)
-        return relations
-
-    def encode_tensor(obj_manager, tensor, stage):
-        """Extract and encode tensor visualization data to a dictionary"""
-        tensor_dict = {
-            "type": "Tensor",
-            "source": obj_manager.get_dom_path(stage),
-            "value_index": tensor.value_index,
-            "shape": str(tensor.op.output(tensor.value_index).shape),
-            "data_type": tensor.op.output(tensor.value_index).dtype,
-        }
-        return tensor_dict
-
-    def encode_tensors(obj_manager, stage):
-        tensors = []
-        for i in range(stage.op.num_outputs):
-            tensor = stage.op.output(i)
-            tensors.append(encode_tensor(obj_manager, tensor, stage))
-        tensors.sort(key=lambda tensor: tensor["value_index"])
-        return tensors
-
-    def encode_stage(obj_manager, stage, range_map):
-        """Extract and encode stage visualization data to a dictionary"""
-        stage_dict = {
-            "type": "Stage",
-            "name": stage.op.name,
-            "attaching_to": obj_manager.get_dom_path(stage.attach_ivar),
-            "compute": str(stage.op.body) if hasattr(stage.op, "body") else None,
-            "properties": {
-                "scope": stage.scope,
-            },
-            "all_itervars": encode_itervars(stage, range_map),
-            "relations": encode_itervar_relations(obj_manager, stage),
-            "input_tensors": [
-                obj_manager.get_dom_path(frozenset({tensor.op.name, tensor.value_index}))
-                for tensor in stage.op.input_tensors
-            ],
-            "output_tensors": encode_tensors(obj_manager, stage),
-        }
-        return stage_dict
-
-    def encode_schedule(sch, need_range):
-        """Extract and encode data from a schedule for visualization to a nested dictionary.
-        It is useful for JSON to serialize schedule.
-
-            Parameters
-            ----------
-            sch : schedule
-                        The schedule object to extract
-
-            Returns
-            -------
-            dict : dictionary
-                A nested dictionary
-        """
-        assert isinstance(
-            sch, tvm.te.schedule.Schedule
-        ), "Input is not a tvm.te.schedule.Schedule object."
-        range_map = None
-        if need_range:
-            try:
-                range_map = tvm.te.schedule.InferBound(sch)
-            except tvm._ffi.base.TVMError as expt:
-                warnings.warn(
-                    "Ranges are not available, because InferBound fails with the following error:\n"
-                    + str(expt)
-                )
-
-        obj_manager = ObjectManager(sch)
-        stages = []
-        for stage in sch.stages:
-            stages.append(encode_stage(obj_manager, stage, range_map))
-        return {
-            "type": "Schedule",
-            "stages": stages,
-        }
-
-    return json.dumps(sch, default=lambda s: encode_schedule(s, need_range))
-
-
-def viz_schedule_tree(sch, show_svg=False, dot_file_path="", output_dot_string=False):
-    """Top level API to render schedule tree
-
-    Parameters
-    ----------
-    sch : schedule
-                The schedule object to visualize
-
-    show_svg : bool
-                Display graph as SVG, useful for Jupyter notebooks.
-
-    dot_file_path : string
-                Dot file to save the graph.
-
-    output_dot_string : bool
-                Return dot file content or an empty string.
-
-    Returns
-    -------
-    dot_string : string
-        Dot file content or an empty string according to output_dot_string
-
-    Examples
-    --------
-    The following code writes a schedule tree to a dot file.
-
-    .. code-block:: python
-        tedd.viz_schedule_tree(s, dot_file_path = '/tmp/example.dot')
-
-    Use the following code to render a SVG graph in a Jupyter notebook.
-
-    .. code-block:: python
-        tedd.viz_schedule_tree(s, show_svg = True)
-    """
-
-    def create_schedule_tree_graph(name=""):
-        return create_graph(name=name, rankdir="BT")
-
-    def root_dot(g):
-        g.node("ROOT", "ROOT", shape="oval", margin="0")
-
-    def stage_node_dot(g, stage):
-        node_label = stage_node_label(stage)
-        g.node(stage["id"], node_label, shape="none", margin="0")
-
-    def stage_node_label(stage):
-        """Return a html format label for the given stage."""
-        label = (
-            '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" '
-            'CELLPADDING="4"> <TR><TD BGCOLOR="lightgrey" '
-            'COLSPAN="2" PORT="stage">' + stage_label(stage) + "</TD></TR>"
-        )
-
-        for leafiv in leaf_itervars(stage):
-            iv_type = leafiv["itervar_type"]
-            var_attr_label = ""
-            if "thread" in leafiv["properties"] and leafiv["properties"]["thread"] is not None:
-                var_attr_label = (
-                    var_attr_label
-                    + '<br/><font color="#2980B9">('
-                    + str(leafiv["properties"]["thread"])
-                    + ")</font>"
-                )
-            if "intrin" in leafiv["properties"] and leafiv["properties"]["intrin"] is not None:
-                var_attr_label = (
-                    var_attr_label
-                    + "<br/>"
-                    + linebrk(
-                        "(tensor_intrin:" + str(leafiv["properties"]["intrin"]) + ")",
-                        TVMDD_TABLE_BODY_WIDTH,
-                    )
-                )
-            var_label, color = get_itervar_label_color(leafiv, iv_type)
-            label += itervar_label(leafiv, leafiv["index"], color, var_label + var_attr_label)
-        if stage["compute"] is not None:
-            label += (
-                '<TR><TD COLSPAN="2">'
-                + linebrk(str(stage["compute"]), TVMDD_TABLE_BODY_WIDTH)
-                + "</TD></TR>"
-            )
-        label += "</TABLE>>"
-        return label
-
-    def compute_at_dot(g, stage):
-        """If the given stage attaches to another stage, create an edge from it
-        stage to its attach point; otherwise, create an edge to the ROOT.
-        """
-        src = stage["id"]
-        dst = (
-            dom_path_to_string([stage["attaching_to"][0]], "Stage")
-            + ":"
-            + dom_path_to_string(stage["attaching_to"], "IterVar")
-            if stage["attaching_to"] is not None
-            else "ROOT"
-        )
-        color = (
-            PALETTE[stage["attaching_to"][1] + 1]
-            if stage["attaching_to"] is not None and stage["attaching_to"][1] < PALETTE_SIZE - 1
-            else PALETTE[0]
-        )
-        g.edge(src, dst, color=color)
-
-    graph = create_schedule_tree_graph("Schedule Tree")
-    s = extract_dom_for_viz(sch)
-    legend_dot(graph)
-    for stage in s["stages"]:
-        stage_node_dot(graph, stage)
-    for stage in s["stages"]:
-        compute_at_dot(graph, stage)
-    root_dot(graph)
-    return dump_graph(graph.source, show_svg, dot_file_path, output_dot_string)
-
-
-def viz_itervar_relationship_graph(sch, show_svg=False, dot_file_path="", output_dot_string=False):
-    """Top level API to render IterVar relationship graph
-
-    Parameters
-    ----------
-    sch : schedule
-                The schedule object to visualize
-
-    show_svg : bool
-                Display graph as SVG, useful for Jupyter notebooks.
-
-    dot_file_path : string
-                Dot file to save the graph.
-
-    output_dot_string : bool
-                Return dot file content or an empty string.
-
-    Examples
-    --------
-    The following code writes Ian tervar relationship graph to a dot file.
-
-    .. code-block:: python
-        tedd.viz_def viz_itervar_relationship_graph(sch,
-            (s, dot_file_path = '/tmp/example.dot')
-
-    Use the following code to render a SVG graph in a Jupyter notebook.
-
-    .. code-block:: python
-        tedd.viz_def viz_itervar_relationship_graph(sch,
-            (s, show_svg = True)
-    """
-
-    def create_itervar_relation_graph(name=""):
-        return create_graph(name=name, rankdir="TB")
-
-    def itervar_node_dot(g, itervar, iv_type, index):
-        label = itervar_node_label(itervar, iv_type, index)
-        g.node(itervar["id"], label, shape="none", margin="0")
-
-    def itervar_node_label(itervar, iv_type, index):
-        label = (
-            '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" '
-            'CELLPADDING="4">'
-            + itervar_label(
-                itervar,
-                index,
-                get_itervar_label_color(itervar, iv_type)[1],
-                get_itervar_label_color(itervar, iv_type)[0],
-            )
-            + "</TABLE>>"
-        )
-        return label
-
-    def itervar_relation_node_dot(g, node_id, node_label, input_ports, output_ports):
-        label = itervar_relation_node_label(node_label, input_ports, output_ports)
-        g.node(node_id, label, shape="none", margin="0")
-
-    def itervar_relation_node_label(node_label, input_ports, output_ports):
-        """Return a html format label for an itervar relationship node
-        including node_label and input/output ports.
-        """
-        label = '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" ' 'CELLPADDING="4">' + "<TR>"
-        max_port_num = max(len(input_ports), len(output_ports))
-        for i in range(max_port_num):
-            if i < len(input_ports):
-                input_port = input_ports[i]
-                label += '<TD BGCOLOR="lightgrey" PORT="' + input_port + '">' + input_port + "</TD>"
-            else:
-                label += '<TD BGCOLOR="white"></TD>'
-        label += "</TR>"
-        label += (
-            '<TR><TD BGCOLOR="white" COLSPAN="'
-            + str(max_port_num)
-            + '" PORT="relation">'
-            + node_label
-            + "</TD></TR>"
-        )
-        label += "<TR>"
-        for i in range(max_port_num):
-            if i < len(output_ports):
-                output_port = output_ports[i]
-                label += (
-                    '<TD BGCOLOR="lightgrey" PORT="' + output_port + '">' + output_port + "</TD>"
-                )
-            else:
-                label += '<TD BGCOLOR="white"></TD>'
-        label += "</TR>"
-        label += "</TABLE>>"
-        return label
-
-    def itervar_relation_dot(g, node, node_id):
-        """Create an itervar relationship node."""
-        node_type = node["type"]
-        if node_type == "Split_Relation":
-            node_type = "Split"
-            itervar_relation_node_dot(g, node_id, node_type, ["Input"], ["Outer", "Inner"])
-            parent = dom_path_to_string(node["parent"], "IterVar")
-            outer = dom_path_to_string(node["outer"], "IterVar")
-            inner = dom_path_to_string(node["inner"], "IterVar")
-            g.edge(parent + ":itervar", node_id + ":Input")
-            g.edge(node_id + ":Outer", outer + ":itervar")
-            g.edge(node_id + ":Inner", inner + ":itervar")
-        elif node_type == "Fuse_Relation":
-            node_type = "Fuse"
-            itervar_relation_node_dot(g, node_id, node_type, ["Outer", "Inner"], ["Fused"])
-            fused = dom_path_to_string(node["fused"], "IterVar")
-            outer = dom_path_to_string(node["outer"], "IterVar")
-            inner = dom_path_to_string(node["inner"], "IterVar")
-            g.edge(outer + ":itervar", node_id + ":Outer")
-            g.edge(inner + ":itervar", node_id + ":Inner")
-            g.edge(node_id + ":Fused", fused + ":itervar")
-        elif node_type == "Singleton_Relation":
-            node_type = "Singleton"
-            itervar_relation_node_dot(g, node_id, node_type, [], ["Iter"])
-            itervar = dom_path_to_string(node["inner"], "IterVar")
-            g.edge(node_id + ":Iter", itervar + ":itervar")
-        else:
-            assert False, "Unknown IterVarRelationNode: " + node_type
-
-    def stage_node_dot(g, stage):
-        """Create a stage node."""
-        with g.subgraph(name="cluster_" + stage["id"]) as subgraph:
-            subgraph.attr(label=stage["name"])
-            if stage["all_itervars"]:
-                for itervar in stage["all_itervars"]:
-                    iv_type = itervar["itervar_type"]
-                    itervar_node_dot(subgraph, itervar, iv_type, itervar["index"])
-                for rel in stage["relations"]:
-                    node_id = rel["id"]
-                    itervar_relation_dot(subgraph, rel, node_id)
-            else:
-                subgraph.node(stage["name"] + "_placeholder", style="invis")
-
-    graph = create_itervar_relation_graph("IterVar Relationship Graph")
-    s = extract_dom_for_viz(sch)
-    legend_dot(graph)
-    for stage in s["stages"]:
-        stage_node_dot(graph, stage)
-
-    return dump_graph(graph.source, show_svg, dot_file_path, output_dot_string)
-
-
-def viz_dataflow_graph(sch, show_svg=False, dot_file_path="", output_dot_string=False):
-    """Top level API to render dataflow graph
-
-    Parameters
-    ----------
-    sch : schedule
-                The schedule object to visualize
-
-    show_svg : bool
-                Display graph as SVG, useful for Jupyter notebooks.
-
-    dot_file_path : string
-                Dot file to save the graph.
-
-    output_dot_string : bool
-                Return dot file content or an empty string.
-
-    Examples
-    --------
-    The following code writes a dataflow graph to a dot file.
-
-    .. code-block:: python
-        tedd.viz_dataflow_graph(s, dot_file_path = '/tmp/example.dot')
-
-    Use the following code to render a SVG graph in a Jupyter notebook.
-
-    .. code-block:: python
-        tedd.viz_dataflow_graph(s, show_svg = True)"""
-
-    def create_dataflow_graph(name=""):
-        return create_graph(name=name, rankdir="LR")
-
-    def tensor_node_dot(g, tensor):
-        """Create a tensor node."""
-        label = tensor_node_label(tensor)
-        g.node(tensor["id"], label, shape="oval", margin="0")
-
-    def tensor_node_label(tensor):
-        """Return a html format label for the given tensor."""
-        label = str(tensor["shape"]) + "\n" + str(tensor["data_type"])
-        return label
-
-    def stage_node_dot(g, stage):
-        """Create a stage node."""
-        label = stage_node_label(stage)
-        g.node(stage["id"], label, shape="none", margin="0")
-
-    def stage_node_label(stage):
-        """Return a html format label for the given stage."""
-        rows = max(1, max(len(stage["output_tensors"]), len(stage["input_tensors"])))
-        label = '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" ' 'CELLPADDING="4">'
-        for i in range(rows):
-            label += "<TR>"
-            if i < len(stage["input_tensors"]):
-                port_id = get_port_id(True, i)
-                label += (
-                    '<TD BGCOLOR="lightgrey" COLSPAN="2" PORT="' + port_id + '">' + str(i) + "</TD>"
-                )
-            else:
-                label += '<TD BGCOLOR="white" COLSPAN="2"></TD>'
-            if i == 0:
-                label += (
-                    '<TD BGCOLOR="white" COLSPAN="2" ROWSPAN="'
-                    + str(rows)
-                    + '">'
-                    + stage_label(stage)
-                    + "</TD>"
-                )
-            if i < len(stage["output_tensors"]):
-                port_id = get_port_id(False, i)
-                label += (
-                    '<TD BGCOLOR="lightgrey" COLSPAN="2" PORT="' + port_id + '">' + str(i) + "</TD>"
-                )
-            else:
-                label += '<TD BGCOLOR="white" COLSPAN="2"></TD>'
-            label += "</TR>"
-        label += "</TABLE>>"
-        return label
-
-    def dfg_dot(g, sch):
-        """Create edges among stages."""
-        stages = sch["stages"]
-        for stage in stages:
-            for i in range(len(stage["input_tensors"])):
-                src = dom_path_to_string(stage["input_tensors"][i], "Tensor")
-                dst = stage["id"] + ":" + get_port_id(True, i)
-                g.edge(src, dst)
-            for i in range(len(stage["output_tensors"])):
-                src = stage["id"] + ":" + get_port_id(False, i)
-                dst = stage["output_tensors"][i]["id"]
-                g.edge(src, dst)
-
-    graph = create_dataflow_graph("Dataflow Graph")
-    s = extract_dom_for_viz(sch, need_range=False)
-    for stage in s["stages"]:
-        stage_node_dot(graph, stage)
-        for tensor in stage["output_tensors"]:
-            tensor_node_dot(graph, tensor)
-
-    dfg_dot(graph, s)
-
-    return dump_graph(graph.source, show_svg, dot_file_path, output_dot_string)
diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index fb325de1d3ab..48909e9832e1 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -17,106 +17,37 @@
 
 # pylint: disable=invalid-name
 """The build utils in python."""
-from typing import Union, Optional, List, Mapping
+from typing import Union, Optional
 
-import warnings
 
 import tvm.tir
 
-from tvm import te
 
-from tvm.runtime import Module
 from tvm.runtime import ndarray
 from tvm.ir import container
 from tvm.tir import PrimFunc
 from tvm.ir.module import IRModule
-from tvm.te import tensor
 from tvm.target import Target
-from tvm.tir.buffer import Buffer
-from tvm.tir.expr import Var
 from tvm.driver import _ffi_api as _driver_ffi
 
 from . import _ffi_api as ffi
 
 
-def get_binds(args, compact=False, binds=None):
-    """Internal function to get binds and arg_list given arguments.
-    Parameters
-    ----------
-    args : list of Buffer or Tensor or Var
-        The argument lists to the function.
-    compact : bool
-        If the statement has already bound to a compact buffer.
-    binds : dict of :any:`Tensor` to :any:`Buffer`, optional
-        Dictionary that maps the Tensor to Buffer which specified the data layout
-        requirement of the function. By default, a new compact buffer is created
-        for each tensor in the argument.
-    Returns
-    -------
-    binds: dict
-        The bind specification
-    arg_list: list
-        The list of symbolic buffers of arguments.
-    """
-    binds, arg_list = ffi.get_binds(args, compact, binds)
-    return binds, arg_list
-
-
-def schedule_to_module(
-    sch: te.Schedule,
-    args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None,
-    name: str = "main",
-    binds: Optional[Mapping[tensor.Tensor, Buffer]] = None,
-) -> IRModule:
-    """According to the given schedule, form a function.
-
-    This is a low-level function intended for testing purposes, and
-    does not apply any optimization passes.  In general, `tvm.lower`
-    and `tvm.build` should be used instead.
-
-    Parameters
-    ----------
-    sch : tvm.te.schedule.Schedule
-        The given scheduler to form the raw body
-    args : list of Buffer or Tensor or Var
-        The argument lists to the function.
-    name : str
-        The name of result function, default name is "main"
-    binds : dict of :any:`Tensor` to :any:`Buffer`, optional
-        The binds information
-    Returns
-    -------
-    The body formed according to the given schedule
-    """
-    return ffi.schedule_to_module(sch, args, name, binds)
-
-
 def lower(
-    inp: Union[te.Schedule, PrimFunc, IRModule],
-    args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None,
+    inp: Union[PrimFunc, IRModule],
     name: str = "main",
-    binds: Optional[Mapping[tensor.Tensor, Buffer]] = None,
     simple_mode: bool = False,
 ) -> IRModule:
     """Lowering step before build into target.
 
     Parameters
     ----------
-    inp : Union[tvm.te.schedule.Schedule, tvm.tir.PrimFunc, IRModule]
+    inp : Union[tvm.tir.PrimFunc, IRModule]
         The TE schedule or TensorIR PrimFunc/IRModule to be built
 
-    args : Optional[List[Union[tvm.tir.Buffer, tensor.Tensor, tir.Var]]]
-        The argument lists to the function for TE schedule.
-
-        It should be None if we want to lower TensorIR.
     name : str
         The name of the result function.
 
-    binds : Optional[Mapping[tensor.Tensor, tvm.tir.Buffer]]
-        Dictionary that maps the Tensor to Buffer which specified the data layout
-        requirement of the function. By default, a new compact buffer is created
-        for each tensor in the argument.
-
     simple_mode : bool
         Whether only output simple and compact statement, this will skip
         LoopPartition, api wrapper generation and Unrolling.
@@ -130,139 +61,65 @@ def lower(
         return ffi.lower_module(inp, simple_mode)
     if isinstance(inp, PrimFunc):
         return ffi.lower_primfunc(inp, name, simple_mode)
-    if isinstance(inp, te.Schedule):
-        return ffi.lower_schedule(inp, args, name, binds, simple_mode)
     raise ValueError(
         f"Expected input to be an IRModule, PrimFunc or te.Schedule, but got {type(inp)}"
     )
 
 
 def build(
-    inputs: Union[te.Schedule, PrimFunc, IRModule, Mapping[str, IRModule]],
-    args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None,
+    inputs: Union[PrimFunc, IRModule],
     target: Optional[Union[str, Target]] = None,
-    target_host: Optional[Union[str, Target]] = None,
-    runtime: Optional[
-        "tvm.relay.backend.Runtime"
-    ] = None,  # Type is annotated this way to avoid cyclic dependency
-    name: Optional[str] = "default_function",
-    binds: Optional[Mapping[tensor.Tensor, Buffer]] = None,
+    name: str = "main",
 ):
     """Build a function with arguments as signature. Code will be generated
     for devices coupled with target information.
 
     Parameters
     ----------
-    inputs : Union[tvm.te.schedule.Schedule, tvm.tir.PrimFunc, IRModule, Mapping[str, IRModule]]
+    input : Union[tvm.tir.PrimFunc, IRModule]
         The input to be built
 
-    args : Optional[List[Union[tvm.tir.Buffer, tensor.Tensor, tir.Var]]]
-        The argument lists to the function.
-
     target : Optional[Union[str, Target]]
         The target and option of the compilation.
 
-    target_host : Optional[Union[str, Target]]
-        Host compilation target, if target is device.
-        When TVM compiles device specific program such as CUDA,
-        we also need host(CPU) side code to interact with the driver
-        setup the dimensions and parameters correctly.
-        target_host is used to specify the host side codegen target.
-        By default, llvm is used if it is enabled,
-        otherwise a stackvm interpreter is used.
-
-    runtime : Optional[Runtime]
-        Runtime to generate artifacts for
-
-    name : Optional[str]
+    name : str
         The name of result function.
 
-    binds : Optional[Mapping[tensor.Tensor, tvm.tir.Buffer]]
-        Dictionary that maps the binding of symbolic buffer to Tensor.
-        By default, a new buffer is created for each tensor in the argument.
-
     Returns
     -------
     ret : tvm.module
         A module that combines both host and device code.
 
-    Examples
-    ________
-    There are two typical example uses of this function depending on the type
-    of the argument `inputs`:
-    1. it is an IRModule.
-
-    .. code-block:: python
-
-        n = 2
-        A = te.placeholder((n,), name='A')
-        B = te.placeholder((n,), name='B')
-        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
-        s = tvm.te.create_schedule(C.op)
-        m = tvm.lower(s, [A, B, C], name="test_add")
-        rt_mod = tvm.build(m, target="llvm")
-
-    2. it is a dict of compilation target to IRModule.
-
-    .. code-block:: python
-
-        n = 2
-        A = te.placeholder((n,), name='A')
-        B = te.placeholder((n,), name='B')
-        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
-        s1 = tvm.te.create_schedule(C.op)
-        with tvm.target.cuda() as cuda_tgt:
-          s2 = topi.cuda.schedule_injective(cuda_tgt, [C])
-          m1 = tvm.lower(s1, [A, B, C], name="test_add1")
-          m2 = tvm.lower(s2, [A, B, C], name="test_add2")
-          rt_mod = tvm.build({"llvm": m1, "cuda": m2})
-
     Note
     ----
     See the note on :any:`tvm.target` on target string format.
     """
-    if isinstance(inputs, te.Schedule):
-        if args is None:
-            raise ValueError("args must be given for build from schedule")
-        input_mod = lower(inputs, args, name=name, binds=binds)
-    elif isinstance(inputs, (list, tuple, container.Array)):
-        merged_mod = tvm.IRModule({})
-        for x in inputs:
-            merged_mod.update(lower(x))
-        input_mod = merged_mod
-    elif isinstance(inputs, PrimFunc):
+    if isinstance(inputs, PrimFunc):
         input_mod = lower(inputs, name=name)
     elif isinstance(inputs, tvm.IRModule):
         assert (
             len(inputs.get_global_vars()) > 0
         ), "Expected a non-empty IRModule, but the IRModule contained no functions."
         input_mod = lower(inputs)
-    elif not isinstance(inputs, (dict, container.Map)):
-        raise ValueError(
-            f"Inputs must be te.Schedule, IRModule, PrimFunc, "
-            f"or dict of target to IRModule, "
-            f"but got {type(inputs)}."
-        )
-
-    if not isinstance(inputs, (dict, container.Map)):
-        target = Target.current() if target is None else target
-        if target is None and isinstance(input_mod, tvm.IRModule):
-            target_mod = {}
-            for gvar, func in input_mod.functions.items():
-                tgt = func.attrs["target"] if "target" in func.attrs else "llvm"
-                if tgt not in target_mod:
-                    target_mod[tgt] = {}
-                target_mod[tgt][gvar] = func
-
-            target_input_mod = {}
-            for tgt in target_mod.keys():
-                tir_mod = tvm.IRModule(target_mod[tgt])
-                tir_mod = tir_mod.with_attrs(input_mod.attrs)
-                target_input_mod[tgt] = tir_mod
-        else:
-            target_input_mod = {target: input_mod}
     else:
-        target_input_mod = {tgt: lower(mod) for tgt, mod in inputs.items()}
+        raise ValueError("Inputs must be IRModule or PrimFunc")
+
+    target = Target.current() if target is None else target
+    if target is None and isinstance(input_mod, tvm.IRModule):
+        target_mod = {}
+        for gvar, func in input_mod.functions.items():
+            tgt = func.attrs["target"] if "target" in func.attrs else "llvm"
+            if tgt not in target_mod:
+                target_mod[tgt] = {}
+            target_mod[tgt][gvar] = func
+
+        target_input_mod = {}
+        for tgt in target_mod.keys():
+            tir_mod = tvm.IRModule(target_mod[tgt])
+            tir_mod = tir_mod.with_attrs(input_mod.attrs)
+            target_input_mod[tgt] = tir_mod
+    else:
+        target_input_mod = {target: input_mod}
 
     # Because modules can be created from a variety of sources, we annotate them
     # with the relevant attributes here to ensure they propagate
@@ -271,18 +128,10 @@ def build(
         if not isinstance(tgt, (str, Target)):
             raise ValueError("The key of inputs must be str or " "Target when inputs is dict.")
         if not isinstance(mod, tvm.IRModule):
-            raise ValueError("inputs must be Schedule, IRModule, " "or dict of str to IRModule.")
-        annotated_mods[tgt] = mod.with_attr("runtime", runtime)
+            raise ValueError("inputs must be IRModule, " "or dict of str to IRModule.")
+        annotated_mods[tgt] = mod
 
-    # TODO(mbs): Both CompilationConfig and TIRToRuntime implement the same host target
-    #  defaulting logic, but there's currently no way to get back the decided host.
-    if target_host is not None:
-        warnings.warn(
-            "target_host parameter is going to be deprecated. "
-            "Please pass in tvm.target.Target(target, host=target_host) instead."
-        )
-
-    annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods, target_host)
+    annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods)
     if not target_host:
         for tar, mod in annotated_mods.items():
             device_type = ndarray.device(tar.kind.name, 0).device_type
@@ -296,41 +145,4 @@ def build(
 
     rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host)
 
-    annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods, target_host)
-
-    if not isinstance(target_host, Target):
-        target_host = Target(target_host)
-
-    if str(runtime) == "crt" and runtime["system-lib"]:
-        if target_host.kind.name == "c":
-            create_csource_crt_metadata_module = tvm._ffi.get_global_func(
-                "runtime.CreateCSourceCrtMetadataModule"
-            )
-            to_return = create_csource_crt_metadata_module([rt_mod_host], target_host, runtime)
-        elif target_host.kind.name == "llvm":
-            create_llvm_crt_metadata_module = tvm._ffi.get_global_func(
-                "runtime.CreateLLVMCrtMetadataModule"
-            )
-            to_return = create_llvm_crt_metadata_module([rt_mod_host], target_host, runtime)
-    else:
-        to_return = rt_mod_host
-
-    return OperatorModule.from_module(to_return, ir_module_by_target=annotated_mods, name=name)
-
-
-class OperatorModule(Module):
-    """Wraps the Module returned by tvm.build() and captures additional outputs of that function."""
-
-    @classmethod
-    def from_module(cls, mod, **kwargs):
-        # NOTE(areusch): It is generally unsafe to continue using `mod` from this point forward.
-        # If an exception occurs in cls.__init__, handle will be deleted. For this reason,
-        # set mod.handle to None.
-        handle = mod.handle
-        mod.handle = None
-        return cls(handle, **kwargs)
-
-    def __init__(self, handle, ir_module_by_target=None, name=None):
-        super(OperatorModule, self).__init__(handle)
-        self.ir_module_by_target = ir_module_by_target
-        self.name = name
+    return rt_mod_host
diff --git a/python/tvm/exec/measure_peak.py b/python/tvm/exec/measure_peak.py
deleted file mode 100644
index 178e60089245..000000000000
--- a/python/tvm/exec/measure_peak.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""measure bandwidth and compute peak
-
-e.g.
-python3 -m tvm.exec.measure_peak --target cuda --rpc-host 127.0.0.1 --rpc-port 9090
-python3 -m tvm.exec.measure_peak --target opencl --target-host "llvm -mtriple=aarch64-linux-gnu" \
-        --rpc-host $TVM_OPENCL_DEVICE_HOST --rpc-port 9090
-"""
-
-import argparse
-import logging
-
-from tvm.target import Target
-from ..contrib.peak import measure_peak_all
-
-
-def main():
-    """Main function"""
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--target", type=str, default="llvm", help="The build target")
-    parser.add_argument(
-        "--target-host", type=str, default=None, help="The host code compilation target"
-    )
-    parser.add_argument(
-        "--rpc-host", type=str, default="127.0.0.1", help="the hostname of the server"
-    )
-    parser.add_argument("--rpc-port", type=int, default=9090, help="The port of the RPC")
-
-    args = parser.parse_args()
-    logging.basicConfig(level=logging.INFO)
-
-    args.target, args.target_host = Target.canon_target_and_host(args.target, args.target_host)
-    measure_peak_all(args.target, args.target_host, args.rpc_host, args.rpc_port)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/tvm/relax/frontend/torch/fx_translator.py b/python/tvm/relax/frontend/torch/fx_translator.py
index ce1f284be6bc..d57d24bf2f77 100644
--- a/python/tvm/relax/frontend/torch/fx_translator.py
+++ b/python/tvm/relax/frontend/torch/fx_translator.py
@@ -99,7 +99,6 @@ def convert(node: fx.Node) -> relax.Var:
     ########## Neural Network ##########
 
     def _adaptive_avg_pool2d_module(self, node: fx.Node) -> relax.Var:
-
         module = self.named_modules[node.target]
         x = self.env[node.args[0]]
         output_size = module.output_size
diff --git a/python/tvm/relax/vm_build.py b/python/tvm/relax/vm_build.py
index cfa4143b66c3..ac4d9698a072 100644
--- a/python/tvm/relax/vm_build.py
+++ b/python/tvm/relax/vm_build.py
@@ -179,10 +179,12 @@ def _vmcodegen(
     raise ValueError(f"Unknown exec_mode {exec_mode}")
 
 
-def _autodetect_system_lib_req(
-    target: Optional[tvm.target.Target] = None, system_lib: Optional[bool] = None
+def _auto_attach_system_lib_prefix(
+    tir_mod: tvm.IRModule,
+    target: Optional[tvm.target.Target] = None,
+    system_lib: Optional[bool] = None,
 ):
-    """Automatically detect system lib requirement"""
+    """Automatically detect system lib req and attach prefix attr"""
     if target is not None:
         host = target if target.host is None else target.host
         if system_lib is None:
@@ -191,9 +193,9 @@ def _autodetect_system_lib_req(
                 system_lib = True
 
     if system_lib:
-        # use packed-func to avoid relay dep.
-        return tvm.get_global_func("relay.backend.CreateRuntime")("cpp", {"system-lib": system_lib})
-    return None
+        if tir_mod.get_attr("system_lib_prefix") is None:
+            return tir_mod.with_attr("system_lib_prefix", "")
+    return tir_mod
 
 
 def _vmlink(
@@ -246,11 +248,8 @@ def _vmlink(
     relax_ext_libs = []
     tir_ext_libs = []
     if tir_mod is not None and len(tir_mod.get_global_vars()) > 0:
-        lib = tvm.build(
-            tir_mod,
-            target=target,
-            runtime=_autodetect_system_lib_req(target, system_lib),
-        )
+        tir_mod = _auto_attach_system_lib_prefix(tir_mod, target, system_lib)
+        lib = tvm.build(tir_mod, target=target)
     for ext_mod in ext_libs:
         if ext_mod.is_device_module:
             tir_ext_libs.append(ext_mod)
diff --git a/python/tvm/te/__init__.py b/python/tvm/te/__init__.py
index 0907ea2ebf85..e7b394ebf76c 100644
--- a/python/tvm/te/__init__.py
+++ b/python/tvm/te/__init__.py
@@ -28,21 +28,11 @@
 from tvm.tir import comm_reducer, min, max, sum
 from tvm.tir import add, subtract, multiply
 
-from .schedule import (
-    Schedule,
-    Stage,
-    create_schedule,
-    SpecializedCondition,
-    AXIS_SEPARATOR,
-)
 from .tensor import TensorSlice, Tensor
-from .tensor_intrin import decl_tensor_intrin
 from .tag import tag_scope
 from .operation import placeholder, compute, scan, extern, var, size_var, const
-from .operation import thread_axis, reduce_axis
+from .operation import thread_axis, reduce_axis, AXIS_SEPARATOR
 from .operation import create_prim_func
 from .operation import extern_primfunc
 
-from .tensor import PlaceholderOp, ComputeOp, TensorComputeOp, ScanOp, ExternOp, HybridOp
-from .autodiff import gradient
-from . import hybrid
+from .tensor import PlaceholderOp, ComputeOp, TensorComputeOp, ScanOp, ExternOp
diff --git a/python/tvm/te/autodiff.py b/python/tvm/te/autodiff.py
deleted file mode 100644
index f8650839948d..000000000000
--- a/python/tvm/te/autodiff.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Automatic differentiation of tensor expressions."""
-from . import _ffi_api
-
-
-def gradient(output, inputs, head=None):
-    """Perform reverse-mode automatic differentiation.
-
-    Parameters
-    ----------
-    output : Tensor
-        The tensor to differentiate.
-
-    inputs : List[Tensor]
-        The list of input tensors to be differentiated wrt.
-
-    head : Tensor
-        The adjoint of the output, in other words, some tensor, by which the Jacobians
-        will be multiplied. Its shape must be of the form `prefix + output.shape`.
-        If `None` is passed, the identity tensor of shape `output.shape + output.shape`
-        will be used.
-
-    Returns
-    -------
-    tensors: List[Tensor]
-        The result gradient, in the same order as the inputs
-
-    Example
-    -------
-    .. code-block:: python
-
-        x = tvm.placeholder((32, 3, 28, 28), name='x')
-        w1 = tvm.placeholder((10, 3, 3, 3), name='w1')
-        w2 = tvm.placeholder((10, 10, 3, 3), name='w2')
-        z1 = topi.nn.conv2d(x, w1, 1, 1, 1)
-        z2 = topi.nn.conv2d(z1, w2, 1, 1, 1)
-        y = topi.sum(z2)
-
-        # produce gradients
-        [dw1, dw2] = tvm.gradient(y, [w1, w2])
-
-        # produce Jacobians
-        [jw1, jw2] = tvm.gradient(z2, [w1, w2])
-
-        # produce gradients, the head adjoint for z2 is provided manually
-        [dw1, dw2] = tvm.gradient(z2, [w1, w2], topi.full_like(z2, 1.0))
-
-    """
-    if not isinstance(inputs, list):
-        inputs = [inputs]
-    return _ffi_api.Gradient(output, inputs, head)
diff --git a/python/tvm/te/hybrid/__init__.py b/python/tvm/te/hybrid/__init__.py
deleted file mode 100644
index cd320c6b209c..000000000000
--- a/python/tvm/te/hybrid/__init__.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Hybrid Programming APIs of TVM Python Package.
-
-This package maps a subset of python to HalideIR so that:
-1. Users can write some preliminary versions of the computation patterns
-have not been supported yet and verify it across the real execution and
-python semantic emulation.
-2. So far, it is a text format dedicated to HalideIR Phase 0. Refer tvm.lower
-for more details. A larger ambition of this module is to support all levels of
-HalideIR.
-"""
-
-# TODO(@were): Make this module more complete.
-# 1. Support HalideIR dumping to Hybrid Script
-# 2. Support multi-level HalideIR
-import inspect
-import tvm._ffi
-import tvm.te.schedule
-from tvm._ffi.base import decorate
-
-from .module import HybridModule
-from .parser import source_to_op
-from .utils import _pruned_source
-
-
-def script(pyfunc):
-    """Decorate a python function as hybrid script.
-
-    The hybrid function support emulation mode and parsing to
-    the internal language IR.
-
-    Returns
-    -------
-    hybrid_func : function
-        A decorated hybrid script function.
-    """
-    # pylint: disable=import-outside-toplevel, missing-docstring
-    def wrapped_func(func, *args, **kwargs):
-        from .utils import _is_tvm_arg_types
-
-        if _is_tvm_arg_types(args):
-            src = _pruned_source(func)
-            closure_vars = inspect.getclosurevars(func).nonlocals
-            closure_vars.update(inspect.getclosurevars(func).globals)
-            return source_to_op(src, args, func.__globals__, closure_vars)
-
-        from .runtime import _enter_hybrid_runtime, _restore_runtime
-
-        intersect = _enter_hybrid_runtime(func)
-        value = func(*args, **kwargs)
-        _restore_runtime(func, intersect)
-        return value
-
-    return decorate(pyfunc, wrapped_func)
-
-
-def build(sch, inputs, outputs, name="hybrid_func"):
-    """Dump the current schedule to hybrid module
-
-    Parameters
-    ----------
-    sch: tvm.te.Schedule
-        The schedule to be dumped
-
-    inputs: An array of Tensors or Vars
-        The inputs of the function body
-
-    outputs: An array of Tensors
-        The outputs of the function body
-
-    Returns
-    -------
-    module: HybridModule
-        The built results is wrapped in a HybridModule.
-        The usage of HybridModule is roughly the same as normal TVM-built modules.
-    """
-    sch = sch.normalize()
-    bounds = tvm.te.schedule.InferBound(sch)
-    stmt = tvm.te.schedule.ScheduleOps(sch, bounds)
-
-    src = _Dump(stmt, inputs, outputs, name)
-
-    return HybridModule(src, name)
-
-
-tvm._ffi._init_api("tvm.hybrid", __name__)
diff --git a/python/tvm/te/hybrid/calls.py b/python/tvm/te/hybrid/calls.py
deleted file mode 100644
index 948a0d7665ff..000000000000
--- a/python/tvm/te/hybrid/calls.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Intrinsics of TVM-Python Hybrid Script for Python compilation time
-semantic support."""
-
-from tvm.runtime import const, convert
-import tvm.te
-from tvm.ir.container import Array
-from tvm.target import Target
-from tvm.tir import expr as _expr
-from tvm.tir import call_intrin
-from tvm.tir.stmt import ForKind
-
-from .utils import _internal_assert
-
-# pylint: disable=redefined-builtin,invalid-name
-
-LOOP_INTRIN = {
-    "range": ForKind.SERIAL,
-    "unroll": ForKind.UNROLLED,
-    "parallel": ForKind.PARALLEL,
-    "vectorize": ForKind.VECTORIZED,
-    "const_range": (ForKind.UNROLLED,),
-}
-
-
-def _range(annotation, args):
-    """Handling TVM loop types"""
-    n = args.__len__()
-    if n == 1:
-        low, ext = const(0, dtype="int32"), args[0]
-    else:
-        _internal_assert(n == 2, "A loop intrinsic should only have 1 or 2 arguments!")
-        low, ext = args[0], args[1]
-    if not tvm.tir.analysis.expr_deep_equal(low, const(0, dtype="int32")):
-        ext = ext - low
-    kind = LOOP_INTRIN[annotation]
-    iter_var = None
-    return iter_var, low, ext, kind
-
-
-range = unroll = vectorize = parallel = const_range = _range  # pylint: disable=invalid-name
-
-
-def bind(func_id, args):
-    """Handling TVM thread binding"""
-    _internal_assert(func_id == "bind", "This function cannot be directly invoked!")
-    _internal_assert(args.__len__() == 2, "A loop bind should only have 2 arguments!")
-    _internal_assert(isinstance(args[0], str), "A loop bind's first argument should be a string!")
-    low, ext = const(0, "int32"), args[1]
-    iter_var = tvm.te.thread_axis((low, ext), args[0])
-    kind = None
-    return iter_var, low, ext, kind
-
-
-def _math_intrin(func_id, args):
-    # pylint: disable=import-outside-toplevel
-    from tvm.tir import op
-
-    return getattr(op, func_id)(*args)
-
-
-sqrt = (
-    log
-) = exp = tanh = sigmoid = power = popcount = round = _math_intrin  # pylint: disable=invalid-name
-
-
-def _min_max(func_id, args):
-    _internal_assert(args.__len__() == 2, "Max/Min function should have 2 elements")
-    return getattr(_expr, func_id.title())(args[0], args[1])
-
-
-min = max = _min_max  # pylint: disable=invalid-name
-
-
-def _allocate_tensor(func_id, args):
-    """Handling TVM tensor allocation.
-    You may refer hybrid.intrin.allocate for more details."""
-    n = args.__len__()
-    _internal_assert(
-        isinstance(convert(args[0]), Array), "allocate's first argument should be a tuple of shape!"
-    )
-    shape = args[0]
-    for i in shape:
-        _internal_assert(isinstance(i, (_expr.PrimExpr, int)), "The shape should be an expression")
-    if n > 1:
-        _internal_assert(isinstance(args[1], str), "The data type should be an str")
-        _internal_assert(
-            args[1].startswith("int") or args[1].startswith("float"),
-            "The data type should be either int or float!",
-        )
-        dtype = args[1]
-    else:
-        dtype = "float32"
-    if n > 2:
-        _internal_assert(isinstance(args[2], str), "The data scope should be an string")
-        _internal_assert(func_id != "output_tensor", "Output tensor cannot specify scope")
-        scope = args[2]
-    else:
-        scope = "global" if func_id != "output_tensor" else "output"
-    return (shape, dtype, scope)
-
-
-output_tensor = allocate = _allocate_tensor  # pylint: disable=invalid-name
-
-
-def len(func_id, args):
-    """Iterpret the len function"""
-    _internal_assert(args.__len__() == 1, "Only 1 argument is expected!")
-    _internal_assert(func_id == "len", "This function cannot be directly invoked!")
-    try:
-        return convert(args[0].__len__())
-    except:  # pylint: disable=bare-except
-        _internal_assert(args[0].shape.__len__() == 1, "Only one-dimension array can get len")
-        return convert(args[0].shape[0])
-
-
-def _cast(func_id, args):
-    _internal_assert(
-        args.__len__() == 1,
-        f"Casting to {func_id} only supports a single argument",
-    )
-    # The FFI can handle any conversion of `args[0]` into PrimExpr, if
-    # required.
-    return _expr.Cast(func_id, args[0])
-
-
-float16 = float32 = float64 = _cast  # pylint: disable=invalid-name
-int8 = int16 = int32 = int64 = _cast  # pylint: disable=invalid-name
-uint8 = uint16 = uint32 = uint64 = _cast  # pylint: disable=invalid-name
-
-
-def ceil_div(func_id, args):
-    _internal_assert(func_id == "ceil_div", "This function cannot be directly invoked!")
-    _internal_assert(args.__len__() == 2, "2 arguments expected for division!")
-    a, b = args
-    return (a + b - 1) // b
-
-
-def likely(func_id, args):
-    _internal_assert(args.__len__() == 1, "Only one expression can be likely")
-    _internal_assert(func_id == "likely", "This function cannot be directly invoked!")
-    return call_intrin(args[0].dtype, "tir.likely", *args)
-
-
-def max_num_threads(func_id, args):
-    """Set the maximum number of threads."""
-    _internal_assert(func_id == "max_num_threads", "This function cannot be directly invoked!")
-    _internal_assert(args.__len__() <= 1, "At most one argument accepted!")
-    if args.__len__() == 0:
-        res = Target.current().max_num_threads
-    else:
-        _internal_assert(isinstance(args[0], _expr.IntImm), "In tvm bool should be uint")
-        res = Target.current(args[0].value).max_num_threads
-    return convert(res)
-
-
-def inf(func_id, args):
-    """Infinity"""
-    _internal_assert(func_id == "inf", "This function cannot be directly invoked!")
-    _internal_assert(args.__len__() == 1, "One argument accepted!")
-    return tvm.tir.max_value(args[0])
-
-
-def ninf(func_id, args):
-    """Negative infinity"""
-    _internal_assert(func_id == "ninf", "This function cannot be directly invoked!")
-    _internal_assert(args.__len__() == 1, "One argument accepted!")
-    return tvm.tir.min_value(args[0])
diff --git a/python/tvm/te/hybrid/module.py b/python/tvm/te/hybrid/module.py
deleted file mode 100644
index 729805b31b6b..000000000000
--- a/python/tvm/te/hybrid/module.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Methods and data structures to support dumping HalideIR to Hybrid Script.
-This allows users to do quick hack to generated HalideIR and cast it back to
-TVM modules.
-
-To enable this feature, you need to build with -DUSE_HYBRID_DUMP=ON.
-"""
-
-import ast
-
-from tvm.contrib import utils
-from .utils import _internal_assert
-from .utils import _is_tvm_arg_types
-from .parser import source_to_op
-
-
-class HybridModule(object):
-    """The usage of Hybrid Module is very similar to conventional TVM module,
-    but conventional TVM module requires a function body which is already fully
-    lowered. This contradicts to the fact that Hybrid Module is originally a text
-    format for Phase 0 HalideIR. Thus, a totally separated module is defined."""
-
-    def __init__(self, src=None, name=None):
-        """The constructor of this a hybrid module
-
-        Parameters
-        ----------
-        src : str
-            The source code of this module
-
-        name : str
-            The name of this module
-        """
-        self.src_ = self.name = self.func_ = self.root_ = None
-        if src is not None:
-            temp = utils.tempdir()
-            dst = temp.relpath("script.py")
-            with open(dst, "w") as f:
-                f.write(f"import tvm\n@tvm.te.hybrid.script\n{src}")
-
-            if name is not None:
-                self.name = name
-            self.load(dst)
-
-    def __call__(self, *args):
-        if _is_tvm_arg_types(args):
-            return source_to_op(self.root_, args, globals(), {})
-        return self.func_(*args)
-
-    def get_source(self):
-        return self.src_
-
-    def save(self, path):
-        if not path.endswith(".py"):
-            path = path + ".py"
-        with open(path, "w") as f:
-            f.write(self.src_)
-
-    def load(self, path):
-        """Load the module from a python file
-
-        Parameters
-        ----------
-        path : str
-            Path to the given python file
-        """
-        with open(path, "r") as f:
-            self.src_ = f.read()
-
-        src = self.src_
-
-        class FindFunc(ast.NodeVisitor):
-            """Find the function in module to be loaded module."""
-
-            # pylint: disable=invalid-name
-            def __init__(self):
-                self.name = None
-                self.root = None
-
-            def visit_FunctionDef(self, node):
-                _internal_assert(self.name is None, "For now, only one function supported!")
-                self.name = node.name
-                _internal_assert(self.root is None, "For now, only one function supported!")
-                self.root = node
-
-        root = ast.parse(src)
-        finder = FindFunc()
-        finder.visit(root)
-        _internal_assert(finder.name is not None and finder.root is not None, "No function found!")
-        if self.name is None:
-            self.name = finder.name
-        self.root_ = finder.root
-
-        _, local_ = {}, {}
-        exec(self.src_, _, local_)  # pylint: disable=exec-used
-        local_.pop("tvm")
-        assert len(local_) == 1
-        self.func_ = list(local_.values())[0]
diff --git a/python/tvm/te/hybrid/parser.py b/python/tvm/te/hybrid/parser.py
deleted file mode 100644
index bd5a060cd01c..000000000000
--- a/python/tvm/te/hybrid/parser.py
+++ /dev/null
@@ -1,658 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Hybrid Script Parser"""
-
-import ast
-import operator
-import logging
-import sys
-import numbers
-
-from enum import Enum
-from tvm.ir import Array, Range
-import tvm.runtime
-import tvm.tir
-import tvm.te
-import tvm.te._ffi_api
-import tvm.arith
-
-from tvm.tir import expr as _expr
-from tvm.tir import stmt as _stmt
-from tvm.te.tensor import Tensor, Operation
-from tvm.tir import all as _all
-from tvm.tir import any as _any
-
-from .utils import _internal_assert
-from . import calls
-from . import utils
-from .preprocessor import determine_variable_usage
-
-
-def concat_list_to_block(lst):
-    """Concatenate a list of Python IR nodes to HalideIR Block"""
-    if not lst:
-        return utils.make_nop()
-    n = len(lst)
-    if n == 1:
-        return lst[0]
-    return _stmt.SeqStmt(lst)
-
-
-def visit_list_to_block(visit, lst):
-    """Visit and concatenate a list of Python IR nodes to HalideIR Block"""
-    lst = [visit(stmt) for stmt in lst if not utils.is_docstring(stmt)]
-    lst = [stmt for stmt in lst if not tvm.ir.structural_equal(stmt, utils.make_nop())]
-    if not lst:
-        return utils.make_nop()
-    return concat_list_to_block(lst)
-
-
-class Symbol(Enum):
-    """Enumerates types in the symbol table"""
-
-    Callable = 0
-    Input = 1
-    OutputBuffer = 2
-    GlobalBuffer = 3
-    LocalBuffer = 4
-    SharedBuffer = 5
-    ConstVar = 6
-    BufferVar = 7
-    LoopVar = 8
-    ConstLoopVar = 9
-    ThreadBind = 10
-
-
-def _floordiv(x, y):
-    if isinstance(x, _expr.ExprOp) or isinstance(y, _expr.ExprOp):
-        return tvm.tir.floordiv(x, y)
-    return operator.floordiv(x, y)
-
-
-def _floormod(x, y):
-    if isinstance(x, _expr.ExprOp) or isinstance(y, _expr.ExprOp):
-        return tvm.tir.floormod(x, y)
-    return operator.mod(x, y)
-
-
-class HybridParser(ast.NodeVisitor):
-    """Python AST visitor pass which finally lowers it to HalideIR"""
-
-    _binop_maker = {
-        ast.Add: operator.add,
-        ast.Sub: operator.sub,
-        ast.Mult: operator.mul,
-        ast.Div: operator.div if sys.version_info[0] == 2 else operator.truediv,
-        ast.FloorDiv: _floordiv,
-        ast.Mod: _floormod,
-        ast.BitOr: operator.or_,
-        ast.BitAnd: operator.and_,
-        ast.BitXor: operator.xor,
-        ast.Gt: operator.gt,
-        ast.GtE: operator.ge,
-        ast.Lt: operator.lt,
-        ast.LtE: operator.le,
-        ast.Eq: operator.eq,
-        ast.NotEq: operator.ne,
-        ast.And: _all,
-        ast.Or: _any,
-    }
-
-    _unaryop_maker = {ast.USub: operator.neg, ast.Invert: operator.invert, ast.Not: operator.not_}
-
-    def __init__(self, args, usage, symbols, closure_vars, func_name=None):
-        """
-        Parameters
-        ----------
-        args: A list of tvm.te.placeholder or te.var
-            Provided by the user, the argument list of the function to be lowered.
-
-        usage: A dict of variables used in last in this function
-            Provided by last lower pass, which collects this information
-
-        symbols : list of str
-            The symbol list of the global context of the function.
-
-        closure_vars: dict
-            A dict of external name reference captured by this function.
-
-        Returns
-        -------
-        func_name: str
-            The name of the function to be lowered; if not provided,
-            the compiler will use the name in the AST
-        """
-        self.args = list(args)
-        self.usage = usage.copy()
-
-        self.symbols = {}  # Symbol table
-        for k, v in symbols.items():
-            if callable(v):
-                self.add_symbol(k, Symbol.Callable, v)
-
-        self.closure_vars = closure_vars
-
-        self.binds = {}  # Thread binds
-        self.device = 0  # Is it generating device
-
-        self.func_name = func_name  # The name of the function to be lowered
-        self.outputs = []  # Output tensors' name
-        self.side_effect = set()  # Tensors with side effects
-        self.parsed_body = None  # The parsed HalideIR body
-        self.analyzer = tvm.arith.Analyzer()
-        self.returned = False  # If this function has a valid return
-
-    def add_symbol(self, key, ty, val):  # pylint: disable=invalid-name
-        """Add value to the symbol table context"""
-        if key in self.symbols.keys():
-            old = str(self.symbols[key])
-            new = str((ty, val))
-            _internal_assert(False, f"Name conflict in symbol table! [{key}] {old} -> {new}")
-
-        self.symbols[key] = ty, val
-
-        if ty == Symbol.ThreadBind:
-            if val.var.name not in self.binds.keys():
-                self.binds[val.var.name] = val
-                return
-            val_ = self.binds[val.var.name]
-            _internal_assert(
-                tvm.tir.analysis.expr_deep_equal(val_.dom.extent, val.dom.extent),
-                "Thread extents should be uniform!",
-            )
-            self.symbols[key] = ty, val_
-
-    def wrap_up_realize(self, node, body):
-        """Wrap up all the variables which will no longer be used"""
-        to_pop = []
-        for key, val in self.usage.items():
-            _, level, _ = val
-            if key not in self.symbols:
-                # don't realize the symbols that are never visited
-                continue
-            if level != node:
-                continue
-            _internal_assert(key in self.symbols.keys(), f"Unknown symbol {key}!")
-
-            ty, entry = self.symbols[key]  # pylint: disable=invalid-name
-            if ty in [Symbol.Input, Symbol.OutputBuffer]:
-                continue
-            if "Buffer" in ty.name:
-                _buf = entry
-                _scope = "global" if ty is Symbol.BufferVar else ty.name[:-6].lower()
-                to_pop.append(key)
-            else:
-                continue
-
-            if _scope == "global":
-                body = self.wrap_up_binds(body)
-
-            _domain = [Range.from_min_extent(0, i) for i in _buf.shape]
-            _dtype = _buf.dtype
-            _true = tvm.runtime.convert(True)
-            body = tvm.tir.ProducerRealize(_buf, _domain, _true, body, tvm.runtime.convert(_scope))
-
-        for elem in to_pop:
-            self.symbols.pop(elem)
-
-        return body
-
-    def wrap_up_binds(self, body):
-        for _, iter_var in self.binds.items():
-            ext = iter_var.dom.extent
-            body = tvm.tir.AttrStmt(iter_var, "thread_extent", ext, body)
-        self.binds = {}
-        return body
-
-    # pylint: disable=invalid-name, missing-docstring
-    def visit_Module(self, node):
-        _internal_assert(
-            len(node.body) == 1, "Only one-function source code will be fed to this parser!"
-        )
-        return self.visit(node.body[0])
-
-    def visit_FunctionDef(self, node):
-        _internal_assert(
-            len(node.args.args) == len(self.args),
-            "The number of arguments passed to the \
-                         function should be the same as it is defined!",
-        )
-        if self.func_name is None:
-            self.func_name = node.name
-        for idx, arg in enumerate(node.args.args):
-            _attr = "id" if sys.version_info[0] < 3 else "arg"  # To make py2 and 3 compatible
-            self.add_symbol(getattr(arg, _attr), Symbol.Input, self.args[idx])
-        res = visit_list_to_block(self.visit, node.body)
-        res = self.wrap_up_realize(node, res)
-        return self.wrap_up_binds(res)
-
-    def visit_Expr(self, node):
-        return self.visit(node.value)
-
-    def visit_Name(self, node):
-        name = node.id
-        if sys.version_info[0] == 2 and name in ["True", "False"]:
-            return tvm.runtime.convert(ast.literal_eval(name))
-
-        if name in self.closure_vars:
-            return tvm.runtime.convert(self.closure_vars[name])
-
-        ty, entry = self.symbols[name]
-        _internal_assert(name in self.symbols, f"Unknown symbol {name}!")
-        if ty in [Symbol.LoopVar, Symbol.Input, Symbol.ConstLoopVar]:
-            return entry
-        if ty is Symbol.ThreadBind:
-            return entry.var
-        if ty is Symbol.ConstVar:
-            return entry if isinstance(node.ctx, ast.Load) else None
-        if ty is Symbol.BufferVar:
-            if isinstance(node.ctx, ast.Load):
-                return tvm.tir.ProducerLoad(entry, [tvm.runtime.const(0, "int32")])
-            return entry, [tvm.runtime.const(0, "int32")]
-        # Do I need any assertion here?
-        return entry
-
-    def visit_Num(self, node):
-        if isinstance(node.n, numbers.Integral):
-            dtype = "int32"
-        elif isinstance(node.n, float):
-            dtype = "float32"
-        else:
-            _internal_assert(
-                isinstance(node.n, bool), "The data type should be one of (int, float, bool)"
-            )
-            dtype = "bool"
-        return tvm.runtime.const(node.n, dtype)
-
-    def visit_NameConstant(self, node):
-        return tvm.tir.const(node.value)
-
-    def visit_AugAssign(self, node):
-        buf = self.visit(node.target)
-        rhs = self.visit(node.value)
-        if isinstance(buf, tuple):
-            _internal_assert(len(buf) == 2, "LHS is supposed to be (buf, args)!")
-            buf, args = buf
-        else:
-            args = [tvm.runtime.const(0, "int32")]
-        _internal_assert(isinstance(buf, Tensor), "LHS is supposed to be Tensor!")
-
-        read = tvm.tir.ProducerLoad(buf, args)
-        value = HybridParser._binop_maker[type(node.op)](read, rhs)
-
-        return tvm.tir.ProducerStore(buf, value, args)
-
-    def visit_Assign(self, node):
-        rhs = self.visit(node.value)
-        if isinstance(rhs, Operation):
-            rmap = {}
-            _internal_assert(
-                len(node.targets) == rhs.num_outputs, "Unable to detuple the outs to targets"
-            )
-            for i in range(rhs.num_outputs):
-                _internal_assert(
-                    isinstance(node.targets[i], ast.Name),
-                    "You should bind a pure name to the tensors",
-                )
-                self.add_symbol(node.targets[i].id, Symbol.GlobalBuffer, rhs.output(i))
-                rmap[rhs.outputs[i].op] = rhs.output(i)
-            return utils.replace_io(rhs.body, rmap)
-
-        _internal_assert(len(node.targets) == 1, "So far only one-valued assignment is supported!")
-        lhs = node.targets[0]
-        if isinstance(rhs, _expr.PrimExpr):
-            rhs = self.analyzer.simplify(rhs)
-        if isinstance(lhs, ast.Name):
-            # TODO: support defined intermediate buffer later
-            lhs_ = lhs
-            lhs = lhs.id
-            if lhs in self.symbols.keys():
-                ty, _ = self.symbols[lhs]
-                _internal_assert(ty != Symbol.LoopVar, "Loop variable cannot be overwritten!")
-            decl, _, rw = self.usage[lhs]
-            if decl == lhs_:
-                _internal_assert(
-                    lhs not in self.symbols.keys(),
-                    "This value should not be defined before this point!",
-                )
-                if isinstance(rhs, tuple):
-                    shape, dtype, scope = rhs
-                    ph = tvm.te.placeholder(shape, dtype=dtype, name=lhs)
-                    self.add_symbol(lhs, getattr(Symbol, scope.title() + "Buffer"), ph)
-                    if scope == "output":
-                        self.outputs.append(lhs)
-                    return utils.make_nop()
-                if isinstance(rhs, utils.halide_imm_types) and ast.Store not in rw:
-                    self.add_symbol(lhs, Symbol.ConstVar, rhs)
-                else:
-                    _internal_assert(
-                        self.device == 0,
-                        "Single variable not supported in devices' side!\n"
-                        + "If you are using GPU, please allocate a 'local' spad "
-                        + "outside the bind body",
-                    )
-                    ph = tvm.te.placeholder((1,), dtype=rhs.dtype, name=lhs)
-                    self.add_symbol(lhs, Symbol.BufferVar, ph)
-            lhs = self.visit(lhs_)
-            if lhs is not None:
-                buf, args = lhs
-                return tvm.tir.ProducerStore(buf, rhs, args)
-            return utils.make_nop()
-
-        lhs, args = self.visit(lhs)
-        _internal_assert(
-            isinstance(lhs, Tensor), "An array access's LHS is expected to be a expr.Call!"
-        )
-        res = tvm.tir.ProducerStore(lhs, rhs, args)
-        return res
-
-    def visit_Index(self, node):
-        if isinstance(node.value, ast.Tuple):
-            return self.visit(node.value)
-        return [self.visit(node.value)]
-
-    def visit_Attribute(self, node):
-        buf = self.visit(node.value)
-        return getattr(buf, node.attr)
-
-    def visit_Subscript(self, node):
-        args = self.visit(node.slice)
-        if sys.version_info >= (3, 9):
-            if not isinstance(node.slice, ast.Tuple):
-                args = [args]
-
-        arr = self.visit(node.value)
-        if isinstance(arr, (Array, list, tuple)):
-            for i in args:
-                if isinstance(i, numbers.Integral):
-                    arr = arr[i]
-                else:
-                    _internal_assert(
-                        isinstance(i, (_expr.IntImm,)), "All indices are supposed to be constants"
-                    )
-                    arr = arr[i.value]
-            return arr
-        if isinstance(node.ctx, ast.Load):
-            return tvm.tir.ProducerLoad(arr, args)
-        return arr, args
-
-    def visit_With(self, node):
-        if sys.version_info[0] < 3:
-            context = node.context_expr
-            option = node.optional_vars
-        else:
-            _internal_assert(len(node.items) == 1, "Only one with element is supported so far!")
-            context = node.items[0].context_expr
-            option = node.items[0].optional_vars
-        _internal_assert(isinstance(context, ast.Call), "The object must be a Python func call!")
-        _internal_assert(isinstance(option, ast.Name), "The object after 'as' must be an id!")
-        self.annotation[option.id] = context.func.id
-        return visit_list_to_block(self.visit, node.body)
-
-    def visit_If(self, node):
-        cond = self.analyzer.simplify(self.visit(node.test))
-
-        # Return no IfThenElse if proven
-        if isinstance(cond, _expr.IntImm):
-            if cond.value:
-                return visit_list_to_block(self.visit, node.body)
-            if node.orelse:
-                return visit_list_to_block(self.visit, node.orelse)
-            return utils.make_nop()
-
-        if_body = visit_list_to_block(self.visit, node.body)
-
-        if node.orelse:
-            else_body = visit_list_to_block(self.visit, node.orelse)
-        else:
-            else_body = None
-        return tvm.tir.IfThenElse(cond, if_body, else_body)
-
-    def visit_IfExp(self, node):
-        cond = self.visit(node.test)
-        if_body = self.visit(node.body)
-        else_body = self.visit(node.orelse)
-        return tvm.tir.Select(cond, if_body, else_body)
-
-    def visit_Compare(self, node):
-        _internal_assert(len(node.ops) == len(node.comparators), "#compare ops != #comparators")
-        ops = [self.visit(node.left)]
-        ops += [self.visit(i) for i in node.comparators]
-        res = []
-        for i in range(len(node.ops)):
-            lhs = ops[i]
-            rhs = ops[i + 1]
-            res.append(HybridParser._binop_maker[type(node.ops[i])](lhs, rhs))
-        return _all(*res)
-
-    def visit_BoolOp(self, node):
-        n = len(node.values)
-        if n == 1:
-            _internal_assert(isinstance(node.op, ast.Not), "Unary is supposed to be not!")
-            return operator.not_(self.visit(node.values[0]))
-        _internal_assert(isinstance(node.op, (ast.And, ast.Or)), "Binary is supposed to be and/or!")
-        values = [self.visit(i) for i in node.values]
-        return HybridParser._binop_maker[type(node.op)](*values)
-
-    def visit_UnaryOp(self, node):
-        operand = self.visit(node.operand)
-        return HybridParser._unaryop_maker[type(node.op)](operand)
-
-    def visit_BinOp(self, node):
-        lhs = self.visit(node.left)
-        rhs = self.visit(node.right)
-        return HybridParser._binop_maker[type(node.op)](lhs, rhs)
-
-    def visit_Call(self, node):
-        # Yet, no function pointer supported
-        _internal_assert(
-            isinstance(node.func, ast.Name), "Only id-function function call is supported so far!"
-        )
-
-        func_id = node.func.id
-        args = [self.visit(i) for i in node.args]
-        # Intrinsics'
-        if hasattr(calls, func_id):
-            return getattr(calls, func_id)(func_id, args)
-        # Contexts'
-        _internal_assert(
-            func_id in self.symbols.keys(),
-            f"The function called ({func_id}) is not in the context either!",
-        )
-        ty, entry = self.symbols[func_id]
-        _internal_assert(ty is Symbol.Callable, "Are you sure what you call is a function?!")
-        outs = entry(*args)
-        op = outs.op if isinstance(outs, Tensor) else outs[0].op
-        return op
-
-    def visit_For(self, node):
-        iter_var, low, ext, kind = self.visit(node.iter)
-        _internal_assert(
-            isinstance(node.target, ast.Name), "The loop iterator should be a variable!"
-        )
-
-        _name = node.target.id
-
-        if isinstance(kind, tuple):
-            low = self.analyzer.simplify(low)
-            ext = self.analyzer.simplify(ext)
-            _internal_assert(
-                isinstance(low, _expr.ConstExpr) and isinstance(ext, _expr.ConstExpr),
-                "Const range should start from a const " + "and iterate const times",
-            )
-
-            low, ext = low.value, ext.value
-            if ext > 114514:
-                logging.log(
-                    logging.CRITICAL, "[Warning] Are you sure to unroll a large loop in Python?"
-                )
-
-            bodies = []
-            for i in range(low, low + ext):
-                self.add_symbol(_name, Symbol.ConstLoopVar, i)
-                body = visit_list_to_block(self.visit, node.body)
-                body = self.wrap_up_realize(node, body)
-                bodies.append(body)
-                self.symbols.pop(_name)
-            return concat_list_to_block(bodies)
-
-        if iter_var is None:
-            _internal_assert(kind is not None, "The loop iterating function parse error!")
-            if isinstance(ext, _expr.PrimExpr):
-                dtype = ext.dtype
-            elif isinstance(ext, int):
-                dtype = "int32"
-            else:
-                raise NotImplementedError(f"Unsupported type of ext: {type(ext)}")
-            offset = iter_var = tvm.te.var(_name, dtype=dtype)
-            if not tvm.tir.analysis.expr_deep_equal(low, tvm.runtime.const(0, "int32")):
-                offset = iter_var + low
-            self.add_symbol(_name, Symbol.LoopVar, offset)
-            _body = visit_list_to_block(self.visit, node.body)
-        else:
-            _internal_assert(kind is None, "The loop bind function parse error!")
-            self.add_symbol(_name, Symbol.ThreadBind, iter_var)
-            self.device += 1
-            _body = visit_list_to_block(self.visit, node.body)
-            self.device -= 1
-
-        _body = self.wrap_up_realize(node, _body)
-
-        if kind is None:
-            res = _body
-        else:
-            _internal_assert(
-                not isinstance(kind, tuple), "Micro expansion should be handled before!"
-            )
-            res = tvm.tir.For(iter_var, tvm.runtime.const(0, "int32"), ext, kind, _body)
-
-        self.symbols.pop(_name)
-        return res
-
-    def visit_Return(self, node):
-        _internal_assert(
-            all(ty != Symbol.LoopVar for ty, _ in self.symbols.values()),
-            "Return should not be in a loop body!",
-        )
-        ids = []
-        if isinstance(node.value, ast.Name):
-            ids = [node.value.id]
-        else:
-            _internal_assert(
-                isinstance(node.value, ast.Tuple),
-                "You should return either a single tensor or a tuple",
-            )
-            _internal_assert(
-                all(isinstance(i, ast.Name) for i in node.value.elts), "What do you return?"
-            )
-            ids = [i.id for i in node.value.elts]
-        _internal_assert(len(set(ids)) == len(ids), "Duplicated tensors in the return tuples")
-        if len(ids) < len(self.outputs):
-            logging.log(logging.CRITICAL, "[Warning] Not all the output buffers returned!")
-        self.outputs = [self.symbols[i][1] for i in ids]
-        self.returned = True
-        return utils.make_nop()
-
-    def visit_Tuple(self, node):
-        return tuple(self.visit(i) for i in node.elts)
-
-    def visit_Str(self, node):
-        return node.s
-
-    def visit_Assert(self, node):
-        test = self.visit(node.test)
-        mesg = tvm.runtime.convert(self.visit(node.msg))
-        return tvm.tir.AssertStmt(test, mesg, utils.make_nop())
-
-
-def parse_python(src, args, symbols, closure_vars):
-    """The helper function of calling the AST visitor
-
-    Parameters
-    ----------
-    src : ast.node or str
-        If an ast.node, then directly lower it.
-        If a str, then parse it to ast and lower it.
-
-    args : list of Tensors or Vars
-        The argument lists to the function.
-        It is NOT encouraged to write a function without arguments.
-        It is NOT encouraged to write a function with side effect.
-
-    symbols : list of str
-        The symbol list of the global context of the function.
-
-    closure_vars: dict
-        A dict of external name reference captured by this function.
-
-    Returns
-    -------
-    root : Stmt
-        The result Halide IR and the parser class instance.
-    """
-    root = ast.parse(src) if isinstance(src, str) else src
-    _internal_assert(root, ast.AST)
-    var_usage = determine_variable_usage(root, args, symbols, closure_vars)
-    parser = HybridParser(args, var_usage, symbols, closure_vars)
-    parser.parsed_body = parser.visit(root)
-    _internal_assert(parser.returned, "No valid return found in the function body!")
-    return parser
-
-
-def source_to_op(src, args, symbols, closure_vars):
-    """Another level of wrapper
-
-    Parameters
-    ----------
-    src : ast.node or str
-        If an ast.node, then directly lower it.
-        If a str, then parse it to ast and lower it.
-
-    args : list of Tensors or Vars
-        The argument lists to the function.
-        It is NOT encouraged to write a function without arguments.
-        It is NOT encouraged to write a function with side effect.
-
-    symbols : list of str
-        The symbol list of the global context of the function.
-
-    closure_vars: dict
-        A dict of external name reference captured by this function.
-
-    Returns
-    -------
-    res : list of output tensors
-        The result of output tensors of the formed OpNode.
-    """
-    parser = parse_python(src, args, symbols, closure_vars)
-
-    input_tensors = []
-
-    def get_input_tensors(arg):
-        if isinstance(arg, Tensor):
-            input_tensors.append(arg)
-        elif isinstance(arg, Array):
-            for i in arg:
-                get_input_tensors(i)
-
-    for i in args:
-        get_input_tensors(i)
-    op = tvm.te._ffi_api.HybridOp(
-        parser.func_name, "HybridOp", None, input_tensors, parser.outputs, parser.parsed_body
-    )
-    res = [op.output(i) for i in range(len(parser.outputs))]
-    return res[0] if len(res) == 1 else res
diff --git a/python/tvm/te/hybrid/preprocessor.py b/python/tvm/te/hybrid/preprocessor.py
deleted file mode 100644
index 6af584060e9b..000000000000
--- a/python/tvm/te/hybrid/preprocessor.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Determines the declaration, r/w status, and last use of each variable"""
-
-import ast
-import sys
-from .runtime import HYBRID_GLOBALS
-from .utils import _internal_assert
-
-
-class PyVariableUsage(ast.NodeVisitor):
-    """The vistor class to determine the declaration, r/w status, and last use of each variable"""
-
-    # pylint: disable=invalid-name
-    # pylint: disable=missing-docstring
-    def __init__(self, args, symbols, closure_vars):
-        self.status = {}
-        self.scope_level = []
-        self._args = {}
-        self.args = args
-        self.aug_assign_ = False
-        self.symbols = symbols
-        self.closure_vars = closure_vars
-
-    def visit_FunctionDef(self, node):
-        self.scope_level.append(node)
-        _internal_assert(
-            len(node.args.args) == len(self.args),
-            "#arguments passed should be the same as #arguments defined",
-        )
-        for idx, arg in enumerate(node.args.args):
-            _attr = "id" if sys.version_info[0] < 3 else "arg"  # To make py2 and 3 compatible
-            self._args[getattr(arg, _attr)] = self.args[idx]
-        for i in node.body:
-            self.visit(i)
-
-    def visit_For(self, node):
-        _internal_assert(isinstance(node.target, ast.Name), "For's iterator should be an id")
-        self.visit(node.iter)
-        self.scope_level.append(node)
-        for i in node.body:
-            self.visit(i)
-        self.scope_level.pop()
-
-    def visit_Call(self, node):
-        # No function pointer supported so far
-        _internal_assert(isinstance(node.func, ast.Name), "Function call should be an id")
-        func_id = node.func.id
-        _internal_assert(
-            func_id
-            in list(HYBRID_GLOBALS.keys())
-            + ["range", "max", "min", "len"]
-            + list(self.symbols.keys()),
-            "Function call id " + func_id + " not in intrinsics' list",
-        )
-        for elem in node.args:
-            self.visit(elem)
-
-    def visit_AugAssign(self, node):
-        self.aug_assign_ = True
-        self.generic_visit(node)
-        self.aug_assign_ = False
-
-    def visit_Name(self, node):
-        # If it is True or False, we do not worry about it!
-        if sys.version_info[0] == 2 and node.id in ["True", "False"]:
-            return
-        # If it is from the argument list or loop variable, we do not worry about it!
-        if node.id in self._args.keys():
-            return
-        fors = [loop.target.id for loop in self.scope_level if isinstance(loop, ast.For)]
-        if node.id in fors:
-            return
-        # The loop variable cannot be overwritten when iteration
-        _internal_assert(
-            not isinstance(node.ctx, ast.Store) or node.id not in fors,
-            "Iter var cannot be overwritten",
-        )
-
-        if node.id not in self.status.keys():
-            # It is a captured value in closure
-            if node.id in self.closure_vars:
-                try:
-                    ast.literal_eval(str(self.closure_vars[node.id]))
-                except ValueError:
-                    raise ValueError("Only support capturing constant values in closure")
-                return
-
-            _internal_assert(isinstance(node.ctx, ast.Store), f"Undeclared variable {node.id}")
-            if self.aug_assign_:
-                raise ValueError('"First store" cannot be an AugAssign')
-            self.status[node.id] = (node, self.scope_level[-1], set())
-        else:
-            decl, loop, usage = self.status[node.id]
-            usage.add(type(node.ctx))
-            _internal_assert(
-                loop in self.scope_level, f"{node.id} is used out of the scope it is defined!"
-            )
-            self.status[node.id] = (decl, loop, usage)
-
-
-def determine_variable_usage(root, args, symbols, closure_vars):
-    """The helper function for calling the dedicated visitor."""
-    visitor = PyVariableUsage(args, symbols, closure_vars)
-    visitor.visit(root)
-    return visitor.status
diff --git a/python/tvm/te/hybrid/runtime.py b/python/tvm/te/hybrid/runtime.py
deleted file mode 100644
index 615bd7e43a7d..000000000000
--- a/python/tvm/te/hybrid/runtime.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Intrinsics of TVM-Python Hybrid Script for Python emulation runtime"""
-
-import numpy
-from tvm.target import Target
-
-
-class bind(object):  # pylint: disable=invalid-name
-    """GPU bind software emulataion runtime."""
-
-    def __init__(self, _, ext):
-        self.ext = ext
-
-    def __iter__(self):
-        i = 0
-        while i < self.ext:
-            yield i
-            i += 1
-
-
-def allocate(shape, dtype="float32", scope="global"):  # pylint: disable=unused-argument
-    """Allocate a buffer with given shape
-
-    Parameters
-    ----------
-    shape: Tuple
-        The shape of the tensor to be allocated
-    dtype: string
-        The data type of the tensor
-    scope: string
-        The storage scope of the tensor
-
-    Returns
-    -------
-    tensor: numpy.array
-        The tensor allocated
-    """
-    return numpy.zeros(shape).astype(dtype)
-
-
-def rsqrt(x):
-    """
-    Computes reciprocal of square root of x element-wise
-
-    Parameters
-    ----------
-    x: Tensor
-
-    Returns
-    -------
-    res: Tensor
-        The result of reciprocal of square root of x
-    """
-    return numpy.ones_like(x) / numpy.sqrt(x)
-
-
-def popcount(x):
-    """
-    Count ones in the binary representation of number x
-
-    Parameters
-    ----------
-    x: Integer
-        The number to be counted
-
-    Returns
-    -------
-    cnt: Integer
-        The number of ones in the binary representation of number x
-    """
-    cnt = 0
-    while x:
-        x -= x & -x
-        cnt += 1
-    return cnt
-
-
-def sigmoid(x):
-    """
-    Sigmoid function of x, aka 1/(1+exp(-x)).
-
-    Parameters
-    ----------
-    x: a real number
-
-    Returns
-    -------
-    res: a real number
-        The result of sigmoid function
-    """
-    return 1 / (1 + numpy.exp(-x))
-
-
-def max_num_threads(allow_none=True):
-    """Get max number of threads for GPU targets."""
-    return Target.current(allow_none).max_num_threads
-
-
-def inf(dtype):
-    return numpy.iinfo(dtype).max
-
-
-def ninf(dtype):
-    return numpy.iinfo(dtype).min
-
-
-HYBRID_GLOBALS = {
-    "unroll": range,
-    "vectorize": range,
-    "parallel": range,
-    "const_range": range,
-    "bind": bind,
-    "allocate": allocate,
-    "output_tensor": allocate,
-    "sqrt": numpy.sqrt,
-    "rsqrt": rsqrt,
-    "log": numpy.log,
-    "tanh": numpy.tanh,
-    "power": numpy.power,
-    "exp": numpy.exp,
-    "sigmoid": sigmoid,
-    "popcount": popcount,
-    "round": round,
-    "likely": lambda cond: cond,
-    "uint8": numpy.uint8,
-    "uint16": numpy.uint16,
-    "uint32": numpy.uint32,
-    "uint64": numpy.uint64,
-    "int8": numpy.int8,
-    "int16": numpy.int16,
-    "int32": numpy.int32,
-    "int64": numpy.int64,
-    "float16": numpy.float16,
-    "float32": numpy.float32,
-    "float64": numpy.float64,
-    "ceil_div": lambda a, b: (a + b - 1) // b,
-    "max_num_threads": max_num_threads,
-    "inf": inf,
-    "ninf": inf,
-}
-
-
-def _enter_hybrid_runtime(func):
-    """Put hybrid runtime variables into the global scope"""
-    _globals = func.__globals__
-    intersect = []
-    for elem in list(HYBRID_GLOBALS.keys()):
-        if elem in _globals.keys():
-            intersect.append((elem, _globals[elem]))
-        _globals[elem] = HYBRID_GLOBALS[elem]
-    return intersect
-
-
-def _restore_runtime(func, intersect):
-    """Rollback the modification caused by hybrid runtime"""
-    _globals = func.__globals__
-    for elem in list(HYBRID_GLOBALS.keys()):
-        _globals.pop(elem)
-    for k, v in intersect:
-        _globals[k] = v
diff --git a/python/tvm/te/hybrid/utils.py b/python/tvm/te/hybrid/utils.py
deleted file mode 100644
index a515938fa524..000000000000
--- a/python/tvm/te/hybrid/utils.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=inconsistent-return-statements
-"""Internal utilities for parsing Python subset to TIR"""
-
-import ast
-import inspect
-import logging
-import sys
-import numpy
-
-import tvm.runtime
-from tvm._ffi.base import numeric_types
-from tvm.ir.container import Array
-
-from tvm.tir import expr as _expr
-from tvm.tir import stmt as _stmt
-from tvm.te.tensor import Tensor
-
-
-# pylint: disable=invalid-name
-np_arg_types = (numpy.ndarray, *numeric_types)
-tvm_arg_types = (Tensor, Array, _expr.Var, _expr.ConstExpr, *numeric_types, list, tuple, str)
-halide_imm_types = (_expr.IntImm, _expr.FloatImm, *numeric_types)
-
-
-def _internal_assert(cond, err):
-    """Simplify the code segment like if not XXX then raise an error"""
-    if not cond:
-        raise ValueError(err)
-
-
-# Useful constants. In avoid of runtime dependences, we use function calls to return them.
-def make_nop():
-    """Returns a 'no operation' node in HalideIR."""
-    return _stmt.Evaluate(tvm.runtime.const(0, dtype="int32"))
-
-
-def is_docstring(node):
-    """Checks if a Python AST node is a docstring"""
-    return isinstance(node, ast.Expr) and isinstance(node.value, ast.Str)
-
-
-def _pruned_source(func):
-    """Prune source code's extra leading spaces"""
-    try:
-        lines = inspect.getsource(func).split("\n")
-        leading_space = len(lines[0]) - len(lines[0].lstrip(" "))
-        lines = [line[leading_space:] for line in lines]
-        return "\n".join(lines)
-    except IOError as err:
-        if sys.version_info[0] == 2 and str(err) == "could not get source code":
-            logging.log(
-                logging.CRITICAL,
-                "This module is not fully operated under Python2... " "Please move to Python3!",
-            )
-            raise err
-
-
-def replace_io(body, rmap):
-    """Replacing tensors usage according to the dict given"""
-    # pylint: disable=import-outside-toplevel
-    from tvm.tir import stmt_functor
-
-    def replace(op):
-        if isinstance(op, _stmt.ProducerStore) and op.producer.op in rmap.keys():
-            buf = rmap[op.producer.op]
-            return _stmt.ProducerStore(buf, op.value, op.indices)
-        if isinstance(op, _expr.ProducerLoad) and op.producer.op in rmap.keys():
-            buf = rmap[op.producer.op]
-            return _expr.ProducerLoad(buf, op.indices)
-        return None
-
-    return stmt_functor.ir_transform(body, None, replace, ["tir.ProducerStore", "tir.ProducerLoad"])
-
-
-def _is_tvm_arg_types(args):
-    """Determine a list of element is either a list of tvm arguments of a list of numpy arguments.
-    If neither is true, raise a value error."""
-    if all(isinstance(elem, tvm_arg_types) for elem in args):
-        return True
-    elif all(isinstance(elem, np_arg_types) for elem in args):
-        return False
-    else:
-        raise ValueError(
-            f"Expected arguments to be entirely TVM types, "
-            f"or entirely numpy types, "
-            f"but received {[type(elem) for elem in args]}"
-        )
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index 63a3ecd57b1c..a9681c6df040 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -620,3 +620,6 @@ def tir_matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     if not isinstance(ops, (list, tuple, Array)):
         ops = [ops]
     return _ffi_api.CreatePrimFunc(ops, index_dtype_override)
+
+
+AXIS_SEPARATOR = tvm.tir.IndexMap.AXIS_SEPARATOR
diff --git a/python/tvm/te/schedule.py b/python/tvm/te/schedule.py
deleted file mode 100644
index 87a4eda728df..000000000000
--- a/python/tvm/te/schedule.py
+++ /dev/null
@@ -1,665 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-import
-"""The computation schedule api of TVM."""
-import collections
-import inspect
-from typing import Callable, List
-
-import tvm._ffi
-from tvm._ffi.base import string_types
-from tvm.ir import container as _container
-from tvm.runtime import Object, convert
-from tvm.tir import Buffer, IndexMap, IterVar, Var
-
-from . import _ffi_api
-from . import tensor as _tensor
-
-
-@tvm._ffi.register_object
-class Split(Object):
-    """Split operation on axis."""
-
-
-@tvm._ffi.register_object
-class Fuse(Object):
-    """Fuse operation on axis."""
-
-
-@tvm._ffi.register_object
-class Singleton(Object):
-    """Singleton axis."""
-
-
-def create_schedule(ops):
-    """Create a schedule for list of ops
-
-    Parameters
-    ----------
-    ops : list of Operations
-        The source expression.
-
-    Returns
-    -------
-    sch : schedule.Schedule
-        The created schedule.
-    """
-    if not isinstance(ops, (list, _container.Array)):
-        ops = [ops]
-    return _ffi_api.CreateSchedule(ops)
-
-
-@tvm._ffi.register_object
-class Schedule(Object):
-    """Schedule for all the stages."""
-
-    def __getitem__(self, k):
-        if isinstance(k, _tensor.Tensor):
-            k = k.op
-        if not isinstance(k, _tensor.Operation):
-            raise ValueError("Expect schedule key to be Tensor or Operation")
-        if k not in self.stage_map:
-            raise ValueError(f"Cannot find the operation {k} in schedule")
-        return self.stage_map[k]
-
-    def normalize(self):
-        """Build a normalized schedule from the current schedule.
-
-        Insert necessary rebase to make certain iter var to start from 0.
-        This is needed before bound inference and followup step.
-
-        Returns
-        -------
-        sch : Schedule
-            The normalized schedule.
-        """
-        return _ffi_api.ScheduleNormalize(self)
-
-    def create_group(self, outputs, inputs, include_inputs=False):
-        """Create stage group by giving output and input boundary.
-
-        The operators between outputs and inputs are placed as member of group.
-        outputs are include in the group, while inputs are not included.
-
-        Parameters
-        ----------
-        outputs : list of Tensors
-            The outputs of the group.
-
-        inputs : list of Tensors
-            The inputs of the group.
-
-        include_inputs : boolean, optional
-            Whether include input operations in the group if they are used by outputs.
-
-        Returns
-        -------
-        group : Stage
-            A virtual stage represents the group, user can use compute_at to move
-            the attachment point of the group.
-        """
-        if isinstance(outputs, _tensor.Tensor):
-            outputs = [outputs]
-        if isinstance(inputs, _tensor.Tensor):
-            inputs = [inputs]
-        return _ffi_api.ScheduleCreateGroup(self, outputs, inputs, include_inputs)
-
-    def cache_read(self, tensor, scope, readers):
-        """Create a cache read of original tensor for readers.
-
-        This will mutate the body of the readers.
-        A new cache stage will be created for the tensor.
-        Call this before doing any split/fuse schedule.
-
-        Parameters
-        ----------
-        tensor : Tensor
-            The tensor to be cached.
-        scope : str
-            The scope of cached
-        readers : list of Tensor or Operation
-            The readers to read the cache.
-
-        Returns
-        -------
-        cache : Tensor
-            The created cache tensor.
-        """
-        if isinstance(readers, (_tensor.Tensor, _tensor.Operation)):
-            readers = [readers]
-        readers = [t.op if isinstance(t, _tensor.Tensor) else t for t in readers]
-        return _ffi_api.ScheduleCacheRead(self, tensor, scope, readers)
-
-    def cache_write(self, tensor, scope):
-        """Create a cache write of original tensor, before storing into tensor.
-
-        This will mutate the body of the tensor.
-        A new cache stage will created before feed into the tensor.
-
-        This function can be used to support data layout transformation.
-        If there is a split/fuse/reorder on the data parallel axis of tensor
-        before cache_write is called. The intermediate cache stores
-        the data in the layout as the iteration order of leave axis.
-        The data will be transformed back to the original layout in the original tensor.
-        User can further call compute_inline to inline the original layout and keep
-        the data stored in the transformed layout.
-
-        Parameters
-        ----------
-        tensor : Tensor, list or tuple
-            The tensors to be feed to. All the tensors must be produced by one computeOp
-        scope : str
-            The scope of cached
-
-        Returns
-        -------
-        cache : Tensor
-            The created cache tensor.
-        """
-        return _ffi_api.ScheduleCacheWrite(self, tensor, scope)
-
-    def rfactor(self, tensor, axis, factor_axis=0):
-        """Factor a reduction axis in tensor's schedule to be an explicit axis.
-
-        This will create a new stage that generated the new tensor with axis
-        as the first dimension. The tensor's body will be rewritten as a reduction
-        over the factored tensor.
-
-        Parameters
-        ----------
-        tensor : Tensor
-            The tensor to be factored.
-        axis : IterVar
-            The reduction axis in the schedule to be factored.
-        factor_axis : int
-            The position where the new axis is placed.
-
-        Returns
-        -------
-        tfactor : Tensor or Array of Tensor
-            The created factored tensor.
-        """
-        factored = _ffi_api.ScheduleRFactor(self, tensor, axis, factor_axis)
-        return factored[0] if len(factored) == 1 else factored
-
-
-@tvm._ffi.register_object
-class Stage(Object):
-    """A Stage represents schedule for one operation."""
-
-    def split(self, parent, factor=None, nparts=None, disable_predication=False):
-        """Split the stage either by factor providing outer scope, or both
-
-        Parameters
-        ----------
-        parent : IterVar
-             The parent iter var.
-
-        factor : Expr, optional
-             The splitting factor
-
-        nparts : Expr, optional
-             The number of outer parts.
-
-        disable_predication : bool, optional
-            If enabled, don't create a predicate for guarding the loop. This can
-            be useful when splitting with scalable factors that the schedule writer
-            knows are divisible by the loop bound.
-
-            Warning: enabling this feature may result in incorrect code generation
-            if not used carefully.
-
-        Returns
-        -------
-        outer : IterVar
-            The outer variable of iteration.
-
-        inner : IterVar
-            The inner variable of iteration.
-        """
-        if nparts is not None:
-            if factor is not None:
-                raise ValueError("Do not need to provide both outer and nparts")
-            outer, inner = _ffi_api.StageSplitByNParts(self, parent, nparts, disable_predication)
-        else:
-            if factor is None:
-                raise ValueError("Either nparts or factor need to be provided")
-            outer, inner = _ffi_api.StageSplitByFactor(self, parent, factor, disable_predication)
-        return outer, inner
-
-    def fuse(self, *args):
-        """Fuse multiple consecutive iteration variables into a single iteration variable.
-
-        fused = fuse(...fuse(fuse(args[0], args[1]), args[2]),..., args[-1])
-        The order is from outer to inner.
-
-        Parameters
-        ----------
-        args : list of IterVars
-            Itervars that proceeds each other
-
-        Returns
-        -------
-        fused : IterVar
-            The fused variable of iteration.
-        """
-        fused = _ffi_api.StageFuse(self, args)
-        return fused
-
-    def set_scope(self, scope):
-        """Set the thread scope of this stage
-
-        Parameters
-        ----------
-        scope : str
-            The thread scope of this stage
-        """
-        return _ffi_api.StageSetScope(self, scope)
-
-    def bind(self, ivar, thread_ivar):
-        """Bind ivar to thread index thread_ivar
-
-        Parameters
-        ----------
-        ivar : IterVar
-            The iteration to be binded to thread.
-
-        thread_ivar : IterVar
-            The thread to be binded.
-        """
-        _ffi_api.StageBind(self, ivar, thread_ivar)
-
-    def env_threads(self, threads):
-        """Mark threads to be launched at the outer scope of composed op.
-
-        Parameters
-        ----------
-        threads : list of threads
-            The threads to be launched.
-        """
-        if isinstance(threads, IterVar):
-            threads = [threads]
-        _ffi_api.StageEnvThreads(self, threads)
-
-    def set_store_predicate(self, predicate):
-        """Set predicate under which store to the array can be performed.
-
-        Use this when there are duplicated threads doing the same store and we only
-        need one of them to do the store.
-
-        Parameters
-        ----------
-        predicate : Expr
-            The guard condition fo store.
-        """
-        _ffi_api.StageSetStorePredicate(self, predicate)
-
-    def compute_at(self, parent, scope):
-        """Attach the stage at parent's scope
-
-        Parameters
-        ----------
-        parent : Stage
-            The parent stage
-
-        scope : IterVar
-            The loop scope t be attached to.
-        """
-        _ffi_api.StageComputeAt(self, parent, scope)
-
-    def compute_inline(self):
-        """Mark stage as inline
-
-        Parameters
-        ----------
-        parent : Stage
-            The parent stage
-        """
-        _ffi_api.StageComputeInline(self)
-
-    def compute_root(self):
-        """Attach the stage at parent, and mark it as root
-
-        Parameters
-        ----------
-        parent : Stage
-            The parent stage
-        """
-        _ffi_api.StageComputeRoot(self)
-
-    def reorder(self, *args):
-        """reorder the arguments in the specified order.
-
-        Parameters
-        ----------
-        args : list of IterVar
-            The order to be ordered
-        """
-        _ffi_api.StageReorder(self, args)
-
-    def tile(self, x_parent, y_parent, x_factor, y_factor):
-        """Perform tiling on two dimensions
-
-        The final loop order from outmost to inner most are
-        [x_outer, y_outer, x_inner, y_inner]
-
-        Parameters
-        ----------
-        x_parent : IterVar
-            The original x dimension
-        y_parent : IterVar
-            The original y dimension
-        x_factor : Expr
-            The stride factor on x axis
-        y_factor : Expr
-            The stride factor on y axis
-
-        Returns
-        -------
-        x_outer : IterVar
-            Outer axis of x dimension
-        y_outer : IterVar
-            Outer axis of y dimension
-        x_inner : IterVar
-            Inner axis of x dimension
-        p_y_inner : IterVar
-            Inner axis of y dimension
-        """
-        x_outer, y_outer, x_inner, y_inner = _ffi_api.StageTile(
-            self, x_parent, y_parent, x_factor, y_factor
-        )
-        return x_outer, y_outer, x_inner, y_inner
-
-    def vectorize(self, var):
-        """Vectorize the iteration.
-
-        Parameters
-        ----------
-        var : IterVar
-            The iteration to be vectorize
-        """
-        _ffi_api.StageVectorize(self, var)
-
-    def tensorize(self, var, tensor_intrin):
-        """Tensorize the computation enclosed by var with tensor_intrin
-
-        Parameters
-        ----------
-        var : IterVar
-            The iteration boundary of tensorization.
-
-        tensor_intrin : TensorIntrin
-            The tensor intrinsic used for computation.
-        """
-        _ffi_api.StageTensorize(self, var, tensor_intrin)
-
-    def unroll(self, var):
-        """Unroll the iteration.
-
-        Parameters
-        ----------
-        var : IterVar
-            The iteration to be unrolled.
-        """
-        _ffi_api.StageUnroll(self, var)
-
-    def parallel(self, var):
-        """Parallelize the iteration.
-
-        Parameters
-        ----------
-        var : IterVar
-            The iteration to be parallelized.
-        """
-        _ffi_api.StageParallel(self, var)
-
-    def pragma(self, var, pragma_type, pragma_value=None):
-        """Annotate the iteration with pragma
-
-        This will translate to a pragma_scope surrounding
-        the corresponding loop generated.
-        Useful to support experimental features and extensions.
-
-        Parameters
-        ----------
-        var : IterVar
-            The iteration to be anotated
-
-        pragma_type : str
-             The pragma string to be annotated
-
-        pragma_value : Expr, optional
-             The pragma value to pass along the pragma
-
-        Note
-        ----
-        Most pragmas are advanced/experimental features
-        and may subject to change. List of supported pragmas:
-
-        - **debug_skip_region**
-
-          Force skip the region marked by the axis and turn it into no-op.
-          This is useful for debug purposes.
-
-        - **parallel_launch_point**
-
-          Specify to launch parallel threads outside the
-          specified iteration loop. By default the threads
-          launch at the point of parallel construct.
-          This pragma moves the launching point to even outer scope.
-          The threads are launched once and reused across multiple
-          parallel constructs as BSP style program.
-
-        - **parallel_barrier_when_finish**
-
-          Insert a synchronization barrier between working threads
-          after the specified loop iteration finishes.
-
-        - **parallel_stride_pattern**
-
-          Hint parallel loop to execute in strided pattern.
-          :code:`for (int i = task_id; i < end; i += num_task)`
-
-        """
-        if isinstance(pragma_value, string_types):
-            pragma_value = convert(pragma_value)
-        _ffi_api.StagePragma(self, var, pragma_type, pragma_value)
-
-    def prefetch(self, tensor, var, offset):
-        """Prefetch the specified variable
-
-        Parameters
-        ----------
-        tensor : Tensor
-            The tensor to be prefetched
-        var : IterVar
-            The loop point at which the prefetching is applied
-        offset : Expr
-            The number of iterations to be prefetched before actual execution
-        """
-        _ffi_api.StagePrefetch(self, tensor, var, offset)
-
-    def storage_align(self, axis, factor, offset):
-        """Set alignment requirement for specific axis
-
-        This ensures that stride[axis] == k * factor + offset for some k.
-        This is useful to set memory layout to for more friendly memory
-        access pattern. For example, we can set alignment to be
-        factor=2, offset=1 to avoid bank conflict for thread access on
-        higher dimension in GPU shared memory.
-
-        Parameters
-        ----------
-        axis : IterVar
-            The axis dimension to be aligned.
-        factor : int
-            The factor in alignment specification.
-        offset : int
-            The offset in the alignment specification.
-        """
-        _ffi_api.StageStorageAlign(self, axis, factor, offset)
-
-    def double_buffer(self):
-        """Compute the current stage via double buffering.
-
-        This can only be applied to intermediate stage.
-        This will double the storage cost of the current stage.
-        Can be useful to hide load latency.
-        """
-        _ffi_api.StageDoubleBuffer(self)
-
-    def rolling_buffer(self):
-        """Compute the current stage via rolling buffering.
-
-        This can only be applied to intermediate stage.
-        This will change the storage cost of the current stage.
-        """
-        _ffi_api.StageRollingBuffer(self)
-
-    def transform_layout(self, mapping_function: Callable[..., List[tvm.tir.PrimExpr]]):
-        """Defines the layout transformation for the current stage's tensor.
-
-        The map from initial_indices to final_indices must be an
-        invertible affine transformation.  This method may be called
-        more than once for a given tensor, in which case each
-        transformation is applied sequentially.
-
-        If the stage is a ComputeOp, then the iteration order of the
-        compute stage is rewritten to be a row-major traversal of the
-        tensor, and the new loop iteration variables are returned.
-        For all other stages, the loop iteration order is unmodified,
-        and the return value is None.
-
-        Parameters
-        ----------
-        mapping_function : Callable[..., List[tvm.tir.PrimExpr]]
-
-            A callable that accepts N arguments of type tvm.tir.Var,
-            and outputs a list of PrimExpr.  The input arguments
-            represent the location of a value in the current stage's
-            tensor, using the pre-transformation layout.  The return
-            value of the function gives the location of that value in
-            the current stage's tensor, using the post-transformation
-            layout.
-
-        Returns
-        -------
-        new_iter_vars : Optional[List[tvm.tir.IterVar]]
-
-            If the stage is a ComputeOp, then the return will be the
-            updated loop iteration variables over the data array, in
-            the same order as the output values from the
-            `mapping_function`.
-
-            Otherwise, the return value is None.
-
-        Examples
-        --------
-        .. code-block:: python
-
-            # ``A`` is a tensor whose compute definition is in NHWC
-            # format, and should be transformed into NCHWc format.
-
-            s[A].transform_layout(
-                lambda n,h,w,c: [n, c//4, h, w, c%4]
-            )
-
-
-        .. code-block:: python
-
-            # ``A`` is a tensor whose compute definition is in an
-            # arbitrary format, and should be transformed such that
-            # the last index is split, with the slower-changing index
-            # of the split placed at the slowest changing dimension.
-
-            s[A].transform_layout(
-                lambda *indices, i: [i//4, *indices, i%4]
-            )
-
-        .. code-block:: python
-
-            # ``B`` is a tensor defined by te.compute to be a copy of
-            # ``A`, and should be transformed such that ``B``'s layout
-            # is a transpose of ``A``'s layout.  The loop iteration
-            # that computes ``B`` will correspond to ``B``'s memory
-            # layout.
-
-            A = te.placeholder([n,m])
-            B = te.compute(A.shape, lambda i,j: A[i,j])
-            s = te.create_schedule(B.op)
-
-            s[B].transform_layout(lambda i,j: [j,i])
-
-        """
-
-        ndim = len(self.op.output(0).shape)
-        index_map, axis_separators = IndexMap.from_func_with_separators(
-            mapping_function, ndim=ndim, index_dtype="int32"
-        )
-
-        new_iter_vars = _ffi_api.StageTransformLayout(
-            self, index_map.initial_indices, index_map.final_indices
-        )
-        _ffi_api.StageSetAxisSeparators(self, axis_separators)
-
-        return new_iter_vars or None
-
-
-@tvm._ffi.register_object
-class SpecializedCondition(Object):
-
-    """Specialized condition to enable op specialization."""
-
-    def __init__(self, conditions):
-        """Create a specialized condition.
-
-        .. note::
-            Conditions are represented in conjunctive joint form (CNF).
-            Each condition should be a simple expression, e.g., n > 16,
-            m % 8 == 0, etc., where n, m are tvm.Var that represents a
-            dimension in the tensor shape.
-
-        Parameters
-        ----------
-        conditions : List of tvm.Expr
-            List of conditions in conjunctive joint form (CNF).
-        """
-        if not isinstance(conditions, (list, _container.Array)):
-            conditions = [conditions]
-        self.__init_handle_by_constructor__(_ffi_api.CreateSpecializedCondition, conditions)
-
-    @staticmethod
-    def current():
-        """Returns the current specialized condition"""
-        return _ffi_api.GetCurrentSpecialization()
-
-    def __enter__(self):
-        _ffi_api.EnterSpecializationScope(self)
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        _ffi_api.ExitSpecializationScope(self)
-
-
-# Sentinel value used to indicate which groups of pre-flattening axes
-# should be used to post-flattening axes.  Moved from
-# te.AXIS_SEPARATOR to tir.IndexMap.AXIS_SEPARATOR for general use,
-# maintained here for backwards compatibility.
-AXIS_SEPARATOR = IndexMap.AXIS_SEPARATOR
-
-
-tvm._ffi._init_api("schedule", __name__)
diff --git a/python/tvm/te/tensor.py b/python/tvm/te/tensor.py
index 930667242e29..53ab9d0b5b59 100644
--- a/python/tvm/te/tensor.py
+++ b/python/tvm/te/tensor.py
@@ -190,13 +190,3 @@ def scan_axis(self):
 @tvm._ffi.register_object
 class ExternOp(Operation):
     """External operation."""
-
-
-@tvm._ffi.register_object
-class HybridOp(Operation):
-    """Hybrid operation."""
-
-    @property
-    def axis(self):
-        """Represent the IterVar axis, also defined when it is a HybridOp"""
-        return self.__getattr__("axis")
diff --git a/python/tvm/te/tensor_intrin.py b/python/tvm/te/tensor_intrin.py
deleted file mode 100644
index ff633af02d13..000000000000
--- a/python/tvm/te/tensor_intrin.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Tensor intrinsics"""
-import tvm._ffi
-import tvm.tir
-
-from tvm.runtime import Object, convert
-from tvm.ir import Range
-from .tensor import PlaceholderOp
-
-from . import tensor as _tensor
-from . import _ffi_api
-
-
-def _get_region(tslice):
-    region = []
-    for idx in tslice.indices:
-        if isinstance(idx, slice):
-            assert idx.step is None
-            region.append(Range(idx.start, idx.stop))
-        else:
-            if isinstance(idx, tvm.tir.IterVar):
-                begin = idx.var
-            else:
-                begin = idx
-            region.append(Range.from_min_extent(begin, 1))
-    return region
-
-
-@tvm._ffi.register_object
-class TensorIntrin(Object):
-    """Tensor intrinsic functions for certain computation.
-
-    See Also
-    --------
-    decl_tensor_intrin: Construct a TensorIntrin
-    """
-
-    def __call__(self, *args, **kwargs):
-        tensors = [x.tensor for x in args if isinstance(x, _tensor.TensorSlice)]
-        scalar_inputs = [x for x in args if not isinstance(x, _tensor.TensorSlice)]
-        regions = [_get_region(x) for x in args if isinstance(x, _tensor.TensorSlice)]
-        reduce_axis = []
-        if "reduce_axis" in kwargs:
-            reduce_axis = kwargs["reduce_axis"]
-            if not isinstance(reduce_axis, (list, tuple)):
-                reduce_axis = [reduce_axis]
-            reduce_axis = convert(reduce_axis)
-        if scalar_inputs:
-            scalar_inputs = convert(scalar_inputs)
-        return _ffi_api.TensorIntrinCall(self, tensors, regions, reduce_axis, scalar_inputs)
-
-
-def decl_tensor_intrin(
-    op, fcompute, name="tensor_intrin", binds=None, scalar_params=None, default_buffer_params=None
-):
-    """Declare a tensor intrinsic function.
-
-    Parameters
-    ----------
-    op: Operation
-        The symbolic description of the intrinsic operation
-
-    fcompute: lambda function of inputs, outputs-> stmt
-        Specifies the IR statement to do the computation.
-        See the following note for function signature of fcompute
-
-        .. note::
-             **Parameters**
-
-             - **ins** (list of :any:`tvm.tir.Buffer`) - Placeholder for each inputs
-             - **outs** (list of :any:`tvm.tir.Buffer`) - Placeholder for each outputs
-
-             **Returns**
-
-             - **stmt** (:any:`tvm.tir.Stmt`, or tuple of three stmts)
-             - If a single stmt is returned, it represents the body
-             - If tuple of three stmts are returned they corresponds to body,
-               reduce_init, reduce_update
-
-    name: str, optional
-        The name of the intrinsic.
-
-    binds: dict of :any:`Tensor` to :any:`tvm.tir.Buffer`, optional
-        Dictionary that maps the Tensor to Buffer which specified the data layout
-        requirement of the function. By default, a new compact buffer is created
-        for each tensor in the argument.
-
-    scalar_params: a list of variables used by op, whose values will be passed
-                   as scalar_inputs when the tensor intrinsic is called.
-
-    default_buffer_params: Optional[dict]
-        Dictionary of buffer arguments to be passed when constructing a buffer.
-
-    Returns
-    -------
-    intrin: TensorIntrin
-        A TensorIntrin that can be used in tensorize schedule.
-    """
-    if not isinstance(op, _tensor.Operation):
-        raise TypeError("expect Operation")
-    inputs = op.input_tensors
-    binds = binds if binds else {}
-    tensors = list(inputs)
-    for i in range(op.num_outputs):
-        tensors.append(op.output(i))
-
-    binds_list = []
-    for t in inputs:
-        if not isinstance(t.op, PlaceholderOp):
-            raise ValueError("Do not yet support composition op")
-
-    default_buffer_params = {} if default_buffer_params is None else default_buffer_params
-    for t in tensors:
-        buf = (
-            binds[t]
-            if t in binds
-            else tvm.tir.decl_buffer(t.shape, t.dtype, t.op.name, **default_buffer_params)
-        )
-        binds_list.append(buf)
-
-    if scalar_params:
-        body = fcompute(binds_list[: len(inputs)], binds_list[len(inputs) :], scalar_params)
-    else:
-        body = fcompute(binds_list[: len(inputs)], binds_list[len(inputs) :])
-        scalar_params = []
-    if isinstance(body, (tvm.tir.PrimExpr, tvm.tir.Stmt)):
-        body = [body]
-    body = [tvm.tir.Evaluate(x) if isinstance(x, tvm.tir.PrimExpr) else x for x in body]
-    if len(body) < 3:
-        body += [None] * (3 - len(body))
-    return _ffi_api.TensorIntrin(name, op, inputs, binds_list, scalar_params, *body)
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 8df32c810543..b3123a20d3e9 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -327,8 +327,7 @@ def _compute_body(*us):
 
         A = tvm.te.compute([r.extent.value for v, r in vranges.items()], _compute_body)
         args = [tvm.nd.empty(A.shape, A.dtype)]
-        sch = tvm.te.create_schedule(A.op)
-        mod = tvm.build(sch, [A])
+        mod = tvm.build(tvm.IRModule.from_expr(tvm.te.create_prim_func([A])))
         mod(*args)
         return args[0].numpy()
 
diff --git a/python/tvm/tir/buffer.py b/python/tvm/tir/buffer.py
index 1109cc3d66d6..72c2a40fedd2 100644
--- a/python/tvm/tir/buffer.py
+++ b/python/tvm/tir/buffer.py
@@ -304,29 +304,6 @@ def decl_buffer(
     buffer : tvm.tir.Buffer
         The created buffer
 
-    Example
-    -------
-    Here's an example of how broadcast buffer can be used to define a symbolic broadcast operation,
-
-    .. code-block:: python
-
-        m0, m1, m2 = te.var("m0"), te.var("m1"), te.var("m2")
-        n0, n1, n2 = te.var("n0"), te.var("n1"), te.var("n2")
-        o0, o1, o2 = te.var("o0"), te.var("o1"), te.var("o2")
-        A = te.placeholder((m0, m1, m2), name='A')
-        B = te.placeholder((n0, n1, n2), name='B')
-        C = te.compute((o0, o1, o2), lambda i, j, k: A[i, j, k] + B[i, j, k], name='C')
-        Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast")
-        Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast")
-        s = te.create_schedule(C.op)
-        fadd = tvm.build(s, [A, B, C], target='llvm', name='bcast_add', binds={A:Ab, B:Bb})
-        dev = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(2, 1, 3)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
     Note
     ----
     Buffer data structure reflects the DLTensor structure in dlpack.
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index 3588c04d8fa2..1de6941c9923 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -39,9 +39,7 @@
 from .sort import *
 from .scatter import *
 from .scatter_elements import *
-from .sparse_fill_empty_rows import *
 from .sparse_reshape import *
-from .argwhere import *
 from .scan import *
 from .einsum import *
 from .unique import *
@@ -49,9 +47,7 @@
 from .signal import *
 from . import nn
 from . import utils
-from . import vision
 from . import image
-from . import random
 from . import gpu
 
 # error reporting
diff --git a/python/tvm/topi/argwhere.py b/python/tvm/topi/argwhere.py
deleted file mode 100644
index c2b658a4e92f..000000000000
--- a/python/tvm/topi/argwhere.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks
-"""Argwhere operator"""
-import tvm
-from tvm.te import hybrid
-
-
-@hybrid.script
-def hybrid_argwhere_1d(output_shape, condition):
-    """Find the indices of elements of a 1-D tensor that are non-zero.
-
-    Parameters
-    ----------
-    condition : tvm.te.Tensor
-        1-D tensor with boolean values.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        Indices of non-zero elements.
-    """
-    a = output_tensor(output_shape, "int32")
-    a1 = condition.shape[0]
-    valid_index = 0
-    for i1 in range(a1):
-        if condition[i1] != 0:
-            a[valid_index, 0] = i1
-            valid_index += 1
-    return a
-
-
-@hybrid.script
-def hybrid_argwhere_2d(output_shape, condition):
-    """Find the indices of elements of a 2-D tensor that are non-zero.
-
-    Parameters
-    ----------
-    condition : tvm.te.Tensor
-        2-D tensor with boolean values.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        Indices of non-zero elements.
-    """
-    a = output_tensor(output_shape, "int32")
-    a1 = condition.shape[0]
-    a2 = condition.shape[1]
-    valid_index = 0
-    for i1 in range(a1):
-        for i2 in range(a2):
-            if condition[i1, i2] != 0:
-                a[valid_index, 0] = i1
-                a[valid_index, 1] = i2
-                valid_index += 1
-    return a
-
-
-@hybrid.script
-def hybrid_argwhere_3d(output_shape, condition):
-    """Find the indices of elements of a 3-D tensor that are non-zero.
-
-    Parameters
-    ----------
-    condition : tvm.te.Tensor
-        3-D tensor with boolean values.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        Indices of non-zero elements.
-    """
-    a = output_tensor(output_shape, "int32")
-    a1 = condition.shape[0]
-    a2 = condition.shape[1]
-    a3 = condition.shape[2]
-    valid_index = 0
-    for i1 in range(a1):
-        for i2 in range(a2):
-            for i3 in range(a3):
-                if condition[i1, i2, i3] != 0:
-                    a[valid_index, 0] = i1
-                    a[valid_index, 1] = i2
-                    a[valid_index, 2] = i3
-                    valid_index += 1
-    return a
-
-
-@hybrid.script
-def hybrid_argwhere_4d(output_shape, condition):
-    """Find the indices of elements of a 4-D tensor that are non-zero.
-
-    Parameters
-    ----------
-    condition : tvm.te.Tensor
-        4-D tensor with boolean values.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        Indices of non-zero elements.
-    """
-    a = output_tensor(output_shape, "int32")
-    a1 = condition.shape[0]
-    a2 = condition.shape[1]
-    a3 = condition.shape[2]
-    a4 = condition.shape[3]
-    valid_index = 0
-    for i1 in range(a1):
-        for i2 in range(a2):
-            for i3 in range(a3):
-                for i4 in range(a4):
-                    if condition[i1, i2, i3, i4] != 0:
-                        a[valid_index, 0] = i1
-                        a[valid_index, 1] = i2
-                        a[valid_index, 2] = i3
-                        a[valid_index, 3] = i4
-                        valid_index += 1
-    return a
-
-
-@hybrid.script
-def hybrid_argwhere_5d(output_shape, condition):
-    """Find the indices of elements of a 5-D tensor that are non-zero.
-
-    Parameters
-    ----------
-    condition : tvm.te.Tensor
-        5-D tensor with boolean values.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        Indices of non-zero elements.
-    """
-    a = output_tensor(output_shape, "int32")
-    a1 = condition.shape[0]
-    a2 = condition.shape[1]
-    a3 = condition.shape[2]
-    a4 = condition.shape[3]
-    a5 = condition.shape[4]
-    valid_index = 0
-    for i1 in range(a1):
-        for i2 in range(a2):
-            for i3 in range(a3):
-                for i4 in range(a4):
-                    for i5 in range(a5):
-                        if condition[i1, i2, i3, i4, i5] != 0:
-                            a[valid_index, 0] = i1
-                            a[valid_index, 1] = i2
-                            a[valid_index, 2] = i3
-                            a[valid_index, 3] = i4
-                            a[valid_index, 4] = i5
-                            valid_index += 1
-    return a
-
-
-@tvm.target.generic_func
-def argwhere(output_shape, condition):
-    """Find the indices of elements of a tensor that are non-zero.
-
-    Parameters
-    ----------
-    condition : tvm.te.Tensor
-        Tensor with boolean values.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        Indices of non-zero elements.
-    """
-    if len(condition.shape) == 1:
-        return hybrid_argwhere_1d(output_shape.shape, condition)
-    if len(condition.shape) == 2:
-        return hybrid_argwhere_2d(output_shape.shape, condition)
-    if len(condition.shape) == 3:
-        return hybrid_argwhere_3d(output_shape.shape, condition)
-    if len(condition.shape) == 4:
-        return hybrid_argwhere_4d(output_shape.shape, condition)
-    if len(condition.shape) == 5:
-        return hybrid_argwhere_5d(output_shape.shape, condition)
-    raise ValueError("Does not support rank higher than 5 in argwhere")
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 5ee625577e38..e145add5f01b 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -615,68 +615,6 @@ def conv2d_NCHWc_int8(
     )
 
 
-def conv2d_gemm_weight_transform(kernel, tile_N, tile_K, use_scalable_vectors=False, use_sme=False):
-    """Weight transformation for winograd
-
-    Parameters
-    ----------
-    kernel: Tensor
-        The raw kernel tensor with layout "NHWC".
-    tile_N: int
-        Tile size across N axis of the weight transformation for ConvGemm. (N = OC)
-    tile_K: int
-        Tile size across K axis of the weight transformation for ConvGemm. (K = KW * KH * IC)
-    use_scalable_vectors : bool
-        determines if operations on scalable vectors are expected
-    use_sme : bool
-        determines if SME operations on scalable vectors are expected
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [CI*KH*KW,CO]
-    """
-    KH, KW, IC, OC = get_const_tuple(kernel.shape)
-    K = KH * KW * IC
-    N = OC
-
-    kernel_flat = te.compute(
-        (K, N), lambda x, y: kernel[(x // IC) // KW, (x // IC) % KW, x % IC, y], "weight_flatten"
-    )
-
-    pad_N, pad_K = tvm.topi.arm_cpu.arm_utils.get_conv2d_weights_padding(N, K, tile_N, tile_K)
-
-    N_padded = N + pad_N
-    K_padded = K + pad_K
-
-    if pad_K != 0 or pad_N != 0:
-        kernel_flat = pad(
-            kernel_flat, pad_before=(0, 0), pad_after=(pad_K, pad_N), name="weight_padding"
-        )
-
-    if use_sme and kernel.dtype == "float16":
-        return te.compute(
-            (N_padded, K_padded), lambda x, y: kernel_flat[y, x], name="weight_transpose"
-        )
-
-    if use_scalable_vectors or use_sme:
-        return kernel_flat
-
-    if kernel.dtype in ["int8", "uint8"]:
-        B_inter_t = te.compute(
-            (N_padded // tile_N, K_padded // tile_K, tile_N, tile_K),
-            lambda x, y, z, w: kernel_flat[w + tile_K * y, z + tile_N * x],
-            name="weight_block_reshape",
-        )
-    else:
-        B_inter_t = te.compute(
-            (N_padded // tile_N, K_padded // tile_K, tile_K, tile_N),
-            lambda x, y, z, w: kernel_flat[z + tile_K * y, w + tile_N * x],
-            name="weight_block_reshape",
-        )
-    return B_inter_t
-
-
 def conv2d_winograd_weight_transform(kernel, tile_size):
     """Weight transformation for winograd
 
@@ -712,29 +650,6 @@ def conv2d_winograd_weight_transform(kernel, tile_size):
     )
 
 
-def conv2d_winograd_nnpack_weight_transform(kernel, convolution_algorithm, out_dtype):
-    """Weight transformation for winograd
-
-    Parameters
-    ----------
-    kernel: Tensor
-        The raw kernel tensor with layout "NCHW". Only 3x3 kernel is supported for now.
-    convolution_algorithm: int
-        The convolution algorithm for Winograd NNPACK.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [alpha, alpha, CO, CI]
-    """
-    # pylint: disable=import-outside-toplevel
-    from tvm.contrib import nnpack
-
-    return nnpack.convolution_inference_weight_transform(
-        kernel, algorithm=convolution_algorithm, dtype=out_dtype
-    )
-
-
 def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtype=None):
     """Group convolution operator in NCHW layout.
 
diff --git a/python/tvm/topi/random/__init__.py b/python/tvm/topi/random/__init__.py
deleted file mode 100644
index ee8d1d6385b7..000000000000
--- a/python/tvm/topi/random/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Pseudorandom generator kernels and operators."""
-from __future__ import absolute_import
-
-from .kernel import *
diff --git a/python/tvm/topi/random/kernel.py b/python/tvm/topi/random/kernel.py
deleted file mode 100644
index 464ea9634ab5..000000000000
--- a/python/tvm/topi/random/kernel.py
+++ /dev/null
@@ -1,657 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Pseudorandom number kernels."""
-import math
-import numpy as np
-
-import tvm
-import tvm.topi
-
-from ... import tir
-from ...tir import ir_builder
-
-
-# Threefry PRNG with splitting based on
-# - J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1,
-#   2, 3," SC '11: Proceedings of 2011 International Conference for High Performance Computing,
-#   Networking, Storage and Analysis, Seattle, WA, 2011, pp. 1-12, doi: 10.1145/2063384.2063405.
-# - Claessen, K. ; Palka, M. (2013) "Splittable Pseudorandom Number Generators using Cryptographic
-#   Hashing". Proceedings of Haskell Symposium 2013 pp. 47-58.  MLA
-# - Ferguson, Niels, et al. "The Skein hash function family." Submission to NIST (round 3) 7.7.5
-#   (2010): 3.
-
-
-# Threefry is a counter based PRNG: given a unique input, it generates a unique random number. As
-# there is no state to maintain, we can apply it to a sequence of numbers (0..N) to generate a
-# sequence of random numbers in parallel. In order to make the PRNG splittable (that is we can
-# generate a sequence of random numbers in one place, and another sequence in another), we add a
-# path and key in addition to the counter. The path allows us to encode a sequence of splits (a 0 in
-# the path indicates the left result of a split, a 1 indicates the right). To avoid continuously
-# growing the path, we can compress an existing path into the key portion of the generator by
-# hashing the current key, path, and counter to create the new key (this same technique is used if
-# we run out of room for the counter). They key is initialized with a unique initial state.
-#
-# Random numbers are generated by applying the Threefry hash to the current key, path, and counter.
-
-# This module use encoding e4 from the appendix of "Splittable Pseudorandom Number Generators using
-# Cryptographic Hashing" (confusingly, the definition in the paper uses e3 to define the encoding
-# function). This encoding uses a 10 element uint64 tensor where each byte means the following:
-
-# .. code-block:
-
-#     gen:
-#     words: 0 1 2 3 | 4 5  | 6 7     | 8 9
-#     usage: key     | path | counter | position of next step in path encoded in binary
-#                                       ex: 0b00010 -> next path entry goes one from the right
-
-# Right now, counter only uses the rightmost word.
-
-# Threefry rotation constants from the Skein paper ("The Skein Hash Function Family"
-# https://www.schneier.com/wp-content/uploads/2015/01/skein.pdf)
-_ROTATIONS = {
-    4: [[14, 16], [52, 57], [23, 40], [5, 37], [25, 33], [46, 12], [58, 22], [32, 32]],
-    8: [
-        [46, 36, 19, 37],
-        [33, 27, 14, 42],
-        [17, 49, 36, 39],
-        [44, 9, 54, 56],
-        [39, 30, 34, 24],
-        [13, 50, 10, 17],
-        [25, 29, 39, 43],
-        [8, 35, 56, 22],
-    ],
-    16: [
-        [24, 13, 8, 47, 8, 17, 22, 37],
-        [38, 19, 10, 55, 49, 18, 23, 52],
-        [33, 4, 51, 13, 34, 41, 59, 17],
-        [5, 20, 48, 41, 47, 28, 16, 25],
-        [41, 9, 37, 31, 12, 47, 44, 30],
-        [16, 34, 56, 51, 4, 53, 42, 41],
-        [31, 44, 47, 46, 19, 42, 44, 25],
-        [9, 48, 35, 52, 23, 31, 37, 20],
-    ],
-}
-
-# Threefry permutation constants from the Skein paper ("The Skein Hash Function Family"
-# https://www.schneier.com/wp-content/uploads/2015/01/skein.pdf)
-_PERMUTATIONS = {
-    4: [0, 3, 2, 1],
-    8: [2, 1, 4, 7, 6, 5, 0, 3],
-    16: [0, 9, 2, 13, 6, 11, 4, 15, 10, 7, 12, 3, 14, 5, 8, 1],
-}
-
-
-def _threefry(
-    irb, key_buf, key_offset, counter_buf, counter_offset, out_buf, out_offset, out_shape
-):
-    """IRBuilder code for running Threefry
-
-    Parameters
-    ----------
-    irb: IRBuilder
-        IRBuilder that this code will be generated for.
-
-    key_buf: BufferVar
-        Buffer to read the key from.
-
-    key_offset: number
-        Threefry will write to :code:`key_buf[key_offset:key_offset+4]`
-
-    counter_buf: BufferVar
-        Buffer to read the counter from.
-
-    counter_offset: number
-        Threefry will write to :code:`counter_buf[counter_offset:counter_offset+4]`
-
-    out_buf: BufferVar
-        Buffer to read the counter from.
-
-    out_offset: number
-        Threefry will write to :code:`out_buf[out_offset:out_offset+4*product(out_shape)]`
-
-    out_shape: number
-        Determines the number of output states to generate. :code:`state[i]` will correspond to
-        counter+i.
-    """
-    nrounds = 20
-    nwords = 4
-    iwidth = 64
-    assert nrounds % 4 == 0
-    assert nwords in [4, 8, 16]
-
-    # The paper has constants for 32 bit threefry, but we keep the implementation simple by only
-    # using 64-bit words.
-    assert key_buf.dtype == "uint64", "threefry only supports 64-bit keys"
-    assert key_buf.dtype == counter_buf.dtype, "threefry key and counter must be the same dtype"
-
-    def mix(a, b, rotation):
-        x = a + b  # wrapping
-        y = x ^ ((b << rotation) | (b >> (iwidth - rotation)))
-        return [x, y]
-
-    # temporary buffer for holding the results of _PERMUTATIONS
-    tmp = irb.allocate(out_buf.dtype, out_shape * nwords, name="tmp", scope="global")
-    tmp_offset = 0
-
-    # Initialize entire key. It is composed of the original key with one
-    # element appended. The appended element is the xor of all key words plus a
-    # constant.
-    full_key = irb.allocate("uint64", nwords + 1, name="full_key", scope="global")
-    for i in range(nwords):
-        full_key[i] = key_buf[key_offset + i]
-    # initial key constant, full_key[nwords] is equivalent to k_{N_W} in the Skein paper.
-    full_key[nwords] = tvm.tir.const(0x1BD11BDAA9FC1A22, dtype="uint64")
-    for i in range(nwords):
-        full_key[nwords] ^= key_buf[key_offset + i]
-
-    with irb.for_range(0, out_shape, dtype="uint64", name="i") as i:
-        for j in range(nwords):
-            out_buf[out_offset + i * nwords + j] = counter_buf[counter_offset + j] + i
-
-    def key_schedule(s, i):
-        # Threefry uses no tweak, so the key schedule is simple
-        if i == nwords - 1:
-            return full_key[(s + i) % (nwords + 1)] + tvm.tir.const(s, dtype="uint64")
-        return full_key[(s + i) % (nwords + 1)]
-
-    with irb.for_range(0, out_shape, name="l") as l:  # pylint: disable=invalid-name
-        for i in range(nrounds // 4):
-            for j in range(nwords):
-                out_buf[out_offset + l * nwords + j] += key_schedule(i, j)  # wrapping
-            for k in range(4):
-                for j in range(nwords // 2):
-                    (
-                        out_buf[out_offset + l * nwords + j * 2 + 0],
-                        out_buf[out_offset + l * nwords + j * 2 + 1],
-                    ) = mix(
-                        out_buf[out_offset + l * nwords + j * 2 + 0],
-                        out_buf[out_offset + l * nwords + j * 2 + 1],
-                        _ROTATIONS[nwords][(i * 4 + k) % 8][j],
-                    )
-                for j in range(nwords):
-                    tmp[tmp_offset + l * nwords + j] = out_buf[
-                        out_offset + l * nwords + _PERMUTATIONS[nwords][j]
-                    ]
-                # number of rounds is even, so out always contains the result
-                (out_buf, tmp) = (tmp, out_buf)
-                (out_offset, tmp_offset) = (tmp_offset, out_offset)
-
-
-def threefry_generate(gen, out_shape):
-    """Generate a series of random values
-
-    Notes
-    -----
-    This function uses the counter portion of the generator state to generate a series of random
-    numbers in parallel. Random number `i` is generated by applying Threefry to the current
-    generator state with the counter portion incremented by `i`. This means that each random number
-    is generated independently from each other random number, so we can compute them in parallel.
-
-    If there is not enough room left in the counter to generate the desired shape of random values,
-    then a new generator is created by applying Threefry to the current key, path, and counter.
-    This new generator will have a reset counter.
-
-    Warning
-    -------
-    Threeyfry requires that unsigned integer arithmetic wraps on overflow. Currently TVM has no
-    guarantee of this, so threefry contains an internal assert to check wrapping behavior. This
-    assert may or may not run depending on your platform, so it is recommended you run
-    :py:func:`threefry_test_wrapping` to verify wrapping behavior.
-
-    Parameters
-    ----------
-    gen : Tensor[10, uint64]
-        Generator state. Can be create with :py:func:`tvm.relay.random.threefry_key`. This should
-        not be reused in another function, otherwise random numbers will be repeated.
-
-    out_shape : Sequence[int]
-        Output shape of the random numbers.
-
-    Returns
-    -------
-    new_gen : Tensor[10, uint64]
-        The new generator state to be used in subsequent calls.
-
-    rand : Tensor[out_shape, uint64]
-        Tensor of random numbers with shape `out_shape`.
-    """
-    out_len = tir.const(1)
-    for s in out_shape:
-        out_len *= s
-    assert (
-        out_len.value <= 2**64 - 1
-    ), f"Can only generate up to 2^64 random numbers, but {out_len} were requested."
-
-    def gen_ir(gen_ptr, out_gen_ptr, out_array_ptr):
-        irb = ir_builder.create()
-        gen = irb.buffer_ptr(gen_ptr)
-        out_gen = irb.buffer_ptr(out_gen_ptr)
-        out_array = irb.buffer_ptr(out_array_ptr)
-
-        # Check that unsigned arithmetic wraps, as it is required to implement threefry correctly.
-        irb.emit(
-            tvm.tir.AssertStmt(
-                tvm.tir.const(0xFFFFFFFFFFFFFFFF, "uint64") + tvm.tir.const(1, "uint64")
-                == tvm.tir.const(0, "uint64"),
-                tvm.tir.StringImm(
-                    "Unsigned integer arithmetic is not wrapping, but threefry requires wrapping."
-                ),
-                tvm.tir.Evaluate(0),
-            )
-        )
-
-        # Create a temporary array to hold the generator state we will use to create the random
-        # numbers. We cannot use gen because we may need to update the key + path if there is not
-        # enough room in the counter.
-        tmp = irb.allocate(gen.dtype, 10, name="tmp", scope="global")
-
-        # TODO(tkonolige): for now we only use the last word of the counter for counting. It is too
-        # much work to figure out how to do 128 bit addition.
-
-        # Max value for counter should be 2**64-2 because we need to reserve a special value to
-        # indicate the counter is used up.
-        with irb.if_scope(gen[7] < tir.const(2**64 - 1, dtype=gen.dtype) - out_len):
-            for i in range(10):
-                tmp[i] = gen[i]
-        with irb.else_scope():
-            # no room left in the counter, we have to change the path or key
-            with irb.if_scope(gen[8] == 0 and gen[9] == 0):
-                # out of room in the path, have to generate new key
-
-                # The paper says the counter that we will be hashing should be a special value of
-                # all ones. We need to allocate some space for it because we cannot overwrite gen.
-                tmp_counter = irb.allocate(gen.dtype, 2, name="tmp_counter", scope="global")
-                tmp_counter[0] = tir.const(0xFFFFFFFFFFFFFFFF, dtype=gen.dtype)
-                tmp_counter[1] = tir.const(0xFFFFFFFFFFFFFFFF, dtype=gen.dtype)
-                _threefry(irb, gen, 0, tmp_counter, 0, tmp, 0, 1)
-                tmp[4] = tir.const(0, dtype=gen.dtype)  # zero path, i.e. no path
-                tmp[5] = tir.const(0, dtype=gen.dtype)
-                tmp[6] = tir.const(0, dtype=gen.dtype)  # zero counter
-                tmp[7] = tir.const(0, dtype=gen.dtype)
-                tmp[8] = tir.const(1 << 63, dtype=gen.dtype)  # one in the leftmost position
-                tmp[9] = tir.const(0, dtype=gen.dtype)
-            with irb.else_scope():
-                tmp[0] = gen[0]
-                tmp[1] = gen[1]
-                tmp[2] = gen[2]
-                tmp[3] = gen[3]
-                tmp[4] = gen[4] | gen[8]  # add a 1 to the path
-                tmp[5] = gen[5] | gen[9]
-                tmp[6] = tir.const(0, dtype=gen.dtype)  # zero counter
-                tmp[7] = tir.const(0, dtype=gen.dtype)
-                _shift_right(irb, gen[8], gen[9], tmp, 8, tmp, 9)
-
-        # Compute random values
-        if out_len.value >= 4:
-            _threefry(irb, tmp, 0, tmp, 4, out_array, 0, out_len // 4)
-        if out_len.value % 4 != 0:
-            remaining = irb.allocate(gen.dtype, 4, name="remaining", scope="global")
-            tmp[7] = tmp[7] + tir.Cast(gen.dtype, out_len // 4 * 4)  # increment counter
-            _threefry(irb, tmp, 0, tmp, 4, remaining, 0, 1)
-            with irb.for_range(0, out_len % 4, dtype="uint64", name="i") as i:
-                out_array[out_len // 4 * 4 + i] = remaining[i]
-
-        # Update generator state
-        out_gen[0] = tmp[0]  # key stays the same
-        out_gen[1] = tmp[1]
-        out_gen[2] = tmp[2]
-        out_gen[3] = tmp[3]
-        out_gen[4] = tmp[4]  # path stays the same
-        out_gen[5] = tmp[5]
-        out_gen[6] = tir.const(0, dtype=gen.dtype)  # unused, leave it as 0
-        if out_len.value % 4 != 0:
-            # increment counter for the remaining
-            # as we will generate 4 random numbers for the remaining, increase 4 here.
-            # the main increment was done before the second _threefry.
-            out_gen[7] = tmp[7] + tir.Cast(gen.dtype, 4)
-        else:
-            out_gen[7] = tmp[7] + tir.Cast(gen.dtype, out_len)  # increment counter
-        out_gen[8] = tmp[8]  # path unchanged, so no update here
-        out_gen[9] = tmp[9]
-
-        return irb.get()
-
-    out_gen = tvm.tir.decl_buffer((10,), name="out_gen", dtype="uint64")
-    out_array = tvm.tir.decl_buffer(out_shape, name="out_array", dtype="uint64")
-    return tvm.te.extern(
-        [out_gen.shape, out_array.shape],
-        [gen],
-        lambda ins, outs: gen_ir(ins[0], outs[0], outs[1]),
-        out_buffers=[out_gen, out_array],
-        name="threefry_generate",
-        tag="threefry_generate",
-    )
-
-
-def _shift_right(irb, a, b, out_a, a_off, out_b, b_off):
-    """Binary shift a 128bit number composed of two 64 bit words right by one."""
-    with irb.if_scope(a == 1):
-        out_a[a_off] = tir.const(0, dtype=a.dtype)
-        out_b[b_off] = tir.const(0x8000000000000000, dtype=a.dtype)
-    with irb.else_scope():
-        with irb.if_scope(a == 0):
-            out_a[a_off] = tir.const(0, dtype=a.dtype)
-            out_b[b_off] = b >> 1
-        with irb.else_scope():
-            out_a[a_off] = a >> 1
-            out_b[b_off] = tir.const(0, dtype=a.dtype)
-
-
-def threefry_split(gen):
-    """Split a single generator state into two new ones
-
-    Notes
-    -----
-    The new generator is created by appending a one (for the right output) or a zero (for the left
-    output) to the end of the path portion of the generator If there is no longer and room in the
-    path, then we create a new key portion of the generator by applying Threefry to the old state,
-    path, and counter. i.e. :code:`new_key = threefry(old_key, [old_path, old_counter])`. This
-    resets the path portion of the new generator.
-
-    Parameters
-    ----------
-    gen : Tensor[10, uint64]
-        Generator state. Can be create with :py:func:`tvm.relay.random.threefry_key`. This should
-        not be reused in another function, otherwise random numbers will be repeated.
-
-    Returns
-    -------
-    out_gen_left : Tensor[10, uint64]
-        New generator state that is distinct from `out_gen_right`.
-
-    out_gen_right : Tensor[10, uint64]
-        New generator state that is distinct from `out_gen_left`.
-    """
-
-    def gen_ir(gen_ptr, out_left_ptr, out_right_ptr):
-        irb = ir_builder.create()
-        gen = irb.buffer_ptr(gen_ptr)
-        out_left = irb.buffer_ptr(out_left_ptr)
-        out_right = irb.buffer_ptr(out_right_ptr)
-
-        with irb.if_scope(gen[8] == 0 and gen[9] == 0):
-            # Generate new key because we have run out of room to extend the path
-            _threefry(irb, gen, 0, gen, 4, out_left, 0, 1)
-            out_left[4] = tir.const(0, dtype=gen.dtype)
-            out_left[5] = tir.const(0, dtype=gen.dtype)
-            out_left[6] = tir.const(0, dtype=gen.dtype)  # counter gets zeroed
-            out_left[7] = tir.const(0, dtype=gen.dtype)  # counter gets zeroed
-            out_left[8] = tir.const(
-                1 << 62, dtype=gen.dtype
-            )  # one in the second from the leftmost position
-            out_left[9] = tir.const(0, dtype=gen.dtype)
-
-            out_right[0] = out_left[0]
-            out_right[1] = out_left[1]
-            out_right[2] = out_left[2]
-            out_right[3] = out_left[3]
-            out_right[4] = tir.const(1 << 63, dtype=gen.dtype)  # one in the leftmost position
-            out_right[5] = tir.const(0, dtype=gen.dtype)
-            out_right[6] = tir.const(0, dtype=gen.dtype)
-            out_right[7] = tir.const(0, dtype=gen.dtype)
-            out_right[8] = tir.const(
-                1 << 62, dtype=gen.dtype
-            )  # one in the second from the leftmost position
-            out_right[9] = tir.const(0, dtype=gen.dtype)
-        with irb.else_scope():
-            out_left[0] = gen[0]
-            out_left[1] = gen[1]
-            out_left[2] = gen[2]
-            out_left[3] = gen[3]
-            out_left[4] = gen[4]  # adding a zero here, but its already zero padded
-            out_left[5] = gen[5]
-            out_left[6] = gen[6]
-            out_left[7] = gen[7]
-            # move path position over one bit
-            _shift_right(irb, gen[8], gen[9], out_left, 8, out_left, 9)
-
-            out_right[0] = gen[0]
-            out_right[1] = gen[1]
-            out_right[2] = gen[2]
-            out_right[3] = gen[3]
-            out_right[4] = gen[4] | gen[8]  # add a one to the path
-            out_right[5] = gen[5] | gen[9]
-            out_right[6] = gen[6]
-            out_right[7] = gen[7]
-            _shift_right(irb, gen[8], gen[9], out_right, 8, out_right, 9)
-
-        return irb.get()
-
-    out_left = tvm.tir.decl_buffer((10,), name="out_left", dtype="uint64")
-    out_right = tvm.tir.decl_buffer((10,), name="out_right", dtype="uint64")
-    return tvm.te.extern(
-        [out_left.shape, out_right.shape],
-        [gen],
-        lambda ins, outs: gen_ir(ins[0], outs[0], outs[1]),
-        out_buffers=[out_left, out_right],
-        name="threefry_split",
-        tag="threefry_split",
-    )
-
-
-def threefry_test_wrapping(target, device):
-    """Test that unsigned arithmetic wraps on overflow.
-
-    Parameters
-    ----------
-    target : tvm.target.Target
-        Target to run against
-    device : tvm.runtime.Device
-        Context to run the test on
-
-    Returns
-    -------
-    is_wrapping : bool
-        Whether or not unsigned integer arithmetic is wrapping for this target, context pair. True
-        indicates that threefry will work on this platform.
-    """
-    if isinstance(target, str):
-        target = tvm.target.Target(target)
-
-    def gen_ir(out_ptr):
-        irb = ir_builder.create()
-        out = irb.buffer_ptr(out_ptr)
-        if "gpu" in target.keys:
-            thread_x = tvm.te.thread_axis("threadIdx.x")
-            irb.scope_attr(thread_x, "thread_extent", 1)
-        out[0] = tvm.tir.const(0xFFFFFFFFFFFFFFFF, "uint64") + tvm.tir.const(1, "uint64")
-        return irb.get()
-
-    out = tvm.tir.decl_buffer((1,), dtype="uint64")
-    f = tvm.te.extern(
-        [out.shape], [], lambda ins, outs: gen_ir(outs[0]), dtype="uint64", out_buffers=[out]
-    )
-    s = tvm.te.create_schedule([f.op])
-    out_ary = tvm.nd.array(np.ones((1,), "uint64"), device)
-    tvm.build(s, [f], target=target)(out_ary)
-    return out_ary.numpy()[0] == 0
-
-
-def uniform(gen, low, high, out_shape, out_dtype):
-    """Draw samples from a uniform distribution.
-
-    Samples are uniformly distributed over the half-open interval [low, high)
-    (includes low, but excludes high). In other words, any value within the
-    given interval is equally likely to be drawn by uniform.
-
-    Parameters
-    ----------
-    gen : ThreefryKey
-        Generator state. Can be create with :py:func:`tvm.relay.threefry_key`. This should not be
-        reused in another function, otherwise random numbers will be repeated.
-
-    low : Tensor[(), out_dtype]
-        Lower boundary of the output interval. All values generated will be
-        greater than or equal to low.
-
-    high : Tensor[(), out_dtype]
-        Upper boundary of the output interval. All values generated will be
-        less than high.
-
-    out_shape : Sequence[int]
-        Output shape of the random numbers.
-
-    out_dtype : str
-        The output dtype.
-
-    Returns
-    -------
-    new_gen : ThreefryKey
-        New generator state that is distinct from `gen`.
-
-    out : Tensor[out_shape, out_dtype]
-        Tensor of random numbers with shape `out_shape` and type `out_dtype`.
-    """
-    new_gen, random_bits = threefry_generate(gen, out_shape)
-    assert out_dtype in (
-        "float32",
-        "float64",
-    ), f"Only support float32 or float64 for now, got {out_dtype}"
-    if out_dtype == "float32":
-        random_dtype = "uint32"
-        nbits = 32
-        nfraction = 23
-    elif out_dtype == "float64":
-        random_dtype = "uint64"
-        nbits = 64
-        nfraction = 52
-    nexp = nbits - nfraction - 1
-    random_bits = random_bits.astype(random_dtype)
-
-    fraction = tvm.topi.right_shift(
-        random_bits, tvm.tir.const(nbits - nfraction, dtype=random_dtype)
-    )
-    exponent = tvm.topi.left_shift(
-        tvm.topi.full(out_shape, random_dtype, (1 << (nexp - 1)) - 1),
-        tvm.tir.const(nfraction, dtype=random_dtype),
-    )
-    mantissa = tvm.topi.bitwise_or(fraction, exponent).astype(random_dtype)
-    standard_uniform_values = tvm.topi.reinterpret(mantissa, out_dtype) - tvm.tir.const(
-        1, dtype=out_dtype
-    )
-    uniform_values = tvm.topi.add(tvm.topi.multiply(standard_uniform_values, high - low), low)
-
-    return new_gen, uniform_values
-
-
-def normal(gen, mean, scale, out_shape, out_dtype):
-    """Draw samples from a normal distribution.
-    The algorithm is based on Box-Muller transform
-
-    Parameters
-    ----------
-    gen : ThreefryKey
-        Generator state. Can be create with :py:func:`tvm.relay.threefry_key`. This should not be
-        reused in another function, otherwise random numbers will be repeated.
-
-    mean : Tensor[(), out_dtype]
-        The mean of the normal distribution.
-
-    scale : Tensor[(), out_dtype]
-        The standard deviation of the normal distribution.
-
-    out_shape : Sequence[int]
-        Output shape of the random numbers.
-
-    out_dtype : str
-        The output dtype.
-
-    Returns
-    -------
-    new_gen : ThreefryKey
-        New generator state that is distinct from `gen`.
-
-    out : Tensor[out_shape, out_dtype]
-        Tensor of random numbers with shape `out_shape` and type `out_dtype`.
-    """
-    out_shape = list(out_shape)
-    # Box-Muller transform need two pieces of original uniform data
-    out_shape.insert(0, 2)
-    new_gen, uniform_values = uniform(
-        gen, tvm.tir.const(0.0, out_dtype), tvm.tir.const(1.0, out_dtype), out_shape, out_dtype
-    )
-    two_pi = tvm.tir.const(2.0 * math.pi, out_dtype)
-    uniform_values_1 = tvm.topi.strided_slice(uniform_values, [0], [1], strides=[1], axes=[0])
-    uniform_values_1 = tvm.topi.squeeze(uniform_values_1, axis=0)
-    uniform_values_2 = tvm.topi.strided_slice(uniform_values, [1], [2], strides=[1], axes=[0])
-    uniform_values_2 = tvm.topi.squeeze(uniform_values_2, axis=0)
-    uniform_values_1 = tvm.topi.subtract(tvm.tir.const(1.0, out_dtype), uniform_values_1)
-    sqrt_values = tvm.topi.sqrt(
-        tvm.topi.multiply(tvm.tir.const(-2.0, out_dtype), tvm.topi.log(uniform_values_1))
-    )
-    sin_values = tvm.topi.sin(tvm.topi.multiply(two_pi, uniform_values_2))
-    random_values = tvm.topi.add(
-        tvm.topi.multiply(tvm.topi.multiply(sqrt_values, sin_values), scale), mean
-    )
-
-    return new_gen, random_values
-
-
-def multinomial(gen, probs, num_samples):
-    """Draw samples from a multinomial distribution defined by the input tensor.
-
-    Parameters
-    ----------
-    gen : ThreefryKey
-        Generator state. Can be created with :py:func:`tvm.relay.threefry_key`. This should not be
-        reused in another function, otherwise random numbers will be repeated.
-
-    probs: Tensor[(input_rows, indices), float]
-        A tensor containing the probabilities to sample from. Each value represents the
-        probability of choosing its corresponding index. If a tensor is provided, the last dimension
-        is treated independently. Negative values in this tensor will be clipped to zero to
-        represent they have no chance of being selected.
-
-    num_samples: int
-        Number of samples to draw from each row.
-
-    Returns
-    -------
-    new_gen : ThreefryKey
-        New generator state that is distinct from `gen`.
-
-    out : Tensor[(input_rows, num_samples), int64]
-        Tensor of sampled indices with shape `input_rows x num_samples` and type `out_dtype`.
-    """
-    # Convert to float for consistent behavior.
-    probs = tvm.topi.cast(probs, "float32")
-    # Clip negative values to 0.
-    probs = tvm.topi.maximum(probs, 0)
-    # Normalize input probabilities.
-    probs = tvm.topi.divide(probs, tvm.topi.expand_dims(tvm.topi.sum(probs, axis=-1), -1))
-    # Convert probability to cumulative sum.
-    cumulative_probs = tvm.topi.cumsum(probs, axis=-1)
-    # Sample a set of uniform values.
-    new_gen, uniform_values = uniform(
-        gen,
-        tvm.tir.const(0.0, "float32"),
-        tvm.tir.const(1.0, "float32"),
-        [*probs.shape[:-1], num_samples],
-        "float32",
-    )
-    # Find index corresponding to sampled values.
-    closest_prob = tvm.topi.subtract(
-        tvm.topi.expand_dims(cumulative_probs, axis=-1),
-        tvm.topi.expand_dims(uniform_values, axis=-2),
-    )
-    zeros = tvm.topi.full_like(closest_prob, 0)
-    ones = tvm.topi.full_like(closest_prob, 1)
-    # Find the smallest positive index for each sample.
-    cond = tvm.topi.greater(closest_prob, zeros)
-    closest_non_neg = tvm.topi.where(cond, closest_prob, ones)
-    sampled_indices = tvm.topi.argmin(closest_non_neg, axis=-2)
-    return new_gen, sampled_indices
diff --git a/python/tvm/topi/sparse_fill_empty_rows.py b/python/tvm/topi/sparse_fill_empty_rows.py
deleted file mode 100644
index 10dc6ee3bfa3..000000000000
--- a/python/tvm/topi/sparse_fill_empty_rows.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHnew_sparse_indices WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=no-else-return, too-many-locals, too-many-arguments, too-many-branches
-# pylint: disable=undefined-variable, invalid-name
-"""SparseFillEmptyRows operator"""
-from ..te import hybrid
-
-
-@hybrid.script
-def _sparse_fill_empty_rows(
-    sparse_indices,
-    sparse_values,
-    dense_shape,
-    default_value,
-    new_sparse_indices_shape,
-    new_sparse_values_shape,
-    empty_row_indicator_shape,
-):
-    default_value_ = int64(default_value[0])
-    new_sparse_indices = output_tensor(new_sparse_indices_shape, "int64")
-    new_sparse_values = output_tensor(new_sparse_values_shape, "int64")
-    empty_row_indicator = output_tensor(empty_row_indicator_shape, "int64")
-    new_sparse_indices_row_id = 0
-
-    if int64(sparse_indices.shape[0]) == int64(0):  # Handle Empty Case
-        #  Fill all rows with default values
-        for i in range(0, new_sparse_indices_shape[0]):
-            new_sparse_indices[i, 0] = int64(i)
-            new_sparse_values[i] = default_value_
-            empty_row_indicator[i] = int64(1)
-            for k in range(1, int64(new_sparse_indices_shape[1])):
-                new_sparse_indices[i, k] = int64(0)
-
-        return (new_sparse_indices, new_sparse_values, empty_row_indicator)
-
-    else:
-        # Iterate through sparse_indices and add rows if/when required
-        for i in range(0, int64(sparse_indices.shape[0])):
-            if i == 0:
-                prev_row_id = int64(0)
-            else:
-                prev_row_id = int64(sparse_indices[i - 1, 0] + 1)
-            row_id = int64(sparse_indices[i, 0])
-
-            # Since input is in row-major order, add rows between prev_row_id and row_id
-            for j in range(prev_row_id, row_id):
-                new_sparse_indices[new_sparse_indices_row_id, 0] = int64(j)
-                for k in range(1, int64(new_sparse_indices_shape[1])):
-                    new_sparse_indices[new_sparse_indices_row_id, k] = int64(0)
-                empty_row_indicator[prev_row_id] = int64(1)
-                new_sparse_values[new_sparse_indices_row_id] = default_value_
-                new_sparse_indices_row_id += 1
-
-            # Add current element to output
-            new_sparse_indices[new_sparse_indices_row_id, 0] = row_id
-            for k in range(1, int64(new_sparse_indices_shape[1])):
-                new_sparse_indices[new_sparse_indices_row_id, k] = int64(sparse_indices[i, k])
-            new_sparse_values[new_sparse_indices_row_id] = int64(sparse_values[i])
-            empty_row_indicator[row_id] = int64(0)
-            new_sparse_indices_row_id += 1
-
-        # Add rows with default value if last row id of sparse_indices is not dense_shape[0] - 1
-        for i in range(
-            int64(sparse_indices[sparse_indices.shape[0] - 1, 0] + 1), int64(dense_shape[0])
-        ):
-
-            new_sparse_indices[new_sparse_indices_row_id, 0] = int64(i)
-            for k in range(1, int64(new_sparse_indices_shape[1])):
-                new_sparse_indices[new_sparse_indices_row_id, k] = int64(0)
-            empty_row_indicator[i] = int64(1)
-            new_sparse_values[new_sparse_indices_row_id] = default_value_
-            new_sparse_indices_row_id += 1
-
-        return (new_sparse_indices, new_sparse_values, empty_row_indicator)
-
-
-def sparse_fill_empty_rows(
-    sparse_indices,
-    sparse_values,
-    dense_shape,
-    default_value,
-    new_sparse_indices_shape,
-    new_sparse_values_shape,
-    empty_row_indicator_shape,
-):
-    return _sparse_fill_empty_rows(
-        sparse_indices,
-        sparse_values,
-        dense_shape,
-        default_value,
-        new_sparse_indices_shape,
-        new_sparse_values_shape,
-        empty_row_indicator_shape,
-    )
diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py
index 2844825a4a73..c1f5bce94870 100644
--- a/python/tvm/topi/transform.py
+++ b/python/tvm/topi/transform.py
@@ -20,7 +20,6 @@
 
 import tvm
 from tvm import te, topi
-from tvm.te import hybrid
 
 from . import cpp, tag
 from .utils import const_vector, make_idx, within_index
@@ -982,35 +981,6 @@ def adv_index(data, indices):
     return cpp.adv_index(data, indices)
 
 
-@hybrid.script
-def invert_permutation(data):
-    """Computes the inverse permutation of data.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        Input data
-
-    Returns
-    -------
-    result : tvm.te.Tensor
-        Output tensor
-
-    Examples
-    --------
-    .. code-block:: python
-
-        data = [3, 4, 0, 2, 1]
-        topi.invert_permutation(data) = [2, 4, 3, 0, 1]
-    """
-    result = output_tensor(data.shape, data.dtype)
-    nums = data.shape[0]
-    for ind in range(nums):
-        r_ind = data[ind]
-        result[r_ind] = ind
-    return result
-
-
 def sliding_window(data, axis, window_shape, strides):
     """Slide a window over the data tensor.
 
diff --git a/python/tvm/topi/unique.py b/python/tvm/topi/unique.py
index 983c48615334..9c9732013413 100644
--- a/python/tvm/topi/unique.py
+++ b/python/tvm/topi/unique.py
@@ -17,9 +17,6 @@
 # pylint: disable=invalid-name
 """Unique operator"""
 from tvm import te, tir
-from ..te import hybrid
-from .scan import cumsum
-from .sort import sort, argsort
 
 
 def _calc_adjacent_diff_ir(data, output, binop=tir.Sub):
@@ -82,234 +79,3 @@ def _calc_adjacent_diff(data, out_dtype="int32", binop=tir.Sub):
         name="_calc_adjacent_diff",
         tag="_calc_adjacent_diff_cpu",
     )
-
-
-@hybrid.script
-def _calc_num_unique(inc_scan):
-    """Helper function to get the number of unique elements fron inc_scan tensor"""
-    output = output_tensor((1,), "int32")
-    output[0] = inc_scan[inc_scan.shape[0] - 1] + int32(1)
-    return output
-
-
-def _calc_unique_ir(
-    data, argsorted_indices, inc_scan, index_converter, unique_elements, inverse_indices, counts
-):
-    """Low level IR to calculate unique elements, inverse indices, and counts (optional) of
-    unique elements of 1-D array.
-
-    Parameters
-    ----------
-    data : Buffer
-        Input 1-D Buffer.
-
-    argsorted_indices : Buffer
-        A buffer that stores the argsorted indices of the input data.
-
-    inc_scan : Buffer
-        A buffer that stores the inclusive scan of the binary tir.NE adjacent difference
-        of the sorted data.
-
-    index_converter (optional) : Buffer
-        An optional index converter that transforms the unique element index
-        such that new_idx = index_converter[old_idx].
-
-    unique_elements : Buffer
-        A buffer that stores the unique elements.
-
-    inverse_indices : Buffer
-        A buffer that stores the index of each input data element in the unique element array.
-
-    counts (optional) : Buffer
-        A buffer that stores the count of each unique element.
-    """
-    ib = tir.ir_builder.create()
-    data_ptr = ib.buffer_ptr(data)
-    argsorted_indices_ptr = ib.buffer_ptr(argsorted_indices)
-    inc_scan_ptr = ib.buffer_ptr(inc_scan)
-    unique_elements_ptr = ib.buffer_ptr(unique_elements)
-    inverse_indices_ptr = ib.buffer_ptr(inverse_indices)
-
-    index_converter_ptr = None
-    if isinstance(index_converter, tir.Buffer):
-        index_converter_ptr = ib.buffer_ptr(index_converter)
-
-    if isinstance(counts, tir.Buffer):
-        counts_ptr = ib.buffer_ptr(counts)
-        # use indices_ptr as a tmp buffer to store tids with inc_scan[tid] != inc_scan[tid-1]
-        unique_seq_indices_ptr = ib.buffer_ptr(inverse_indices)
-
-    data_length = data.shape[0]
-
-    # if need to return counts
-    if isinstance(counts, tir.Buffer):
-        num_unique = inc_scan_ptr[inc_scan.shape[0] - 1] + 1
-        num_elements = data.shape[0]
-        unique_seq_indices_ptr[num_unique - 1] = num_elements
-        with ib.new_scope():
-            with ib.for_range(0, data_length, kind="parallel") as i:
-                with ib.if_scope(i > 0):
-                    with ib.if_scope(inc_scan_ptr[i] != inc_scan_ptr[i - 1]):
-                        unique_seq_indices_ptr[inc_scan_ptr[i] - 1] = i
-        with ib.new_scope():
-            with ib.for_range(0, num_unique, kind="parallel") as i:
-                unique_idx = i if not index_converter_ptr else index_converter_ptr[i]
-                with ib.if_scope(i == 0):
-                    counts_ptr[unique_idx] = unique_seq_indices_ptr[i]
-                with ib.else_scope():
-                    counts_ptr[unique_idx] = (
-                        unique_seq_indices_ptr[i] - unique_seq_indices_ptr[i - 1]
-                    )
-    # calculate unique elements and inverse indices
-    with ib.new_scope():
-        with ib.for_range(0, data_length, kind="parallel") as i:
-            data_idx = argsorted_indices_ptr[i]
-            unique_idx = (
-                inc_scan_ptr[i] if not index_converter_ptr else index_converter_ptr[inc_scan_ptr[i]]
-            )
-            inverse_indices_ptr[data_idx] = unique_idx
-            with ib.if_scope(i == 0):
-                unique_elements_ptr[unique_idx] = data_ptr[data_idx]
-            with ib.else_scope():
-                with ib.if_scope(inc_scan_ptr[i] != inc_scan_ptr[i - 1]):
-                    unique_elements_ptr[unique_idx] = data_ptr[data_idx]
-    return ib.get()
-
-
-@hybrid.script
-def _calc_first_occurence(argsorted_indices, inc_scan):
-    """Hybrid script to calculate the first occurence of each unique element in the input data.
-
-    Parameters
-    ----------
-    argsorted_indices : tvm.te.Tensor
-        A tensor that stores the argsorted indices of the input data.
-
-    inc_scan : tvm.te.Tensor
-        A tensor that stores the inclusive scan of the binary tir.NE adjacent difference
-        of the sorted data.
-
-    first_occurence : tvm.te.Tensor
-        A tensor that stores the first occurence of each unique element in the input data.
-    """
-    first_occurence = output_tensor(argsorted_indices.shape, "int32")
-    for i in parallel(argsorted_indices.shape[0]):
-        first_occurence[i] = argsorted_indices.shape[0]
-    for i in parallel(argsorted_indices.shape[0]):
-        if i == 0 or inc_scan[i] != inc_scan[i - 1]:
-            first_occurence[inc_scan[i]] = argsorted_indices[i]
-    return first_occurence
-
-
-def unique(data, is_sorted=True, return_counts=False):
-    """
-    Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to
-    have the same length of `data` and element with index >= num_unique[0] has undefined value.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        A 1-D tensor of integers.
-
-    sorted : bool
-        Whether to sort the unique elements in ascending order before returning as output.
-
-    return_counts : bool
-        Whether to return the count of each unique element.
-
-    Returns
-    -------
-    unique : tvm.te.Tensor
-        A 1-D tensor containing the unique elements of the input data tensor. The same size as
-        the input data. If there are less unique elements than input data, the end of the tensor
-        is padded with zeros.
-
-    indices : tvm.te.Tensor
-        A 1-D tensor. The same size as output. For each entry in output, it contains
-        the index of its first occurence in the input data. The end of the tensor is padded
-        with the length of the input data.
-
-    inverse_indices : tvm.te.Tensor
-        A 1-D tensor. For each entry in data, it contains the index of that data element in
-        the unique array. (Note that inverse_indices is very similar to indices if output is not
-        sorted.)
-
-    num_unique : tvm.te.Tensor
-        A 1-D tensor with size=1 containing the number of unique elements in the input data tensor.
-
-    counts (optional) : tvm.te.Tensor
-        A 1-D tensor containing the count of each unique element in the output.
-
-    Examples
-    --------
-    .. code-block:: python
-
-        [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, False)
-        output          =  [4, 5, 1, 2, 3, _, _, _]
-        indices         =  [0, 1, 2, 3, 4, _, _, _]
-        inverse_indices =  [0, 1, 2, 3, 4, 4, 0, 1]
-        num_unique      =  [5]
-
-        [output, indices, num_unique, counts] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, True)
-        output          =  [4, 5, 1, 2, 3, _, _, _]
-        indices         =  [0, 1, 2, 3, 4, _, _, _]
-        inverse_indices =  [0, 1, 2, 3, 4, 4, 0, 1]
-        num_unique      =  [5]
-        counts          =  [2, 2, 1, 1, 2, _, _, _]
-
-        [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], True)
-        output          =  [1, 2, 3, 4, 5, _, _, _]
-        indices         =  [2, 3, 4, 0, 1, _, _, _]
-        inverse_indices =  [3, 4, 0, 1, 2, 2, 3, 4]
-        num_unique      =  [5]
-    """
-    sorted_data = sort(data)
-    argsorted_indices = argsort(data, dtype="int32")
-    # adjacent difference
-    adjacent_diff = _calc_adjacent_diff(sorted_data, "int32", tir.NE)
-    # inclusive scan
-    inc_scan = cumsum(adjacent_diff, dtype="int32", exclusive=0)
-    # total number of unique elements
-    num_unique_elements = _calc_num_unique(inc_scan)
-    # prepare outputs
-    if return_counts:
-        out_data_shape = [data.shape] * 3
-        out_dtypes = [data.dtype, "int32", "int32"]
-    else:
-        out_data_shape = [data.shape] * 2
-        out_dtypes = [data.dtype, "int32"]
-    # prepare inputs and fcompute
-
-    first_occurence = _calc_first_occurence(argsorted_indices, inc_scan)
-    if is_sorted:
-        in_data = [data, argsorted_indices, inc_scan]
-        if return_counts:
-            fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs)
-        else:
-            fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs, None)
-
-        indices = first_occurence
-    else:
-        # calculate index converter by sorting unique elements by their first occurence
-        argsorted_first_occurence = argsort(first_occurence, dtype="int32")
-        index_converter = argsort(argsorted_first_occurence, dtype="int32")
-        in_data = [data, argsorted_indices, inc_scan, index_converter]
-        if return_counts:
-            fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs)
-        else:
-            fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs, None)
-        # First occurence is in order of sorted unique output, if we sort the first_occurence array
-        # we get the correct result
-        indices = sort(first_occurence)
-
-    outs = te.extern(
-        out_data_shape,
-        in_data,
-        fcompute,
-        dtype=out_dtypes,
-        name="_calc_unique",
-        tag="_calc_unique_cpu",
-    )
-    if return_counts:
-        return [outs[0], indices, outs[1], num_unique_elements, outs[2]]
-    return [outs[0], indices, outs[1], num_unique_elements]
diff --git a/python/tvm/topi/vision/__init__.py b/python/tvm/topi/vision/__init__.py
deleted file mode 100644
index 2861d31de0f4..000000000000
--- a/python/tvm/topi/vision/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""VISION network operators"""
-from __future__ import absolute_import as _abs
-
-from . import ssd
-from .reorg import *
-from .nms import *
-from .rcnn import *
diff --git a/python/tvm/topi/vision/nms.py b/python/tvm/topi/vision/nms.py
deleted file mode 100644
index 7bd94745e226..000000000000
--- a/python/tvm/topi/vision/nms.py
+++ /dev/null
@@ -1,1183 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-error, invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements, too-many-function-args
-"""Non-maximum suppression operator"""
-import tvm
-from tvm import te
-
-from tvm.te import hybrid
-from tvm.tir import if_then_else
-
-from ..sort import argsort
-from ..math import cast
-from ..transform import reshape, gather
-from .. import reduction
-from ..scan import cumsum
-from .nms_util import (
-    binary_search,
-    collect_selected_indices,
-    collect_selected_indices_and_scores,
-    run_all_class_nms,
-)
-
-
-@hybrid.script
-def hybrid_rearrange_box_out(data, one, batch_size, num_anchors):
-    """Hybrid routine to rearrange nms output to
-    move all valid entries to top.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor or numpy NDArray
-        NMS output. 3-D tensor with shape
-        [batch_size, num_anchors, 6].
-
-    one: tvm.tir.const
-        Constant one with the same dtype as data.
-
-    batch_size: tvm.tir.IntImm or tvm.tir.Var
-        Batch size. We need to pass it in since hybrid script doesn't support
-        binding variable to symbolic dim.
-
-    num_anchors: tvm.tir.IntImm or tvm.tir.Var
-        Number of anchors.
-
-    Returns
-    -------
-    output : tvm.te.Tensor or numpy NDArray
-        Transformed NMS output. 3-D tensor with shape
-        [batch_size, num_anchors, 6].
-    """
-    elem_length = data.shape[2]
-    output = output_tensor((batch_size, num_anchors, elem_length), data.dtype)
-    valid_indices = allocate((batch_size,), "int32")
-
-    for i in parallel(batch_size):
-        valid_indices[i] = 0
-        for j in range(num_anchors):
-            if data[i, j, 0] >= 0:
-                for k in range(elem_length):
-                    output[i, valid_indices[i], k] = data[i, j, k]
-                valid_indices[i] += 1
-            if j >= valid_indices[i]:
-                for k in range(elem_length):
-                    output[i, j, k] = -one
-    return output
-
-
-@hybrid.script
-def hybrid_rearrange_indices_out(data, one, batch_size, num_anchors):
-    """Hybrid routine to rearrange nms output to
-    move all valid entries to top.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor or numpy NDArray
-        NMS output. 3-D tensor with shape
-        [batch_size, num_anchors, 6] or
-        [batch_size, num_anchors, 5], or 2-D
-        tensor with shape [batch_size, num_anchors].
-
-    one: tvm.tir.const
-        Constant one with the same dtype as data.
-
-    batch_size: tvm.tir.IntImm or tvm.tir.Var
-        Batch size. We need to pass it in since hybrid script doesn't support
-        binding variable to symbolic dim.
-
-    num_anchors: tvm.tir.IntImm or tvm.tir.Var
-        Number of anchors.
-
-    Returns
-    -------
-    output : tvm.te.Tensor or numpy NDArray
-        2-D tensor with shape [batch_size, num_anchors].
-
-    valid_box_count : tvm.te.Tensor or numpy NDArray
-        Tensor with shape [batch_size, 1], indicates
-        the valid number of boxes.
-    """
-    valid_box_count = output_tensor((batch_size, 1), "int32")
-    output = output_tensor((batch_size, num_anchors), data.dtype)
-    valid_indices = allocate((batch_size,), "int32")
-
-    for i in parallel(batch_size):
-        valid_indices[i] = 0
-        for j in range(num_anchors):
-            if data[i, j] >= 0:
-                output[i, valid_indices[i]] = data[i, j]
-                valid_indices[i] += 1
-            if data[i, j] > num_anchors or data[i, j] < -num_anchors:
-                output[i, valid_indices[i]] = 0
-                valid_indices[i] += 1
-            if j >= valid_indices[i]:
-                output[i, j] = -one
-        valid_box_count[i, 0] = valid_indices[i]
-
-    return output, valid_box_count
-
-
-@hybrid.script
-def hybrid_get_valid_counts(
-    data, score_threshold, id_index, score_index, one, batch_size, num_anchors
-):
-    """Hybrid routine to get valid count of bounding boxes
-    given a score threshold. Also moves valid boxes to the
-    top of input data.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor or numpy NDArray
-        Input data. 3-D tensor with shape [batch_size, num_anchors, 6]
-        or [batch_size, num_anchors, 5].
-
-    score_threshold : tvm.te.Tensor
-        Lower limit of score for valid bounding boxes.
-
-    id_index : tvm.tir.const
-        index of the class categories, -1 to disable.
-
-    score_index: tvm.tir.const
-        Index of the scores/confidence of boxes.
-
-    one: tvm.tir.const
-        Constant one with the same dtype as data.
-
-    batch_size: tvm.tir.IntImm or tvm.tir.Var
-        Batch size. We need to pass it in since hybrid script doesn't support
-        binding variable to symbolic dim.
-
-    num_anchors: tvm.tir.IntImm or tvm.tir.Var
-        Number of anchors.
-
-    Returns
-    -------
-    valid_count : tvm.te.Tensor or numpy NDArray
-        1-D tensor for valid number of boxes.
-
-    out_tensor : tvm.te.Tensor or numpy NDArray
-        Rearranged data tensor.
-
-    out_indices: tvm.te.Tensor or numpy NDArray
-        Related index in input data.
-    """
-    box_data_length = data.shape[2]
-    valid_count = output_tensor((batch_size,), "int32")
-    out_tensor = output_tensor((batch_size, num_anchors, box_data_length), data.dtype)
-    out_indices = output_tensor((batch_size, num_anchors), "int32")
-    for i in parallel(batch_size):
-        valid_count[i] = 0
-        for j in range(num_anchors):
-            score = data[i, j, score_index]
-            if score > score_threshold and (id_index < 0 or data[i, j, id_index] >= 0):
-                for k in range(box_data_length):
-                    out_tensor[i, valid_count[i], k] = data[i, j, k]
-                out_indices[i, valid_count[i]] = j
-                valid_count[i] += 1
-            if j >= valid_count[i]:
-                for k in range(box_data_length):
-                    out_tensor[i, j, k] = -one
-                out_indices[i, j] = -1
-    return valid_count, out_tensor, out_indices
-
-
-def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
-    """Get valid count of bounding boxes given a score threshold.
-    Also moves valid boxes to the top of input data.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        Input data. 3-D tensor with shape [batch_size, num_anchors, 6]
-        or [batch_size, num_anchors, 5].
-
-    score_threshold : optional, float
-        Lower limit of score for valid bounding boxes.
-
-    id_index : optional, int
-        index of the class categories, -1 to disable.
-
-    score_index: optional, int
-        Index of the scores/confidence of boxes.
-
-    Returns
-    -------
-    valid_count : tvm.te.Tensor
-        1-D tensor for valid number of boxes.
-
-    out_tensor : tvm.te.Tensor
-        Rearranged data tensor.
-
-    out_indices: tvm.te.Tensor or numpy NDArray
-        Related index in input data.
-    """
-    if isinstance(score_threshold, (float, int)):
-        score_threshold = tvm.tir.const(score_threshold, dtype=data.dtype)
-    id_index_const = tvm.tir.const(id_index, "int32")
-    score_index_const = tvm.tir.const(score_index, "int32")
-    return hybrid_get_valid_counts(
-        data,
-        score_threshold,
-        id_index_const,
-        score_index_const,
-        tvm.tir.const(1, data.dtype),
-        data.shape[0],
-        data.shape[1],
-    )
-
-
-@hybrid.script
-def hybrid_nms(
-    data,
-    sorted_index,
-    valid_count,
-    indices,
-    batch_size,
-    num_anchors,
-    max_output_size,
-    iou_threshold,
-    force_suppress,
-    top_k,
-    coord_start,
-    score_index,
-    id_index,
-    return_indices,
-    zero,
-    one,
-):
-    """Hybrid routing for non-maximum suppression.
-
-    Parameters
-    ----------
-    data: tvm.te.Tensor or numpy NDArray
-        Bounding boxes with class and score. 3-D tensor with shape
-        [batch_size, num_anchors, 6]. It could be the second output
-        out_tensor of get_valid_counts.
-
-    sorted_index : tvm.te.Tensor or numpy NDArray
-        Bounding box indexes sorted by score, with shape
-        [batch_size, num_anchors].
-
-    valid_count : tvm.te.Tensor or numpy NDArray
-        1-D tensor for valid number of boxes. It could be the output
-        valid_count of get_valid_counts.
-
-    indices : tvm.te.Tensor or numpy.NDArray
-        indices in original tensor, with shape [batch_size, num_anchors],
-        represents the index of box in original data. It could be the third
-        output out_indices of get_valid_counts. The values in the second
-        dimension are like the output of arange(num_anchors) if get_valid_counts
-        is not used before non_max_suppression.
-
-    batch_size: tvm.tir.IntImm or tvm.tir.Var
-        Batch size. We need to pass it in since hybrid script doesn't support
-        binding variable to symbolic dim.
-
-    num_anchors: tvm.tir.IntImm or tvm.tir.Var
-        The number of anchors.
-
-    max_output_size : tvm.te.Tensor
-        Max number of output valid boxes for each instance.
-        Return all valid boxes if max_output_size < 0.
-
-    iou_threshold : tvm.te.Tensor
-        Overlapping(IoU) threshold to suppress object with smaller score.
-
-    force_suppress : tvm.tir.const
-        Whether to suppress all detections regardless of class_id.
-
-    top_k : tvm.tir.const
-        Keep maximum top k detections before nms, -1 for no limit.
-
-    coord_start : tvm.tir.const
-        Start index of the consecutive 4 coordinates.
-
-    score_index: tvm.tir.const
-        Index of the scores/confidence of boxes.
-
-    id_index : tvm.tir.const
-        index of the class categories, -1 to disable.
-
-    return_indices : tvm.tir.const
-        Whether to return box indices in input data.
-
-    zero: tvm.tir.const
-        Constant zero with the same dtype as data.
-
-    one: tvm.tir.const
-        Constant one with the same dtype as data.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        3-D tensor with shape [batch_size, num_anchors, 6]
-        or [batch_size, num_anchors, 5].
-
-    box_indices: tvm.te.Tensor
-        2-D tensor with shape [batch_size, num_anchors].
-    """
-
-    box_data_length = data.shape[2]
-
-    # box_indices is the expected indices of boxes
-    box_indices = output_tensor((batch_size, num_anchors), sorted_index.dtype)
-    output = output_tensor(
-        (
-            batch_size,
-            num_anchors,
-            box_data_length,
-        ),
-        data.dtype,
-    )
-
-    for i in range(batch_size):
-        if iou_threshold > 0:
-            if valid_count[i] > 0:
-                # Reorder output
-                nkeep = valid_count[i]
-                if 0 < top_k < nkeep:
-                    nkeep = top_k
-                for j in parallel(nkeep):
-                    for k in range(box_data_length):
-                        output[i, j, k] = data[i, sorted_index[i, j], k]
-                    box_indices[i, j] = sorted_index[i, j]
-                if 0 < top_k < valid_count[i]:
-                    for j in parallel(valid_count[i] - nkeep):
-                        for k in range(box_data_length):
-                            output[i, j + nkeep, k] = -one
-                        box_indices[i, j + nkeep] = -1
-
-            # Apply nms
-            box_start_idx = coord_start
-            batch_idx = i
-            num_valid_boxes = 0
-
-            for j in range(valid_count[i]):
-                if num_valid_boxes == max_output_size:
-                    for k in range(box_data_length):
-                        output[i, j, k] = -one
-                    box_indices[i, j] = -1
-
-                elif output[i, j, score_index] > 0:
-                    box_a_idx = j
-                    is_valid_box = 1
-
-                    # a_l: left, a_t: top, a_r: right, a_b: bottom
-                    a_l = min(
-                        output[batch_idx, box_a_idx, box_start_idx],
-                        output[batch_idx, box_a_idx, box_start_idx + 2],
-                    )
-                    a_t = min(
-                        output[batch_idx, box_a_idx, box_start_idx + 1],
-                        output[batch_idx, box_a_idx, box_start_idx + 3],
-                    )
-                    a_r = max(
-                        output[batch_idx, box_a_idx, box_start_idx],
-                        output[batch_idx, box_a_idx, box_start_idx + 2],
-                    )
-                    a_b = max(
-                        output[batch_idx, box_a_idx, box_start_idx + 1],
-                        output[batch_idx, box_a_idx, box_start_idx + 3],
-                    )
-
-                    # check if current box j is valid by calculating iou with
-                    # all existing valid boxes
-                    for k in range(j):
-                        check_iou = 0
-                        if (
-                            is_valid_box == 1
-                            and k < j
-                            and output[i, k, score_index] > 0
-                            and (id_index < 0 or output[i, k, id_index] >= 0)
-                        ):
-                            if force_suppress:
-                                check_iou = 1
-                            elif id_index < 0 or output[i, j, id_index] == output[i, k, id_index]:
-                                check_iou = 1
-
-                        if check_iou > 0:
-                            box_b_idx = k
-
-                            # b_l: left, b_t: top, b_r: right, b_b: bottom
-                            b_l = min(
-                                output[batch_idx, box_b_idx, box_start_idx],
-                                output[batch_idx, box_b_idx, box_start_idx + 2],
-                            )
-                            b_t = min(
-                                output[batch_idx, box_b_idx, box_start_idx + 1],
-                                output[batch_idx, box_b_idx, box_start_idx + 3],
-                            )
-                            b_r = max(
-                                output[batch_idx, box_b_idx, box_start_idx],
-                                output[batch_idx, box_b_idx, box_start_idx + 2],
-                            )
-                            b_b = max(
-                                output[batch_idx, box_b_idx, box_start_idx + 1],
-                                output[batch_idx, box_b_idx, box_start_idx + 3],
-                            )
-
-                            # Overlapping width and height
-                            w = max(zero, min(a_r, b_r) - max(a_l, b_l))
-                            h = max(zero, min(a_b, b_b) - max(a_t, b_t))
-
-                            # Overlapping area
-                            area = h * w
-
-                            # total area of the figure formed by box a and box b
-                            # except for overlapping area
-                            u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area
-
-                            # get the iou
-                            iou = zero if u <= zero else area / u
-
-                            if iou >= iou_threshold:
-                                is_valid_box = 0
-
-                    if is_valid_box == 0:
-                        for k in range(box_data_length):
-                            output[i, j, k] = -one
-                        box_indices[i, j] = -1
-                    else:
-                        num_valid_boxes += 1
-
-        else:
-            for j in parallel(valid_count[i]):
-                for k in range(box_data_length):
-                    output[i, j, k] = data[i, j, k]
-                box_indices[i, j] = j
-
-        # Set invalid entry to be -1
-        for j in parallel(num_anchors - valid_count[i]):
-            for k in range(box_data_length):
-                output[i, j + valid_count[i], k] = -one
-            box_indices[i, j + valid_count[i]] = -1
-
-        if return_indices:
-            for j in range(valid_count[i]):
-                idx = box_indices[i, j]
-                if box_indices[i, j] >= 0:
-                    box_indices[i, j] = indices[i, idx]
-
-    return output, box_indices
-
-
-@tvm.target.generic_func
-def non_max_suppression(
-    data,
-    valid_count,
-    indices,
-    max_output_size=-1,
-    iou_threshold=0.5,
-    force_suppress=False,
-    top_k=-1,
-    coord_start=2,
-    score_index=1,
-    id_index=0,
-    return_indices=True,
-    invalid_to_bottom=False,
-):
-    """Non-maximum suppression operator for object detection.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        3-D tensor with shape [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5].
-
-    valid_count : tvm.te.Tensor
-        1-D tensor for valid number of boxes.
-
-    indices : tvm.te.Tensor
-        2-D tensor with shape [batch_size, num_anchors].
-
-    max_output_size : optional, int or tvm.te.Tensor
-        Max number of output valid boxes for each instance.
-        Return all valid boxes if the value of max_output_size is less than 0.
-
-    iou_threshold : optional, float or tvm.te.Tensor
-        Non-maximum suppression threshold.
-
-    force_suppress : optional, boolean
-        Whether to suppress all detections regardless of class_id.
-
-    top_k : optional, int
-        Keep maximum top k detections before nms, -1 for no limit.
-
-    coord_start : required, int
-        Start index of the consecutive 4 coordinates.
-
-    score_index: optional, int
-        Index of the scores/confidence of boxes.
-
-    id_index : optional, int
-        index of the class categories, -1 to disable.
-
-    return_indices : optional, boolean
-        Whether to return box indices in input data.
-
-    invalid_to_bottom : optional, boolean
-        Whether to move all valid bounding boxes to the top.
-
-    Returns
-    -------
-    out : tvm.te.Tensor or tuple of tvm.te.Tensor
-        3-D tensor with shape [batch_size, num_anchors, 6]
-        or [batch_size, num_anchors, 5]. Out is a tuple of tvm.te.Tensor
-        if return_indices is True, the Tensor in the tuple is 2-D tensor
-        with shape [batch_size, num_anchors] and shape
-        [batch_size, num_valid_anchors] respectively.
-
-    Example
-    --------
-    .. code-block:: python
-
-        # An example to use non_max_suppression
-        dshape = (1, 5, 6)
-        data = te.placeholder(dshape, name="data")
-        valid_count = te.placeholder((dshape[0],), dtype="int32", name="valid_count")
-        iou_threshold = 0.7
-        force_suppress = True
-        top_k = -1
-        out = non_max_suppression(data, valid_count, indices, iou_threshold=iou_threshold,
-                                  force_suppress=force_suppress, top_k=top_k)
-        np_data = np.random.uniform(dshape)
-        np_valid_count = np.array([4])
-        s = topi.generic.schedule_nms(out)
-        f = tvm.build(s, [data, valid_count, out], "llvm")
-        dev = tvm.cpu()
-        tvm_data = tvm.nd.array(np_data, dev)
-        tvm_valid_count = tvm.nd.array(np_valid_count, dev)
-        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev)
-        f(tvm_data, tvm_valid_count, tvm_out)
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    if isinstance(max_output_size, int):
-        max_output_size = tvm.tir.const(max_output_size, dtype="int32")
-    if isinstance(iou_threshold, float):
-        iou_threshold = tvm.tir.const(iou_threshold, dtype=data.dtype)
-    score_axis = score_index
-    score_shape = (batch_size, num_anchors)
-    score_tensor = te.compute(score_shape, lambda i, j: data[i, j, score_axis])
-    sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False)
-
-    out, box_indices = hybrid_nms(
-        data,
-        sort_tensor,
-        valid_count,
-        indices,
-        batch_size,
-        num_anchors,
-        max_output_size,
-        iou_threshold,
-        tvm.tir.const(force_suppress, dtype="bool"),
-        tvm.tir.const(top_k, dtype="int32"),
-        tvm.tir.const(coord_start, dtype="int32"),
-        tvm.tir.const(score_index, dtype="int32"),
-        tvm.tir.const(id_index, dtype="int32"),
-        tvm.tir.const(return_indices, dtype="bool"),
-        zero=tvm.tir.const(0, dtype=data.dtype),
-        one=tvm.tir.const(1, dtype=data.dtype),
-    )
-
-    if return_indices:
-        return hybrid_rearrange_indices_out(
-            box_indices,
-            one=tvm.tir.const(1, dtype="int32"),
-            batch_size=batch_size,
-            num_anchors=num_anchors,
-        )
-
-    if invalid_to_bottom:
-        out = hybrid_rearrange_box_out(
-            out,
-            one=tvm.tir.const(1, dtype=data.dtype),
-            batch_size=batch_size,
-            num_anchors=num_anchors,
-        )
-    return out
-
-
-def _nms_loop(
-    ib,
-    batch_size,
-    top_k,
-    iou_threshold,
-    max_output_size,
-    valid_count,
-    on_new_valid_box_func,
-    on_new_invalidated_box_func,
-    needs_bbox_check_func,
-    calc_overlap_func,
-    out_scores,
-    num_valid_boxes,
-):
-    def nms_inner_loop(ib, i, j, nkeep, num_valid_boxes_local):
-        # The box j is valid, invalidate other boxes that overlap with j above iou_threshold
-        on_new_valid_box_func(ib, 0, num_valid_boxes_local[0], i, j)
-        num_valid_boxes_local[0] += 1
-
-        num_boxes_to_check = nkeep - (j + 1)
-
-        with ib.for_range(0, num_boxes_to_check, name="_k", kind="parallel") as _k:
-            k = j + 1 + _k
-
-            with ib.if_scope(
-                tvm.tir.all(
-                    k < nkeep,
-                    out_scores[i, k] > 0,  # is the box k still valid?
-                    needs_bbox_check_func(i, j, k),
-                )
-            ):
-                iou = calc_overlap_func(i, j, k)
-
-                with ib.if_scope(iou >= iou_threshold):
-                    # invalidate the box k
-                    out_scores[i, k] = -1.0
-                    on_new_invalidated_box_func(i, k)
-
-    with ib.for_range(0, batch_size, name="i") as i:
-        nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i])
-        max_output_size = if_then_else(max_output_size > 0, max_output_size, nkeep)
-
-        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
-            num_valid_boxes_local = ib.allocate(
-                "int32", (1,), name="num_valid_boxes_local", scope="local"
-            )
-            box_idx = ib.allocate("int32", (1,), name="box_idx", scope="local")
-            num_valid_boxes_local[0] = 0
-            box_idx[0] = 0
-
-            # Apply nms
-            # No need to do more iteration if we have already reached max_output_size boxes
-            with ib.while_loop(
-                tvm.tir.all(box_idx[0] < nkeep, num_valid_boxes_local[0] < max_output_size)
-            ):
-                # Proceed to the inner loop if the box with id box_idx is still valid
-                with ib.if_scope(out_scores[i, box_idx[0]] > -1.0):
-                    nms_inner_loop(ib, i, box_idx[0], nkeep, num_valid_boxes_local)
-                box_idx[0] += 1
-
-            num_valid_boxes[i] = num_valid_boxes_local[0]
-
-        with ib.else_scope():
-            num_valid_boxes[i] = 0
-
-    return ib.get()
-
-
-def _get_valid_box_count(scores, score_threshold):
-    batch_classes, num_boxes = scores.shape
-
-    def searchsorted_ir(scores, valid_count):
-        ib = tvm.tir.ir_builder.create()
-        scores = ib.buffer_ptr(scores)
-        valid_count = ib.buffer_ptr(valid_count)
-
-        with ib.for_range(0, batch_classes, name="i", kind="parallel") as i:
-            binary_search(ib, i, num_boxes, scores, score_threshold, valid_count)
-
-        return ib.get()
-
-    scores_buf = tvm.tir.decl_buffer(scores.shape, scores.dtype, "scores_buf", data_alignment=8)
-
-    return te.extern(
-        [(batch_classes,)],
-        [scores],
-        lambda ins, outs: searchsorted_ir(ins[0], outs[0]),
-        dtype=["int32"],
-        in_buffers=[scores_buf],
-        name="searchsorted",
-        tag="searchsorted",
-    )
-
-
-def _collect_selected_indices_ir(num_class, selected_indices, num_detections, row_offsets, out):
-    batch_classes, _ = selected_indices.shape
-
-    ib = tvm.tir.ir_builder.create()
-
-    selected_indices = ib.buffer_ptr(selected_indices)
-    num_detections = ib.buffer_ptr(num_detections)
-    row_offsets = ib.buffer_ptr(row_offsets)
-    out = ib.buffer_ptr(out)
-
-    with ib.for_range(0, batch_classes, name="i", kind="parallel") as i:
-        i = cast(i, "int64")
-        batch_id = i // num_class
-        class_id = i % num_class
-
-        with ib.for_range(0, num_detections[i], name="j") as j:
-            out[row_offsets[i] + j, 0] = batch_id
-            out[row_offsets[i] + j, 1] = class_id
-            out[row_offsets[i] + j, 2] = cast(selected_indices[i, j], "int64")
-
-    return ib.get()
-
-
-def _collect_selected_indices_and_scores_ir(
-    selected_indices,
-    selected_scores,
-    num_detections,
-    row_offsets,
-    num_total_detections,
-    collected_indices,
-    collected_scores,
-):
-    batch_size, num_class = row_offsets.shape
-    num_boxes = selected_indices.shape[1]
-
-    ib = tvm.tir.ir_builder.create()
-
-    selected_indices = ib.buffer_ptr(selected_indices)
-    selected_scores = ib.buffer_ptr(selected_scores)
-    num_detections = ib.buffer_ptr(num_detections)
-    row_offsets = ib.buffer_ptr(row_offsets)
-    num_total_detections = ib.buffer_ptr(num_total_detections)
-    collected_indices = ib.buffer_ptr(collected_indices)
-    collected_scores = ib.buffer_ptr(collected_scores)
-    zero = cast(0, "int64")
-
-    with ib.for_range(0, batch_size * num_class, name="i", kind="parallel") as i:
-        i = cast(i, "int64")
-        batch_id = i // num_class
-        class_id = i % num_class
-
-        with ib.for_range(0, num_boxes, name="j") as j:
-            with ib.if_scope(j < num_detections[batch_id, class_id]):
-                offset = row_offsets[batch_id, class_id] + j
-                collected_indices[batch_id, offset, 0] = class_id
-                collected_indices[batch_id, offset, 1] = cast(selected_indices[i, j], "int64")
-                collected_scores[batch_id, offset] = selected_scores[i, j]
-            with ib.else_scope():
-                offset = (
-                    num_total_detections[batch_id]
-                    + class_id * num_boxes
-                    - row_offsets[batch_id, class_id]
-                    + j
-                    - num_detections[batch_id, class_id]
-                )
-                collected_indices[batch_id, offset, 0] = zero
-                collected_indices[batch_id, offset, 1] = zero
-                collected_scores[batch_id, offset] = 0.0
-
-    return ib.get()
-
-
-def all_class_non_max_suppression(
-    boxes,
-    scores,
-    max_output_boxes_per_class,
-    iou_threshold,
-    score_threshold,
-    output_format="onnx",
-):
-    """Non-maximum suppression operator for object detection, corresponding to ONNX
-    NonMaxSuppression and TensorFlow combined_non_max_suppression.
-    NMS is performed for each class separately.
-
-    Parameters
-    ----------
-    boxes : tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_boxes, 4)
-
-    scores: tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_classes, num_boxes)
-
-    max_output_boxes_per_class : int or tvm.te.Tensor, optional
-        The maxinum number of output selected boxes per class
-
-    iou_threshold : float or tvm.te.Tensor, optionaIl
-        IoU test threshold
-
-    score_threshold : float or tvm.te.Tensor, optional
-        Score threshold to filter out low score boxes early
-
-    output_format : str, optional
-        "onnx" or "tensorflow", see below.
-
-    Returns
-    -------
-    out : list of tvm.te.Tensor
-        If `output_format` is "onnx", the output is two tensors. The first is `indices` of size
-        `(batch_size * num_class* num_boxes , 3)` and the second is a scalar tensor
-        `num_total_detection` of shape `(1,)` representing the total number of selected
-        boxes. The three values in `indices` encode batch, class, and box indices.
-        Rows of `indices` are ordered such that selected boxes from batch 0, class 0 come
-        first, in descending of scores, followed by boxes from batch 0, class 1 etc. Out of
-        `batch_size * num_class* num_boxes` rows of indices, only the first `num_total_detection`
-        rows are valid.
-
-        If `output_format` is "tensorflow", the output is three tensors, the first
-        is `indices` of size `(batch_size, num_class * num_boxes , 2)`, the second is `scores` of
-        size `(batch_size, num_class * num_boxes)`, and the third is `num_total_detection` of size
-        `(batch_size,)` representing the total number of selected boxes per batch. The two values
-        in `indices` encode class and box indices. Of num_class * num_boxes boxes in `indices` at
-        batch b, only the first `num_total_detection[b]` entries are valid. The second axis of
-        `indices` and `scores` are sorted within each class by box scores, but not across classes.
-        So the box indices and scores for the class 0 come first in a sorted order, followed by
-        the class 1 etc.
-    """
-    batch, num_class, num_boxes = scores.shape
-    scores = reshape(scores, (batch * num_class, num_boxes))
-
-    sorted_indices = argsort(scores, axis=1, is_ascend=False, dtype="int32")
-    sorted_scores = gather(scores, 1, sorted_indices)
-
-    valid_count = _get_valid_box_count(sorted_scores, score_threshold)
-
-    selected_indices, selected_scores, num_detections = run_all_class_nms(
-        boxes,
-        sorted_scores,
-        sorted_indices,
-        valid_count,
-        max_output_boxes_per_class,
-        iou_threshold,
-        _nms_loop,
-        return_scores=(output_format == "tensorflow"),
-    )
-
-    if output_format == "onnx":
-        row_offsets = cumsum(num_detections, exclusive=True, dtype="int64")
-        num_total_detections = reduction.sum(cast(num_detections, "int64"), axis=1)
-
-        selected_indices = collect_selected_indices(
-            num_class, selected_indices, num_detections, row_offsets, _collect_selected_indices_ir
-        )
-        return [selected_indices, num_total_detections]
-
-    num_detections_per_batch = reshape(num_detections, (batch, num_class))
-    row_offsets = cumsum(num_detections_per_batch, exclusive=True, dtype="int64", axis=1)
-    num_total_detections = reduction.sum(cast(num_detections_per_batch, "int64"), axis=1)
-
-    selected_indices, selected_scores = collect_selected_indices_and_scores(
-        selected_indices,
-        selected_scores,
-        num_detections_per_batch,
-        row_offsets,
-        num_total_detections,
-        _collect_selected_indices_and_scores_ir,
-    )
-
-    return [selected_indices, selected_scores, num_total_detections]
-
-
-@hybrid.script
-def hybrid_regular_nms(
-    boxes,
-    scores,
-    max_detections_per_class,
-    max_detections,
-    batch_size,
-    num_boxes,
-    num_classes,
-    num_classes_with_background,
-    iou_threshold,
-    score_threshold,
-):
-    """Hybrid routing for regular non-maximum suppression.
-
-    Parameters
-    ----------
-    boxes : tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_boxes, 4)
-
-    scores: tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_boxes, num_classes_with_background)
-
-    max_detections_per_class : tvm.tir.const
-        The maxinum number of output selected boxes per class
-
-    max_detections : tvm.tir.const
-        The maxinum number of output selected boxes
-
-    batch_size : tvm.tir.IntImm or tvm.tir.Var
-        The number of batches
-
-    num_boxes : tvm.tir.IntImm or tvm.tir.Var
-        The number of bounding boxes
-
-    num_classes : tvm.tir.const
-        The number of classes without background
-
-    num_classes_with_background : tvm.tir.IntImm or tvm.tir.Var
-        The number of classes including background ones
-
-    iou_threshold : tvm.tir.const
-        IoU test threshold
-
-    score_threshold : tvm.tir.const
-        Score threshold to filter out low score boxes early
-
-    Returns
-    -------
-    detection_boxes : tvm.te.Tensor
-        3-D tensor with shape [batch_size, max_detections, 4].
-
-    detection_classes : tvm.te.Tensor
-        2-D tensor with shape [batch_size, max_detections].
-
-    detection_scores : tvm.te.Tensor
-        2-D tensor with shape [batch_size, max_detections].
-
-    num_detections : tvm.te.Tensor
-        1-D tensor with shape [batch_size].
-    """
-    # output tensors
-    detection_boxes = output_tensor((batch_size, max_detections, 4), boxes.dtype)
-    detection_classes = output_tensor((batch_size, max_detections), "int32")
-    detection_scores = output_tensor((batch_size, max_detections), scores.dtype)
-    num_detections = output_tensor((batch_size,), "int32")
-
-    # scratch buffers
-    class_scores = allocate((num_boxes,), scores.dtype)
-    keep_indices = allocate((num_boxes,), "int32")
-    keep_scores = allocate((num_boxes,), scores.dtype)
-    sorted_indices = allocate((max_detections + num_boxes,), "int32")
-    sorted_scores = allocate((max_detections + num_boxes,), scores.dtype)
-    active_box_candidate = allocate((num_boxes,), "int32")
-    selected = allocate((num_boxes,), "int32")
-    box_indices_after_regular_nms = allocate((max_detections + num_boxes,), "int32")
-    scores_after_regular_nms = allocate((max_detections + num_boxes,), scores.dtype)
-
-    label_offset = num_classes_with_background - num_classes
-    tmp_idx = 0
-
-    for batch_idx in range(batch_size):
-        size_of_sorted_indices = 0
-
-        for class_id in range(num_classes):
-            for box_id in range(num_boxes):
-                # get scores of boxes corresponding to all anchors for single class
-                class_scores[box_id] = scores[batch_idx, box_id, class_id + label_offset]
-
-            # perform non-maximal suppression on single class
-
-            # select detections above score threshold
-            num_scores_kept = 0
-            for i in range(num_boxes):
-                if class_scores[i] >= score_threshold:
-                    keep_scores[num_scores_kept] = class_scores[i]
-                    keep_indices[num_scores_kept] = i
-                    num_scores_kept += 1
-
-            # iota
-            for i in range(num_scores_kept):
-                sorted_indices[i] = i
-            # decreasing sort of scores
-            for i in range(num_scores_kept):
-                for j in range(num_scores_kept - i - 1):
-                    if keep_scores[sorted_indices[j]] < keep_scores[sorted_indices[j + 1]]:
-                        tmp_idx = sorted_indices[j]
-                        sorted_indices[j] = sorted_indices[j + 1]
-                        sorted_indices[j + 1] = tmp_idx
-
-            selected_size = 0
-
-            for i in range(num_scores_kept):
-                active_box_candidate[i] = 1
-
-            num_active_candidate = num_scores_kept
-            for i in range(num_scores_kept):
-                if (
-                    num_active_candidate != 0
-                    and selected_size < min(num_scores_kept, max_detections_per_class)
-                    and active_box_candidate[i] == 1
-                ):
-                    selected[selected_size] = keep_indices[sorted_indices[i]]
-                    selected_size += 1
-
-                    active_box_candidate[i] = 0
-                    num_active_candidate -= 1
-
-                    for j in range(i + 1, num_scores_kept):
-                        if active_box_candidate[j] == 1:
-                            # compute IOU
-                            i_ymin = boxes[batch_idx, keep_indices[sorted_indices[i]], 0]
-                            i_xmin = boxes[batch_idx, keep_indices[sorted_indices[i]], 1]
-                            i_ymax = boxes[batch_idx, keep_indices[sorted_indices[i]], 2]
-                            i_xmax = boxes[batch_idx, keep_indices[sorted_indices[i]], 3]
-
-                            j_ymin = boxes[batch_idx, keep_indices[sorted_indices[j]], 0]
-                            j_xmin = boxes[batch_idx, keep_indices[sorted_indices[j]], 1]
-                            j_ymax = boxes[batch_idx, keep_indices[sorted_indices[j]], 2]
-                            j_xmax = boxes[batch_idx, keep_indices[sorted_indices[j]], 3]
-
-                            area_i = (i_ymax - i_ymin) * (i_xmax - i_xmin)
-                            area_j = (j_ymax - j_ymin) * (j_xmax - j_xmin)
-
-                            iou = 0.0
-                            if area_i > 0 and area_j > 0:
-                                intersection_ymin = max(i_ymin, j_ymin)
-                                intersection_xmin = max(i_xmin, j_xmin)
-                                intersection_ymax = min(i_ymax, j_ymax)
-                                intersection_xmax = min(i_xmax, j_xmax)
-                                intersection_area = max(
-                                    intersection_ymax - intersection_ymin, 0.0
-                                ) * max(intersection_xmax - intersection_xmin, 0.0)
-                                iou = intersection_area / (area_i + area_j - intersection_area)
-
-                            if iou > iou_threshold:
-                                active_box_candidate[j] = 0
-                                num_active_candidate -= 1
-
-            # end of non-maximal suppression on single class
-
-            # add selected indices from non-max suppression of boxes in this class
-            output_index = size_of_sorted_indices
-            for i in range(selected_size):
-                selected_index = selected[i]
-
-                box_indices_after_regular_nms[output_index] = (
-                    selected_index * num_classes_with_background + class_id + label_offset
-                )
-                scores_after_regular_nms[output_index] = class_scores[selected_index]
-
-                output_index += 1
-
-            # sort the max scores among the selected indices
-            # get the indices for top scores
-            num_indices_to_sort = min(output_index, max_detections)
-
-            # iota
-            for i in range(output_index):
-                sorted_indices[i] = i
-            # deacreasing sort of scores
-            for i in range(output_index):
-                for j in range(output_index - i - 1):
-                    if (
-                        scores_after_regular_nms[sorted_indices[j]]
-                        < scores_after_regular_nms[sorted_indices[j + 1]]
-                    ):
-                        tmp_idx = sorted_indices[j]
-                        sorted_indices[j] = sorted_indices[j + 1]
-                        sorted_indices[j + 1] = tmp_idx
-
-            # copy values to temporary vectors
-            for i in range(num_indices_to_sort):
-                sorted_scores[i] = scores_after_regular_nms[sorted_indices[i]]
-                sorted_indices[i] = box_indices_after_regular_nms[sorted_indices[i]]
-
-            # copy scores and indices from temporary vectors
-            for i in range(num_indices_to_sort):
-                box_indices_after_regular_nms[i] = sorted_indices[i]
-                scores_after_regular_nms[i] = sorted_scores[i]
-
-            size_of_sorted_indices = num_indices_to_sort
-
-        # fill output tensors
-        for output_box_index in range(max_detections):
-            box_ymin = 0.0
-            box_xmin = 0.0
-            box_ymax = 0.0
-            box_xmax = 0.0
-            class_idx = 0
-            selected_score = 0.0
-
-            if output_box_index < size_of_sorted_indices:
-                anchor_idx = (
-                    box_indices_after_regular_nms[output_box_index] // num_classes_with_background
-                )
-
-                box_ymin = boxes[batch_idx, anchor_idx, 0]
-                box_xmin = boxes[batch_idx, anchor_idx, 1]
-                box_ymax = boxes[batch_idx, anchor_idx, 2]
-                box_xmax = boxes[batch_idx, anchor_idx, 3]
-                class_idx = (
-                    box_indices_after_regular_nms[output_box_index]
-                    - anchor_idx * num_classes_with_background
-                    - label_offset
-                )
-                selected_score = scores_after_regular_nms[output_box_index]
-
-            detection_boxes[batch_idx, output_box_index, 0] = box_ymin
-            detection_boxes[batch_idx, output_box_index, 1] = box_xmin
-            detection_boxes[batch_idx, output_box_index, 2] = box_ymax
-            detection_boxes[batch_idx, output_box_index, 3] = box_xmax
-            detection_classes[batch_idx, output_box_index] = class_idx
-            detection_scores[batch_idx, output_box_index] = selected_score
-
-        num_detections[batch_idx] = size_of_sorted_indices
-
-    return detection_boxes, detection_classes, detection_scores, num_detections
-
-
-def regular_non_max_suppression(
-    boxes,
-    scores,
-    max_detections_per_class,
-    max_detections,
-    num_classes,
-    iou_threshold,
-    score_threshold,
-):
-    """Regular non-maximum suppression operator for object detection, corresponding to TFLite's
-    regular NMS. NMS is performed for each class separately.
-
-    Parameters
-    ----------
-    boxes : tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_boxes, 4). The four values in boxes
-        encode (ymin, xmin, ymax, xmax) coordinates of a box
-
-    scores: tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_boxes, num_classes_with_background)
-
-    max_detections_per_class : int
-        The maxinum number of output selected boxes per class
-
-    max_detections : int
-        The maxinum number of output selected boxes
-
-    num_classes : int
-        The number of classes without background
-
-    iou_threshold : float
-        IoU test threshold
-
-    score_threshold : float
-        Score threshold to filter out low score boxes early
-
-    Returns
-    -------
-    out : list of tvm.te.Tensor
-        The output is a list of four tensors. The first is `detection_boxes` of size
-        `(batch_size, max_detections , 4)`, the second is `detection_classes` of size
-        `(batch_size, max_detections)`, the third is `detection_scores` of size
-        `(batch_size, max_detections)`, and the fourth is `num_detections` of size `(batch_size,)`
-        representing the total number of selected boxes per batch.
-    """
-    batch_size, num_boxes, num_classes_with_background = scores.shape
-
-    detection_boxes, detection_classes, detection_scores, num_detections = hybrid_regular_nms(
-        boxes=boxes,
-        scores=scores,
-        max_detections_per_class=tvm.tir.const(max_detections_per_class, dtype="int32"),
-        max_detections=tvm.tir.const(max_detections, dtype="int32"),
-        batch_size=batch_size,
-        num_boxes=num_boxes,
-        num_classes=tvm.tir.const(num_classes, dtype="int32"),
-        num_classes_with_background=num_classes_with_background,
-        iou_threshold=tvm.tir.const(iou_threshold, dtype="float32"),
-        score_threshold=tvm.tir.const(score_threshold, dtype="float32"),
-    )
-
-    return [
-        detection_boxes,
-        cast(detection_classes, dtype="float32"),
-        detection_scores,
-        num_detections,
-    ]
diff --git a/python/tvm/topi/vision/nms_util.py b/python/tvm/topi/vision/nms_util.py
deleted file mode 100644
index d12592fd111a..000000000000
--- a/python/tvm/topi/vision/nms_util.py
+++ /dev/null
@@ -1,338 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Common utilities used in Non-maximum suppression operators"""
-import tvm
-from tvm import te
-
-
-def _get_boundaries(output, box_idx):
-    l = tvm.te.min(
-        output[box_idx],
-        output[box_idx + 2],
-    )
-    t = tvm.te.min(
-        output[box_idx + 1],
-        output[box_idx + 3],
-    )
-    r = tvm.te.max(
-        output[box_idx],
-        output[box_idx + 2],
-    )
-    b = tvm.te.max(
-        output[box_idx + 1],
-        output[box_idx + 3],
-    )
-    return l, t, r, b
-
-
-def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
-    """Calculate overlap of two boxes."""
-    a_l, a_t, a_r, a_b = _get_boundaries(out_tensor, box_a_idx)
-    b_l, b_t, b_r, b_b = _get_boundaries(out_tensor, box_b_idx)
-
-    # Overlapping width and height
-    w = tvm.te.max(0.0, tvm.te.min(a_r, b_r) - tvm.te.max(a_l, b_l))
-    h = tvm.te.max(0.0, tvm.te.min(a_b, b_b) - tvm.te.max(a_t, b_t))
-
-    # Overlapping area
-    area = h * w
-
-    # total area of the figure formed by box a and box b
-    # except for overlapping area
-    u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area
-    return tvm.tir.Select(u <= 0.0, 0.0, area / u)
-
-
-def binary_search(ib, y, num_boxes, scores, score_threshold, out):
-    """Binary search for score_threshold on scores sorted in descending order"""
-    lo = ib.allocate("int32", (1,), name="lo", scope="local")
-    hi = ib.allocate("int32", (1,), name="hi", scope="local")
-
-    lo[0] = 0
-    hi[0] = num_boxes
-
-    with ib.while_loop(lo[0] < hi[0]):
-        mid = (hi[0] + lo[0]) >> 1
-        with ib.if_scope(scores[y, mid] > score_threshold):
-            lo[0] = mid + 1
-        with ib.else_scope():
-            hi[0] = mid
-
-    out[y] = lo[0]
-
-
-def collect_selected_indices(num_class, selected_indices, num_detections, row_offsets, ir):
-    """Collect selected indices from the core NMS loop into one linear output
-
-    Parameters
-    ----------
-    num_class : int
-
-    selected_indices: tvm.te.Tensor
-        2-D tensor with shape (batch_size * num_classes, num_boxes), representing the indices
-        of selected boxes by the core NMS loop.
-
-    num_detections tvm.te.Tensor
-        1-D tensor with shape (batch_size * num_classes,), representing
-        the number of boxes selected by the core NMS loop, per batch and class
-
-    row_offsets tvm.te.Tensor
-        1-D tensor with shape (batch_size * num_classes,), this should be the exclusive scan
-        of num_detections
-
-    ir : function
-        A function to generate IR for CPU or GPU, see its usage in vision/nms.py and cuda/nms.py
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        The output is indices of size (batch_size * num_class* num_boxes , 3).
-        Rows of indices are ordered such that selected boxes from batch 0, class 0 come
-        first, in descending of scores, followed by boxes from batch 0, class 1 etc.
-    """
-    batch_class, num_boxes = selected_indices.shape
-    return te.extern(
-        [(batch_class * num_boxes, 3)],
-        [selected_indices, num_detections, row_offsets],
-        lambda ins, outs: ir(num_class, ins[0], ins[1], ins[2], outs[0]),
-        dtype=["int64"],
-        name="collect_indices",
-        tag="collect_indices",
-    )
-
-
-def collect_selected_indices_and_scores(
-    selected_indices, selected_scores, num_detections, row_offsets, num_total_detections, ir
-):
-    """Collect selected indices and scores from the core NMS loop into one linear output
-
-    Parameters
-    ----------
-    num_class : int
-
-    selected_indices: tvm.te.Tensor
-        2-D tensor with shape (batch_size * num_classes, num_boxes), representing the indices
-        of selected boxes by the core NMS loop.
-
-    selected_indices: tvm.te.Tensor
-        2-D tensor with shape (batch_size * num_classes, num_boxes), representing the scores
-        of selected boxes by the core NMS loop.
-
-    num_detections tvm.te.Tensor
-        2-D tensor with shape (batch_size, num_classes), representing
-        the number of boxes selected by the core NMS loop, per batch and class
-
-    row_offsets tvm.te.Tensor
-        2-D tensor with shape (batch_size, num_classes), this should be the exclusive scan
-        of num_detections along axis 1
-
-    ir : function
-        A function to generate IR for CPU or GPU, see its usage in vision/nms.py and cuda/nms.py
-
-    Returns
-    -------
-    out : [tvm.te.Tensor, tvm.te.Tensor]
-        The output is two tensors. The first is indices of size
-        (batch_size, num_class* num_boxes, 2), and the second is scores of size
-        (batch_size, num_class* num_boxes).
-    """
-    batch_size, num_class = row_offsets.shape
-    num_boxes = selected_indices.shape[1]
-    return te.extern(
-        [(batch_size, num_class * num_boxes, 2), (batch_size, num_class * num_boxes)],
-        [selected_indices, selected_scores, num_detections, row_offsets, num_total_detections],
-        lambda ins, outs: ir(ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], outs[1]),
-        dtype=["int64", "float32"],
-        name="collect_indices_and_scores",
-        tag="collect_indices_and_scores",
-    )
-
-
-def _all_class_nms_ir(
-    boxes,
-    sorted_scores,
-    sorted_indices,
-    valid_count,
-    batch_class,
-    num_class,
-    num_anchors,
-    iou_threshold,
-    max_output_size_per_class,
-    box_indices,
-    selected_scores,
-    num_valid_boxes,
-    nms_loop,
-):
-    ib = tvm.tir.ir_builder.create()
-    boxes = ib.buffer_ptr(boxes)
-    sorted_scores = ib.buffer_ptr(sorted_scores)
-    sorted_indices = ib.buffer_ptr(sorted_indices)
-    valid_count = ib.buffer_ptr(valid_count)
-    box_indices = ib.buffer_ptr(box_indices)
-    num_valid_boxes = ib.buffer_ptr(num_valid_boxes)
-
-    if selected_scores is not None:
-        selected_scores = ib.buffer_ptr(selected_scores)
-
-    if isinstance(iou_threshold, float):
-        iou_threshold = tvm.tir.FloatImm("float32", iou_threshold)
-
-    if isinstance(max_output_size_per_class, int):
-        max_output_size_per_class = tvm.tir.const(max_output_size_per_class)
-
-    def calc_overlap(i, j, k):
-        offset_j = sorted_indices[i, j] * 4
-        offset_k = sorted_indices[i, k] * 4
-        batch_id = i // num_class
-        base_bbox_idx = batch_id * num_anchors * 4
-        return calculate_overlap(
-            boxes,
-            base_bbox_idx + offset_j,
-            base_bbox_idx + offset_k,
-        )
-
-    def on_new_valid_box(ib, tid, num_current_valid_box, i, j):
-        with ib.if_scope(tid + 0 == 0):
-            box_indices[i, num_current_valid_box] = sorted_indices[i, j]
-
-            if selected_scores is not None:
-                selected_scores[i, num_current_valid_box] = sorted_scores[i, j]
-
-    def on_new_invalidated_box(*_):
-        pass
-
-    def needs_bbox_check(*_):
-        return tvm.tir.const(True)
-
-    return nms_loop(
-        ib,
-        batch_class,
-        tvm.tir.IntImm("int32", -1),  # top_k
-        iou_threshold,
-        max_output_size_per_class,
-        valid_count,
-        on_new_valid_box,
-        on_new_invalidated_box,
-        needs_bbox_check,
-        calc_overlap,
-        sorted_scores,
-        num_valid_boxes,
-    )
-
-
-def run_all_class_nms(
-    boxes,
-    sorted_scores,
-    sorted_indices,
-    valid_count,
-    max_output_size_per_class,
-    iou_threshold,
-    nms_loop,
-    return_scores=False,
-):
-    """The core all class NMS routine
-
-    Parameters
-    ----------
-    boxes : tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_boxes, 4)
-
-    sorted_scores: tvm.te.Tensor
-        2-D tensor with shape (batch_size * num_classes, num_boxes)
-        One of the outputs from argsort
-
-    sorted_indices: tvm.te.Tensor
-        2-D tensor with shape (batch_size * num_classes, num_boxes)
-        The other output from argsort
-
-    valid_count: tvm.te.Tensor
-        1-D tensor with shape (batch_size * num_classes,), representing
-        the number of boxes whose score is above score_threshold, per batch and class
-
-    max_output_boxes_per_class : int or tvm.te.Tensor, optional
-        The maxinum number of output selected boxes per class
-
-    iou_threshold : float or tvm.te.Tensor, optionaIl
-        IoU test threshold
-
-    nms_loop : function
-        A core NMS loop, see its usage in vision/nms.py and cuda/nms.py
-
-    return_scores : bool, optional
-        Whether or not to return selected scores, needed by the tensorflow output format.
-
-    Returns
-    -------
-    out : a list of tvm.te.Tensor
-        The output is three tensors, the first and second are indices and scores of size
-        (batch_size * num_class, num_boxes), and the third is a tensor
-        num_selected_boxes of shape (batch_size * num_class,) representing the total number of
-        selected boxes per batch and class. If return_scores is False, the second output is
-        None.
-    """
-    batch, num_boxes, _ = boxes.shape
-    batch_class = sorted_scores.shape[0]
-    num_class = batch_class // batch
-
-    if return_scores is False:
-        selected_indices, num_detections = te.extern(
-            [(batch_class, num_boxes), (1, batch_class)],
-            [boxes, sorted_scores, sorted_indices, valid_count],
-            lambda ins, outs: _all_class_nms_ir(
-                ins[0],  # boxes
-                ins[1],  # sorted_scores
-                ins[2],  # sorted_indices
-                ins[3],  # valid_count
-                batch_class,
-                num_class,
-                num_boxes,
-                iou_threshold,
-                max_output_size_per_class,
-                outs[0],  # box_indices
-                None,  # scores
-                outs[1],  # num_selected_boxes
-                nms_loop,
-            ),
-            dtype=["int32", "int32"],
-            name="all_class_nms",
-            tag="all_class_nms",
-        )
-        return selected_indices, None, num_detections
-
-    return te.extern(
-        [(batch_class, num_boxes), (batch_class, num_boxes), (1, batch_class)],
-        [boxes, sorted_scores, sorted_indices, valid_count],
-        lambda ins, outs: _all_class_nms_ir(
-            ins[0],  # boxes
-            ins[1],  # sorted_scores
-            ins[2],  # sorted_indices
-            ins[3],  # valid_count
-            batch_class,
-            num_class,
-            num_boxes,
-            iou_threshold,
-            max_output_size_per_class,
-            outs[0],  # box_indices
-            outs[1],  # selected scores
-            outs[2],  # num_selected_boxes
-            nms_loop,
-        ),
-        dtype=["int32", "float32", "int32"],
-        name="all_class_nms",
-        tag="all_class_nms",
-    )
diff --git a/python/tvm/topi/vision/rcnn/__init__.py b/python/tvm/topi/vision/rcnn/__init__.py
deleted file mode 100644
index e5693e869445..000000000000
--- a/python/tvm/topi/vision/rcnn/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Faster R-CNN and Mask R-CNN operators"""
-from .roi_align import *
-from .roi_pool import *
-from .proposal import *
diff --git a/python/tvm/topi/vision/rcnn/proposal.py b/python/tvm/topi/vision/rcnn/proposal.py
deleted file mode 100644
index 12a0d6bcf0a0..000000000000
--- a/python/tvm/topi/vision/rcnn/proposal.py
+++ /dev/null
@@ -1,448 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, singleton-comparison, bad-continuation
-"""Proposal operator"""
-import math
-import tvm
-from tvm import te
-from ...utils import get_const_tuple, get_const_int
-from ...sort import argsort
-
-
-def generate_anchor(ratio, scale, base_size):
-    """Generate anchor"""
-    w = h = float(base_size)
-    x_ctr = 0.5 * (w - 1.0)
-    y_ctr = 0.5 * (h - 1.0)
-    size = w * h
-    size_ratios = math.floor(size / ratio)
-    new_w = math.floor(math.sqrt(size_ratios) + 0.5) * scale
-    new_h = math.floor((new_w / scale * ratio) + 0.5) * scale
-    return (
-        x_ctr - 0.5 * (new_w - 1.0),
-        y_ctr - 0.5 * (new_h - 1.0),
-        x_ctr + 0.5 * (new_w - 1.0),
-        y_ctr + 0.5 * (new_h - 1.0),
-    )
-
-
-def reg_bbox(x1, y1, x2, y2, dx, dy, dw, dh):
-    """Bounding box regression function"""
-    bbox_w = x2 - x1 + 1.0
-    bbox_h = y2 - y1 + 1.0
-    ctr_x = x1 + 0.5 * (bbox_w - 1.0)
-    ctr_y = y1 + 0.5 * (bbox_h - 1.0)
-
-    pred_ctr_x = dx * bbox_w + ctr_x
-    pred_ctr_y = dy * bbox_h + ctr_y
-    pred_w = te.exp(dw) * bbox_w
-    pred_h = te.exp(dh) * bbox_h
-
-    pred_x1 = pred_ctr_x - 0.5 * (pred_w - 1.0)
-    pred_y1 = pred_ctr_y - 0.5 * (pred_h - 1.0)
-    pred_x2 = pred_ctr_x + 0.5 * (pred_w - 1.0)
-    pred_y2 = pred_ctr_y + 0.5 * (pred_h - 1.0)
-    return pred_x1, pred_y1, pred_x2, pred_y2
-
-
-def reg_iou(x1, y1, x2, y2, dx1, dy1, dx2, dy2):
-    """Bounding box regression function"""
-    pred_x1 = x1 + dx1
-    pred_y1 = y1 + dy1
-    pred_x2 = x2 + dx2
-    pred_y2 = y2 + dy2
-    return pred_x1, pred_y1, pred_x2, pred_y2
-
-
-def predict_bbox_ir(
-    cls_prob_buf,
-    bbox_pred_buf,
-    im_info_buf,
-    out_buf,
-    scales,
-    ratios,
-    feature_stride,
-    rpn_min_size,
-    iou_loss,
-):
-    """Predict bounding boxes based on anchors, scores and deltas.
-
-    Parameters
-    ----------
-    cls_prob_buf : tvm.te.schedule.Buffer
-        4-D with shape [batch, 2 * num_anchors, height, width]
-
-    bbox_pred_buf : tvm.te.schedule.Buffer
-        4-D with shape [batch, 4 * num_anchors, height, width]
-
-    im_info_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, 3]
-
-    out_buf : tvm.te.schedule.Buffer
-        3-D with shape [batch, num_bbox, 5]
-        The last dimension is in format of [w_start, h_start, w_end, h_end, score]
-
-    scales : list/tuple of float
-        Scales of anchor windows.
-
-    ratios : list/tuple of float
-        Ratios of anchor windows.
-
-    feature_stride : int
-        The size of the receptive field each unit in the convolution layer of the rpn, for example
-        the product of all stride's prior to this layer.
-
-    rpn_min_size : int
-        Minimum height or width in proposal.
-
-    iou_loss : bool
-        Usage of IoU loss.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch, num_anchors, height, width = get_const_tuple(cls_prob_buf.shape)
-    num_anchors //= 2
-    ib = tvm.tir.ir_builder.create()
-
-    p_score = ib.buffer_ptr(cls_prob_buf)
-    p_delta = ib.buffer_ptr(bbox_pred_buf)
-    p_im_info = ib.buffer_ptr(im_info_buf)
-    p_out = ib.buffer_ptr(out_buf)
-
-    idxm = tvm.tir.indexmod
-    idxd = tvm.tir.indexdiv
-
-    with ib.for_range(0, batch * height * width) as tid:
-        w = idxm(tid, width)
-        h = idxm(idxd(tid, width), height)
-        b = idxd(idxd(tid, width), height)
-
-        for k in range(num_anchors):
-            out_index = tid * num_anchors + k
-            ratio = ratios[k // len(scales)]
-            scale = scales[k % len(scales)]
-            anchor = generate_anchor(ratio, scale, feature_stride)
-            im_height = p_im_info[b * 3]
-            im_width = p_im_info[b * 3 + 1]
-            x1 = anchor[0] + w * feature_stride
-            y1 = anchor[1] + h * feature_stride
-            x2 = anchor[2] + w * feature_stride
-            y2 = anchor[3] + h * feature_stride
-
-            delta = [
-                p_delta[((((b * num_anchors + k) * 4 + i) * height + h) * width + w)]
-                for i in range(4)
-            ]
-            regression_func = reg_iou if iou_loss else reg_bbox
-            pred_x1, pred_y1, pred_x2, pred_y2 = regression_func(x1, y1, x2, y2, *delta)
-
-            pred_x1 = tvm.te.max(tvm.te.min(pred_x1, im_width - 1.0), 0.0)
-            pred_y1 = tvm.te.max(tvm.te.min(pred_y1, im_height - 1.0), 0.0)
-            pred_x2 = tvm.te.max(tvm.te.min(pred_x2, im_width - 1.0), 0.0)
-            pred_y2 = tvm.te.max(tvm.te.min(pred_y2, im_height - 1.0), 0.0)
-
-            real_height = (im_height / feature_stride).astype("int32")
-            real_width = (im_width / feature_stride).astype("int32")
-
-            bbox_w = pred_x2 - pred_x1 + 1.0
-            bbox_h = pred_y2 - pred_y1 + 1.0
-            min_size = p_im_info[b * 3 + 2] * rpn_min_size
-
-            pred_score = p_score[((b * num_anchors * 2 + num_anchors + k) * height + h) * width + w]
-            pred_score = tvm.tir.Select(
-                tvm.tir.any(h >= real_height, w >= real_width), -1.0, pred_score
-            )
-            p_out[out_index * 5 + 0] = pred_x1
-            p_out[out_index * 5 + 1] = pred_y1
-            p_out[out_index * 5 + 2] = pred_x2
-            p_out[out_index * 5 + 3] = pred_y2
-            p_out[out_index * 5 + 4] = pred_score
-
-            with ib.if_scope(tvm.tir.any(bbox_w < min_size, bbox_h < min_size)):
-                p_out[out_index * 5 + 0] -= min_size / 2.0
-                p_out[out_index * 5 + 1] -= min_size / 2.0
-                p_out[out_index * 5 + 2] += min_size / 2.0
-                p_out[out_index * 5 + 3] += min_size / 2.0
-                p_out[out_index * 5 + 4] = -1.0
-
-    return ib.get()
-
-
-def argsort_ir(data_buf, out_index_buf):
-    """Batched odd-even transposition sort.
-
-    Parameters
-    ----------
-    data_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, num_bbox]
-
-    out_index_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, num_bbox]. Indices of data in sorted order.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch, num_bbox = get_const_tuple(data_buf.shape)
-    ib = tvm.tir.ir_builder.create()
-    p_data = ib.buffer_ptr(data_buf)
-    index_out = ib.buffer_ptr(out_index_buf)
-    temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local")
-    temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local")
-    idxm = tvm.tir.indexmod
-    with ib.for_range(0, batch, kind="unroll") as b:
-        start = b * num_bbox
-        for i in range(2):
-            with ib.for_range(0, (num_bbox + 1) // 2) as tid:
-                bbox_id = tid * 2 + i
-                with ib.if_scope(bbox_id < num_bbox):
-                    index_out[start + bbox_id] = bbox_id
-        with ib.for_range(0, num_bbox) as k:
-            with ib.for_range(0, (num_bbox + 1) // 2) as tid:
-                offset = start + 2 * tid + idxm(k, 2)
-                with ib.if_scope(
-                    tvm.tir.all(offset + 1 < num_bbox, p_data[offset] < p_data[offset + 1])
-                ):
-                    temp_data[0] = p_data[offset]
-                    p_data[offset] = p_data[offset + 1]
-                    p_data[offset + 1] = temp_data[0]
-                    temp_index[0] = index_out[offset]
-                    index_out[offset] = index_out[offset + 1]
-                    index_out[offset + 1] = temp_index[0]
-    return ib.get()
-
-
-def nms_ir(sorted_bbox_buf, out_buf, nms_threshold):
-    """Non-maximum suppression.
-
-    Parameters
-    ----------
-    sorted_bbox_buf : tvm.te.schedule.Buffer
-        3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
-        [w_start, h_start, w_end, h_end, score].
-
-    out_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.
-
-    nms_threshold : float
-        Non-maximum suppression threshold.
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-
-    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
-        """Calculate overlap of two boxes."""
-        w = tvm.te.max(
-            0.0,
-            tvm.te.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
-            - tvm.te.max(out_tensor[box_a_idx], out_tensor[box_b_idx])
-            + 1.0,
-        )
-        h = tvm.te.max(
-            0.0,
-            tvm.te.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
-            - tvm.te.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])
-            + 1.0,
-        )
-        i = w * h
-        u = (
-            (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx] + 1.0)
-            * (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1] + 1.0)
-            + (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx] + 1.0)
-            * (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1] + 1.0)
-            - i
-        )
-        return i / u
-
-    batch, num_bbox = get_const_tuple(out_buf.shape)
-    ib = tvm.tir.ir_builder.create()
-    p_data = ib.buffer_ptr(sorted_bbox_buf)
-    p_out = ib.buffer_ptr(out_buf)
-    with ib.for_range(0, batch, kind="unroll", name="n") as b:
-        base_idx = b * num_bbox
-        for i in range(num_bbox):
-            p_out[base_idx + i] = False
-        with ib.for_range(0, num_bbox - 1) as l:
-            with ib.for_range(0, num_bbox) as i:
-                with ib.if_scope(tvm.tir.all(i < num_bbox, i > l, p_out[base_idx + l] == False)):
-                    iou = calculate_overlap(p_data, (base_idx + l) * 5, (base_idx + i) * 5)
-                    with ib.if_scope(iou > nms_threshold):
-                        p_out[base_idx + i] = True
-    return ib.get()
-
-
-def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
-    """Copy output after applying nms to continuous memory.
-
-    Parameters
-    ----------
-    sorted_bbox_buf : tvm.te.schedule.Buffer
-        3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
-        [w_start, h_start, w_end, h_end, score].
-
-    remove_mask_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.
-
-    out_buf : tvm.te.schedule.Buffer
-        2-D with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
-        [batch_index, w_start, h_start, w_end, h_end].
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch, num_bbox, _ = get_const_tuple(sorted_bbox_buf.shape)
-    rpn_post_nms_top_n = get_const_int(out_buf.shape[0]) // batch
-    ib = tvm.tir.ir_builder.create()
-    i = ib.allocate("int32", (batch,), "i", scope="local")
-    p_sorted_bbox = ib.buffer_ptr(sorted_bbox_buf)
-    p_remove = ib.buffer_ptr(remove_mask_buf)
-    p_out = ib.buffer_ptr(out_buf)
-
-    nkeep = ib.allocate("int32", (batch,), "nkeep", scope="local")
-
-    with ib.for_range(0, batch) as b:
-        nkeep[b] = 0
-        i[b] = 0
-
-    with ib.for_range(0, num_bbox) as j:
-        with ib.for_range(0, batch) as b:
-            with ib.if_scope(p_remove[b * num_bbox + j] == False):
-                nkeep[b] += 1
-    with ib.for_range(0, batch) as b:
-        with ib.if_scope(nkeep[b] > 0):
-            with ib.for_range(
-                0, te.ceil(tvm.tir.const(rpn_post_nms_top_n, "float32") / nkeep[b]).astype("int32")
-            ):
-                with ib.for_range(0, num_bbox) as j:
-                    offset_j = (b * num_bbox + j) * 5
-                    offset_i = (b * rpn_post_nms_top_n + i[b]) * 5
-                    with ib.if_scope(
-                        tvm.tir.all(
-                            i[b] < rpn_post_nms_top_n, p_remove[(b * num_bbox + j)] == False
-                        )
-                    ):
-                        p_out[offset_i] = tvm.tir.Cast("float32", b)
-                        with ib.for_range(0, 4, kind="unroll") as k:
-                            p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k]
-                        i[b] = i[b] + 1
-
-    body = ib.get()
-    return body
-
-
-def proposal(
-    cls_prob,
-    bbox_pred,
-    im_info,
-    scales,
-    ratios,
-    feature_stride,
-    threshold,
-    rpn_pre_nms_top_n,
-    rpn_post_nms_top_n,
-    rpn_min_size,
-    iou_loss,
-):
-    """Proposal operator.
-
-    Parameters
-    ----------
-    cls_prob : tvm.te.Tensor
-        4-D with shape [batch, 2 * num_anchors, height, width]
-
-    bbox_pred : tvm.te.Tensor
-        4-D with shape [batch, 4 * num_anchors, height, width]
-
-    im_info : tvm.te.Tensor
-        2-D with shape [batch, 3]
-
-    scales : list/tuple of float
-        Scales of anchor windows.
-
-    ratios : list/tuple of float
-        Ratios of anchor windows.
-
-    feature_stride : int
-        The size of the receptive field each unit in the convolution layer of the rpn, for example
-        the product of all stride's prior to this layer.
-
-    threshold : float
-        Non-maximum suppression threshold.
-
-    rpn_pre_nms_top_n : int
-        Number of top scoring boxes to apply NMS. -1 to use all boxes.
-
-    rpn_post_nms_top_n : int
-        Number of top scoring boxes to keep after applying NMS to RPN proposals.
-
-    rpn_min_size : int
-        Minimum height or width in proposal.
-
-    iou_loss : bool
-        Usage of IoU loss.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        2-D tensor with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
-        [batch_index, w_start, h_start, w_end, h_end].
-    """
-    # pylint: disable=unused-argument
-    batch, _, height, width = get_const_tuple(cls_prob.shape)
-    num_anchors = len(scales) * len(ratios)
-    num_bbox = height * width * num_anchors
-    rpn_pre_nms_top_n = min(rpn_pre_nms_top_n, num_bbox) if rpn_pre_nms_top_n > 0 else num_bbox
-
-    bbox = te.extern(
-        (batch, num_bbox, 5),
-        [cls_prob, bbox_pred, im_info],
-        lambda ins, outs: predict_bbox_ir(
-            ins[0], ins[1], ins[2], outs[0], scales, ratios, feature_stride, rpn_min_size, iou_loss
-        ),
-        dtype=bbox_pred.dtype,
-    )
-    score = te.compute((batch, num_bbox), lambda b, i: bbox[b, i, 4], tag="bbox_score")
-    valid_count_shape = (1,)
-    valid_count = te.compute(valid_count_shape, lambda i: num_bbox)
-    sorted_index = argsort(score, valid_count=valid_count, axis=1, is_ascend=False)
-    sorted_bbox = te.compute(
-        (batch, rpn_pre_nms_top_n, 5),
-        lambda b, i, j: bbox[b, sorted_index[b, i], j],
-        tag="sorted_bbox",
-    )
-    nms_remove_mask = te.extern(
-        (batch, rpn_pre_nms_top_n),
-        [sorted_bbox],
-        lambda ins, outs: nms_ir(ins[0], outs[0], threshold),
-        dtype="bool",
-    )
-    nms_out = te.extern(
-        (batch * rpn_post_nms_top_n, 5),
-        [sorted_bbox, nms_remove_mask],
-        lambda ins, outs: prepare_output_ir(ins[0], ins[1], outs[0]),
-        dtype=sorted_bbox.dtype,
-    )
-    return nms_out
diff --git a/python/tvm/topi/vision/rcnn/roi_align.py b/python/tvm/topi/vision/rcnn/roi_align.py
deleted file mode 100644
index 238e02964356..000000000000
--- a/python/tvm/topi/vision/rcnn/roi_align.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Roi align operator"""
-import tvm
-from tvm import te
-from ...utils import get_const_tuple
-from ...cpp.utils import bilinear_sample_nchw, bilinear_sample_nhwc
-
-
-def _sample_common(
-    i,
-    c,
-    ph,
-    pw,
-    rois,
-    pooled_size_h,
-    pooled_size_w,
-    spatial_scale,
-    sample_ratio,
-    dtype,
-    avg_mode,
-    bilinear_func,
-):
-    roi = rois[i]
-    batch_index = roi[0].astype("int32")
-    roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4]
-    roi_start_h *= spatial_scale
-    roi_end_h *= spatial_scale
-    roi_start_w *= spatial_scale
-    roi_end_w *= spatial_scale
-
-    # force malformed ROIs to be 1x1
-    roi_h = tvm.te.max(roi_end_h - roi_start_h, tvm.tir.const(1.0, dtype))
-    roi_w = tvm.te.max(roi_end_w - roi_start_w, tvm.tir.const(1.0, dtype))
-
-    bin_h = roi_h / pooled_size_h
-    bin_w = roi_w / pooled_size_w
-
-    if sample_ratio > 0:
-        roi_bin_grid_h = roi_bin_grid_w = tvm.tir.const(sample_ratio, "int32")
-    else:
-        roi_bin_grid_h = te.ceil(roi_h / pooled_size_h).astype("int32")
-        roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype("int32")
-
-    count = roi_bin_grid_h * roi_bin_grid_w
-    rh = te.reduce_axis((0, roi_bin_grid_h), name="rh")
-    rw = te.reduce_axis((0, roi_bin_grid_w), name="rw")
-    roi_start_h += ph * bin_h
-    roi_start_w += pw * bin_w
-
-    if avg_mode:
-        return te.sum(
-            bilinear_func(
-                batch_index,
-                c,
-                roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
-                roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
-            )
-            / count,
-            axis=[rh, rw],
-        )
-    # max mode
-    return te.max(
-        bilinear_func(
-            batch_index,
-            c,
-            roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
-            roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
-        ),
-        axis=[rh, rw],
-    )
-
-
-def roi_align_nchw(data, rois, pooled_size, spatial_scale, mode, sample_ratio=-1):
-    """ROI align operator in NCHW layout.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        4-D with shape [batch, channel, height, width]
-
-    rois : tvm.te.Tensor
-        2-D with shape [num_roi, 5]. The last dimension should be in format of
-        [batch_index, w_start, h_start, w_end, h_end]
-
-    pooled_size : int or list/tuple of two ints
-        output size, or [out_height, out_width]
-
-    spatial_scale : float
-        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
-        of total stride in convolutional layers, which should be in range (0.0, 1.0]
-
-    mode : int or str
-        There are two modes, average and max. For the average mode, you can pass b'avg' or 0, and
-        for the max mode, you can pass b'max' or 1.
-
-    sample_ratio : int
-        Optional sampling ratio of ROI align, using adaptive size by default.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [num_roi, channel, pooled_size, pooled_size]
-    """
-    avg_mode = mode in (b"avg", 0)
-    max_mode = mode in (b"max", 1)
-    assert avg_mode or max_mode, "Mode must be avg or max. Please pass in a valid mode."
-    dtype = rois.dtype
-    _, channel, height, width = get_const_tuple(data.shape)
-    num_roi, _ = get_const_tuple(rois.shape)
-
-    if isinstance(pooled_size, int):
-        pooled_size_h = pooled_size_w = pooled_size
-    else:
-        pooled_size_h, pooled_size_w = pooled_size
-
-    def _bilinear(i, c, y, x):
-        outside = tvm.tir.any(y < -1.0, x < -1.0, y > height, x > width)
-        y = tvm.te.min(tvm.te.max(y, 0.0), height - 1)
-        x = tvm.te.min(tvm.te.max(x, 0.0), width - 1)
-        val = bilinear_sample_nchw(data, (i, c, y, x), height - 1, width - 1)
-        return tvm.tir.if_then_else(outside, 0.0, val)
-
-    def _sample(i, c, ph, pw):
-        return _sample_common(
-            i,
-            c,
-            ph,
-            pw,
-            rois,
-            pooled_size_h,
-            pooled_size_w,
-            spatial_scale,
-            sample_ratio,
-            dtype,
-            avg_mode,
-            _bilinear,
-        )
-
-    return te.compute(
-        (num_roi, channel, pooled_size_h, pooled_size_w), _sample, tag="pool,roi_align_nchw"
-    )
-
-
-def roi_align_nhwc(data, rois, pooled_size, spatial_scale, mode, sample_ratio=-1):
-    """ROI align operator in NHWC layout.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        4-D with shape [batch, height, width, channel]
-
-    rois : tvm.te.Tensor
-        2-D with shape [num_roi, 5]. The last dimension should be in format of
-        [batch_index, w_start, h_start, w_end, h_end]
-
-    pooled_size : int or list/tuple of two ints
-        output size, or [out_height, out_width]
-
-    spatial_scale : float
-        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
-        of total stride in convolutional layers, which should be in range (0.0, 1.0]
-
-    mode : int or str
-        There are two modes, average and max. For the average mode, you can pass b'avg' or 0, and
-        for the max mode, you can pass b'max' or 1.
-
-    sample_ratio : int
-        Optional sampling ratio of ROI align, using adaptive size by default.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [num_roi, pooled_size, pooled_size, channel]
-    """
-    avg_mode = mode in (b"avg", 0)
-    max_mode = mode in (b"max", 1)
-    assert avg_mode or max_mode, "Mode must be avg or max. Please pass in a valid mode."
-    dtype = rois.dtype
-    _, height, width, channel = get_const_tuple(data.shape)
-    num_roi, _ = get_const_tuple(rois.shape)
-
-    if isinstance(pooled_size, int):
-        pooled_size_h = pooled_size_w = pooled_size
-    else:
-        pooled_size_h, pooled_size_w = pooled_size
-
-    def _bilinear(i, c, y, x):
-        outside = tvm.tir.any(y < -1.0, x < -1.0, y > height, x > width)
-        y = tvm.te.min(tvm.te.max(y, 0.0), height - 1)
-        x = tvm.te.min(tvm.te.max(x, 0.0), width - 1)
-        val = bilinear_sample_nhwc(data, (i, y, x, c), height - 1, width - 1)
-        return tvm.tir.if_then_else(outside, 0.0, val)
-
-    def _sample(i, ph, pw, c):
-        return _sample_common(
-            i,
-            c,
-            ph,
-            pw,
-            rois,
-            pooled_size_h,
-            pooled_size_w,
-            spatial_scale,
-            sample_ratio,
-            dtype,
-            avg_mode,
-            _bilinear,
-        )
-
-    return te.compute(
-        (num_roi, pooled_size_h, pooled_size_w, channel), _sample, tag="pool,roi_align_nchw"
-    )
diff --git a/python/tvm/topi/vision/rcnn/roi_pool.py b/python/tvm/topi/vision/rcnn/roi_pool.py
deleted file mode 100644
index dd1429bcb3c5..000000000000
--- a/python/tvm/topi/vision/rcnn/roi_pool.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""ROI pool operator"""
-import tvm
-from tvm import te
-from ...utils import get_const_tuple
-
-
-def roi_pool_nchw(data, rois, pooled_size, spatial_scale):
-    """ROI pool operator in NCHW layout.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        4-D with shape [batch, channel, height, width]
-
-    rois : tvm.te.Tensor
-        2-D with shape [num_roi, 5]. The last dimension should be in format of
-        [batch_index, w_start, h_start, w_end, h_end]
-
-    pooled_size : int or list/tuple of two ints
-        output size, or [out_height, out_width]
-
-    spatial_scale : float
-        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
-        of total stride in convolutional layers, which should be in range (0.0, 1.0]
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        4-D with shape [num_roi, channel, pooled_size, pooled_size]
-    """
-    dtype = rois.dtype
-    _, channel, height, width = get_const_tuple(data.shape)
-    num_roi, _ = get_const_tuple(rois.shape)
-
-    if isinstance(pooled_size, int):
-        pooled_size_h = pooled_size_w = pooled_size
-    else:
-        pooled_size_h, pooled_size_w = pooled_size
-
-    def _pool(i, c, ph, pw):
-        roi = rois[i]
-        batch_index = roi[0].astype("int32")
-        roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4]
-
-        roi_start_h = te.round(roi_start_h * spatial_scale).astype("int32")
-        roi_start_w = te.round(roi_start_w * spatial_scale).astype("int32")
-        roi_end_h = te.round(roi_end_h * spatial_scale).astype("int32")
-        roi_end_w = te.round(roi_end_w * spatial_scale).astype("int32")
-
-        # force malformed ROIs to be 1x1
-        roi_h = tvm.te.max(roi_end_h - roi_start_h + 1, tvm.tir.const(1, "int32"))
-        roi_w = tvm.te.max(roi_end_w - roi_start_w + 1, tvm.tir.const(1, "int32"))
-
-        bin_h = roi_h.astype(dtype) / pooled_size_h
-        bin_w = roi_w.astype(dtype) / pooled_size_w
-
-        # use epsilon to prevent floating point precision loss in floor/ceil
-        epsilon = tvm.tir.const(0.00001, dtype)
-        hstart = te.floor(ph * bin_h + epsilon).astype("int32")
-        wstart = te.floor(pw * bin_w + epsilon).astype("int32")
-        hend = te.ceil((ph + 1) * bin_h - epsilon).astype("int32")
-        wend = te.ceil((pw + 1) * bin_w - epsilon).astype("int32")
-        hstart = tvm.te.min(tvm.te.max(hstart + roi_start_h, 0), height)
-        wstart = tvm.te.min(tvm.te.max(wstart + roi_start_w, 0), width)
-        hend = tvm.te.min(tvm.te.max(hend + roi_start_h, 0), height)
-        wend = tvm.te.min(tvm.te.max(wend + roi_start_w, 0), width)
-
-        non_empty = tvm.tir.all(hstart < hend, wstart < wend)
-        min_value = lambda dtype: tvm.tir.if_then_else(
-            non_empty, tvm.te.min_value(dtype), tvm.tir.const(0.0, dtype)
-        )
-        # pylint: disable=unnecessary-lambda
-        _max = te.comm_reducer(lambda x, y: tvm.te.max(x, y), min_value, name="max")
-        rh = te.reduce_axis((0, hend - hstart), "rh")
-        rw = te.reduce_axis((0, wend - wstart), "rw")
-        return _max(data[batch_index, c, hstart + rh, wstart + rw], axis=[rh, rw])
-
-    return te.compute((num_roi, channel, pooled_size_h, pooled_size_w), _pool, tag="pool,roi_pool")
diff --git a/python/tvm/topi/vision/reorg.py b/python/tvm/topi/vision/reorg.py
deleted file mode 100644
index 9883085f9f40..000000000000
--- a/python/tvm/topi/vision/reorg.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-REORG Operator
-====================
-Reorg operator, used in darknet.
-"""
-from __future__ import absolute_import as _abs
-from .. import cpp
-
-
-def reorg(data, stride):
-    """Reorg forward operators.
-
-    Parameters
-    ----------
-    Input : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    stride : int
-        Stride value for reorganization
-
-    Returns
-    -------
-    Output : tvm.te.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    return cpp.vision.reorg(data, stride)
diff --git a/python/tvm/topi/vision/ssd/__init__.py b/python/tvm/topi/vision/ssd/__init__.py
deleted file mode 100644
index 1ac388da9a1e..000000000000
--- a/python/tvm/topi/vision/ssd/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""VISION network operators"""
-from __future__ import absolute_import as _abs
-
-from .multibox import *
diff --git a/python/tvm/topi/vision/ssd/multibox.py b/python/tvm/topi/vision/ssd/multibox.py
deleted file mode 100644
index 234bfd795328..000000000000
--- a/python/tvm/topi/vision/ssd/multibox.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable
-"""SSD multibox operators"""
-import tvm
-
-from tvm.te import hybrid
-from tvm.tir import exp, sqrt
-
-from tvm import topi
-
-from ..nms import non_max_suppression
-
-
-@hybrid.script
-def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
-    """Hybrid routing for multibox_prior operator.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor or numpy NDArray
-        4-D tensor with shape [batch, channel, height, width]]
-
-    sizes : tvm ConsExpr
-        Sizes for anchor boxes.
-
-    ratios : tvm ConsExpr
-        Ratios for anchor boxes.
-
-    steps : tvm ConsExpr
-        Priorbox step across y and x, -1 for auto calculation.
-
-    offsets : tvm ConsExpr
-        Priorbox center offsets, y and x respectively.
-
-    Returns
-    -------
-    output : tvm.te.Tensor or numpy NDArray
-        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
-    """
-    in_height = data.shape[2]
-    in_width = data.shape[3]
-    num_sizes = len(sizes)
-    num_ratios = len(ratios)
-    num_boxes = in_height * in_width * (num_sizes + num_ratios - 1)
-    output = output_tensor((1, num_boxes, 4), "float32")
-    steps_h = steps[0] * 1.0 if steps[0] > 0 else 1.0 / in_height
-    steps_w = steps[1] * 1.0 if steps[1] > 0 else 1.0 / in_width
-    offset_h = offsets[0]
-    offset_w = offsets[1]
-
-    # Need to define var out of const_range + if
-    w = 0.0
-    h = 0.0
-
-    for i in parallel(in_height):
-        center_h = (i + offset_h) * steps_h
-        for j in range(in_width):
-            center_w = (j + offset_w) * steps_w
-            for k in const_range(num_sizes + num_ratios - 1):
-                if k < num_sizes:
-                    w = float32(sizes[k] * in_height) / in_width / 2.0
-                    h = sizes[k] / 2.0
-                else:
-                    w = (
-                        float32(sizes[0] * in_height)
-                        / in_width
-                        * sqrt(ratios[k - num_sizes + 1] * 1.0)
-                        / 2.0
-                    )
-                    h = sizes[0] / sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0
-                count = (
-                    i * in_width * (num_sizes + num_ratios - 1)
-                    + j * (num_sizes + num_ratios - 1)
-                    + k
-                )
-                output[0, count, 0] = center_w - w
-                output[0, count, 1] = center_h - h
-                output[0, count, 2] = center_w + w
-                output[0, count, 3] = center_h + h
-
-    return output
-
-
-def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False):
-    """Generate prior(anchor) boxes from data, sizes and ratios.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        4-D with shape [batch, c_in, h_in, w_in]]
-
-    sizes : tuple of float
-        Tuple of sizes for anchor boxes.
-
-    ratios : tuple of float
-        Tuple of ratios for anchor boxes.
-
-    steps : Tuple of float
-        Priorbox step across y and x, -1 for auto calculation.
-
-    offsets : tuple of int
-        Priorbox center offsets, y and x respectively.
-
-    clip : boolean
-        Whether to clip out-of-boundary boxes.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
-    """
-    out = hybrid_multibox_prior(
-        data,
-        tvm.runtime.convert(sizes),
-        tvm.runtime.convert(ratios),
-        tvm.runtime.convert(steps),
-        tvm.runtime.convert(offsets),
-    )
-    if clip:
-        out = topi.clip(out, 0, 1)
-    return out
-
-
-@hybrid.script
-def _hybrid_transform_loc(anchor, pred_loc, variance, clip, batch_idx, anchor_idx):
-    """Transform prior anchor box to output box through location predictions."""
-    al = anchor[0, anchor_idx, 0]
-    at = anchor[0, anchor_idx, 1]
-    ar = anchor[0, anchor_idx, 2]
-    ab = anchor[0, anchor_idx, 3]
-
-    px = pred_loc[batch_idx, 0]
-    py = pred_loc[batch_idx, 1]
-    pw = pred_loc[batch_idx, 2]
-    ph = pred_loc[batch_idx, 3]
-
-    vx = variance[0]
-    vy = variance[1]
-    vw = variance[2]
-    vh = variance[3]
-
-    output = output_tensor((4,), pred_loc.dtype)
-
-    aw = ar - al
-    ah = ab - at
-    ax = (al + ar) / 2.0
-    ay = (at + ab) / 2.0
-    ox = px * vx * aw + ax
-    oy = py * vy * ah + ay
-    ow = exp(pw * vw) * aw / 2.0
-    oh = exp(ph * vh) * ah / 2.0
-    output[0] = max(0.0, min(1.0, ox - ow)) if clip else ox - ow
-    output[1] = max(0.0, min(1.0, oy - oh)) if clip else oy - oh
-    output[2] = max(0.0, min(1.0, ox + ow)) if clip else ox + ow
-    output[3] = max(0.0, min(1.0, oy + oh)) if clip else oy + oh
-    return output
-
-
-@hybrid.script
-def hybrid_multibox_transform_loc(
-    cls_prob,
-    loc_pred,
-    anchor,
-    clip,
-    threshold,
-    variances,
-    keep_background,
-):
-    """Hybrid routing for transform location in multibox_detection operator.
-
-    Parameters
-    ----------
-    cls_prob : tvm.te.Tensor or numpy NDArray
-        3-D tensor of class probabilities.
-
-    loc_pred : tvm.te.Tensor or numpy NDArray
-        2-D tensor of location regression predictions.
-
-    anchor : tvm.te.Tensor or numpy NDArray
-        3-D tensor of prior anchor boxes.
-
-    clip : tvm.tir.const
-        Whether to clip out-of-boundary boxes.
-
-    threshold : tvm.tir.const
-        Threshold to be a positive prediction.
-
-    variances : tvm.nd.NDArray
-        Variances to be decoded from box regression output.
-
-    keep_background : tvm.tir.const
-        Whether to keep boxes detected as background or not.
-
-    Returns
-    -------
-    out_loc : tvm.te.Tensor or numpy NDArray
-        3-D tensor of transformed location.
-
-    valid_count : tvm.te.Tensor or numpy NDArray
-        1_d tensor of valid counts for boxes.
-    """
-    batch_size = cls_prob.shape[0]
-    num_classes = cls_prob.shape[1]
-    num_anchors = cls_prob.shape[2]
-    pred_coord = allocate(
-        (
-            batch_size,
-            4,
-        ),
-        loc_pred.dtype,
-    )
-    out_loc = output_tensor((batch_size, num_anchors, 6), loc_pred.dtype)
-    valid_count = output_tensor((batch_size,), "int32")
-
-    start_cls_idx = 0 if keep_background else 1
-
-    for i in parallel(batch_size):
-        valid_count[i] = 0
-        for j in range(num_anchors):
-            # Find the predicted class id and probability
-            score = -1.0
-            cls_id = 0
-            for k in range(start_cls_idx, num_classes):
-                temp = cls_prob[i, k, j]
-                cls_id = k if temp > score else cls_id
-                score = max(temp, score)
-            if cls_id > 0 and score < threshold:
-                cls_id = 0
-            # [id, prob, xmin, ymin, xmax, ymax]
-            # Remove background if 'keep_background=False', restore original id
-            if keep_background or cls_id > 0:
-                out_loc[i, valid_count[i], 0] = cls_id - 0.0 if keep_background else cls_id - 1.0
-                out_loc[i, valid_count[i], 1] = score
-                for l in range(4):
-                    pred_coord[i, l] = loc_pred[i, j * 4 + l]
-                out_coord = _hybrid_transform_loc(anchor, pred_coord, variances, clip, i, j)
-                out_loc[i, valid_count[i], 2] = out_coord[0]
-                out_loc[i, valid_count[i], 3] = out_coord[1]
-                out_loc[i, valid_count[i], 4] = out_coord[2]
-                out_loc[i, valid_count[i], 5] = out_coord[3]
-                valid_count[i] += 1
-
-    return out_loc, valid_count
-
-
-def multibox_transform_loc(
-    cls_prob,
-    loc_pred,
-    anchor,
-    clip=True,
-    threshold=0.01,
-    variances=(0.1, 0.1, 0.2, 0.2),
-    keep_background=False,
-):
-    """Location transformation for multibox detection
-
-    Parameters
-    ----------
-    cls_prob : tvm.te.Tensor
-        Class probabilities.
-
-    loc_pred : tvm.te.Tensor
-        Location regression predictions.
-
-    anchor : tvm.te.Tensor
-        Prior anchor boxes.
-
-    clip : boolean
-        Whether to clip out-of-boundary boxes.
-
-    threshold : float
-        Threshold to be a positive prediction.
-
-    variances : tuple of float
-        Variances to be decoded from box regression output.
-
-    keep_background : boolean
-        Whether to keep boxes detected as background or not.
-
-    Returns
-    -------
-    ret : tuple of tvm.te.Tensor
-    """
-
-    return hybrid_multibox_transform_loc(
-        cls_prob,
-        loc_pred,
-        anchor,
-        tvm.tir.const(clip, "bool"),
-        tvm.tir.const(threshold, "float32"),
-        tvm.runtime.convert(variances),
-        tvm.tir.const(keep_background, "bool"),
-    )
-
-
-def multibox_detection(
-    cls_prob,
-    loc_pred,
-    anchor,
-    clip=True,
-    threshold=0.01,
-    nms_threshold=0.5,
-    force_suppress=False,
-    variances=(0.1, 0.1, 0.2, 0.2),
-    nms_topk=-1,
-):
-    """Convert multibox detection predictions.
-
-    Parameters
-    ----------
-    cls_prob : tvm.te.Tensor
-        Class probabilities.
-
-    loc_pred : tvm.te.Tensor
-        Location regression predictions.
-
-    anchor : tvm.te.Tensor
-        Prior anchor boxes.
-
-    clip : boolean
-        Whether to clip out-of-boundary boxes.
-
-    nms_threshold : float
-        Non-maximum suppression threshold.
-
-    force_suppress : boolean
-        Whether to suppress all detections regardless of class_id.
-
-    threshold : float
-        Threshold to be a positive prediction.
-
-    variances : tuple of float
-        Variances to be decoded from box regression output.
-
-    nms_topk : int
-        Keep maximum top k detections before nms, -1 for no limit.
-
-    Returns
-    -------
-    out : tvm.te.Tensor
-        3-D tensor with shape (batch_size, num_anchors, 6)
-    """
-    inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor, clip, threshold, variances)
-    out = non_max_suppression(
-        inter_out[0],
-        inter_out[1],
-        inter_out[1],
-        max_output_size=-1,
-        iou_threshold=nms_threshold,
-        force_suppress=force_suppress,
-        top_k=nms_topk,
-        return_indices=False,
-    )
-    return out
diff --git a/python/tvm/utils/__init__.py b/python/tvm/utils/__init__.py
deleted file mode 100644
index 33abc352b0f0..000000000000
--- a/python/tvm/utils/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utilities operating at a graph/model or other "high" level"""
-
-from .roofline import roofline_analysis
diff --git a/python/tvm/utils/roofline/__init__.py b/python/tvm/utils/roofline/__init__.py
deleted file mode 100644
index 45cc880c5b85..000000000000
--- a/python/tvm/utils/roofline/__init__.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utilities for computing an approximate roofline model"""
-from typing import Dict, Optional, Union
-
-import numpy as np
-
-from ... import IRModule, auto_scheduler, build, get_global_func, nd, relay, tir, topi, transform
-from ...contrib import utils
-from ...ir.expr import GlobalVar
-from ...ir.instrument import pass_instrument
-from ...rpc.base import RPC_SESS_MASK
-from ...rpc.client import RPCSession
-from ...runtime import Device, num_threads, profiler_vm, profiling
-from ...script import tir as T
-from ...target import Target
-from . import cuda, registry, x86
-
-
-def _create_args(mod: IRModule, dev: Device, func_name: str = "main", remote=None):
-    if dev.device_type >= RPC_SESS_MASK:
-        random_fill = remote.get_function("tvm.contrib.random.random_fill")
-    else:
-        random_fill = get_global_func("tvm.contrib.random.random_fill")
-    assert random_fill, "Please make sure USE_RANDOM is ON in config.cmake"
-    args = []
-    for arg in mod[func_name].params:
-        ary = nd.empty(
-            [x.value for x in arg.type_annotation.shape],
-            arg.type_annotation.dtype,
-            device=dev,
-        )
-        random_fill(ary)
-        args.append(ary)
-    return args
-
-
-@pass_instrument
-class SaveLoweredTIR:
-    """Save TIR functions for analysis.
-
-    We need the TIR function in a form that can be handled by
-    `auto_scheduler.feature.named_features_from_primfunc`, but which
-    is the closest to the final lowered form as possible.  Right now this
-    means right before tir.SplitHostDevice.
-
-    """
-
-    def __init__(self, before_pass: str = "tir.SplitHostDevice"):
-        """
-        Parameters
-        ----------
-        before_pass: str
-            Pass before which the TIR is saved.
-        """
-        self.functions = {}
-        self.before_pass = before_pass
-
-    def run_before_pass(self, mod, info):
-        if info.name == self.before_pass:
-            for v, func in mod.functions.items():
-                if isinstance(func, tir.PrimFunc):
-                    self.functions[v] = func
-
-
-def roofline_from_existing(
-    report: profiling.Report,
-    tir_functions: Dict[GlobalVar, tir.PrimFunc],
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession] = None,
-) -> profiling.Report:
-    """Add roofline and other estimated statistics to an existing profiling report.
-
-    :py:func:`roofline_analysis` should always be used instead of this function
-    unless you need a custom compilation pipeline.
-
-    Calculating roofline statistics requires features extracted the TIR
-    functions in addition to per-operator runtime information (`report`) of the
-    same TIR features. The features and TIR functions are not included with the
-    compiled library used to generate the per-operator runtime. It is essential
-    that the per-operator information comes from the exact same compilation
-    pipeline as the TIR functions.
-
-
-    Example
-    -------
-
-    ..code: : python
-
-        import tvm
-        import tvm.relay
-
-        mod, params = tvm.relay.testing.mlp.get_workload()
-
-        # it is recommended to use SaveLoweredTIR to get out the tir primfuncs
-        save_tir = tvm.utils.roofline.SaveLoweredTIR()
-        with tvm.transform.PassContext(opt_level=3, pass_instrument=[save_tir]):
-            lib = relay.vm.compile(mod, params=params, target=target)
-
-        vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
-        report = vmexec.profile(*inputs)
-
-        roofline_report = roofline_from_existing(report, save_tir.functions, target, dev)
-
-
-    Parameters
-    ----------
-    report : Report
-        Existing profiling report from :py:method:`VirtualMachineProfiler.profile`.
-    tir_functions : Dict[GlobalVar, PrimFunc]
-        TIR primfuncs from the module run to generate `report`. It is nessesary
-        that these functions come before the `tir.MakePackedAPI` pass and are
-        compatible with auto_scheduler featurization.
-        :py:class:`SaveLoweredTIR` is the recommended way to collect these
-        functions.
-    target : Target
-        TVM target that `report` was generated with.
-    dev : Device
-        Device that `report` was generated with.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-
-    Returns
-    -------
-    profiling.Report
-        New profiling report that includes all information from `report`
-        along with additional roofline metrics. See
-        :py:func:`roofline_analysis` for more information on which metrics
-        are included.
-    """
-
-    all_features = {
-        prim.attrs["hash"]: (name, prim, auto_scheduler.feature.named_features_from_primfunc(prim))
-        for name, prim in tir_functions.items()
-        if isinstance(prim, tir.PrimFunc) and "hash" in prim.attrs.keys()
-    }
-
-    new_configuration = dict(report.configuration.items())
-    new_calls = []
-    for call in report.calls:
-        if "Hash" in call.keys() and call["Hash"] in all_features:
-            _, prim, features = all_features[call["Hash"]]
-            if features is None:
-                continue
-
-            with target:
-                flops, peak_flops, flops_name = registry.estimate_peak_flops(
-                    prim, features, target, dev, remote
-                )
-                loaded_bytes, peak_bandwidth, bandwidth_name = registry.estimate_peak_bandwidth(
-                    prim, features, target, dev, remote
-                )
-            new_configuration[f"Estimated Peak FLOP/s ({flops_name})"] = profiling.Ratio(peak_flops)
-            new_configuration[
-                f"Estimated Peak Bandwidth ({bandwidth_name}, byte/second)"
-            ] = profiling.Ratio(peak_bandwidth)
-            ridge_point = peak_flops / peak_bandwidth
-
-            runtime = call["Duration (us)"].microseconds * 1e-6
-            arith_inten = flops / loaded_bytes
-            call = dict(call)
-            call["Loaded Bytes"] = profiling.Count(int(loaded_bytes))
-            call["Estimated FLOPs"] = profiling.Count(int(flops))
-            call["Arithmetic Intensity"] = profiling.Ratio(arith_inten)
-            call["FLOP/s"] = profiling.Ratio(flops / runtime)
-            call["Bandwidth"] = profiling.Ratio(loaded_bytes / runtime)
-            compute_bound = arith_inten > ridge_point
-            call["Bound"] = "compute" if compute_bound else "memory"
-            per_mem_bound = (loaded_bytes / runtime) / peak_bandwidth * 100
-            per_compute_bound = (flops / runtime) / peak_flops * 100.0
-            # We use ratio here because the percentages should be averaged instead of summed.
-            call["Percent of Theoretical Optimal"] = profiling.Ratio(
-                per_compute_bound if compute_bound else per_mem_bound
-            )
-            new_calls.append(call)
-        else:
-            new_calls.append(call)
-    return profiling.Report(new_calls, report.device_metrics, new_configuration)
-
-
-def roofline_analysis(
-    mod: IRModule,
-    params: Dict[str, nd.NDArray],
-    target: Union[str, Target],
-    dev: Device,
-    remote: Optional[RPCSession] = None,
-) -> profiling.Report:
-    """
-    Create a profiling report that contains roofline and other estimated
-    statistics from running a module on the VM.
-
-    The roofline model measures how close a operator gets to best possible
-    memory bandwidth or FLOP/s depending on whether it is memory or compute
-    bound. This computation uses the runtime of the operator along with two
-    numbers extracted from the TIR code: bytes of memory touched and number of
-    floating point operations.
-
-    These statistics are calculated by analyzing the lowered TIR of each
-    operator, so they are estimates of the true values. The statistics are:
-      - Bound: Is the operator memory or compute bound. This is computed by
-        assuming that the operator could perfectly cache all loads -- each byte
-        of memory is only loaded once.
-      - Percent of Theoretical Optimal: What percent of theoretical optimal for
-        the bound. i.e. percent of peak memory bandwidth if memory bound,
-        percent of peak FLOP/s if compute bound.
-      - Loaded Bytes: estimation of the number of bytes loaded from main memory.
-      - Estimated Flops: estimated number of floating point operations.
-      - Arithmetic Intensity: ratio of FLOPs per byte of data.
-      - FLOP/s: floating point operations per second.
-      - Bandwidth: Number of bytes loaded per second.
-
-    Parameters
-    ----------
-    mod : IRModule
-      Uncompiled input module
-
-    params : Dict[str, nd.NDArray]
-
-    target : Union[str, Target]
-      Target to run on.
-
-    dev : Device
-      Device to run on.
-
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-
-    Returns
-    -------
-
-    report : profiling.Report
-      Profiling report which includes the estimated statistics.
-    """
-    if isinstance(target, str):
-        target = Target(target)
-
-    save_tir = SaveLoweredTIR()
-    # copy existing context but add our instrument
-    pass_ctx = transform.PassContext.current()
-    with transform.PassContext(
-        opt_level=pass_ctx.opt_level,
-        required_pass=pass_ctx.required_pass,
-        disabled_pass=pass_ctx.disabled_pass,
-        instruments=list(pass_ctx.instruments) + [save_tir],
-        config=pass_ctx.config,
-    ):
-        lib = relay.vm.compile(mod, params=params, target=target)
-    # upload to remote if running over rpc
-    if dev.device_type >= RPC_SESS_MASK:
-        if remote is None:
-            raise RuntimeError("A RPCSession must be provided when using a remote device.")
-        temp = utils.tempdir()
-        path = temp.relpath("roofline_lib.tar")
-        lib.mod.export_library(path)
-        remote.upload(path)
-        lib = remote.load_module("roofline_lib.tar")
-    vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
-
-    args = _create_args(mod, dev, remote=remote)
-    report = vmexec.profile(*args)
-
-    return roofline_from_existing(report, save_tir.functions, target, dev, remote=remote)
diff --git a/python/tvm/utils/roofline/cuda.py b/python/tvm/utils/roofline/cuda.py
deleted file mode 100644
index b83a902b7fda..000000000000
--- a/python/tvm/utils/roofline/cuda.py
+++ /dev/null
@@ -1,407 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Estimation of peak flops and memory bandwidth for cuda devices"""
-import functools
-import re
-from typing import Dict, Optional, Tuple
-
-import numpy as np
-
-from ... import build, nd, transform
-from ...contrib import nvcc, utils
-from ...rpc.base import RPC_SESS_MASK
-from ...rpc.client import RPCSession
-from ...runtime import Device
-from ...script import tir as T
-from ...target import Target
-from ...tir import PrimFunc
-from . import registry
-
-
-@functools.lru_cache(maxsize=None)
-def estimate_peak_flops_tensorcore(
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-    mat_dtype: str = "float16",
-    acc_dtype: str = "float32",
-) -> Tuple[float, float, str]:
-    """Estimate the peak FLOP/s of a cuda device with tensorcores.
-
-    This estimate should only be used to compare with operators that can use
-    dense tensorcore mma instructions.
-
-    References
-    ----------
-    Wei Sun, Ang Li, Tong Geng, Sander Stuijk, Henk Corporaal: "Dissecting
-    Tensor Cores via Microbenchmarks: Latency, Throughput and Numerical
-    Behaviors", 2022; http://arxiv.org/abs/2206.02874
-    https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.1.pdf
-
-    Parameters
-    ----------
-    target : Target
-        Target to run on. This should be as specific to the actual hardware as
-        possible.
-    dev : Device
-        Device to run on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-    mat_dtype : str
-        Dtype of matrices passed to mma instructions.
-    acc_dtype : str
-        Dtype of accumulator to use with mma instructions. Should be compatible
-        with `mat_dtype`.
-
-    Returns
-    -------
-    peak_flops : float
-        Approximate sustained FLOP/s of this target/device combo assuming
-        mma instructions. Addition and multiplications are each counted as
-        separate FLOPs.
-    """
-
-    @T.prim_func
-    def peak_flops_tensorcore_tir(
-        inp: T.Buffer((16, 16), mat_dtype),
-        out: T.Buffer((16, 16), acc_dtype),
-        n: T.int32,
-        sms: T.int32,
-    ):
-        # pylint: disable=invalid-name, missing-function-docstring
-        A = T.alloc_buffer((16, 16), dtype=mat_dtype, scope="wmma.matrix_a")
-        B = T.alloc_buffer((16, 16), dtype=mat_dtype, scope="wmma.matrix_b")
-        C = T.alloc_buffer((16, 16), dtype=acc_dtype, scope="wmma.accumulator")
-        for _ in T.thread_binding(sms, thread="blockIdx.x"):
-            for _ in T.thread_binding(
-                8, thread="threadIdx.y"
-            ):  # need 8 warps to get enough in-SM parallelism
-                for _ in T.thread_binding(32, thread="threadIdx.x"):
-                    T.evaluate(
-                        T.tvm_load_matrix_sync(
-                            A.data,
-                            16,
-                            16,
-                            16,
-                            0,
-                            T.tvm_access_ptr(
-                                T.type_annotation(dtype=mat_dtype),
-                                inp.data,
-                                0,
-                                16,
-                                1,
-                                dtype="handle",
-                            ),
-                            16,
-                            "row_major",
-                            dtype="handle",
-                        )
-                    )
-                    T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, 0, dtype="handle"))
-                    T.evaluate(T.tvm_fill_fragment(C.data, 16, 16, 16, 0, 0, dtype="handle"))
-                    for _ in range(n):
-                        T.evaluate(
-                            T.tvm_mma_sync(
-                                C.data, 0, A.data, 0, B.data, 0, C.data, 0, dtype="handle"
-                            )
-                        )
-                    T.evaluate(
-                        T.tvm_store_matrix_sync(
-                            C.data,
-                            16,
-                            16,
-                            16,
-                            0,
-                            T.tvm_access_ptr(
-                                T.type_annotation(dtype=acc_dtype),
-                                out.data,
-                                0,
-                                16,
-                                2,
-                                dtype="handle",
-                            ),
-                            16,
-                            "row_major",
-                            dtype="handle",
-                        )
-                    )
-
-    n = 100000
-    sms = dev.multi_processor_count
-    specialized = peak_flops_tensorcore_tir.specialize(
-        {peak_flops_tensorcore_tir.params[2]: n, peak_flops_tensorcore_tir.params[3]: sms}
-    )
-    with transform.PassContext(opt_level=3):
-        f = build(specialized, target=target)
-
-    # upload to remote if running over rpc
-    if dev.device_type >= RPC_SESS_MASK:
-        if remote is None:
-            raise RuntimeError("A RPCSession must be provided when using a remote device.")
-        temp = utils.tempdir()
-        path = temp.relpath("peak_mma_flops.tar")
-        f.export_library(path)
-        remote.upload(path)
-        f = remote.load_module("peak_mma_flops.tar")
-
-    x = nd.empty((16, 16), dtype=mat_dtype, device=dev)
-    y = nd.empty((16, 16), dtype=acc_dtype, device=dev)
-    times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(x, y)
-    # each mma operation computes 16 x 16 x 16 FLOPs
-    return n * 16 * 16 * 16 * 2 * sms * 8 / times.min
-
-
-@functools.lru_cache(maxsize=None)
-def estimate_peak_flops_fma(
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-    dtype: str,
-) -> Tuple[float, float, str]:
-    """Estimate the peak FLOP/s of a cuda device with fma operations (not using tensor cores).
-
-    References
-    ----------
-    https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.1.pdf
-
-    Parameters
-    ----------
-    target : Target
-        Target to run on. This should be as specific to the actual hardware as
-        possible.
-    dev : Device
-        Device to run on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-    dtype : str
-        Dtype of fma operation
-
-    Returns
-    -------
-    peak_flops : float
-        Approximate sustained FLOP/s of this target/device combo assuming
-        fma instructions. Addition and multiplications are each counted as
-        separate FLOPs.
-    """
-
-    vec_width = 32
-    warps = 16  # need 16 warps to get enough in-SM parallelism
-    sms = dev.multi_processor_count
-    n = 100000
-
-    @T.prim_func
-    def peak_flops_fma_tir(
-        A: T.Buffer((sms, warps, vec_width), dtype),
-        B: T.Buffer((sms, warps, vec_width), dtype),
-    ):
-        # pylint: disable=invalid-name, missing-function-docstring
-        shared = T.alloc_buffer((sms, warps, vec_width), dtype=dtype, scope="shared")
-        for sm in T.thread_binding(sms, thread="blockIdx.x"):
-            for warp in T.thread_binding(warps, thread="threadIdx.y"):
-                for t in T.thread_binding(vec_width, thread="threadIdx.x"):
-                    shared[sm, warp, t] = A[sm, warp, t]
-                    for _ in range(n):
-                        shared[sm, warp, t] = (
-                            shared[sm, warp, t] * shared[sm, warp, t] + shared[sm, warp, t]
-                        )
-                    B[sm, warp, t] = shared[sm, warp, t]
-
-    with transform.PassContext(opt_level=3):
-        f = build(peak_flops_fma_tir, target=target)
-
-    # upload to remote if running over rpc
-    if dev.device_type >= RPC_SESS_MASK:
-        if remote is None:
-            raise RuntimeError("A RPCSession must be provided when using a remote device.")
-        temp = utils.tempdir()
-        path = temp.relpath("peak_fma_flops.tar")
-        f.export_library(path)
-        remote.upload(path)
-        f = remote.load_module("peak_fma_flops.tar")
-
-    x = nd.empty((sms, warps, vec_width), dtype=dtype, device=dev)
-    y = nd.empty((sms, warps, vec_width), dtype=dtype, device=dev)
-    times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(x, y)
-    return n * warps * sms * vec_width * 2 / times.min
-
-
-@registry.estimate_peak_flops.register("cuda")
-def estimate_peak_flops(
-    func: PrimFunc,  # pylint: disable=unused-argument
-    features: Dict[str, np.ndarray],
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-) -> Tuple[float, float, str]:
-    """Estimate the peak FLOP/s of a cuda device.
-
-    Parameters
-    ----------
-    func : PrimFunc
-        Function to estimate peak flops for. Used to check if a specific kind
-        intrinsic or dtype could be used with this function.
-    features : Dict[str, np.ndarry]
-        Features extracted from `func`. Used to check if a specific kind
-        intrinsic or dtype could be used with this function.
-    target : Target
-        Target to run on. This should be as specific to the actual hardware as
-        possible.
-    dev : Device
-        Device to run on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-
-    Returns
-    -------
-    flops : float
-        Estimated number of flops used by `func`.
-    peak_flops : float
-        Approximate sustained FLOP/s of this target/device combo. Addition and
-        multiplications are each counted as separate FLOPs.
-    name : str
-        Dtype/intrinsic used by `func` to achieve peak flops.
-    """
-    has_tensorcore = nvcc.have_tensorcore(dev.compute_version)
-    # assume that the first argument dtype is the same as all the others
-    dtype = list(func.buffer_map.values())[0].dtype
-    if dtype == "float16" and has_tensorcore:
-        peak_flops = estimate_peak_flops_tensorcore(target, dev, remote)
-        name = "float16 tensorcore"
-    else:
-        peak_flops = estimate_peak_flops_fma(target, dev, remote, dtype)
-        name = f"{dtype} fma"
-    flops = np.sum(
-        features["float_addsub"]
-        + features["float_mul"]
-        + features["float_mad"] * 2
-        + features["float_divmod"]
-    )
-    return flops, peak_flops, name
-
-
-@T.prim_func
-def peak_bandwidth_tir(a: T.handle, b: T.handle, blocks: T.int32, warp_size: T.int32) -> None:
-    # pylint: disable=invalid-name, missing-function-docstring
-    N = T.int32()
-    A = T.match_buffer(a, [blocks, N, 4, warp_size], "float32")
-    B = T.match_buffer(b, [blocks, 4, warp_size], "float32")
-    for i in T.thread_binding(blocks, "blockIdx.x"):
-        for k in T.serial(N):
-            for l in T.unroll(4):
-                # vectorized load is necessary to hit peak bandwidth
-                for j in T.thread_binding(warp_size, "threadIdx.x"):
-                    # += is necessary to introduce a data dependency for all
-                    # elements of A, preventing the backend from removing the
-                    # `k` loop and setting `k` to the loop extent.
-                    B[i, l, j] += A[i, k, l, j]
-
-
-@functools.lru_cache(maxsize=None)
-def estimate_peak_bandwidth_global_mem(
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession] = None,
-) -> Tuple[float, float, str]:
-    """Estimate peak bandwidth of global memory. See estimate_peak_bandwidth"""
-    warp_size = dev.warp_size
-    # These sizes seem large enough to give the card time to hit a fixpoint on memory bandwidth
-    blocks = 1024
-    size = 1024
-
-    specialized = peak_bandwidth_tir.specialize(
-        {peak_bandwidth_tir.params[2]: blocks, peak_bandwidth_tir.params[3]: warp_size}
-    )
-    with transform.PassContext(opt_level=3):
-        f = build(specialized, target=target)
-
-    # upload to remote if running over rpc
-    if dev.device_type >= RPC_SESS_MASK:
-        if remote is None:
-            raise RuntimeError("A RPCSession must be provided when using a remote device.")
-        temp = utils.tempdir()
-        path = temp.relpath("peak_bandwidth.tar")
-        f.export_library(path)
-        remote.upload(path)
-        f = remote.load_module("peak_bandwidth.tar")
-
-    a = nd.empty((blocks, size, 4, warp_size), dtype="float32", device=dev)
-    b = nd.empty((blocks, 4, warp_size), dtype="float32", device=dev)
-    times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(a, b)
-    return a.numpy().size * 4 / times.min  # 4 bytes per float32
-
-
-@registry.estimate_peak_bandwidth.register("cuda")
-def estimate_peak_bandwidth(
-    func: PrimFunc,  # pylint: disable=unused-argument
-    features: Dict[str, np.ndarray],
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession] = None,
-) -> Tuple[float, float, str]:
-    """Estimate peak memory bandwidth of a target/device combo.
-
-    Peak bandwidth is estimated by running a small experiment on the underlying
-    hardware. The peak bandwidth measurement assumes that vector instructions
-    are being used to load the data.
-
-    Parameters
-    ----------
-    func : PrimFunc
-        Function to estimate peak bandwidth for. Used to check if a specific
-        kind of memory could be used with this function.
-    features : Dict[str, np.ndarry]
-        Features extracted from `func`. Used to check if a specific kind of
-        memory could be used with this function.
-    target : Target
-        Target to use for measurement. This target should be as specific to the
-        underlying hardware as possible.
-    dev : Device
-        Device to measure peak bandwidth on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-
-    Returns
-    -------
-    loaded_bytes : float
-        Estimated bytes loaded by `func`.
-    peak_bandwidth : float
-        Peak memory bandwidth in bytes/seconds.
-    name : str
-        Name of the memory being used.
-    """
-    # autoscheduler features do not take into account that 1.
-    # global and shared memory have very different performance
-    # characteristics -- both are included in the same bytes
-    # touched count 2. multiple threads accessing the same byte
-    # of memory does not use the same amount of bandwidth as
-    # multiple threads accessing different bytes of memory. We
-    # use unique bytes accessed here to avoid these two issues,
-    # but this does bias results towards being more compute
-    # bound.
-    loaded_bytes = sum(
-        [
-            np.sum(x)
-            for (k, x) in features.items()
-            if re.match(r"^B[0-9]+\.unique_bytes$", k) is not None
-        ]
-    )
-    peak_bandwidth = estimate_peak_bandwidth_global_mem(target, dev, remote)
-    return loaded_bytes, peak_bandwidth, "global"
diff --git a/python/tvm/utils/roofline/registry.py b/python/tvm/utils/roofline/registry.py
deleted file mode 100644
index 9358529b38ec..000000000000
--- a/python/tvm/utils/roofline/registry.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Definition of generic functions for estimating peak flops and bandwidth"""
-from typing import Dict, Optional, Tuple
-
-import numpy as np
-
-from ...rpc.client import RPCSession
-from ...runtime import Device
-from ...target import Target, generic_func
-from ...tir import PrimFunc
-
-
-@generic_func
-def estimate_peak_bandwidth(
-    func: PrimFunc,
-    features: Dict[str, np.ndarray],
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession] = None,
-) -> Tuple[float, float, str]:
-    """Estimate peak memory bandwidth of a target/device combo.
-
-    Peak bandwidth is estimated by running a small experiment on the underlying
-    hardware. The peak bandwidth measurement assumes that vector instructions
-    are being used to load the data.
-
-    Parameters
-    ----------
-    func : PrimFunc
-        Function to estimate peak bandwidth for. Used to check if a specific
-        kind of memory could be used with this function.
-    features : Dict[str, np.ndarry]
-        Features extracted from `func`. Used to check if a specific kind of
-        memory could be used with this function.
-    target : Target
-        Target to use for measurement. This target should be as specific to the
-        underlying hardware as possible.
-    dev : Device
-        Device to measure peak bandwidth on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-
-    Returns
-    -------
-    loaded_bytes : float
-        Estimated bytes loaded by `func`.
-    peak_bandwidth : float
-        Peak memory bandwidth in bytes/seconds.
-    name : str
-        Name of the memory being used.
-    """
-    raise NotImplementedError()
-
-
-@generic_func
-def estimate_peak_flops(
-    func: PrimFunc,
-    features: Dict[str, np.ndarray],
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-) -> Tuple[float, float, str]:
-    """
-    Estimate the maximum number of FLOP/s this target/device combo is capable
-    of reaching by running a test program. This is a generic function that
-    should be overridden for each target.
-
-    Parameters
-    ----------
-    func : PrimFunc
-        Function to estimate peak flops for. Used to check if a specific kind
-        intrinsic or dtype could be used with this function.
-    features : Dict[str, np.ndarry]
-        Features extracted from `func`. Used to check if a specific kind
-        intrinsic or dtype could be used with this function.
-    target : Target
-        Target to run on. This should be as specific to the actual hardware as
-        possible to make sure that LLVM generates the best vector code.
-    dev : Device
-        Device to run on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-
-    Returns
-    -------
-    flops : float
-        Estimated number of flops used by `func`.
-    peak_flops : float
-        Approximate sustained FLOP/s of this target/device combo assuming
-        vectorized FMA instructions. Each FMA operation counts as two FLOPs.
-    name : str
-        Dtype/intrinsic used by `func` to achieve peak flops.
-    """
-    raise NotImplementedError()
diff --git a/python/tvm/utils/roofline/x86.py b/python/tvm/utils/roofline/x86.py
deleted file mode 100644
index 5d2dd27e523b..000000000000
--- a/python/tvm/utils/roofline/x86.py
+++ /dev/null
@@ -1,331 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Estimate peak flops and bandwidth for x86 devices"""
-import functools
-import re
-from typing import Dict, Optional, Tuple
-
-import numpy as np
-
-from ... import build, get_global_func, nd, transform
-from ...contrib import utils
-from ...rpc.base import RPC_SESS_MASK
-from ...rpc.client import RPCSession
-from ...runtime import DataType, Device, num_threads
-from ...script import tir as T
-from ...target import Target, x86
-from ...tir import PrimFunc
-from . import registry
-
-
-def _detect_vec_width_registers(
-    target: Target, vec_width: Optional[int], num_vector_registers: Optional[int]
-):
-    """Get the vector width and number of vector registers for a target.
-
-    Parameters
-    ----------
-    target : Target
-        Target to detect vector width and registers for.
-    vec_width : Optional[int]
-        If None, try and detect vector width from target. Otherwise provided input is used.
-    num_vector_registers : Optional[int]
-        If None, try and number of vector registers from target. Otherwise provided input is used.
-
-    Returns
-    -------
-    vec_width: int
-        Width of a vector register on `target` in bytes.
-    num_vector_registers: int
-        Number of vector registers on `target`.
-    """
-    if vec_width is None:
-        # Only implemented for x86 so far...
-        if (
-            str(target.kind) == "llvm"
-            and target.device_name == ""
-            and len(target.keys) == 1
-            and target.keys[0] == "cpu"
-        ):
-            with target:
-                vec_width = x86.get_simd_32bit_lanes() * 4  # in number of bytes
-        else:
-            raise RuntimeError(f"Cannot determine vector width for target {target}")
-    if num_vector_registers is None:
-        if target.device_name == "":  # indicates x86
-            num_vector_registers = 16  # Assuming for all platforms, probably wrong on older ones
-        else:
-            raise RuntimeError(f"Cannot determine number of vector registers for target {target}")
-    return vec_width, num_vector_registers
-
-
-@functools.lru_cache(maxsize=None)
-def estimate_peak_fma_vector_flops(
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-    dtype: DataType,
-    vec_width: Optional[int] = None,
-    num_vector_registers: Optional[int] = None,
-):
-    """Estimate peak flops assuming vector fma instructions and no explicit
-    intrinsics. See estimate_peak_fma_flops.
-    """
-
-    @T.prim_func
-    def peakflops_fma_tir(
-        a: T.handle,
-        vec_width: T.int32,
-        iters: T.int32,
-        num_vector_registers: T.int32,
-        threads: T.int32,
-    ) -> None:
-        # pylint: disable=invalid-name, missing-function-docstring
-        A = T.match_buffer(a, [threads, num_vector_registers, vec_width], dtype)
-        for t in T.parallel(threads):
-            for _j in range(iters):
-                for l in T.unroll(num_vector_registers):
-                    # We want to use as few registers as possible, so we perform
-                    # all operations on the same element
-                    for k in T.vectorized(vec_width):
-                        A[t, l, k] = A[t, l, k] * A[t, l, k] + A[t, l, k]
-
-    vec_width, num_vector_registers = _detect_vec_width_registers(
-        target, vec_width, num_vector_registers
-    )
-    vec_width //= DataType(dtype).bits // 8
-    iters = 1000000
-    nthreads = num_threads()
-    specialized = peakflops_fma_tir.specialize(
-        {
-            peakflops_fma_tir.params[1]: vec_width,
-            peakflops_fma_tir.params[2]: iters,
-            peakflops_fma_tir.params[3]: num_vector_registers,
-            peakflops_fma_tir.params[4]: nthreads,
-        }
-    )
-    with transform.PassContext(opt_level=3):
-        f = build(specialized, target=target)
-
-    # upload to remote if running over rpc
-    if dev.device_type >= RPC_SESS_MASK:
-        if remote is None:
-            raise RuntimeError("A RPCSession must be provided when using a remote device.")
-        temp = utils.tempdir()
-        path = temp.relpath("peak_fma_flops.tar")
-        f.export_library(path)
-        remote.upload(path)
-        f = remote.load_module("peak_fma_flops.tar")
-        random_fill = remote.get_function("tvm.contrib.random.random_fill")
-    else:
-        random_fill = get_global_func("tvm.contrib.random.random_fill")
-    assert random_fill, "Please make sure USE_RANDOM is ON in config.cmake"
-
-    a = nd.empty((nthreads, num_vector_registers, vec_width), dtype=dtype, device=dev)
-    random_fill(a)
-    times = f.time_evaluator(f.entry_name, dev, repeat=100, number=1)(a)
-    flops = 2 * vec_width * num_vector_registers * nthreads * iters  # fma is two flops
-    return flops / times.min
-
-
-@registry.estimate_peak_flops.register("cpu")
-def estimate_peak_fma_flops(
-    func: PrimFunc,
-    features: Dict[str, np.ndarray],
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-    vec_width: Optional[int] = None,
-    num_vector_registers: Optional[int] = None,
-) -> Tuple[float, float, str]:
-    """
-    Estimate the maximum number of FLOP/s this target/device combo is capable
-    of reaching by running a test program. This assumes vectorized FMA
-    (fused-multiply-add) instructions.
-
-
-    Parameters
-    ----------
-    func : PrimFunc
-        Function to estimate peak flops for. Used to check if a specific kind
-        intrinsic or dtype could be used with this function.
-    features : Dict[str, np.ndarry]
-        Features extracted from `func`. Used to check if a specific kind
-        intrinsic or dtype could be used with this function.
-    target : Target
-        Target to run on. This should be as specific to the actual hardware as
-        possible to make sure that LLVM generates the best vector code.
-    dev : Device
-        Device to run on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-    vec_width : Optional[int]
-        Vector width of SIMD units on the underlying hardware. Will try to
-        infer if no value is provided.
-    num_vector_registers : Optional[int]
-        Number of vector registers on the underlying hardware. Will try to
-        infer if no value is provided.
-
-    Returns
-    -------
-    flops : float
-        Estimated number of flops used by `func`.
-    peak_flops : float
-        Approximate sustained FLOP/s of this target/device combo assuming
-        vectorized FMA instructions. Each FMA operation counts as two FLOPs.
-    name : str
-        Dtype/intrinsic used by `func` to achieve peak flops.
-    """
-    # assume that the first argument's dtype is the one we want
-    dtype = list(func.buffer_map.values())[0].dtype
-    if "int" in dtype:
-        flops = np.sum(
-            features["int_addsub"]
-            + features["int_mul"]
-            + features["int_mad"] * 2
-            + features["int_divmod"]
-        )
-    else:
-        flops = np.sum(
-            features["float_addsub"]
-            + features["float_mul"]
-            + features["float_mad"] * 2
-            + features["float_divmod"]
-        )
-    peak_flops = estimate_peak_fma_vector_flops(
-        target, dev, remote, dtype, vec_width, num_vector_registers
-    )
-    return flops, peak_flops, f"{dtype} FMA"
-
-
-@T.prim_func
-def peak_bandwidth_tir(a: T.handle, b: T.handle, threads: T.int32, vec_width: T.int32) -> None:
-    # pylint: disable=invalid-name, missing-function-docstring
-    N = T.int32()
-    A = T.match_buffer(a, [threads, N, 4, vec_width], "float32")
-    B = T.match_buffer(b, [threads, 4, vec_width], "float32")
-    # Parallelism is necessary to hit all cores/nodes
-    for i in T.parallel(threads):
-        for k in T.serial(N):
-            for l in T.unroll(4):
-                # vectorized load is necessary to hit peak bandwidth
-                for j in T.vectorized(vec_width):
-                    # += is necessary to introduce a data dependency for all
-                    # elements of A, preventing the backend from removing the
-                    # `k` loop and setting `k` to the loop extent.
-                    B[i, l, j] += A[i, k, l, j]
-
-
-@functools.lru_cache(maxsize=None)
-def estimate_peak_bandwidth_dram(
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-    vec_width: Optional[int] = None,
-) -> float:
-    """Estimate peak bandwidth for DRAM. See estimate_peak_bandwidth."""
-    vec_width, _ = _detect_vec_width_registers(target, vec_width, 1)
-    specialized = peak_bandwidth_tir.specialize(
-        {
-            peak_bandwidth_tir.params[3]: vec_width,
-        }
-    )
-    with transform.PassContext(opt_level=3):
-        f = build(specialized, target=target)
-
-    # upload to remote if running over rpc
-    if dev.device_type >= RPC_SESS_MASK:
-        if remote is None:
-            raise RuntimeError("A RPCSession must be provided when using a remote device.")
-        temp = utils.tempdir()
-        path = temp.relpath("peak_bandwidth.tar")
-        f.export_library(path)
-        remote.upload(path)
-        f = remote.load_module("peak_bandwidth.tar")
-        random_fill = remote.get_function("tvm.contrib.random.random_fill")
-    else:
-        random_fill = get_global_func("tvm.contrib.random.random_fill")
-    assert random_fill, "Please make sure USE_RANDOM is ON in config.cmake"
-
-    threads = num_threads()
-    # Data size needs to be larger than last level of cache. We don't have a
-    # way of getting cache sizes, so this number should give us a large enough
-    # size.
-    size = 10**8 // (4 * threads * vec_width)
-    a = nd.empty((threads, size, 4, vec_width), dtype="float32", device=dev)
-    random_fill(a)
-    b = nd.empty((threads, 4, vec_width), dtype="float32", device=dev)
-    random_fill(b)
-    times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(a, b, threads)
-    return a.numpy().size * 4 / times.min  # 4 bytes per float32
-
-
-@registry.estimate_peak_bandwidth.register("cpu")
-def estimate_peak_bandwidth(
-    func: PrimFunc,  # pylint: disable=unused-argument
-    features: Dict[str, np.ndarray],
-    target: Target,
-    dev: Device,
-    remote: Optional[RPCSession],
-    vec_width: Optional[int] = None,
-) -> Tuple[float, float, str]:
-    """Estimate peak memory bandwidth of a target/device combo.
-
-    Peak bandwidth is estimated by running a small experiment on the underlying
-    hardware. The peak bandwidth measurement assumes that vector instructions
-    are being used to load the data.
-
-    Parameters
-    ----------
-    func : PrimFunc
-        Function to estimate peak bandwidth for. Used to check if a specific
-        kind of memory could be used with this function.
-    features : Dict[str, np.ndarry]
-        Features extracted from `func`. Used to check if a specific kind of
-        memory could be used with this function.
-    target : Target
-        Target to use for measurement. This target should be as specific to the
-        underlying hardware as possible.
-    dev : Device
-        Device to measure peak bandwidth on.
-    remote : Optional[RPCSession]
-      Remote session used to upload artifacts for runtime evaluation. Must be
-      the same session used to create `dev`.
-    vec_width : Optional[int]
-        Vector unit width, determined from target if not supplied.
-
-    Returns
-    -------
-    loaded_bytes : float
-        Estimated bytes loaded by `func`.
-    peak_bandwidth : float
-        Peak memory bandwidth in bytes/seconds.
-    name : str
-        Name of the memory being used.
-    """
-    # Ideally we'd be able to use this code to measure peak bandwidth of the
-    # different cache levels. If we could just generate load commands, then we
-    # could use those in a tight loop. Instead we need some code that is
-    # limited on the cache bandwidth. With the L1 cache we need an operation
-    # that has a very low arithmetic intensity and we haven't come up with one
-    # yet.
-    peak_bandwidth = estimate_peak_bandwidth_dram(target, dev, remote, vec_width)
-    loaded_bytes = sum(
-        [np.sum(x) for (k, x) in features.items() if re.match(r"^B[0-9]+\.bytes$", k) is not None]
-    )
-    return loaded_bytes, peak_bandwidth, "DRAM"
diff --git a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
index bc3ae64b46c1..67d5d84a0c1d 100644
--- a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
+++ b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py
@@ -30,7 +30,7 @@
 @tvm.testing.requires_llvm
 def test_llvm_add_pipeline():
     """all-platform-minimal-test: Check LLVM enablement."""
-    nn = 1024
+    nn = 128
     n = tvm.runtime.convert(nn)
     A = te.placeholder((n,), name="A")
     B = te.placeholder((n,), name="B")
@@ -38,23 +38,15 @@ def test_llvm_add_pipeline():
     BB = te.compute((n,), lambda *i: B(*i), name="B")
     T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T")
     C = te.compute(A.shape, lambda *i: T(*i), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=4)
-    xo1, xo2 = s[C].split(xo, factor=13)
-    s[C].parallel(xo2)
-    s[C].pragma(xo1, "parallel_launch_point")
-    s[C].pragma(xo2, "parallel_stride_pattern")
-    s[C].pragma(xo2, "parallel_barrier_when_finish")
-    s[C].vectorize(xi)
+
+    sch = tvm.tir.Schedule(te.create_prim_func([A, B, C]))
+    xo, xi = sch.split(sch.get_loops("C")[0], factors=[None, 4])
+    sch.parallel(xo)
+    sch.vectorize(xi)
 
     def check_llvm():
-        # Specifically allow offset to test codepath when offset is available
-        Ab = tvm.tir.decl_buffer(
-            A.shape, A.dtype, elem_offset=te.size_var("Aoffset"), offset_factor=8, name="A"
-        )
-        binds = {A: Ab}
         # BUILD and invoke the kernel.
-        f = tvm.build(s, [A, B, C], "llvm", binds=binds)
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
diff --git a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
index 8f929b1c1a76..d01f9599ffe0 100644
--- a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
+++ b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
@@ -69,28 +69,6 @@ def test_memory_usage(target, dev, dtype):
     assert dev.available_global_memory == available_memory_before
 
 
-@pytest.mark.skip(reason="Skip for passing windows test on CI")
-def test_fp16_conversion():
-    n = 100
-
-    for src, dst in [("float32", "float16"), ("float16", "float32")]:
-        A = te.placeholder((n,), dtype=src)
-        B = te.compute((n,), lambda i: A[i].astype(dst))
-
-        s = te.create_schedule([B.op])
-        func = tvm.build(s, [A, B], "llvm")
-
-        x_tvm = tvm.nd.array(100 * np.random.randn(n).astype(src) - 50)
-        y_tvm = tvm.nd.array(100 * np.random.randn(n).astype(dst) - 50)
-
-        func(x_tvm, y_tvm)
-
-        expected = x_tvm.numpy().astype(dst)
-        real = y_tvm.numpy()
-
-        tvm.testing.assert_allclose(expected, real)
-
-
 def test_dtype():
     dtype = tvm.DataType("handle")
     assert dtype.type_code == tvm.DataTypeCode.HANDLE
diff --git a/tests/python/codegen/test_target_codegen_aarch64.py b/tests/python/codegen/test_target_codegen_aarch64.py
index 366198c7de6a..8bd0cb17267d 100644
--- a/tests/python/codegen/test_target_codegen_aarch64.py
+++ b/tests/python/codegen/test_target_codegen_aarch64.py
@@ -43,9 +43,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: A[i] * B[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and mul instructions using z registers
         assembly = f.get_source("asm")
@@ -75,9 +73,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: A[i] + B[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and add instructions using z registers
         assembly = f.get_source("asm")
@@ -107,9 +103,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: A[i] - B[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and sub instructions using z registers
         assembly = f.get_source("asm")
@@ -140,9 +134,7 @@ def check_correct_assembly(type):
         B = te.placeholder(m, dtype=type, name="B")
         C = te.placeholder(m, dtype=type, name="C")
         D = te.compute((m), lambda i: A[i] * B[i] + C[i], name="D")
-        s = te.create_schedule([D.op])
-
-        f = tvm.build(s, [A, B, C, D], target)
+        f = tvm.build(te.create_prim_func([A, B, C, D]), target=target)
 
         # Verify we see SVE load instructions and either mad or mla instructions using z registers
         assembly = f.get_source("asm")
@@ -172,9 +164,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: tvm.te.max(A[i], B[i]))
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and cmgt + sel instructions or a max instruction, all using z registers
         assembly = f.get_source("asm")
@@ -208,9 +198,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: tvm.te.min(A[i], B[i]))
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and cmgt + sel instructions or a min instruction, all using z registers
         assembly = f.get_source("asm")
@@ -244,9 +232,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: tvm.te.div(A[i], B[i]))
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and div instructions using z registers
         assembly = f.get_source("asm")
@@ -256,7 +242,7 @@ def check_correct_assembly(type):
         )
 
         assert len(loads) > 1
-        assert len(matches) > 1
+        assert len(matches) >= 1
 
     check_correct_assembly(type=dtype)
 
@@ -275,9 +261,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: tvm.te.floormod(A[i], B[i]), name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and mls instructions using z registers
         assembly = f.get_source("asm")
@@ -307,9 +291,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: A[i] == B[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and cmpeq or cmeq instructions using z registers
         assembly = f.get_source("asm")
@@ -339,9 +321,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: A[i] != B[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and cmpgt, cmgt, cmpne or cmne instructions, all using z registers
         assembly = f.get_source("asm")
@@ -370,9 +350,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: A[i] | B[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and orr instructions using z registers
         assembly = f.get_source("asm")
@@ -401,9 +379,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype=type, name="B")
         C = te.compute((m), lambda i: A[i] & B[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see SVE load instructions and and instructions using z registers
         assembly = f.get_source("asm")
@@ -431,9 +407,7 @@ def check_correct_assembly(type):
         m = te.var("m")
         A = te.placeholder(m, dtype=type, name="A")
         C = te.compute((m), lambda i: ~A[i], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, C], target)
+        f = tvm.build(te.create_prim_func([A, C]), target=target)
 
         # Verify we see SVE load instructions and eor instructions using z registers
         assembly = f.get_source("asm")
@@ -466,9 +440,7 @@ def check_correct_assembly(type):
         A = te.placeholder(m, dtype=type, name="A")
         B = te.placeholder(m, dtype="int32", name="B")
         C = te.compute((m), lambda i: A[B[i]], name="C")
-        s = te.create_schedule([C.op])
-
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
 
         # Verify we see gather instructions in the assembly
         assembly = f.get_source("asm")
@@ -557,10 +529,7 @@ def test_vscale_range_function_attribute(mattr, expect_attr):
     m = te.var("m")
     A = te.placeholder(m, dtype="float32", name="A")
     C = te.compute((m), lambda i: A[i] + 1, name="C")
-    s = te.create_schedule([C.op])
-
-    with tvm.target.Target(target) as target:
-        f = tvm.build(s, [A, C], target)
+    f = tvm.build(te.create_prim_func([A, C]), target=target)
 
     # Check if the vscale_range() attribute exists
     ll = f.get_source("ll")
diff --git a/tests/python/codegen/test_target_codegen_arm.py b/tests/python/codegen/test_target_codegen_arm.py
index b5c69d6df1a6..9357d38e667b 100644
--- a/tests/python/codegen/test_target_codegen_arm.py
+++ b/tests/python/codegen/test_target_codegen_arm.py
@@ -28,10 +28,9 @@ def check_correct_assembly(type, elements, counts):
         n = tvm.runtime.convert(elements)
         A = te.placeholder(n, dtype=type, name="A")
         B = te.compute(A.shape, lambda i: tvm.tir.popcount(A[i]), name="B")
-        s = te.create_schedule(B.op)
-        s[B].vectorize(s[B].op.axis[0])
-        f = tvm.build(s, [A, B], target)
-
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        sch.vectorize(sch.get_loops("B")[0])
+        f = tvm.build(sch.mod, target=target)
         # Verify we see the correct number of vpaddl and vcnt instructions in the assembly
         assembly = f.get_source("asm")
         matches = re.findall("vpaddl", assembly)
@@ -59,9 +58,9 @@ def check_correct_assembly(N):
             lambda n: te.sum(A[k, n].astype("int32") * B[k, n].astype("int32"), axis=[k]),
             name="C",
         )
-        s = te.create_schedule(C.op)
-        s[C].vectorize(s[C].op.axis[0])
-        f = tvm.build(s, [A, B, C], target)
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B, C]))
+        sch.vectorize(sch.get_loops("C")[0])
+        f = tvm.build(sch.mod, target=target)
 
         # Verify we see the correct number of vmlal.s16 instructions
         assembly = f.get_source("asm")
@@ -83,9 +82,9 @@ def check_broadcast_correct_assembly(N):
             lambda n: te.sum(A[k, n].astype("int32") * B[k].astype("int32"), axis=[k]),
             name="C",
         )
-        s = te.create_schedule(C.op)
-        s[C].vectorize(s[C].op.axis[0])
-        f = tvm.build(s, [A, B, C], target)
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B, C]))
+        sch.vectorize(sch.get_loops("C")[0])
+        f = tvm.build(sch.mod, target=target)
 
         # Verify we see the correct number of vmlal.s16 instructions
         assembly = f.get_source("asm")
diff --git a/tests/python/codegen/test_target_codegen_bool.py b/tests/python/codegen/test_target_codegen_bool.py
index b9f4437110c8..a575c0cec9c9 100644
--- a/tests/python/codegen/test_target_codegen_bool.py
+++ b/tests/python/codegen/test_target_codegen_bool.py
@@ -35,29 +35,24 @@ def compute(arr_size):
 
 
 @tvm.testing.fixture
-def schedule(target, compute):
+def get_module(target, compute):
     target = tvm.target.Target(target)
     A, B, C, D = compute
     if target.kind.name == "llvm":
-        s = te.create_schedule(D.op)
-        xo, xi = s[C].split(C.op.axis[0], factor=4)
-        xo1, xo2 = s[C].split(xo, factor=13)
-        s[C].parallel(xo2)
+        return tvm.IRModule.from_expr(te.create_prim_func([A, B, D]))
 
-    else:
-        s = te.create_schedule(D.op)
-        for stage in [C, D]:
-            xo, xi = s[stage].split(stage.op.axis[0], factor=4)
-            s[stage].bind(xo, te.thread_axis("blockIdx.x"))
-            s[stage].bind(xi, te.thread_axis("threadIdx.x"))
-
-    return s
+    sch = tvm.tir.Schedule(te.create_prim_func([A, B, D]))
+    for stage in ["C", "D"]:
+        xo, xi = sch.split(sch.get_loops(stage)[0], factors=[None, 4])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "blockIdx.x")
+    return sch.mod
 
 
 @tvm.testing.uses_gpu
-def test_cmp_load_store(target, dev, arr_size, compute, schedule):
+def test_cmp_load_store(target, dev, arr_size, compute, get_module):
     A, B, _, D = compute
-    f = tvm.build(schedule, [A, B, D], target)
+    f = tvm.build(get_module, target=target)
 
     a_np = np.random.uniform(size=arr_size).astype(A.dtype)
     b_np = np.random.uniform(size=arr_size).astype(B.dtype)
diff --git a/tests/python/codegen/test_target_codegen_c_host.py b/tests/python/codegen/test_target_codegen_c_host.py
index 3aca0fc8c77e..d7a7cbc8a44b 100644
--- a/tests/python/codegen/test_target_codegen_c_host.py
+++ b/tests/python/codegen/test_target_codegen_c_host.py
@@ -31,61 +31,19 @@ def test_add():
     A = te.placeholder((n,), name="A")
     B = te.placeholder((n,), name="B")
     C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = te.create_schedule(C.op)
 
     def check_c():
-        mhost = tvm.build(s, [A, B, C], "c", name="test_fadd")
-        temp = utils.tempdir()
-        path_dso = temp.relpath("temp.so")
-        mhost.export_library(path_dso)
-        m = tvm.runtime.load_module(path_dso)
-        fadd = m["test_fadd"]
-        dev = tvm.cpu(0)
-        # launch the kernel.
-        n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    check_c()
-
-
-def test_add_pipeline():
-    nn = 1024
-    n = tvm.runtime.convert(nn)
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    AA = te.compute((n,), lambda *i: A(*i), name="A")
-    BB = te.compute((n,), lambda *i: B(*i), name="B")
-    T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T")
-    C = te.compute(A.shape, lambda *i: T(*i), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=4)
-    xo1, xo2 = s[C].split(xo, factor=13)
-    s[C].parallel(xo2)
-    s[C].pragma(xo1, "parallel_launch_point")
-    s[C].pragma(xo2, "parallel_stride_pattern")
-    s[C].pragma(xo2, "parallel_barrier_when_finish")
-    # FIXME(tvm-team): vector operators are not supported for codegen to C yet
-    # s[C].vectorize(xi)
-
-    def check_c():
-        # Specifically allow offset to test codepath when offset is available
-        Ab = tvm.tir.decl_buffer(
-            A.shape, A.dtype, elem_offset=te.size_var("Aoffset"), offset_factor=8, name="A"
+        mhost = tvm.build(
+            tvm.IRModule.from_expr(
+                te.create_prim_func([A, B, C]).with_attr("global_symbol", "test_fadd")
+            ),
+            target="c",
         )
-        binds = {A: Ab}
-        # BUILD and invoke the kernel.
-        f1 = tvm.lower(s, [A, B, C], name="test_fadd_pipeline")
-        mhost = tvm.build(f1, target="c")
-
         temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
         m = tvm.runtime.load_module(path_dso)
-        fadd = m["test_fadd_pipeline"]
+        fadd = m["test_fadd"]
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
@@ -105,10 +63,14 @@ def test_reinterpret():
     B = te.compute(
         A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.reinterpret", 2 + A(*i)), name="B"
     )
-    s = te.create_schedule(B.op)
 
     def check_c():
-        mhost = tvm.build(s, [A, B], "c", name="test_reinterpret")
+        mhost = tvm.build(
+            tvm.IRModule.from_expr(
+                te.create_prim_func([A, B]).with_attr("global_symbol", "test_reinterpret")
+            ),
+            target="c",
+        )
         temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
@@ -129,10 +91,14 @@ def test_ceil():
     n = tvm.runtime.convert(nn)
     A = te.placeholder((n,), name="A", dtype="float32")
     B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.ceil", A(*i)), name="B")
-    s = te.create_schedule(B.op)
 
     def check_c():
-        mhost = tvm.build(s, [A, B], "c", name="test_ceil")
+        mhost = tvm.build(
+            tvm.IRModule.from_expr(
+                te.create_prim_func([A, B]).with_attr("global_symbol", "test_ceil")
+            ),
+            target="c",
+        )
         temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
@@ -153,10 +119,14 @@ def test_floor():
     n = tvm.runtime.convert(nn)
     A = te.placeholder((n,), name="A", dtype="float32")
     B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.floor", A(*i)), name="B")
-    s = te.create_schedule(B.op)
 
     def check_c():
-        mhost = tvm.build(s, [A, B], "c", name="test_floor")
+        mhost = tvm.build(
+            tvm.IRModule.from_expr(
+                te.create_prim_func([A, B]).with_attr("global_symbol", "test_floor")
+            ),
+            target="c",
+        )
         temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
@@ -177,10 +147,14 @@ def test_round():
     n = tvm.runtime.convert(nn)
     A = te.placeholder((n,), name="A", dtype="float32")
     B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.round", A(*i)), name="B")
-    s = te.create_schedule(B.op)
 
     def check_c():
-        mhost = tvm.build(s, [A, B], "c", name="test_round")
+        mhost = tvm.build(
+            tvm.IRModule.from_expr(
+                te.create_prim_func([A, B]).with_attr("global_symbol", "test_round")
+            ),
+            target="c",
+        )
         temp = utils.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)
@@ -196,42 +170,6 @@ def check_c():
     check_c()
 
 
-def test_call_packed():
-    def fake_func(fname="fake.func"):
-        ib = tvm.tir.ir_builder.create()
-        A = ib.pointer("float32", name="A")
-        fake_func1 = tvm.tir.call_packed(fname, A[0])
-
-        ib.emit(fake_func1)
-        body = ib.get()
-        return A, body
-
-    def check_global_packed_func():
-        fname = "fake.func"
-        A, body = fake_func(fname)
-        func1 = tvm.tir.PrimFunc([A], body).with_attr("global_symbol", "func1")
-        B, body = fake_func()
-        func2 = tvm.tir.PrimFunc([B], body).with_attr("global_symbol", "func2")
-        mod = tvm.IRModule({"fake_func1": func1, "fake_func2": func2})
-        fcode = tvm.build(mod, None, "c")
-        src = fcode.get_source()
-
-        # there are two locations calling the packed func
-        assert src.count(fname) == 2
-
-        suffix = "_packed"
-        packed_func_name = fname + suffix
-        # func name will be standardized by GetUniqueName and not exists anymore
-        assert src.find(packed_func_name) == -1
-
-        packed_func_real_name = "_".join(fname.split(".")) + suffix
-        func_declaration = "static void* %s = NULL;" % packed_func_real_name
-        # src only has 1 valid declaration
-        assert src.count(func_declaration) == 1
-
-    check_global_packed_func()
-
-
 def test_subroutine_call():
     @I.ir_module
     class mod:
diff --git a/tests/python/codegen/test_target_codegen_cross_llvm.py b/tests/python/codegen/test_target_codegen_cross_llvm.py
index 8758ae2a04e8..9dc001e1949a 100644
--- a/tests/python/codegen/test_target_codegen_cross_llvm.py
+++ b/tests/python/codegen/test_target_codegen_cross_llvm.py
@@ -32,10 +32,11 @@ def test_llvm_add_pipeline():
     A = te.placeholder((n,), name="A")
     B = te.placeholder((n,), name="B")
     C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=4)
-    s[C].parallel(xo)
-    s[C].vectorize(xi)
+
+    sch = tvm.tir.Schedule(te.create_prim_func([A, B, C]))
+    xo, xi = sch.split(sch.get_loops("C")[0], factors=[None, 4])
+    sch.parallel(xo)
+    sch.vectorize(xi)
 
     def verify_elf(path, e_machine):
         with open(path, "rb") as fi:
@@ -48,7 +49,7 @@ def verify_elf(path, e_machine):
     def build_i386():
         temp = utils.tempdir()
         target = "llvm -mtriple=i386-pc-linux-gnu"
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(sch.mod, target=target)
         path = temp.relpath("myadd.o")
         f.save(path)
         verify_elf(path, 0x03)
@@ -59,7 +60,7 @@ def build_arm():
             print("Skip because %s is not enabled.." % target)
             return
         temp = utils.tempdir()
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(sch.mod, target=target)
         path = temp.relpath("myadd.o")
         f.save(path)
         verify_elf(path, 0x28)
diff --git a/tests/python/codegen/test_target_codegen_cuda.py b/tests/python/codegen/test_target_codegen_cuda.py
index 7b370f3e3211..ae3173a14dee 100644
--- a/tests/python/codegen/test_target_codegen_cuda.py
+++ b/tests/python/codegen/test_target_codegen_cuda.py
@@ -28,9 +28,6 @@
 import tvm.testing
 import pytest
 
-tx = te.thread_axis("threadIdx.x")
-bx = te.thread_axis("blockIdx.x")
-
 
 @tvm.testing.requires_gpu
 @tvm.testing.requires_cuda
@@ -46,11 +43,13 @@ def check_cuda(dtype, n, lanes):
             return
         A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes))
         B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
-        s[B].bind(xo, bx)
-        s[B].bind(xi, tx)
-        fun = tvm.build(s, [A, B], "cuda")
+
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, num_thread])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "threadIdx.x")
+        fun = tvm.build(sch.mod, target="cuda")
+
         dev = tvm.cuda(0)
         a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes)))
         c = tvm.nd.empty((n,), B.dtype, dev)
@@ -96,14 +95,15 @@ def np_bf162np_float(arr):
     def check_cuda(n, lanes):
         A = te.placeholder((n,), name="A", dtype="bfloat16x%d" % lanes)
         B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
-        s[B].bind(xo, bx)
-        s[B].bind(xi, tx)
+
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, num_thread])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "threadIdx.x")
         with tvm.transform.PassContext(
             disabled_pass=["tir.BF16Promote", "tir.BF16CastElimination", "tir.BF16TypeLowering"]
         ):
-            fun = tvm.build(s, [A, B], "cuda")
+            fun = tvm.build(sch.mod, target="cuda")
         dev = tvm.cuda(0)
         np_a = np.random.uniform(size=(n, lanes)).astype("float32")
         np_a = np_bf162np_float(np_float2np_bf16(np_a))
@@ -134,11 +134,12 @@ def check_cuda(dtype, n, lanes):
         D = te.compute(
             (n,), lambda i: tvm.tir.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name="D"
         )
-        s = te.create_schedule(D.op)
-        xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
-        s[D].bind(xo, bx)
-        s[D].bind(xi, tx)
-        fun = tvm.build(s, [A, B, C, D], "cuda")
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B, C, D]))
+        xo, xi = sch.split(sch.get_loops("D")[0], factors=[None, num_thread])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "threadIdx.x")
+        fun = tvm.build(sch.mod, target="cuda")
+
         np_a = np.random.randint(low=-128, high=127, size=(n, lanes))
         np_b = np.random.randint(low=-128, high=127, size=(n, lanes))
         np_c = np.random.randint(low=0, high=127, size=(n,))
@@ -163,11 +164,13 @@ def check_cuda(dtype, n, lanes):
         dev = tvm.cuda(0)
         A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes))
         B = te.compute((n,), lambda i: A[i], name="B")
-        s = te.create_schedule(B.op)
-        block, thread = s[B].split(B.op.axis[0], factor=num_thread)
-        s[B].bind(block, bx)
-        s[B].bind(thread, tx)
-        fun = tvm.build(s, [A, B], "cuda", name="vector_load")
+
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, num_thread])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "threadIdx.x")
+        fun = tvm.build(sch.mod, target="cuda")
+
         np_a = np.random.randint(low=-128, high=127, size=(n, lanes))
         a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np_a)
         b = tvm.nd.empty((n,), B.dtype, dev)
@@ -187,12 +190,14 @@ def test_cuda_make_int8():
     def check_cuda(n, value, lanes):
         dtype = "int8"
         dev = tvm.cuda(0)
-        A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype))
-        s = te.create_schedule(A.op)
-        y, x = s[A].op.axis
-        s[A].vectorize(x)
-        s[A].bind(y, bx)
-        fun = tvm.build(s, [A], "cuda", name="make_int8x4")
+        A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype), name="A")
+
+        sch = tvm.tir.Schedule(te.create_prim_func([A]))
+        y, x = sch.get_loops("A")
+        sch.vectorize(x)
+        sch.bind(y, "blockIdx.x")
+        fun = tvm.build(sch.mod, target="cuda")
+
         np_a = np.full((n, lanes), value, dtype=dtype)
         a = tvm.nd.empty(np_a.shape, dtype, dev)
         fun(a)
@@ -215,13 +220,13 @@ def test_cuda_make_int4():
     def check_cuda(n, value, lanes):
         dtype = "int4"
         dev = tvm.cuda(0)
-        A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype))
-        s = te.create_schedule(A.op)
-        y, x = s[A].op.axis
-        s[A].vectorize(x)
-        s[A].bind(y, bx)
-        kernel_name = "make_int4x" + str(lanes)
-        fun = tvm.build(s, [A], "cuda", name=kernel_name)
+        A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype), name="A")
+        sch = tvm.tir.Schedule(te.create_prim_func([A]))
+        y, x = sch.get_loops("A")
+        sch.vectorize(x)
+        sch.bind(y, "blockIdx.x")
+        fun = tvm.build(sch.mod, target="cuda")
+
         np_a = np.full((n, lanes), value, dtype="int8")
         a = tvm.nd.empty((n, lanes), dtype, dev)
         fun(a)
@@ -246,9 +251,13 @@ def check_inf_nan(dev, n, value, dtype):
         A = te.placeholder((n,), name="A", dtype=dtype)
         inf_value = tvm.tir.const(value, dtype=dtype)
         C = te.compute((n,), lambda i: inf_value, name="C")
-        s = te.create_schedule(C.op)
-        s[C].bind(s[C].op.axis[0], tx)
-        fun = tvm.build(s, [A, C], target)
+
+        sch = tvm.tir.Schedule(te.create_prim_func([A, C]))
+        xo, xi = sch.split(sch.get_loops("C")[0], factors=[None, 8])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "threadIdx.x")
+        fun = tvm.build(sch.mod, target="cuda")
+
         a = tvm.nd.empty((n,), A.dtype, dev)
         c = tvm.nd.empty((n,), A.dtype, dev)
         # Only need to test compiling here
@@ -264,53 +273,6 @@ def check_inf_nan(dev, n, value, dtype):
     check_inf_nan(dev, 1, float("nan"), "float64")
 
 
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_cuda_shuffle():
-    idxm = tvm.tir.indexmod
-    a = te.placeholder((64,), "int32")
-    b = te.placeholder((64,), "int32")
-    c = te.compute((64,), lambda x: a[x] + b[x - idxm(x, 4) + (3 - idxm(x, 4))])
-    sch = te.create_schedule(c.op)
-    x = c.op.axis[0]
-    xo, xi = sch[c].split(x, 4)
-    thrx = te.thread_axis("threadIdx.x")
-    sch[c].bind(xo, thrx)
-    sch[c].vectorize(xi)
-
-    def MyVectorize():
-        def vectorizer(op):
-            if op.kind == tvm.tir.ForKind.VECTORIZED:
-                idx = tvm.tir.Ramp(4 * thrx.var, 1, 4)
-                store = op.body
-                value = store.value
-                new_a = tvm.tir.BufferLoad(value.a.buffer, [idx])
-                bs, ids = [], []
-                for i in range(4):
-                    bs.append(tvm.tir.BufferLoad(value.b.buffer, [4 * thrx.var + i]))
-                    ids.append(3 - i)
-                new_b = tvm.tir.Shuffle(bs, ids)
-                return tvm.tir.BufferStore(store.buffer, new_a + new_b, [idx])
-            return None
-
-        def _transform(f, *_):
-            return f.with_body(
-                tvm.tir.stmt_functor.ir_transform(f.body, None, vectorizer, ["tir.For"])
-            )
-
-        return tvm.tir.transform.prim_func_pass(_transform, opt_level=0, name="MyVectorize")
-
-    with tvm.transform.PassContext(config={"tir.add_lower_pass": [(1, MyVectorize())]}):
-        module = tvm.build(sch, [a, b, c], target="cuda")
-        a_ = np.array(list(range(64)), dtype="int32")
-        b_ = np.array((list(range(4))[::-1]) * 16, dtype="int32")
-        c_ = np.zeros((64,), dtype="int32")
-        ref = a_ + np.array((list(range(4))) * 16, dtype="int32")
-        nda, ndb, ndc = [tvm.nd.array(i, tvm.cuda(0)) for i in [a_, b_, c_]]
-        module(nda, ndb, ndc)
-        tvm.testing.assert_allclose(ndc.numpy(), ref)
-
-
 @tvm.testing.parametrize_targets("cuda", "rocm")
 def test_crossthread_reduction1(target, dev):
     n = te.var("n")
@@ -320,12 +282,13 @@ def test_crossthread_reduction1(target, dev):
     B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
 
     def sched(nthd):
-        s = te.create_schedule(B.op)
-        ko, _ = s[B].split(B.op.reduce_axis[0], nparts=nthd)
-        s[B].bind(ko, te.thread_axis("threadIdx.x"))
-        s[B].bind(B.op.axis[0], te.thread_axis("blockIdx.x"))
-        func = tvm.build(s, [A, B], target)
-        return func
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        x, k = sch.get_loops("B")
+        ko, _ = sch.split(k, factors=[nthd, None])
+        sch.bind(ko, "threadIdx.x")
+        sch.bind(x, "blockIdx.x")
+        fun = tvm.build(sch.mod, target="cuda")
+        return fun
 
     def verify(nthd):
         func = sched(nthd)
@@ -355,13 +318,14 @@ def test_crossthread_reduction2(target, dev):
     B = te.compute((n,), lambda i: te.sum(A[i, k0, k1], axis=(k0, k1)), name="B")
 
     def sched(nthdx, nthdy):
-        s = te.create_schedule(B.op)
-        k0o, _ = s[B].split(B.op.reduce_axis[0], nparts=nthdx)
-        k1o, _ = s[B].split(B.op.reduce_axis[1], nparts=nthdy)
-        s[B].bind(k0o, te.thread_axis("threadIdx.x"))
-        s[B].bind(k1o, te.thread_axis("threadIdx.y"))
-        s[B].bind(B.op.axis[0], te.thread_axis("blockIdx.x"))
-        func = tvm.build(s, [A, B], target)
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        x, k0, k1 = sch.get_loops("B")
+        k0o, _ = sch.split(k0, factors=[nthdx, None])
+        k1o, _ = sch.split(k1, factors=[nthdy, None])
+        sch.bind(k0o, "threadIdx.x")
+        sch.bind(k1o, "threadIdx.y")
+        sch.bind(x, "blockIdx.x")
+        func = tvm.build(sch.mod, target="cuda")
         return func
 
     def verify(nthdx, nthdy):
@@ -389,42 +353,13 @@ def test_cuda_reduction_binding():
     k = te.reduce_axis((0, 32), "k")
     A = te.placeholder((96, 32), name="A")
     B = te.compute((96,), lambda m: te.sum(A[m, k], axis=k), name="B")
-    s = te.create_schedule(B.op)
-
-    s[B].reorder(B.op.reduce_axis[0], B.op.axis[0])
-
-    mo, _ = s[B].split(B.op.axis[0], 32)
-    s[B].bind(mo, te.thread_axis("blockIdx.x"))
 
-    fcuda = tvm.build(s, [A, B], "cuda")
-
-
-@tvm.testing.parametrize_targets("cuda", "rocm")
-def test_rfactor_predicates(target, dev):
-    n = te.reduce_axis((0, 129), "n")
-    A = te.placeholder((129,), name="A")
-    B = te.compute((1,), lambda b: te.sum(A[n], axis=n), name="B")
-
-    s = te.create_schedule(B.op)
-
-    _, ni = s[B].split(s[B].op.reduce_axis[0], factor=8)
-
-    BF = s.rfactor(B, ni, 0)
-    s[B].set_store_predicate(tx.var.equal(0))
-
-    s[B].bind(s[B].op.reduce_axis[0], tx)
-    s[B].bind(s[B].op.axis[0], bx)
-
-    s[BF].compute_at(s[B], s[B].op.axis[0])
-
-    _, noi = s[BF].split(s[BF].op.reduce_axis[0], factor=2)
-
-    BF2 = s.rfactor(BF, noi, 0)
-
-    s[BF].bind(s[BF].op.axis[0], tx)
-    s[BF2].compute_at(s[BF], s[BF].op.axis[1])
-
-    fcuda = tvm.build(s, [A, B], target)
+    sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+    x, k = sch.get_loops("B")
+    sch.reorder(k, x)
+    mo, _ = sch.split(x, factors=[None, 32])
+    sch.bind(mo, "blockIdx.x")
+    func = tvm.build(sch.mod, target="cuda")
 
 
 @tvm.testing.requires_gpu
@@ -436,15 +371,14 @@ def test_cuda_const_float_to_half():
     shape = (2, 3, 4)
     a = te.placeholder(shape, dtype="float16", name="a")
     b = tvm.tir.const(0.5, dtype="float16")
-    c = te.compute(shape, lambda i, j, k: a[i, j, k] > b, name="c")
-    s = te.create_schedule(c.op)
-    axes = [axis for axis in c.op.axis]
-    fused = s[c].fuse(*axes)
-    bx, tx = s[c].split(fused, factor=64)
-    s[c].bind(bx, te.thread_axis("blockIdx.x"))
-    s[c].bind(tx, te.thread_axis("threadIdx.x"))
-
-    func = tvm.build(s, [a, c], "cuda")
+    c = te.compute(shape, lambda i, j, k: a[i, j, k] > b, name="C")
+
+    sch = tvm.tir.Schedule(te.create_prim_func([a, c]))
+    xo, xi = sch.split(sch.fuse(*sch.get_loops("C")), factors=[None, 64])
+    sch.bind(xo, "blockIdx.x")
+    sch.bind(xi, "threadIdx.x")
+    func = tvm.build(sch.mod, target="cuda")
+
     dev = tvm.cuda(0)
     a_np = np.random.uniform(size=shape).astype(a.dtype)
     c_np = np.zeros(shape=shape, dtype=c.dtype)
@@ -463,13 +397,14 @@ def test_cuda_floordiv_with_vectorization():
         k = 37
         A = te.placeholder((n,), name="A")
         B = te.compute((n,), lambda i: A[tvm.tir.floordiv(i, k)], name="B")
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], nparts=1)
-        xio, xii = s[B].split(xi, factor=4)
-        s[B].vectorize(xii)
-        s[B].bind(xo, bx)
-        s[B].bind(xio, tx)
-        func = tvm.build(s, [A, B], "cuda")
+
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        xo, xi = sch.split(sch.get_loops("B")[0], factors=[1, None])
+        xio, xii = sch.split(xi, factors=[None, 4])
+        sch.vectorize(xii)
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xio, "threadIdx.x")
+        func = tvm.build(sch.mod, target="cuda")
 
         dev = tvm.cuda(0)
         a_np = np.random.uniform(size=(n,)).astype(A.dtype)
@@ -489,13 +424,13 @@ def test_cuda_floormod_with_vectorization():
         k = 37
         A = te.placeholder((n,), name="A")
         B = te.compute((n,), lambda i: A[tvm.tir.floormod(i, k)], name="B")
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], nparts=1)
-        xio, xii = s[B].split(xi, factor=4)
-        s[B].vectorize(xii)
-        s[B].bind(xo, bx)
-        s[B].bind(xio, tx)
-        func = tvm.build(s, [A, B], "cuda")
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        xo, xi = sch.split(sch.get_loops("B")[0], factors=[1, None])
+        xio, xii = sch.split(xi, factors=[None, 4])
+        sch.vectorize(xii)
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xio, "threadIdx.x")
+        func = tvm.build(sch.mod, target="cuda")
 
         dev = tvm.cuda(0)
         a_np = np.random.uniform(size=(n,)).astype(A.dtype)
@@ -521,11 +456,11 @@ def check(t0, t1, factor):
         C = te.compute((n,), lambda i: A[i] + topi.cast(B[i], A.dtype), name="C")
 
         # schedule
-        s = tvm.te.create_schedule(C.op)
-        ob, ib = s[C].split(s[C].op.axis[0], factor=factor)
-        s[C].vectorize(ib)
-        s[C].bind(ob, tx)
-        func = tvm.build(s, [A, B, C], "cuda")
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B, C]))
+        ob, ib = sch.split(sch.get_loops("C")[0], factors=[None, factor])
+        sch.vectorize(ib)
+        sch.bind(ob, "threadIdx.x")
+        func = tvm.build(sch.mod, target="cuda")
 
         # correctness
         dev = tvm.cuda(0)
@@ -570,15 +505,16 @@ def skip(t0, t1):
     check("uint8", "int8", 16)
 
 
-def sched(B):
-    s = te.create_schedule(B.op)
-    io, ii = s[B].split(s[B].op.axis[0], nparts=1)
-    iio, iii = s[B].split(ii, nparts=32)
-    _, iiii = s[B].split(iii, factor=4)
-    s[B].vectorize(iiii)
-    s[B].bind(io, bx)
-    s[B].bind(iio, tx)
-    return s
+def sched(A, B):
+    # schedule
+    sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+    io, ii = sch.split(sch.get_loops("B")[0], factors=[1, None])
+    iio, iii = sch.split(ii, factors=[32, None])
+    _, iiii = sch.split(iii, factors=[None, 4])
+    sch.vectorize(iiii)
+    sch.bind(io, "blockIdx.x")
+    sch.bind(iio, "threadIdx.x")
+    return tvm.build(sch.mod, target="cuda")
 
 
 @tvm.testing.requires_gpu
@@ -627,8 +563,7 @@ def run_test(tvm_intrin, np_func, dtype):
         n = 128
         A = te.placeholder((n,), dtype=dtype, name="A")
         B = te.compute((n,), lambda *i: tvm_intrin(A(*i)), name="B")
-        s = sched(B)
-        f = tvm.build(s, [A, B], "cuda")
+        f = sched(A, B)
         dev = tvm.cuda(0)
         a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
         b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev)
@@ -653,8 +588,7 @@ def run_test(tvm_intrin, np_func):
         n = 128
         A = te.placeholder((n,), dtype=dtype, name="A")
         B = te.compute((n,), lambda i: tvm_intrin(A[i], c2), name="B")
-        s = sched(B)
-        f = tvm.build(s, [A, B], "cuda")
+        f = sched(A, B)
         dev = tvm.cuda(0)
         a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
         b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev)
@@ -679,8 +613,7 @@ def run_test(dtype):
         n = 128
         A = te.placeholder((n,), dtype=dtype, name="A")
         B = te.compute((n,), lambda i: tvm.tir.popcount(A[i]), name="B")
-        s = sched(B)
-        f = tvm.build(s, [A, B], "cuda")
+        f = sched(A, B)
         dev = tvm.cuda(0)
         a = tvm.nd.array(np.random.randint(0, 100000, size=n).astype(A.dtype), dev)
         b = tvm.nd.array(np.zeros(shape=(n,)).astype(B.dtype), dev)
@@ -711,12 +644,14 @@ def check_cuda(dtype, n, l, padding, lanes):
             ),
             name="B",
         )
-        s = te.create_schedule(B.op)
-        block, thread, vectorize = s[B].op.axis
-        s[B].bind(block, bx)
-        s[B].bind(thread, tx)
-        s[B].vectorize(vectorize)
-        fun = tvm.build(s, [A, B], "cuda", name="vector_load_permute_pad")
+
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        block, thread, vectorize = sch.get_loops("B")
+        sch.bind(block, "blockIdx.x")
+        sch.bind(thread, "threadIdx.x")
+        sch.vectorize(vectorize)
+        fun = tvm.build(sch.mod, target="cuda")
+
         np_a = np.random.randint(low=-128, high=127, size=(n, l)).astype(A.dtype)
         a = tvm.nd.empty((n, l), A.dtype, dev).copyfrom(np_a)
         b = tvm.nd.empty((n // lanes, l + padding * 2, lanes), B.dtype, dev)
@@ -736,205 +671,6 @@ def check_cuda(dtype, n, l, padding, lanes):
     check_cuda("float32", 64, 16, 3, 4)
 
 
-def vcf_check_common(s, args):
-    N = 512
-
-    # To check if every vectorize loop transforms to ramp expr successfully
-    stmt = tvm.lower(s, args)
-    # Use this as a stack flag to show whether this stmt is inside a BroadcastNode
-    inside_broadcast = [False]
-
-    # Possible patterns:
-    # Reduce init:          BufferStore[Ramp] = Broadcast(0)
-    # Shared memory copy:   BufferStore[Ramp] = BufferLoad[Ramp]
-    # Compute:              BufferStore[Ramp] = BufferLoad[Ramp] ... Broadcast[Load]
-
-    def pre_visit(stmt):
-        if isinstance(stmt, tvm.tir.Broadcast):
-            inside_broadcast[0] = True
-            # Check Broadcast[Imm numbers] or Broadcast[Load] patterns
-            assert isinstance(stmt.value, (tvm.tir.IntImm, tvm.tir.FloatImm, tvm.tir.BufferLoad))
-
-        if isinstance(stmt, (tvm.tir.BufferStore, tvm.tir.BufferLoad)):
-            is_ramp_index = isinstance(stmt.indices[-1], tvm.tir.Ramp)
-            is_vectorized_buffer = re.match(r"^.*x\d+$", stmt.buffer.dtype)
-            if isinstance(stmt, tvm.tir.BufferLoad):
-                # Check Broadcast[BufferLoad] or BufferLoad[Ramp] patterns
-                assert inside_broadcast[0] or is_ramp_index or is_vectorized_buffer
-                # Skip the rest of the BufferLoad
-                return stmt
-            else:
-                assert is_ramp_index or is_vectorized_buffer
-
-        return None
-
-    def post_visit(stmt):
-        if isinstance(stmt, tvm.tir.Broadcast):
-            inside_broadcast[0] = False
-        return None
-
-    tvm.tir.stmt_functor.ir_transform(stmt["main"].body, pre_visit, post_visit)
-
-    tgt = tvm.target.cuda()
-    mod = tvm.build(s, args, tgt)
-    # To check if every vectorize loop transforms to correct instruction
-    # print(mod.imported_modules[0].get_source())
-
-    dev = tvm.device("cuda", 0)
-    a = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), dev)
-    b = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), dev)
-    c = tvm.nd.array(np.zeros((512, 512), dtype="float32"), dev)
-    mod(a, b, c)
-    tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()), rtol=1e-5)
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_vectorized_cooperative_fetching_x():
-    N = 512
-    A = te.placeholder((N, N), name="A", dtype="float32")
-    B = te.placeholder((N, N), name="B", dtype="float32")
-    k = te.reduce_axis((0, N), name="k")
-    C = te.compute((N, N), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k))
-    s = te.create_schedule(C.op)
-    i, j = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    AA = s.cache_read(A, "shared", [C])
-    BB = s.cache_read(B, "shared", [C])
-
-    i3, i4 = s[C].split(i, factor=4)
-    i2, i3 = s[C].split(i3, factor=2)
-    i1, i2 = s[C].split(i2, factor=8)
-    i0, i1 = s[C].split(i1, factor=1)
-    j3, j4 = s[C].split(j, factor=4)
-    j2, j3 = s[C].split(j3, factor=2)
-    j1, j2 = s[C].split(j2, factor=8)
-    j0, j1 = s[C].split(j1, factor=2)
-    k1, k2 = s[C].split(k, factor=8)
-    k0, k1 = s[C].split(k1, factor=8)
-    s[C].reorder(i0, j0, i1, j1, i2, j2, k0, k1, i3, j3, k2, i4, j4)
-    block_it = s[C].fuse(i0, j0)
-    s[C].bind(block_it, tvm.te.thread_axis("blockIdx.x"))
-    vthread_it = s[C].fuse(i1, j1)
-    s[C].bind(vthread_it, tvm.te.thread_axis("vthread"))
-    thread_it = s[C].fuse(i2, j2)
-    s[C].bind(thread_it, tvm.te.thread_axis("threadIdx.x"))
-    s[C].vectorize(j4)
-
-    s[AA].compute_at(s[C], k0)
-    iaa, jaa = s[AA].op.axis
-    s[BB].compute_at(s[C], k0)
-    ibb, jbb = s[BB].op.axis
-    aa_fused = s[AA].fuse(iaa, jaa)
-    bb_fused = s[BB].fuse(ibb, jbb)
-    aa1, aa2 = s[AA].split(aa_fused, factor=4)
-    aa0, aa1 = s[AA].split(aa1, factor=64)
-    bb1, bb2 = s[BB].split(bb_fused, factor=4)
-    bb0, bb1 = s[BB].split(bb1, factor=64)
-    s[AA].bind(aa1, tvm.te.thread_axis("threadIdx.x"))
-    s[AA].vectorize(aa2)
-    s[BB].bind(bb1, tvm.te.thread_axis("threadIdx.x"))
-    s[BB].vectorize(bb2)
-
-    vcf_check_common(s, [A, B, C])
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_vectorized_cooperative_fetching_xy():
-    N = 512
-    A = te.placeholder((N, N), name="A")
-    B = te.placeholder((N, N), name="B")
-    k = te.reduce_axis((0, N), name="k")
-    C = te.compute((N, N), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k))
-    s = te.create_schedule(C.op)
-    i, j = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    AA = s.cache_read(A, "shared", [C])
-    BB = s.cache_read(B, "shared", [C])
-
-    i3, i4 = s[C].split(i, factor=4)
-    i2, i3 = s[C].split(i3, factor=2)
-    i1, i2 = s[C].split(i2, factor=8)
-    i0, i1 = s[C].split(i1, factor=1)
-    j3, j4 = s[C].split(j, factor=4)
-    j2, j3 = s[C].split(j3, factor=2)
-    j1, j2 = s[C].split(j2, factor=8)
-    j0, j1 = s[C].split(j1, factor=2)
-    k1, k2 = s[C].split(k, factor=8)
-    k0, k1 = s[C].split(k1, factor=8)
-    s[C].reorder(i0, j0, i1, j1, i2, j2, k0, k1, i3, j3, k2, i4, j4)
-    block_it = s[C].fuse(i0, j0)
-    s[C].bind(block_it, tvm.te.thread_axis("blockIdx.x"))
-    vthread_it = s[C].fuse(i1, j1)
-    s[C].bind(vthread_it, tvm.te.thread_axis("vthread"))
-    s[C].bind(i2, tvm.te.thread_axis("threadIdx.y"))
-    s[C].bind(j2, tvm.te.thread_axis("threadIdx.x"))
-    s[C].vectorize(j4)
-
-    s[AA].compute_at(s[C], k0)
-    iaa, jaa = s[AA].op.axis
-    s[BB].compute_at(s[C], k0)
-    ibb, jbb = s[BB].op.axis
-    aa_fused = s[AA].fuse(iaa, jaa)
-    bb_fused = s[BB].fuse(ibb, jbb)
-    aa2, aa3 = s[AA].split(aa_fused, factor=4)
-    aa1, aa2 = s[AA].split(aa2, factor=8)
-    aa0, aa1 = s[AA].split(aa1, factor=8)
-    bb2, bb3 = s[BB].split(bb_fused, factor=4)
-    bb1, bb2 = s[BB].split(bb2, factor=8)
-    bb0, bb1 = s[BB].split(bb1, factor=8)
-    s[AA].bind(aa1, tvm.te.thread_axis("threadIdx.y"))
-    s[AA].bind(aa2, tvm.te.thread_axis("threadIdx.x"))
-    s[AA].vectorize(aa3)
-    s[BB].bind(bb1, tvm.te.thread_axis("threadIdx.y"))
-    s[BB].bind(bb2, tvm.te.thread_axis("threadIdx.x"))
-    s[BB].vectorize(bb3)
-
-    vcf_check_common(s, [A, B, C])
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_unrolled_vectorization():
-    dtype = "float32"
-    target = "cuda"
-
-    # Compute declaration
-    N = 128
-    A = te.placeholder((N, N), name="A")
-    B = te.placeholder((N, N), name="B")
-    k = te.reduce_axis((0, N), name="k")
-    C = te.compute((N, N), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="C")
-
-    # Schedule
-    s = te.create_schedule([C.op])
-    CC = s.cache_write(C, "local")
-    i, j = s[C].op.axis
-    bx, tx, ii, ji = s[C].tile(i, j, 1, 2)
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    s[C].vectorize(ji)
-    s[CC].compute_at(s[C], tx)
-    i, j = s[CC].op.axis
-    k = s[CC].op.reduce_axis[0]
-    ko, ki = s[CC].split(k, 2)
-    s[CC].unroll(ki)
-    s[CC].vectorize(j)
-
-    # Check correctness
-    dev = tvm.device(target)
-    a_tvm = tvm.nd.array(np.ones((N, N)).astype(dtype), device=dev)
-    b_tvm = tvm.nd.array(np.ones((N, N)).astype(dtype), device=dev)
-    c_tvm = tvm.nd.empty((N, N), device=dev)
-    func_tvm = tvm.build(s, [A, B, C], target=target)
-    func_tvm(a_tvm, b_tvm, c_tvm)
-    c_np = c_tvm.numpy()
-    tvm.testing.assert_allclose(c_np, N * np.ones((N, N)))
-
-
 @tvm.testing.requires_gpu
 @tvm.testing.requires_cuda
 def test_try_unaligned_vector_load():
@@ -950,16 +686,15 @@ def get_compute_aligned():
         return get_compute(4, 2, 2)
 
     def build(A, C, N, C_N):
-        s = te.create_schedule(C.op)
-        oi, ii = s[C].split(C.op.axis[0], factor=2)
-        s[C].bind(oi, te.thread_axis("threadIdx.x"))
-        s[C].vectorize(ii)  # BUG: misalignment
-
-        tgt = tvm.target.Target(target="cuda", host="llvm")
-        dev = tvm.device(tgt.kind.name, 0)
-        f = tvm.build(s, [A, C], tgt, name="foo")
-        kernel_source = f.imported_modules[0].get_source()
+        sch = tvm.tir.Schedule(te.create_prim_func([A, C]))
+        oi, ii = sch.split(sch.get_loops("C")[0], factors=[None, 2])
+        sch.bind(oi, "threadIdx.x")
+        sch.vectorize(ii)  # BUG: misalignment
 
+        f = tvm.build(sch.mod, target="cuda")
+
+        kernel_source = f.imported_modules[0].get_source()
+        dev = tvm.cuda()
         a_data = np.arange(0, N).astype(A.dtype)
         a = tvm.nd.array(a_data, dev)
         c = tvm.nd.array(np.zeros(C_N, dtype=C.dtype), dev)
@@ -984,28 +719,6 @@ def build(A, C, N, C_N):
     assert np.allclose(c, expected), f"expected={expected}\nactual={c}"
 
 
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_cuda_save_kernels_for_profiling():
-    num_thread = 8
-
-    def check_cuda(n, lanes):
-        dtype = "float32"
-        A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes))
-        B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
-        s[B].bind(xo, bx)
-        s[B].bind(xi, tx)
-        tempdir = utils.tempdir()
-        tmp_path = str(tempdir.path)
-        with tvm.transform.PassContext(opt_level=3, config={"cuda.kernels_output_dir": tmp_path}):
-            _ = tvm.build(s, [A, B], "cuda")
-        assert "tvm_kernels.cu" in os.listdir(tmp_path)
-
-    check_cuda(64, 2)
-
-
 @tvm.testing.requires_gpu
 @tvm.testing.requires_cuda
 def test_cuda_thread_sync_inside_condition():
diff --git a/tests/python/codegen/test_target_codegen_device.py b/tests/python/codegen/test_target_codegen_device.py
index b4181fb7b014..ad27356961aa 100644
--- a/tests/python/codegen/test_target_codegen_device.py
+++ b/tests/python/codegen/test_target_codegen_device.py
@@ -19,6 +19,7 @@
 from tvm.contrib import utils
 import numpy as np
 import tvm.testing
+from tvm import tir
 
 
 @tvm.testing.requires_gpu
@@ -29,16 +30,25 @@ def test_large_uint_imm():
     num_thread = 2
 
     A = te.compute((n,), lambda *i: tvm.tir.const(value, "uint64") + other, name="A")
-    s = te.create_schedule(A.op)
-    xo, xi = s[A].split(A.op.axis[0], factor=num_thread)
-    s[A].bind(xi, te.thread_axis("threadIdx.x"))
-    s[A].bind(xo, te.thread_axis("blockIdx.x"))
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A])
+    sch = tir.Schedule(mod)
+
+    # Get block and loop
+    block = sch.get_block("A")
+    loop = sch.get_loops(block)[0]
+
+    # Split and bind
+    xo, xi = sch.split(loop, factors=[None, num_thread])
+    sch.bind(xi, "threadIdx.x")
+    sch.bind(xo, "blockIdx.x")
 
     def check_target(device):
         if not tvm.testing.device_enabled(device):
             return
         dev = tvm.device(device, 0)
-        f = tvm.build(s, [A], device)
+        f = tvm.build(sch.mod, target=device)
         # launch the kernel.
         a = tvm.nd.empty((n,), dtype=A.dtype, device=dev)
         f(a)
@@ -55,23 +65,36 @@ def test_add_pipeline():
     B = te.placeholder((), name="B")
     C = te.compute(A.shape, lambda *i: A(*i) + B(), name="C")
     D = te.compute(A.shape, lambda *i: C(*i) + 1, name="D")
-    s = te.create_schedule(D.op)
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, B, D])
+    sch = tir.Schedule(mod)
+
+    # Get blocks and loops
+    c_block = sch.get_block("C")
+    d_block = sch.get_block("D")
+    c_loop = sch.get_loops(c_block)[0]
+    d_loop = sch.get_loops(d_block)[0]
 
     # GPU schedule have to split by gridIdx and threadIdx
     num_thread = 256
-    xo, xi = s[C].split(C.op.axis[0], factor=num_thread)
-    s[C].bind(xi, te.thread_axis("threadIdx.x"))
-    s[C].bind(xo, te.thread_axis("blockIdx.x"))
 
-    xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
-    s[D].bind(xi, te.thread_axis("threadIdx.x"))
-    s[D].bind(xo, te.thread_axis("blockIdx.x"))
+    # Schedule C
+    c_xo, c_xi = sch.split(c_loop, factors=[None, num_thread])
+    sch.bind(c_xi, "threadIdx.x")
+    sch.bind(c_xo, "blockIdx.x")
+
+    # Schedule D
+    d_xo, d_xi = sch.split(d_loop, factors=[None, num_thread])
+    sch.bind(d_xi, "threadIdx.x")
+    sch.bind(d_xo, "blockIdx.x")
 
     def check_target(device, host="stackvm"):
         if not tvm.testing.device_enabled(device) or not tvm.testing.device_enabled(host):
             return
         dev = tvm.device(device, 0)
-        mhost = tvm.driver.build(s, [A, B, D], target=tvm.target.Target(device, host))
+        target = tvm.target.Target(device, host)
+        mhost = tvm.build(sch.mod, target=target)
         f = mhost.entry_func
         # launch the kernel.
         n = 1027
diff --git a/tests/python/codegen/test_target_codegen_extern.py b/tests/python/codegen/test_target_codegen_extern.py
index 38fac332e9de..378eb427fd54 100644
--- a/tests/python/codegen/test_target_codegen_extern.py
+++ b/tests/python/codegen/test_target_codegen_extern.py
@@ -18,6 +18,8 @@
 from tvm import te
 import numpy as np
 import tvm.testing
+import pytest
+from tvm import tir
 
 
 @tvm.testing.uses_gpu
@@ -56,18 +58,18 @@ def extern_generator_gpu(ins, outs):
 
     C_cpu = te.extern(A.shape, [A], extern_generator, name="C")
     C_gpu = te.extern(A.shape, [A], extern_generator_gpu, name="C")
-    s_cpu = te.create_schedule(C_cpu.op)
-    s_gpu = te.create_schedule(C_gpu.op)
-    print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True))
-    print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True))
+
+    # Create IRModules directly
+    mod_cpu = tvm.IRModule.from_expr(te.create_prim_func([A, C_cpu]))
+    mod_gpu = tvm.IRModule.from_expr(te.create_prim_func([A, C_gpu]))
 
     def check_target(target):
         if not tvm.testing.device_enabled(target):
             return
-        s = s_gpu if target in ["opencl", "cuda"] else s_cpu
+        mod = mod_gpu if target in ["opencl", "cuda"] else mod_cpu
         C = C_gpu if target in ["opencl", "cuda"] else C_cpu
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], target)
+        f = tvm.build(mod, target=target)
         dev = tvm.device(target, 0)
         # launch the kernel.
         n = nn
@@ -91,7 +93,9 @@ def extern_generator(ins, outs):
         return tvm.tir.call_packed("my_extern_array_func1", ins[0], outs[0])
 
     C = te.extern(A.shape, [A], extern_generator, name="C")
-    s = te.create_schedule(C.op)
+
+    # Create IRModule directly
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, C]))
 
     @tvm.register_func
     def my_extern_array_func1(aa, bb):
@@ -101,7 +105,7 @@ def check_target(target):
         if not tvm.testing.device_enabled(target):
             return
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], target)
+        f = tvm.build(mod, target=target)
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
@@ -115,6 +119,7 @@ def check_target(target):
     check_target("llvm")
 
 
+@pytest.mark.skip("LEGACY-TO-FIX: limitation of create_prim_func with intermediate buffer")
 def test_pack_buffer_intermediate():
     nn = 1024
     n = tvm.runtime.convert(nn)
@@ -126,13 +131,13 @@ def extern_generator(ins, outs):
         return tvm.tir.call_packed("my_extern_array_func2", ins[0], outs[0])
 
     C = te.extern(B.shape, [B], extern_generator, name="C")
-    s = te.create_schedule(C.op)
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, C]))
 
     def check_target(target):
         if not tvm.testing.device_enabled(target):
             return
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], target)
+        f = tvm.build(mod, target=target)
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
diff --git a/tests/python/codegen/test_target_codegen_hexagon.py b/tests/python/codegen/test_target_codegen_hexagon.py
index c97637f927b7..37e62e5b34ef 100644
--- a/tests/python/codegen/test_target_codegen_hexagon.py
+++ b/tests/python/codegen/test_target_codegen_hexagon.py
@@ -15,14 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import numpy as np
 import os
-import pytest
 import re
 import sys
+import numpy as np
+import pytest
 import tvm
 import tvm.testing
 import tvm.contrib.hexagon as hexagon
+from tvm import te
 
 
 @pytest.fixture(autouse=True)
@@ -39,28 +40,17 @@ def register_linker():
 def test_basic():
     target = tvm.target.hexagon("v66", hvx=128)
 
-    def check_add(offload):
+    def check_add():
         A = tvm.te.placeholder((128,), dtype="uint8", name="A")
         B = tvm.te.placeholder((128,), dtype="uint8", name="A")
         C = tvm.te.compute((128,), lambda i: A[i] + B[i], name="C")
-        s = tvm.te.create_schedule(C.op)
-
-        if offload:
-            xo, xi = s[C].split(s[C].op.axis[0], nparts=1)
-            s[C].bind(xo, tvm.te.thread_axis("pipeline"))
-            m = tvm.build(s, [C, A, B], target=target, name="offload_add")
-            hexm = m.imported_modules[0]
-        else:
-            hexm = tvm.build(
-                s, [C, A, B], target=tvm.target.Target(target, target), name="native_add"
-            )
-
+        mod = tvm.IRModule.from_expr(te.create_prim_func([C, A, B]))
+        hexm = tvm.build(mod, target=tvm.target.Target(target, target))
         asm = hexm.get_source("s")
         vadds = re.findall(r"v[0-9]+.b = vadd\(v[0-9]+.b,v[0-9]+.b\)", asm)
         assert vadds  # Check that it's non-empty
 
-    check_add(True)
-    check_add(False)
+    check_add()
 
 
 @tvm.testing.requires_hexagon
@@ -69,48 +59,22 @@ def test_llvm_target_features():
     # Define some trivial compute
     A = tvm.te.placeholder((128,), dtype="uint8", name="A")
     C = tvm.te.compute((128,), lambda i: A[i] + 1, name="C")
-    s = tvm.te.create_schedule(C.op)
-    m = tvm.build(s, [C, A], target=tvm.target.Target(target, target), name="add_one")
+    mod = tvm.IRModule.from_expr(te.create_prim_func([C, A]).with_attr("global_symbol", "add_one"))
+    m = tvm.build(mod, target=tvm.target.Target(target, target))
     llvm_ir = m.get_source("ll")
     # Make sure we find +hvx-length128b in "attributes".
     fs = re.findall(r"attributes.*\+hvx-length128b", llvm_ir)
     assert fs  # Check that it's non-empty
 
 
-@tvm.testing.requires_hexagon
-def test_alloc_vtcm():
-    target = tvm.target.hexagon("v66")
-
-    buf_len = 2048
-    A = tvm.te.placeholder((buf_len,), name="A", dtype="int8")
-    B = tvm.te.placeholder((buf_len,), name="B", dtype="int8")
-
-    A_buf = tvm.te.compute((buf_len,), lambda *i: A(*i), "A_buf")
-    B_buf = tvm.te.compute((buf_len,), lambda *i: B(*i), "B_buf")
-    C = tvm.te.compute((buf_len,), lambda *i: A_buf(*i) + B_buf(*i), name="C")
-    s = tvm.te.create_schedule(C.op)
-
-    # Use VTCM for each buffer.
-    s[A_buf].set_scope("local.vtcm")
-    s[B_buf].set_scope("local.vtcm")
-
-    config = {"tir.add_lower_pass": hexagon.ir_lower_vtcm_pass()}
-    with tvm.transform.PassContext(config=config):
-        irmod = tvm.lower(s, [A, B, C], name="alloc_vtcm")
-
-    calls = re.findall("HexagonBackend[A-Za-z]*VTCM", str(irmod["alloc_vtcm"]))
-    assert "HexagonBackendAllocateVTCM" in calls
-    assert "HexagonBackendFreeVTCM" in calls
-
-
 @tvm.testing.requires_hexagon
 def test_llvm_options():
     target = tvm.target.hexagon("v66", llvm_options="-hexagon-noopt")
     Zero = tvm.te.compute((10,), lambda _: tvm.tir.const(0, "int32"))
-    s = tvm.te.create_schedule(Zero.op)
-    tvm.build(s, [Zero], target=target, name="zero")
+    mod = tvm.IRModule.from_expr(te.create_prim_func([Zero]))
     # Check that BuildHexagon hasn't crashed because of target attribute
     # type mismatch.
+    tvm.build(mod, target=tvm.target.Target(target, target))
     assert re.search("-hexagon-noopt", str(target))
 
 
diff --git a/tests/python/codegen/test_target_codegen_llvm.py b/tests/python/codegen/test_target_codegen_llvm.py
index d629d93d365e..e3ccff49ba1b 100644
--- a/tests/python/codegen/test_target_codegen_llvm.py
+++ b/tests/python/codegen/test_target_codegen_llvm.py
@@ -26,6 +26,7 @@
 import tvm
 import tvm.testing
 from tvm import te
+from tvm import tir
 from tvm.contrib import clang, utils
 from tvm.script import tir as T, ir as I
 from tvm.target.codegen import llvm_get_intrinsic_name, llvm_lookup_intrinsic_id
@@ -85,8 +86,13 @@ def use_llvm_intrinsic(A, C):
     C = tvm.te.extern(
         (1, 1), [A], lambda ins, outs: use_llvm_intrinsic(ins[0], outs[0]), name="C", dtype="int32"
     )
-    s = tvm.te.create_schedule(C.op)
-    f = tvm.build(s, [A, C], target="llvm")
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, C])
+    sch = tir.Schedule(mod)
+
+    # Build from scheduled TIR
+    f = tvm.build(sch.mod, target="llvm")
 
 
 @tvm.testing.requires_llvm
@@ -108,10 +114,13 @@ def test_llvm_large_uintimm():
     value = (1 << 63) + 123
     other = tvm.tir.const(3, "uint64")
     A = te.compute((), lambda: tvm.tir.const(value, "uint64") + other, name="A")
-    s = te.create_schedule(A.op)
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A])
+    sch = tir.Schedule(mod)
 
     def check_llvm():
-        f = tvm.build(s, [A], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         a = tvm.nd.empty((), dtype=A.dtype, device=dev)
@@ -122,24 +131,38 @@ def check_llvm():
 
 
 @tvm.testing.requires_llvm
-def test_llvm_persist_parallel():
+def test_llvm_multi_parallel():
     n = 128
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1, name="B")
     C = te.compute(A.shape, lambda *i: te.sqrt(B(*i)) * 2 + 2, name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=8)
-    xo1, xo2 = s[C].split(xo, nparts=1)
-    s[B].compute_at(s[C], xo1)
-    s[B].parallel(s[B].op.axis[0])
-    s[B].pragma(s[B].op.axis[0], "parallel_barrier_when_finish")
-    s[C].parallel(xi)
-    s[C].pragma(xo1, "parallel_launch_point")
-    s[C].pragma(xi, "parallel_stride_pattern")
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, C])
+    sch = tir.Schedule(mod)
+
+    # Get blocks and loops
+    c_block = sch.get_block("C")
+    b_block = sch.get_block("B")
+    c_loop = sch.get_loops(c_block)[0]
+
+    # Split and parallelize
+    xo, xi = sch.split(c_loop, factors=[None, 8])
+    xo1, xo2 = sch.split(xo, factors=[1, None])
+
+    # Move computation of B
+    sch.compute_at(b_block, xo1)
+
+    # Get B's loop after compute_at
+    b_loop = sch.get_loops(b_block)[0]
+
+    # Apply parallel scheduling
+    sch.parallel(b_loop)
+    sch.parallel(xi)
 
     def check_llvm():
         # BUILD and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
@@ -156,12 +179,22 @@ def check_llvm(nn, base):
         n = tvm.runtime.convert(nn)
         A = te.placeholder((n + base), name="A")
         C = te.compute((n,), lambda i: A(nn + base - i - 1), name="C")
-        s = te.create_schedule(C.op)
-        xo, xi = s[C].split(C.op.axis[0], factor=4)
-        s[C].parallel(xo)
-        s[C].vectorize(xi)
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, C])
+        sch = tir.Schedule(mod)
+
+        # Get block and loop
+        block = sch.get_block("C")
+        loop = sch.get_loops(block)[0]
+
+        # Split and parallelize
+        xo, xi = sch.split(loop, factors=[None, 4])
+        sch.parallel(xo)
+        sch.vectorize(xi)
+
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
@@ -178,29 +211,31 @@ def check_llvm(nn, base):
 
 @tvm.testing.requires_llvm
 def test_llvm_vadd_pipeline():
-    def check_llvm(n, lanes):
-        A = te.placeholder((n,), name="A", dtype="float32x%d" % lanes)
-        B = te.compute((n,), lambda i: A[i], name="B")
-        C = te.compute((n,), lambda i: B[i] + tvm.tir.const(1, A.dtype), name="C")
-        s = te.create_schedule(C.op)
-        xo, xi = s[C].split(C.op.axis[0], nparts=2)
-        _, xi = s[C].split(xi, factor=2)
-        s[C].parallel(xo)
-        s[C].vectorize(xi)
-        s[B].compute_at(s[C], xo)
-        xo, xi = s[B].split(B.op.axis[0], factor=2)
-        s[B].vectorize(xi)
-        # build and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
-        dev = tvm.cpu(0)
-        # launch the kernel.
-        a = tvm.nd.empty((n,), A.dtype).copyfrom(np.random.uniform(size=(n, lanes)))
-        c = tvm.nd.empty((n,), C.dtype, dev)
-        f(a, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
-
-    check_llvm(64, 2)
-    check_llvm(512, 2)
+    n = te.size_var("n")
+    A = te.placeholder((n,), name="A")
+    B = te.placeholder((n,), name="B")
+    C = te.compute((n,), lambda i: A[i] + B[i], name="C")
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, B, C])
+    sch = tir.Schedule(mod)
+
+    # Get block and loop
+    block = sch.get_block("C")
+    loop = sch.get_loops(block)[0]
+
+    # Split the loop
+    _, inner = sch.split(loop, factors=[None, 4])
+    sch.vectorize(inner)
+    # Build and verify
+    f = tvm.build(sch.mod, target="llvm")
+    dev = tvm.cpu(0)
+    n = 128
+    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+    f(a, b, c)
+    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
 
 
 @tvm.testing.requires_llvm
@@ -209,12 +244,22 @@ def check_llvm(nn, base, stride):
         n = tvm.runtime.convert(nn)
         A = te.placeholder((n + base, stride), name="A")
         C = te.compute((n, stride), lambda i, j: A(base + i, j) + 1, name="C")
-        s = te.create_schedule(C.op)
-        xo, xi = s[C].split(C.op.axis[0], factor=4)
-        s[C].parallel(xo)
-        s[C].vectorize(xi)
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, C])
+        sch = tir.Schedule(mod)
+
+        # Get block and loops
+        block = sch.get_block("C")
+        i_loop, j_loop = sch.get_loops(block)
+
+        # Split and parallelize
+        xo, xi = sch.split(i_loop, factors=[None, 4])
+        sch.parallel(xo)
+        sch.vectorize(xi)
+
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
@@ -237,11 +282,14 @@ def test_llvm_temp_space():
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda i: A(i) + 1, name="B")
     C = te.compute(A.shape, lambda i: B(i) + 1, name="C")
-    s = te.create_schedule(C.op)
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, C])
+    sch = tir.Schedule(mod)
 
     def check_llvm():
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         n = nn
@@ -255,36 +303,37 @@ def check_llvm():
 
 @tvm.testing.requires_llvm
 def test_multiple_func():
-    nn = 1024
-    n = tvm.runtime.convert(nn)
+    # Define the computation
+    n = te.size_var("n")
     A = te.placeholder((n,), name="A")
     B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=4)
-    s[C].parallel(xo)
-    s[C].vectorize(xi)
-
-    def check_llvm():
-        # build two functions
-        f2 = tvm.lower(s, [A, B, C], name="fadd1")
-        f1 = tvm.lower(s, [A, B, C], name="fadd2")
-        m = tvm.build([f1, f2], "llvm")
-        fadd2 = m["fadd2"]
-        fadd1 = m["fadd1"]
+    C = te.compute((n,), lambda i: A[i] + B[i], name="C")
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, B, C])
+    sch = tir.Schedule(mod)
+
+    # Create two functions with different names
+    mod = tvm.IRModule(
+        {
+            "fadd1": sch.mod["main"].with_attr("global_symbol", "fadd1"),
+            "fadd2": sch.mod["main"].with_attr("global_symbol", "fadd2"),
+        }
+    )
 
-        dev = tvm.cpu(0)
-        # launch the kernel.
-        n = nn
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        fadd1(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-        fadd2(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
+    # Build and verify
+    f = tvm.build(mod, target="llvm")
+    dev = tvm.cpu(0)
+    n = 10
+    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
+    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
+    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
 
-    check_llvm()
+    # Test both functions
+    f["fadd1"](a, b, c)
+    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
+    f["fadd2"](a, b, c)
+    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
 
 
 @tvm.testing.requires_llvm
@@ -292,9 +341,13 @@ def test_llvm_condition():
     def check_llvm(n, offset):
         A = te.placeholder((n,), name="A")
         C = te.compute((n,), lambda i: tvm.tir.if_then_else(i >= offset, A[i], 0.0), name="C")
-        s = te.create_schedule(C.op)
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, C])
+        sch = tir.Schedule(mod)
+
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
@@ -312,9 +365,13 @@ def test_llvm_bool():
     def check_llvm(n):
         A = te.placeholder((n,), name="A", dtype="int32")
         C = te.compute((n,), lambda i: A[i].equal(1).astype("float"), name="C")
-        s = te.create_schedule(C.op)
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, C])
+        sch = tir.Schedule(mod)
+
         # build and invoke the kernel.
-        f = tvm.build(s, [A, C], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
@@ -334,9 +391,13 @@ def check_llvm(n):
         k = te.reduce_axis((0, n), name="k")
         C = te.compute((), lambda: te.sum(A[k] * scale(), axis=k), name="C")
         D = te.compute((), lambda: C() + 1)
-        s = te.create_schedule(D.op)
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, scale, D])
+        sch = tir.Schedule(mod)
+
         # build and invoke the kernel.
-        f = tvm.build(s, [A, scale, D], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
@@ -358,9 +419,13 @@ def check_llvm(n):
             k = te.reduce_axis((0, n), name="k")
             C = te.compute((), lambda: te.sum(A[k] * scale(), axis=k), name="C")
             D = te.compute((), lambda: C() + 1)
-            s = te.create_schedule(D.op)
+
+            # Convert to TIR and create schedule
+            mod = te.create_prim_func([A, scale, D])
+            sch = tir.Schedule(mod)
+
             # build and invoke the kernel.
-            f = tvm.build(s, [A, scale, D], "llvm")
+            f = tvm.build(sch.mod, target="llvm")
             dev = tvm.cpu(0)
             # launch the kernel.
             a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
@@ -378,10 +443,21 @@ def test_alignment():
     n = tvm.runtime.convert(1024)
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda i: A[i] * 3, name="B")
-    s = te.create_schedule(B.op)
-    bx, tx = s[B].split(B.op.axis[0], factor=8)
-    s[B].vectorize(tx)
-    f = tvm.build(s, [A, B], "llvm", name="test_alignment")
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, B]).with_attr("global_symbol", "test_alignment")
+    sch = tir.Schedule(mod)
+
+    # Get block and loop
+    block = sch.get_block("B")
+    loop = sch.get_loops(block)[0]
+
+    # Split and vectorize
+    _, tx = sch.split(loop, factors=[None, 8])
+    sch.vectorize(tx)
+
+    # Build with name
+    f = tvm.build(sch.mod, target="llvm")
 
     lines = f.get_source().split("\n")
 
@@ -452,8 +528,12 @@ def clipb(x):
             lambda i, j: (div(clipa(A[i]), clipb(B[j])), mod(clipa(A[i]), clipb(B[j]))),
         )
 
-        s = te.create_schedule([D.op, M.op])
-        f = tvm.build(s, [A, B, D, M], "llvm")
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, B, D, M])
+        sch = tir.Schedule(mod)
+
+        # Build from scheduled TIR
+        f = tvm.build(sch.mod, target="llvm")
 
         # Fill input arrays with values
         A_arr = tvm.nd.empty((end - start + 1,), dtype)
@@ -477,7 +557,7 @@ def _show_info():
             print("dtype: {}".format(dtype))
             print("dividend range: [{}, {}]".format(start, end))
             print("divisor range: [{}, {}]".format(dstart, dend))
-            lowered = tvm.lower(s, [A, B, D, M], simple_mode=True)
+            lowered = tvm.lower(sch.mod, simple_mode=True)
             print("Lowered code:")
             print(lowered)
 
@@ -557,8 +637,12 @@ def check_llvm_reciprocal(n):
         A = te.placeholder((n,), name="A")
         B = te.compute((n,), lambda i: te.div(1.0, (1e37 * A[i])), name="B")
 
-        s = te.create_schedule(B.op)
-        f = tvm.build(s, [A, B], "llvm")
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, B])
+        sch = tir.Schedule(mod)
+
+        # Build from scheduled TIR
+        f = tvm.build(sch.mod, target="llvm")
 
         a = tvm.nd.array(np.full((n,), 100, "float32"))
         b = tvm.nd.empty((n,), "float32")
@@ -573,8 +657,12 @@ def check_llvm_sigmoid(n):
         A = te.placeholder((n,), name="A")
         B = te.compute((n,), lambda i: te.sigmoid(A[i]), name="B")
 
-        s = te.create_schedule(B.op)
-        f = tvm.build(s, [A, B], "llvm")
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, B])
+        sch = tir.Schedule(mod)
+
+        # Build from scheduled TIR
+        f = tvm.build(sch.mod, target="llvm")
 
         a = tvm.nd.array(np.full((n,), -1000, "float32"))
         b = tvm.nd.empty((n,), "float32")
@@ -593,10 +681,19 @@ def test_dwarf_debug_information():
     A = te.placeholder((n,), name="A")
     B = te.placeholder((n,), name="B")
     C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=4)
-    s[C].parallel(xo)
-    s[C].vectorize(xi)
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, B, C])
+    sch = tir.Schedule(mod)
+
+    # Get block and loop
+    block = sch.get_block("C")
+    loop = sch.get_loops(block)[0]
+
+    # Split and parallelize
+    xo, xi = sch.split(loop, factors=[None, 4])
+    sch.parallel(xo)
+    sch.vectorize(xi)
 
     def check_llvm_object():
         if tvm.target.codegen.llvm_version_major() < 5:
@@ -604,9 +701,13 @@ def check_llvm_object():
         if tvm.target.codegen.llvm_version_major() > 6:
             return
         # build two functions
-        f2 = tvm.lower(s, [A, B, C], name="fadd1")
-        f1 = tvm.lower(s, [A, B, C], name="fadd2")
-        m = tvm.build([f1, f2], "llvm")
+        mod = tvm.IRModule(
+            {
+                "fadd1": sch.mod["main"].with_attr("global_symbol", "fadd1"),
+                "fadd2": sch.mod["main"].with_attr("global_symbol", "fadd2"),
+            }
+        )
+        m = tvm.build(mod, target="llvm")
         temp = utils.tempdir()
         o_path = temp.relpath("temp.o")
         m.save(o_path)
@@ -638,9 +739,13 @@ def check_llvm_ir():
         if tvm.target.codegen.llvm_version_major() > 6:
             return
         # build two functions
-        f2 = tvm.lower(s, [A, B, C], name="fadd1")
-        f1 = tvm.lower(s, [A, B, C], name="fadd2")
-        m = tvm.build([f1, f2], target="llvm -mtriple=aarch64-linux-gnu")
+        mod = tvm.IRModule(
+            {
+                "fadd1": sch.mod["main"].with_attr("global_symbol", "fadd1"),
+                "fadd2": sch.mod["main"].with_attr("global_symbol", "fadd2"),
+            }
+        )
+        m = tvm.build(mod, target="llvm -mtriple=aarch64-linux-gnu")
         ll = m.get_source("ll")
 
         # On non-Darwin OS, don't explicitly specify DWARF version.
@@ -650,7 +755,7 @@ def check_llvm_ir():
         assert re.search(r"""llvm.dbg.value""", ll)
 
         # Try Darwin, require DWARF-2
-        m = tvm.build([f1, f2], target="llvm -mtriple=x86_64-apple-darwin-macho")
+        m = tvm.build(mod, target="llvm -mtriple=x86_64-apple-darwin-macho")
         ll = m.get_source("ll")
         assert re.search(r"""i32 4, !"Dwarf Version", i32 2""", ll)
         assert re.search(r"""llvm.dbg.value""", ll)
@@ -664,7 +769,10 @@ def test_llvm_shuffle():
     a = te.placeholder((8,), "int32")
     b = te.placeholder((8,), "int32")
     c = te.compute((8,), lambda x: a[x] + b[7 - x])
-    sch = te.create_schedule(c.op)
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([a, b, c])
+    sch = tir.Schedule(mod)
 
     def my_vectorize():
         def vectorizer(op):
@@ -685,8 +793,8 @@ def _transform(f, *_):
         return tvm.tir.transform.prim_func_pass(_transform, opt_level=0, name="my_vectorize")
 
     with tvm.transform.PassContext(config={"tir.add_lower_pass": [(1, my_vectorize())]}):
-        ir = tvm.lower(sch, [a, b, c], simple_mode=True)
-        module = tvm.build(sch, [a, b, c])
+        ir = tvm.lower(sch.mod, simple_mode=True)
+        module = tvm.build(sch.mod)
         a_ = tvm.nd.array(np.arange(1, 9, dtype="int32"))
         b_ = tvm.nd.array(np.arange(8, 0, -1, dtype="int32"))
         c_ = tvm.nd.array(np.zeros((8,), dtype="int32"))
@@ -727,12 +835,21 @@ def dotest(do_vectorize):
         np.random.seed(122)
         A = te.placeholder((32,), dtype="bfloat16")
         B = te.placeholder((32,), dtype="bfloat16")
-        d = te.compute((32,), lambda x: A[x] + B[x])
-        sch = te.create_schedule(d.op)
+        D = te.compute((32,), lambda x: A[x] + B[x], name="D")
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, B, D])
+        sch = tir.Schedule(mod)
+
+        # Get block and loop
+        block = sch.get_block("D")
+        loop = sch.get_loops(block)[0]
+
+        # Apply vectorization if requested
         if do_vectorize:
-            sch[d].vectorize(d.op.axis[0])
+            sch.vectorize(loop)
 
-        module = tvm.build(sch, [A, B, d])
+        module = tvm.build(sch.mod, target="llvm")
         npa = np.random.rand(32).astype("float32")
         npb = np.random.rand(32).astype("float32")
         va = np_bf16_cast_and_cast_back(npa)
@@ -762,72 +879,6 @@ def test_llvm_crt_static_lib():
     module.save("test.o")
 
 
-def atomic_add(x, y):
-    return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y)
-
-
-@tvm.testing.requires_llvm
-def test_llvm_lower_atomic():
-    def do_atomic_add(A):
-        ib = tvm.tir.ir_builder.create()
-        n = A.shape[0]
-        atomic_add_return = ib.allocate(A.dtype, (1,), name="atomic_add_return", scope="local")
-        one = tvm.tir.const(1, A.dtype)
-        A_ptr = ib.buffer_ptr(A)
-        with ib.for_range(0, n, name="i", kind="parallel") as i:
-            atomic_add_return[0] = atomic_add(
-                tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one
-            )
-        return ib.get()
-
-    A = tvm.te.placeholder((100,), dtype="int32", name="A")
-    C = tvm.te.extern((100,), [A], lambda ins, _: do_atomic_add(ins[0]), name="C", dtype="int32")
-    s = tvm.te.create_schedule(C.op)
-    # This does not work because of pointer type mismatch
-    # TVMError: LLVM module verification failed with the following errors:
-    # Argument value type does not match pointer operand type!
-    # %21 = atomicrmw add i8* %7, i32 1 monotonic
-    # i8
-    # f = tvm.build(s, [A], target="llvm")
-
-
-@tvm.testing.requires_llvm
-@tvm.testing.requires_gpu
-def test_llvm_gpu_lower_atomic():
-    def do_atomic_add(A):
-        ib = tvm.tir.ir_builder.create()
-        n = A.shape[0]
-        atomic_add_return = ib.allocate(A.dtype, (1,), name="atomic_add_return", scope="local")
-        one = tvm.tir.const(1, A.dtype)
-        A_ptr = ib.buffer_ptr(A)
-        nthread_tx = 64
-        with ib.new_scope():
-            nthread_bx = (n + nthread_tx - 1) // nthread_tx
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-            ib.scope_attr(tx, "thread_extent", nthread_tx)
-            ib.scope_attr(bx, "thread_extent", nthread_bx)
-            atomic_add_return[0] = atomic_add(
-                tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one
-            )
-        return ib.get()
-
-    size = 1024
-    # CI uses LLVM 8, which does not support float atomic
-    for dtype in ["int32"]:
-        A = tvm.te.placeholder((size,), dtype=dtype, name="A")
-        C = tvm.te.extern((size,), [A], lambda ins, _: do_atomic_add(ins[0]), dtype=dtype)
-        s = tvm.te.create_schedule(C.op)
-        f = tvm.build(s, [A], target="nvptx")
-
-        dev = tvm.cuda()
-        a = tvm.nd.array(np.zeros((size,)).astype(A.dtype), dev)
-        f(a)
-        ref = np.zeros((size,)).astype(A.dtype)
-        ref[0] = size
-        tvm.testing.assert_allclose(a.numpy(), ref, rtol=1e-5)
-
-
 @tvm.testing.requires_llvm
 def test_llvm_order_functions():
     """Check that functions in the LLVM module are ordered alphabetically."""
@@ -850,7 +901,7 @@ def make_call_extern(caller, callee):
         "Kirby": make_call_extern("Kirby", "Fred"),
     }
     mod = tvm.IRModule(functions=functions)
-    ir_text = tvm.build(mod, None, target="llvm").get_source("ll")
+    ir_text = tvm.build(mod, target="llvm").get_source("ll")
     # Skip functions whose names start with _.
     matches = re.findall(r"^define[^@]*@([a-zA-Z][a-zA-Z0-9_]*)", ir_text, re.MULTILINE)
     assert matches == sorted(matches)
@@ -879,13 +930,14 @@ def check_llvm(use_file):
         temp = utils.tempdir()
         ll_path = temp.relpath("temp.ll")
         ll_code = clang.create_llvm(cc_code, output=ll_path)
-        s = te.create_schedule(B.op)
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+
         if use_file:
-            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_path)
+            sch.annotate(sch.get_loops("B")[0], "pragma_import_llvm", ll_path)
         else:
-            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_code)
+            sch.annotate(sch.get_loops("B")[0], "pragma_import_llvm", ll_code)
         # BUILD and invoke the kernel.
-        f = tvm.build(s, [A, B], "llvm")
+        f = tvm.build(sch.mod, target="llvm")
         dev = tvm.cpu(0)
         # launch the kernel.
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
@@ -910,7 +962,7 @@ def test_llvm_scalar_concat():
     # This will crash in LLVM codegen if CodeGenLLVM::CreateVecConcat doesn't convert
     # scalars to single-lane LLVM vectors.
     with tvm.transform.PassContext(config={"tir.disable_assert": True}):
-        m = tvm.build(mod, [x, y, z], target="llvm")
+        m = tvm.build(mod, target="llvm")
 
 
 @tvm.testing.requires_llvm
@@ -925,7 +977,7 @@ def threadpool_nested_parallel_loop(
                 B[i, j] = A[i, j] * 2.0
 
     with pytest.raises(tvm.TVMError) as e:
-        tvm.build({"llvm": tvm.IRModule.from_expr(threadpool_nested_parallel_loop)})
+        tvm.build(tvm.IRModule.from_expr(threadpool_nested_parallel_loop), target="llvm")
     msg = str(e)
     assert msg.find("Nested parallel loop is not supported") != -1
 
@@ -939,13 +991,16 @@ def test_llvm_target_attributes():
     A = te.placeholder((n,), name="A", dtype="float32")
     B = te.compute((n,), lambda i: A[i], name="B")
     C = te.compute((n,), lambda i: B[i] + tvm.tir.const(1, A.dtype), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], nparts=2)
-    s[C].parallel(xo)
+
+    sch = tvm.tir.Schedule(
+        te.create_prim_func([A, B, C, n]).with_attr("global_symbol", "test_func")
+    )
+    xo, xi = sch.split(sch.get_loops("C")[0], factors=[2, None])
+    sch.parallel(xo)
 
     target_llvm = "llvm -mtriple=x86_64-linux-gnu -mcpu=skylake -mattr=+avx512f"
     target = tvm.target.Target(target_llvm, host=target_llvm)
-    module = tvm.build(s, [A, B, C, n], target=target, name="test_func")
+    module = tvm.build(sch.mod, target=target)
 
     llvm_ir = module.get_source()
     llvm_ir_lines = llvm_ir.split("\n")
@@ -996,7 +1051,7 @@ def tir_assume_func(A: T.Buffer((4, 4), "int32"), B: T.Buffer((14,), "int32")):
     mod = tvm.IRModule.from_expr(tir_assume_func)
     inp = te.placeholder((4, 4), name="A", dtype="int32")
     out = te.placeholder((14,), name="B", dtype="int32")
-    m = tvm.build(mod, [inp, out], target="llvm")
+    m = tvm.build(mod, target="llvm")
 
 
 @tvm.testing.requires_llvm
diff --git a/tests/python/codegen/test_target_codegen_opencl.py b/tests/python/codegen/test_target_codegen_opencl.py
index 079553665ffb..90af959472c5 100644
--- a/tests/python/codegen/test_target_codegen_opencl.py
+++ b/tests/python/codegen/test_target_codegen_opencl.py
@@ -135,9 +135,12 @@ def test_opencl_erf():
     def check_erf(dev, n, dtype):
         A = te.placeholder((n,), name="A", dtype=dtype)
         C = te.compute(A.shape, lambda *i: te.erf(A(*i)), name="C")
-        s = te.create_schedule(C.op)
-        s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x"))
-        fun = tvm.build(s, [A, C], target)
+        func = te.create_prim_func([A, C])
+        sch = tvm.tir.Schedule(func)
+        (x,) = sch.get_loops(sch.get_block("C"))
+        sch.bind(x, "threadIdx.x")
+        fun = tvm.build(sch.mod, target=target)
+
         source_str = fun.imported_modules[0].get_source()
         matches = re.findall("erf", source_str)
         error_matches = re.findall("erff", source_str)
diff --git a/tests/python/codegen/test_target_codegen_rocm.py b/tests/python/codegen/test_target_codegen_rocm.py
index a0990c330f03..4c7592034ef0 100644
--- a/tests/python/codegen/test_target_codegen_rocm.py
+++ b/tests/python/codegen/test_target_codegen_rocm.py
@@ -18,41 +18,8 @@
 import tvm.testing
 from tvm import te
 import numpy as np
-import unittest
 from tvm.script import tir as T
 
-tx = te.thread_axis("threadIdx.x")
-ty = te.thread_axis("threadIdx.y")
-bx = te.thread_axis("blockIdx.x")
-by = te.thread_axis("blockIdx.y")
-
-
-@tvm.testing.requires_rocm
-def test_rocm_cross_thread_reduction():
-    # based on the reduction tutorial
-    n = te.size_var("n")
-    m = te.size_var("m")
-    A = te.placeholder((n, m), name="A")
-    k = te.reduce_axis((0, m), "k")
-    B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
-    s = te.create_schedule(B.op)
-    ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
-    BF = s.rfactor(B, ki)
-    xo, xi = s[B].split(s[B].op.axis[0], factor=32)
-    s[B].bind(xo, bx)
-    s[B].bind(xi, ty)
-    s[B].bind(s[B].op.reduce_axis[0], tx)
-    s[BF].compute_at(s[B], s[B].op.reduce_axis[0])
-    s[B].set_store_predicate(tx.var.equal(0))
-    frocm = tvm.build(s, [A, B], "rocm")
-
-    nn = 128
-    dev = tvm.rocm(0)
-    a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev)
-    frocm(a, b)
-    tvm.testing.assert_allclose(b.numpy(), np.sum(a.numpy(), axis=1), rtol=1e-4)
-
 
 @tvm.testing.requires_rocm
 def test_rocm_inf_nan():
@@ -60,9 +27,11 @@ def check_inf_nan(dev, n, value, dtype):
         A = te.placeholder((n,), name="A", dtype=dtype)
         inf_value = tvm.tir.const(value, dtype=dtype)
         C = te.compute((n,), lambda i: inf_value, name="C")
-        s = te.create_schedule(C.op)
-        s[C].bind(s[C].op.axis[0], tx)
-        fun = tvm.build(s, [A, C], "rocm")
+        sch = tvm.tir.Schedule(te.create_prim_func([A, C]))
+        xo, xi = sch.split(sch.get_loops("C")[0], factors=[None, 128])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "threadIdx.x")
+        fun = tvm.build(sch.mod, "rocm")
         a = tvm.nd.empty((n,), A.dtype, dev)
         c = tvm.nd.empty((n,), A.dtype, dev)
         # Only need to test compiling here
@@ -78,19 +47,6 @@ def check_inf_nan(dev, n, value, dtype):
     check_inf_nan(dev, 1, float("nan"), "float64")
 
 
-@tvm.testing.requires_rocm
-def test_rocm_reduction_binding():
-    k = te.reduce_axis((0, 32), "k")
-    A = te.placeholder((96, 32), name="A")
-    B = te.compute((96,), lambda m: te.sum(A[m, k], axis=k), name="B")
-    s = te.create_schedule(B.op)
-
-    s[B].reorder(B.op.reduce_axis[0], B.op.axis[0])
-
-    mo, _ = s[B].split(B.op.axis[0], 32)
-    s[B].bind(mo, bx)
-
-
 @tvm.testing.requires_rocm
 def test_rocm_copy():
     def check_rocm(dtype, n):
@@ -116,11 +72,12 @@ def test_rocm_vectorize_add():
     def check_rocm(dtype, n, lanes):
         A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes))
         B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
-        s = te.create_schedule(B.op)
-        xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
-        s[B].bind(xo, bx)
-        s[B].bind(xi, tx)
-        fun = tvm.build(s, [A, B], "rocm")
+        sch = tir.Schedule(te.create_prim_func([A, B]))
+        xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, 4])
+        sch.bind(xo, "blockIdx.x")
+        sch.bind(xi, "threadIdx.x")
+        fun = tvm.build(sch.mod, target="rocm")
+
         dev = tvm.rocm(0)
         a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes)))
         c = tvm.nd.empty((n,), B.dtype, dev)
@@ -179,13 +136,3 @@ def func(
     b = tvm.nd.array(np.zeros((4,)).astype("float32"), dev)
     mod(a, b)
     tvm.testing.assert_allclose(b.numpy(), np.exp2(a.numpy()))
-
-
-if __name__ == "__main__":
-    test_rocm_cross_thread_reduction()
-    test_rocm_inf_nan()
-    test_rocm_reduction_binding()
-    test_rocm_copy()
-    test_rocm_vectorize_add()
-    test_rocm_warp_shuffle()
-    test_rocm_vectorized_exp()
diff --git a/tests/python/codegen/test_target_codegen_vulkan.py b/tests/python/codegen/test_target_codegen_vulkan.py
index 9d00f047cb69..0e1aa1a0403b 100644
--- a/tests/python/codegen/test_target_codegen_vulkan.py
+++ b/tests/python/codegen/test_target_codegen_vulkan.py
@@ -26,7 +26,7 @@
 
 import tvm
 import tvm.testing
-from tvm import te
+from tvm import te, tir
 from tvm.topi.math import cast
 from tvm.script import tir as T, ir as I
 from tvm.tir import TensorIntrin, IntImm, Cast, Schedule
@@ -60,9 +60,10 @@
         ]
     )
 )
-def test_vector_comparison(target, dtype):
-    n = (1024,)
-    A = te.placeholder(n, dtype=dtype, name="A")
+def test_vector_comparison(target, dev, dtype):
+    target = tvm.target.Target(target)
+    n = 1024
+    A = te.placeholder((n,), dtype=dtype, name="A")
     B = te.compute(
         A.shape,
         lambda i: tvm.tir.Select(
@@ -70,14 +71,18 @@ def test_vector_comparison(target, dtype):
         ),
         name="B",
     )
-    s = te.create_schedule(B.op)
 
-    (bx, tx) = s[B].split(s[B].op.axis[0], factor=128)
-    (tx, vx) = s[B].split(tx, factor=4)
-    s[B].bind(bx, te.thread_axis("blockIdx.x"))
-    s[B].bind(tx, te.thread_axis("threadIdx.x"))
-    s[B].vectorize(vx)
-    f = tvm.build(s, [A, B], target)
+    # Create IRModule
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]))
+    sch = tir.Schedule(mod)
+    (bx, tx) = sch.split(sch.get_loops("B")[0], factors=[None, 128])
+    (tx, vx) = sch.split(tx, factors=[None, 4])
+    sch.bind(bx, "blockIdx.x")
+    sch.bind(tx, "threadIdx.x")
+    sch.vectorize(vx)
+
+    # Build
+    f = tvm.build(sch.mod, target=target)
 
     # Verify we generate the boolx4 type declaration and the OpSelect
     # v4{float,half,int} instruction
@@ -102,133 +107,48 @@ def test_array_copy(dev, dtype, fuzz_seed):
 
 @tvm.testing.exclude_targets("llvm")
 def test_array_vectorize_add(target, dev, dtype):
+    target = tvm.target.Target(target)
     arr_size = 64
     lanes = 2
-    if "opencl" in target and dtype == "float16":
-        pytest.xfail("Opencl target does not support float16")
 
-    num_thread = 8
+    if "opencl" in str(target) and dtype == "float16":
+        pytest.xfail("Opencl target does not support float16")
 
     A = te.placeholder((arr_size,), name="A", dtype="%sx%d" % (dtype, lanes))
-    B = te.compute((arr_size,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
-    s[B].bind(xi, te.thread_axis("threadIdx.x"))
-    fun = tvm.build(s, [A, B], target)
+    B = te.compute(A.shape, lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
+
+    sch = tir.Schedule(te.create_prim_func([A, B]))
+    xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, 4])
+    sch.bind(xo, "blockIdx.x")
+    sch.bind(xi, "threadIdx.x")
+    f = tvm.build(sch.mod, target=target)
+
     a = tvm.nd.empty((arr_size,), A.dtype, dev).copyfrom(np.random.uniform(size=(arr_size, lanes)))
     c = tvm.nd.empty((arr_size,), B.dtype, dev)
-    fun(a, c)
+    f(a, c)
     tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
 
 
-@tvm.testing.parametrize_targets("vulkan")
-@pytest.mark.skip("Flaky, https://github.com/apache/tvm/issues/10779")
-def test_vulkan_stress(target, dev):
-    """
-    Launch a randomized test with multiple kernels per stream, multiple uses of
-    kernels per stream, over multiple threads.
-    """
-
-    n = 1024
-    num_thread = 64
-
-    def run_stress():
-        def worker():
-            A = te.placeholder((n,), name="A", dtype="float32")
-            B = te.placeholder((n,), name="B", dtype="float32")
-            functions = [
-                (
-                    lambda: te.compute((n,), lambda i: 2 * A[i] + 3 * B[i]),
-                    lambda a, b: 2 * a + 3 * b,
-                ),
-                (lambda: te.compute((n,), lambda i: A[i] + B[i]), lambda a, b: a + b),
-                (lambda: te.compute((n,), lambda i: A[i] + 2 * B[i]), lambda a, b: a + 2 * b),
-            ]
-
-            def build_f(f_ref):
-                (C_f, ref) = f_ref
-                C = C_f()
-                s = te.create_schedule(C.op)
-                xo, xi = s[C].split(C.op.axis[0], factor=num_thread)
-                s[C].bind(xo, te.thread_axis("blockIdx.x"))
-                s[C].bind(xi, te.thread_axis("threadIdx.x"))
-                fun = tvm.build(s, [A, B, C], target)
-                return (fun, ref)
-
-            fs = [
-                build_f(random.choice(functions)) for _ in range(np.random.randint(low=1, high=10))
-            ]
-            a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n,)))
-            b = tvm.nd.empty((n,), B.dtype, dev).copyfrom(np.random.uniform(size=(n,)))
-            cs = [tvm.nd.empty((n,), A.dtype, dev) for _ in fs]
-            for (f, _), c in zip(fs, cs):
-                f(a, b, c)
-
-            for (_, ref), c in zip(fs, cs):
-                tvm.testing.assert_allclose(c.numpy(), ref(a.numpy(), b.numpy()))
-
-        ts = [threading.Thread(target=worker) for _ in range(np.random.randint(1, 10))]
-        for t in ts:
-            t.start()
-        for t in ts:
-            t.join()
-
-    run_stress()
-
-
 @tvm.testing.exclude_targets("llvm")
 def test_vulkan_bool_load(target, dev):
-    arr_size = 1024
-
     target = tvm.target.Target(target)
-    if target.kind.name == "vulkan":
-        supports_int8_buffer = target.attrs.get("supports_int8", False) and target.attrs.get(
-            "supports_8bit_buffer", False
-        )
-        if not supports_int8_buffer:
-            pytest.xfail(
-                "Vulkan target does not support int8 buffer access, used to transfer booleans"
-            )
-
-    def do_copy(A, B, n):
-        ib = tvm.tir.ir_builder.create()
-        A = ib.buffer_ptr(A)
-        B = ib.buffer_ptr(B)
-
-        tx = te.thread_axis("threadIdx.x")
-        bx = te.thread_axis("blockIdx.x")
-
-        max_threads = 32
-        ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(n + max_threads - 1, max_threads))
-        ib.scope_attr(tx, "thread_extent", max_threads)
-        tid = bx * max_threads + tx
-
-        with ib.if_scope(tid < n):
-            B[tid] = cast(A[tid], "int32")
-
-        return ib.get()
-
+    arr_size = 1024
     A = te.placeholder((arr_size,), name="A", dtype="bool")
-    B = te.placeholder((arr_size,), name="B", dtype="int32")
+    B = te.compute(A.shape, lambda i: A[i].astype("int32"), name="B")
 
-    B = te.extern(
-        A.shape,
-        [A],
-        lambda ins, outs: do_copy(ins[0], outs[0], arr_size),
-        name="bool_copy_ir",
-        dtype="int32",
-    )
-    s = te.create_schedule(B.op)
+    sch = tir.Schedule(te.create_prim_func([A, B]))
+    xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, 128])
+    sch.bind(xo, "blockIdx.x")
+    sch.bind(xi, "threadIdx.x")
 
-    with tvm.transform.PassContext(opt_level=3):
-        func = tvm.build(s, [A, B], target)
+    # Build
+    f = tvm.build(sch.mod, target=target)
 
     a_np = np.random.uniform(size=arr_size) > 0.5
     b_np = np.zeros((arr_size,), dtype="int32")
     a = tvm.nd.array(a_np, dev)
     b = tvm.nd.array(b_np, dev)
-    func(a, b)
+    f(a, b)
     ref = a_np.astype(np.int32)
     tvm.testing.assert_allclose(b.numpy(), ref)
 
@@ -270,11 +190,11 @@ def test_vulkan_constant_passing(target, dev, vulkan_parameter_impl, vulkan_para
     A = te.placeholder((n,), name="A", dtype=dtype)
     B = te.compute(A.shape, lambda i: scalar_sum + A[i], name="B")
 
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=64)
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
-    s[B].bind(xi, te.thread_axis("threadIdx.x"))
-    f_add = tvm.build(s, scalars + [A, B], target)
+    sch = tvm.tir.Schedule(te.create_prim_func(scalars + [A, B]))
+    xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, 64])
+    sch.bind(xo, "blockIdx.x")
+    sch.bind(xi, "threadIdx.x")
+    f_add = tvm.build(sch.mod, target=target)
 
     n = 1024
     scalars = np.array([1 for _ in scalars]).astype(dtype)
@@ -287,6 +207,9 @@ def test_vulkan_constant_passing(target, dev, vulkan_parameter_impl, vulkan_para
 
 def test_vulkan_while_if(target, dev):
     target = tvm.target.Target(target)
+    n = 1
+    dtype = "int32"
+    A = te.placeholder((n,), name="A", dtype=dtype)
 
     def do_compute(A, B, n):
         ib = tvm.tir.ir_builder.create()
@@ -300,9 +223,6 @@ def do_compute(A, B, n):
         iterations[0] = 0
         B[0] = 0
 
-        # WhileNode's condition is re-evaluated every loop.  The
-        # if_then_else block introduces additional labels/blocks that
-        # must be kept separate from the WhileNode's block.
         loop_condition = iterations[0] < tvm.tir.if_then_else(A[0] > 0, 10, 20)
         with ib.while_loop(loop_condition):
             iterations[0] += 1
@@ -310,21 +230,19 @@ def do_compute(A, B, n):
 
         return ib.get()
 
-    n = 1
-    dtype = "int32"
-    A = te.placeholder((n,), name="A", dtype=dtype)
-
     B = te.extern(
         A.shape,
         [A],
         lambda ins, outs: do_compute(ins[0], outs[0], n),
         dtype=dtype,
     )
-    s = te.create_schedule(B.op)
 
-    # Point of failure would be here, at tvm.build.
-    with tvm.transform.PassContext(opt_level=3):
-        func = tvm.build(s, [A, B], target)
+    # Create IRModule
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]))
+    sch = tir.Schedule(mod)
+
+    # Build
+    func = tvm.build(sch.mod, target=target)
 
     a = tvm.nd.array(np.array([5], dtype=A.dtype), dev)
     b = tvm.nd.array(np.zeros(n, dtype=A.dtype), dev)
@@ -339,52 +257,40 @@ def do_compute(A, B, n):
 
 @tvm.testing.exclude_targets("llvm")
 def test_vulkan_local_threadidx(target, dev):
-    # To access the thread index, the vulkan runtime accesses a global
-    # array of thread indices, storing the result in a local variable.
-    # In CUDA, these are the built-in threadIdx.x variables, which are
-    # globally accessible.  In vulkan, these local variables must be
-    # defined inside a function, but are hoisted up to the function
-    # header to mimic the global CUDA semantics.  Before this
-    # hoisting, this test could trigger spvValidate errors for
-    # potentially undeclared variables.
+    target = tvm.target.Target(target)
+    n = 32
+    A = te.placeholder((n,), name="A", dtype="int32")
 
     def do_compute(A, B, n):
         ib = tvm.tir.ir_builder.create()
         A = ib.buffer_ptr(A)
         B = ib.buffer_ptr(B)
 
-        # One single declaration of te.thread_axis.
         tx = te.thread_axis("threadIdx.x")
 
         with ib.for_range(0, 1):
-            # Used inside a for-loop scope, defines local thread_id
-            # variable.
             ib.scope_attr(tx, "thread_extent", 16)
             B[tx + 0] = A[tx + 0]
 
         with ib.for_range(0, 1):
-            # Used in next scope.  If local variable defined at point
-            # of use instead of function header, will fail spvValidate
-            # for access of out-of-scope local variable.
             ib.scope_attr(tx, "thread_extent", 16)
             B[tx + 16] = A[tx + 16]
 
         return ib.get()
 
-    n = te.var("n")
-    A = te.placeholder((n,), name="A", dtype="int32")
-    B = te.placeholder((n,), name="B", dtype="int32")
-
     B = te.extern(
         A.shape,
         [A],
         lambda ins, outs: do_compute(ins[0], outs[0], n),
         dtype="int32",
     )
-    s = te.create_schedule(B.op)
 
-    # Expected failure occurs at build step.
-    func = tvm.build(s, [A, B], target)
+    # Create IRModule
+    mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]))
+    sch = tir.Schedule(mod)
+
+    # Build
+    func = tvm.build(sch.mod, target=target)
 
     n = 32
     a_np = np.arange(n).astype(dtype=A.dtype)
@@ -473,9 +379,8 @@ def do_compute(ins, outs):
             return ib.get()
 
         B = te.extern(A.shape, [A, R], do_compute, dtype="int32")
-        s = te.create_schedule(B.op)
 
-        return tvm.lower(s, [A, R, B])
+        return tvm.IRModule.from_expr(te.create_prim_func([A, R, B]))
 
     def test_ramp_broadcast_index(self, target, dev, mod, ref_data):
         f = tvm.build(mod, target=target)
@@ -488,36 +393,6 @@ def test_ramp_broadcast_index(self, target, dev, mod, ref_data):
         tvm.testing.assert_allclose(b.numpy(), b_np)
 
 
-@tvm.testing.parametrize_targets("vulkan -max_shared_memory_per_block=16384")
-def test_shared_mem_alloc(target, dev):
-    alloc_nbytes = 16384 * 2
-
-    def do_compute(ins, outs):
-        ib = tvm.tir.ir_builder.create()
-        out = ib.buffer_ptr(outs[0])
-
-        ib.scope_attr(te.thread_axis("blockIdx.x"), "thread_extent", 0)
-
-        array = ib.allocate("int32", (alloc_nbytes,), name="array", scope="shared")
-        array[0] = 0
-        out[0] = array[0]
-
-        return ib.get()
-
-    Out = te.extern(
-        shape=(1,),
-        inputs=[],
-        fcompute=do_compute,
-        dtype="int32",
-    )
-    s = te.create_schedule(Out.op)
-
-    # Codegen should raise error when allocating more memory than the
-    # target supports.
-    with pytest.raises(tvm.TVMError):
-        tvm.build(s, [Out], target)
-
-
 def test_negative_operand_divmod(target, dev):
     """Test handling of negative offsets to floormod/floordiv
 
diff --git a/tests/python/codegen/test_target_codegen_x86.py b/tests/python/codegen/test_target_codegen_x86.py
index a276940050b1..f433964f7f5d 100644
--- a/tests/python/codegen/test_target_codegen_x86.py
+++ b/tests/python/codegen/test_target_codegen_x86.py
@@ -38,9 +38,9 @@ def fp16_to_fp32(target, width, match=None, not_match=None):
         n = tvm.runtime.convert(elements)
         A = te.placeholder((n, width), dtype="float16", name="A")
         B = te.compute(A.shape, lambda *i: A(*i).astype("float32"), name="B")
-        s = te.create_schedule(B.op)
-        s[B].vectorize(s[B].op.axis[1])
-        f = tvm.build(s, [A, B], target)
+        sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
+        sch.vectorize(sch.get_loops("B")[1])
+        f = tvm.build(sch.mod, target=target)
 
         assembly = f.get_source("asm").splitlines()
         if match:
diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py
index 3c90aefeb67a..b8851e685b13 100644
--- a/tests/python/contrib/test_cblas.py
+++ b/tests/python/contrib/test_cblas.py
@@ -39,7 +39,6 @@ def verify_matmul_add(
     final_result = te.compute(
         matmul_result.shape, lambda i, j: matmul_result[i, j] + bias, name="final_result"
     )
-    s = te.create_schedule(final_result.op)
 
     def get_numpy(a, b, matrix_bias, transa, transb):
         if transa:
@@ -64,7 +63,12 @@ def verify(target="llvm"):
             return
         dev = tvm.cpu(0)
         name = "test_matmul_add"
-        f = tvm.build(s, [input1_data, input2_data, final_result, bias], target, name=name)
+        f = tvm.build(
+            te.create_prim_func([input1_data, input2_data, final_result, bias]).with_attr(
+                "global_symbol", name
+            ),
+            target=target,
+        )
         if target == "c":
             f = compiling(f, name)
         matrix_input1 = tvm.nd.array(np.random.uniform(size=ashape).astype(input1_data.dtype), dev)
@@ -126,7 +130,6 @@ def verify_quantized_matmul_add(matrix_m, matrix_l, matrix_n, transa=False, tran
     final_result = te.compute(
         matmul_result.shape, lambda i, j: matmul_result[i, j] + bias, name="final_result"
     )
-    s = te.create_schedule(final_result.op)
 
     def get_numpy(a, b, matrix_bias, transa, transb):
         if transa:
@@ -143,7 +146,9 @@ def verify(target="llvm"):
             print("skip because extern function is not available")
             return
         dev = tvm.cpu(0)
-        f = tvm.build(s, [input1_data, input2_data, final_result, bias], target)
+        f = tvm.build(
+            te.create_prim_func([input1_data, input2_data, final_result, bias]), target=target
+        )
         matrix_input1 = tvm.nd.array(
             np.random.randint(low=0, high=50, size=ashape).astype(input1_data.dtype), dev
         )
@@ -201,7 +206,6 @@ def verify_batch_matmul(
     final_result = te.compute(
         matmul_result.shape, lambda k, i, j: matmul_result[k, i, j], name="final_result"
     )
-    s = te.create_schedule(final_result.op)
 
     def get_numpy(a, b, transa, transb):
         if transa:
@@ -226,7 +230,7 @@ def verify(target="llvm"):
             return
         dev = tvm.cpu(0)
         name = "test_batch_matmul"
-        f = tvm.build(s, [input1_data, input2_data, final_result], target, name=name)
+        f = tvm.build(te.create_prim_func([input1_data, input2_data, final_result]), target=target)
         if target == "c":
             f = compiling(f, name)
         matrix_input1 = tvm.nd.array(np.random.uniform(size=ashape).astype(input1_data.dtype), dev)
diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py
index 4e65f79c518e..70277cb0ca0a 100644
--- a/tests/python/contrib/test_dlpack.py
+++ b/tests/python/contrib/test_dlpack.py
@@ -49,10 +49,9 @@ def verify_torch_dlpack():
 
         k = te.reduce_axis((0, n), name="k")
         ZZ = te.compute((n, n), lambda i, j: te.sum(XX[i, k] * YY[k, j], axis=k))
-        s = te.create_schedule(ZZ.op)
         # No need to speficy target_host if it's llvm
         # Otherwise you will need to specify the target and target_host
-        f = tvm.build(s, [XX, YY, ZZ], name="f")
+        f = tvm.build(te.create_prim_func([XX, YY, ZZ]))
 
         f_pytorch = to_pytorch_func(f)
         zz2 = torch.empty(137, 137)
diff --git a/tests/python/contrib/test_gemm_acc16.py b/tests/python/contrib/test_gemm_acc16.py
deleted file mode 100644
index 18e15098a07e..000000000000
--- a/tests/python/contrib/test_gemm_acc16.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
-import tvm
-from tvm import te
-import numpy as np
-from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int16
-
-
-def benchmark_fc_int8_acc16():
-    m = 128
-    n = 128
-    k = 128
-
-    X = te.placeholder((m, k), name="X", dtype="uint8")
-    W = te.placeholder((n, k), name="W", dtype="int8")
-
-    peak = 512 / 16 * 2 * 2 * 2
-    gops_per_mm = 2 * n * m * k
-    print("Peak {} Gops/s \n".format(peak))
-
-    def verify(target="llvm -mcpu=skylake-avx512"):
-        if not tvm.runtime.enabled(target):
-            print("skip because %s is not enabled..." % target)
-            return
-
-        dev = tvm.device(target, 0)
-        X = te.placeholder((m, k), name="X", dtype="uint8")
-        W = te.placeholder((n, k), name="W", dtype="int8")
-        pc = dot_16x1x16_uint8_int8_int16()
-        ak = te.reduce_axis((0, k), name="k")
-
-        packedW = te.placeholder((n // 128, 128 * (k // 2), 2), name="packedW", dtype="int8")
-        t_fc = te.compute(
-            (m, n),
-            lambda i, j: te.sum(
-                X[i, ak].astype("int16")
-                * packedW[j // 128, (ak // 2) * 128 + j % 128, ak % 2].astype("int16"),
-                axis=ak,
-            ),
-            name="F",
-        )
-
-        t_sch = te.create_schedule(t_fc.op)
-        a_x, a_y = t_fc.op.axis
-        (a_k,) = t_fc.op.reduce_axis
-
-        a_yo, a_yi = t_sch[t_fc].split(a_y, factor=128)
-        a_ko, a_ki = t_sch[t_fc].split(a_k, factor=2)
-
-        a_xo, a_xi = t_sch[t_fc].split(a_x, factor=128)
-        a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=32)
-        t_sch[t_fc].reorder(a_yo, a_xo, a_koo, a_xi, a_koi, a_yi, a_ki)
-
-        t_sch[t_fc].tensorize(a_yi, pc)
-        # print(tvm.lower(t_sch, [X, packedW, t_fc], simple_mode=True))
-        t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
-        t_evaluator = t_func.time_evaluator(t_func.entry_name, dev, number=10)
-
-        # generate the plain data
-        a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
-        b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8")
-
-        packW = np.random.uniform(1, 10, size=(n // 128, 128 * (k // 2), 2)).astype("int8")
-        # This occurs in pre_compute stage
-        for r_idx in range(n // 128):
-            for s_idx in range(128 * (k // 2)):
-                for t_idx in range(2):
-                    packW[r_idx][s_idx][t_idx] = b_[r_idx * 128 + s_idx % 128][
-                        s_idx // 128 * 2 + t_idx
-                    ]
-
-        x = tvm.nd.array(a_, dev)
-        w = tvm.nd.array(packW, dev)
-        y = tvm.nd.array(np.zeros((m, n), dtype="int16"), dev)
-
-        result = t_evaluator(x, w, y)
-        gops_per_sec = gops_per_mm / result.mean / 1e9
-        tvm.testing.assert_allclose(y.numpy(), np.dot(a_, b_.T), rtol=1e-5)
-        print(
-            "Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}.".format(
-                result.mean * 1000, gops_per_sec, gops_per_sec / peak
-            )
-        )
-        # t_func.export_library("gemm_tensorize.o")
-
-    verify()
-
-
-if __name__ == "__main__":
-    benchmark_fc_int8_acc16()
diff --git a/tests/python/contrib/test_gemm_acc32_vnni.py b/tests/python/contrib/test_gemm_acc32_vnni.py
deleted file mode 100644
index 2e15d38612ce..000000000000
--- a/tests/python/contrib/test_gemm_acc32_vnni.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-import tvm.testing
-from tvm import te
-import numpy as np
-from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32
-
-
-def verify_fc_int8_acc32(m=1024, n=1024, k=1024, target="llvm -mcpu=cascadelake"):
-    X = te.placeholder((m, k), name="X", dtype="uint8")
-    # W = te.placeholder((n, k), name="W", dtype="int8")
-
-    if not tvm.testing.device_enabled(target):
-        print("skip because %s is not enabled..." % target)
-        return
-
-    dev = tvm.device(target, 0)
-    # workaround for Target.current()
-    with tvm.target.Target(target) as target:
-        pc = dot_16x1x16_uint8_int8_int32()
-
-    ak = te.reduce_axis((0, k), name="k")
-    packedW = te.placeholder((n // 16, 16 * (k // 4), 4), name="packedW", dtype="int8")
-
-    t_fc = te.compute(
-        (m, n),
-        lambda i, j: te.sum(
-            X[i, ak].astype("int32")
-            * packedW[
-                tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(ak, 4) * 16 + j % 16, ak % 4
-            ].astype("int32"),
-            axis=ak,
-        ),
-        name="F",
-    )
-    t_sch = te.create_schedule(t_fc.op)
-    a_x, a_y = t_fc.op.axis
-    (a_k,) = t_fc.op.reduce_axis
-
-    a_yo, a_yi = t_sch[t_fc].split(a_y, factor=16)
-    a_xo, a_xi = t_sch[t_fc].split(a_x, factor=32)
-    a_ko, a_ki = t_sch[t_fc].split(a_k, factor=4)
-    a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=4)
-    t_sch[t_fc].reorder(a_yo, a_xo, a_xi, a_koo, a_koi, a_yi, a_ki)
-
-    t_sch[t_fc].unroll(a_koi)
-    t_sch[t_fc].tensorize(a_yi, pc)
-
-    t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
-    t_evaluator = t_func.time_evaluator(t_func.entry_name, dev, number=10)
-
-    # generate the plain data
-    a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
-    b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8")
-
-    packW = np.random.uniform(1, 10, size=(n // 16, 16 * (k // 4), 4)).astype("int8")
-    # This occurs in pre_compute stage
-    for r_idx in range(n // 16):
-        for s_idx in range(16 * (k // 4)):
-            for t_idx in range(4):
-                packW[r_idx][s_idx][t_idx] = b_[r_idx * 16 + s_idx % 16][(s_idx // 16) * 4 + t_idx]
-
-    x = tvm.nd.array(a_, dev)
-    w = tvm.nd.array(packW, dev)
-    y = tvm.nd.array(np.zeros((m, n), dtype="int32"), dev)
-    result = t_evaluator(x, w, y)
-
-    peak = 280
-    print("Peak {} Gops/s".format(peak))
-    # memory_ops = m * k + n * k + 2 * m * n
-    gops_per_mm = 2 * m * n * k
-
-    gops_per_sec = gops_per_mm / result.mean / 1e9
-    # verify the correctness
-    tvm.testing.assert_allclose(y.numpy(), np.dot(a_, b_.T), rtol=0)
-    print(
-        "Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}".format(
-            result.mean * 1000, gops_per_sec, gops_per_sec / peak
-        )
-    )
-    # t_func.export_library("tensorize_acc32.o")
-
-
-@tvm.testing.requires_x86_vnni
-def test_fc_int8_acc32_vnni():
-    # For LLVM < 8.0, it shows "'cascadelake' is not a recognized processor for this target
-    # (ignoring processor)" error with the following setting. After LLVM 8.0 is enabled in the
-    # test, we should use cascadelake setting.
-    verify_fc_int8_acc32()
-
-
-@tvm.testing.requires_x86_avx512
-def test_fc_int8_acc32_avx512():
-    verify_fc_int8_acc32(target="llvm -mcpu=skylake-avx512")
-
-
-if __name__ == "__main__":
-    test_fc_int8_acc32_vnni()
-    test_fc_int8_acc32_avx512()
diff --git a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py
deleted file mode 100644
index 07f6c2613dbc..000000000000
--- a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Hexagon contrib tests for blocked conv2d """
-
-
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import te, topi
-from tvm.topi import testing
-
-from ..infrastructure import (
-    build_and_run,
-    conv2d_compute,
-    conv2d_verify,
-    get_block_shape,
-    get_packed_filter_shape,
-    get_packed_shape,
-)
-
-
-def conv2d_nhwc8h8w32c(
-    shape_input,
-    pad,
-    stride,
-    dilation,
-    shape_filter,
-    k_split_factor,
-    h_split_factor,
-    dtype,
-    storage_scope="global",
-):
-    """
-    Conv2d wherein the input activation is defined by its
-    logical NHWC layout.  The filter is provided in its physical
-    packed layout (oihw8i32o4i).  The input is padded and then packed
-    into its physical packed layout (nhwc8h8w32c).  The resulting
-    computation is in the same physical packed layout (nhwc8h8w32c).
-    """
-
-    # nhwc layout
-    logical_input = te.placeholder(shape_input, dtype=dtype, name="logical_input")
-
-    # oihw8i32o4i layout
-    filt_packed = te.placeholder(shape_filter, dtype=dtype, name="packed_filter")
-
-    block_h, block_w, block_c = get_block_shape()
-
-    # Calculate padded input
-    _, height, width, _ = shape_input
-    pad_h = (block_h - ((height + pad[1]) % block_h)) % block_h
-    pad_w = (block_w - ((width + pad[3]) % block_w)) % block_w
-    padded_input = topi.nn.pad(
-        logical_input,
-        [0, pad[0], pad[2], 0],
-        [0, pad_h, pad_w, 0],
-        pad_value=0,
-        name="padded_input",
-    )
-
-    # Calculate packed input
-    packed_shape = get_packed_shape(padded_input.shape)
-    packed_input = te.compute(
-        packed_shape,
-        lambda n, ho, wo, co, hi, wi, ci: padded_input[
-            n, ho * block_h + hi, wo * block_w + wi, co * block_c + ci
-        ],
-        name="packed_input",
-    )
-
-    output_shape, compute = conv2d_compute(packed_input, filt_packed, pad, stride, dilation)
-    packed_output = te.compute(output_shape, compute, name="packed_output")
-    s = te.create_schedule(packed_output.op)
-
-    # Ensure the padding and array packing is performed inline
-    s[padded_input].compute_inline()
-    s[packed_input].compute_inline()
-
-    # cache reads and writes
-    cached_input = s.cache_read(packed_input, storage_scope, [packed_output])
-    cached_filt = s.cache_read(filt_packed, storage_scope, [packed_output])
-    cached_output = s.cache_write(packed_output, storage_scope)
-
-    # cache write schedule
-    batch, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[packed_output].op.axis
-    koo, koi = s[packed_output].split(k_outer, factor=k_split_factor)
-    hoo, hoi = s[packed_output].split(h_outer, factor=h_split_factor)
-    s[packed_output].reorder(batch, koo, hoo, koi, hoi, w_outer, h_inner, w_inner, k_inner)
-    s[cached_output].compute_at(s[packed_output], hoo)
-
-    # compute schedule
-    batch, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[cached_output].op.axis
-    _, _, reduce_c = s[cached_output].op.reduce_axis
-    rco, rci = s[cached_output].split(reduce_c, factor=block_c)
-    koo, koi = s[cached_output].split(k_outer, factor=k_split_factor)
-    hoo, hoi = s[cached_output].split(h_outer, factor=h_split_factor)
-    s[cached_output].reorder(
-        batch, koo, hoo, koi, hoi, w_outer, rco, h_inner, w_inner, k_inner, rci
-    )
-    s[cached_input].compute_at(s[cached_output], hoo)
-    s[cached_filt].compute_at(s[cached_output], hoo)
-
-    binds = {}
-    if storage_scope and storage_scope != "global":
-        with tvm.transform.PassContext():
-            input_buffer = tvm.tir.decl_buffer(
-                packed_shape, name="Xb", dtype=dtype, scope=storage_scope
-            )
-            output_buffer = tvm.tir.decl_buffer(
-                output_shape, name="Yb", dtype=dtype, scope=storage_scope
-            )
-            binds = {logical_input: input_buffer, packed_output: output_buffer}
-
-    return (s, [logical_input, filt_packed, packed_output], binds)
-
-
-class BaseConv2d:
-    """Base class for conv2d tests"""
-
-    # input
-    batch = tvm.testing.parameter(1)
-    in_size = tvm.testing.parameter(64)
-    in_channel = tvm.testing.parameter(64)
-    # conv2d
-    pad = tvm.testing.parameter(0)
-    stride = tvm.testing.parameter(1)
-    kernel_size = tvm.testing.parameter(1, 3)
-    out_channel = tvm.testing.parameter(128)
-    # schedule params
-    k_split_factor = tvm.testing.parameter(1, 2)
-    h_split_factor = tvm.testing.parameter(1, 2)
-    dtype = tvm.testing.parameter("float32")
-
-
-class TestConv2dPackedFilter(BaseConv2d):
-    """Conv2d packed filter test class"""
-
-    @tvm.testing.parametrize_targets("llvm")
-    @tvm.testing.skip_if_32bit(reason="Test known to be flaky on i386 machines")
-    def test_conv2d(
-        self,
-        batch,
-        in_size,
-        in_channel,
-        pad,
-        stride,
-        kernel_size,
-        out_channel,
-        k_split_factor,
-        h_split_factor,
-        dtype,
-        target,
-    ):
-        """conv2d test"""
-        # TODO: no support for dilation
-        dilation = 1
-
-        shape_input = [batch, in_size, in_size, in_channel]
-        shape_filter_oihw = [out_channel, in_channel, kernel_size, kernel_size]
-        shape_filter_oihw8i32o4i = get_packed_filter_shape(shape_filter_oihw)
-
-        inputs = [
-            np.random.uniform(0, 255, size=shape_input).astype(dtype),
-            np.random.uniform(0, 255, size=shape_filter_oihw8i32o4i).astype(dtype),
-        ]
-        np_filter = (
-            inputs[1]
-            .transpose(0, 5, 1, 4, 6, 2, 3)
-            .reshape(shape_filter_oihw)
-            .transpose(2, 3, 1, 0)
-        )
-        ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad)
-        output = build_and_run(
-            inputs,
-            conv2d_nhwc8h8w32c,
-            target,
-            target,
-            shape_input=shape_input,
-            pad=(pad, pad, pad, pad),
-            stride=(stride, stride),
-            dilation=(dilation, dilation),
-            shape_filter=shape_filter_oihw8i32o4i,
-            k_split_factor=k_split_factor,
-            h_split_factor=h_split_factor,
-            dtype=dtype,
-        )
-
-        conv2d_verify(output, ref_output, dtype)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py
deleted file mode 100644
index fa770c9be313..000000000000
--- a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" back-to-back conv2d Hexagon test for stripe scheduling """
-
-
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import te, topi
-from tvm.topi import testing
-
-from ..infrastructure import (
-    build_and_run,
-    conv2d_compute,
-    conv2d_verify,
-    get_block_shape,
-    get_packed_filter_shape,
-    get_packed_shape,
-)
-
-
-def conv2dconv2d_nhwc8h8w32c(
-    shape_input,
-    pad1,
-    stride1,
-    dilation1,
-    shape_filter1,
-    pad2,
-    stride2,
-    dilation2,
-    shape_filter2,
-    k_split_factor,
-    h_split_factor,
-    dtype,
-    storage_scope="global",
-):
-    """
-    Conv2d -> Conv2d wherein the input activation is defined by its
-    logical NHWC layout.  The filter is provided in its physical
-    packed layout (oihw8i32o4i).  The input is padded and then packed
-    into its physical packed layout (nhwc8h8w32c).  The resulting
-    computation is in the same physical packed layout (nhwc8h8w32c).
-    """
-
-    # nhwc layout
-    logical_input = te.placeholder(shape_input, dtype=dtype, name="logical_input")
-
-    # oihw8i32o4i layout
-    filt_packed1 = te.placeholder(shape_filter1, dtype=dtype, name="packed_filter1")
-    filt_packed2 = te.placeholder(shape_filter2, dtype=dtype, name="packed_filter2")
-
-    block_h, block_w, block_c = get_block_shape()
-
-    # Calculate padded input
-    _, height, width, _ = shape_input
-    pad_h = (block_h - ((height + pad1[1]) % block_h)) % block_h
-    pad_w = (block_w - ((width + pad1[3]) % block_w)) % block_w
-    padded_input = topi.nn.pad(
-        logical_input,
-        [0, pad1[0], pad1[2], 0],
-        [0, pad_h, pad_w, 0],
-        pad_value=0,
-        name="padded_input",
-    )
-
-    # Calculate packed input
-    packed_shape = get_packed_shape(padded_input.shape)
-    packed_input = te.compute(
-        packed_shape,
-        lambda n, ho, wo, co, hi, wi, ci: padded_input[
-            n, ho * block_h + hi, wo * block_w + wi, co * block_c + ci
-        ],
-        name="packed_input",
-    )
-
-    output_shape1, compute1 = conv2d_compute(packed_input, filt_packed1, pad1, stride1, dilation1)
-    temp_output = te.compute(output_shape1, compute1, name="temp_output")
-
-    output_shape2, compute2 = conv2d_compute(temp_output, filt_packed2, pad2, stride2, dilation2)
-    packed_output = te.compute(output_shape2, compute2, name="packed_output")
-    s = te.create_schedule(packed_output.op)
-
-    # Ensure the padding and array packing is performed inline
-    s[padded_input].compute_inline()
-    s[packed_input].compute_inline()
-
-    # cache reads and writes
-    packed_input_cached = s.cache_read(packed_input, storage_scope, [temp_output])
-    filt_packed1_cached = s.cache_read(filt_packed1, storage_scope, [temp_output])
-    filt_packed2_cached = s.cache_read(filt_packed2, storage_scope, [packed_output])
-    packed_output_cached = s.cache_write(packed_output, storage_scope)
-
-    # conv2d #1 schedule
-    n, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[temp_output].op.axis
-    _, _, reduce_channel = s[temp_output].op.reduce_axis
-    rco, rci = s[temp_output].split(reduce_channel, factor=block_c)
-    koo, koi = s[temp_output].split(k_outer, factor=k_split_factor)
-    hoo, hoi = s[temp_output].split(h_outer, factor=h_split_factor)
-    s[temp_output].reorder(n, koo, hoo, koi, hoi, w_outer, rco, h_inner, w_inner, k_inner, rci)
-    s[packed_input_cached].compute_at(s[temp_output], hoo)
-    s[filt_packed1_cached].compute_at(s[temp_output], hoo)
-
-    # cache write schedule
-    n, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[packed_output].op.axis
-    koo, koi = s[packed_output].split(k_outer, factor=k_split_factor)
-    hoo, hoi = s[packed_output].split(h_outer, factor=h_split_factor)
-    s[packed_output].reorder(n, koo, hoo, koi, hoi, w_outer, h_inner, w_inner, k_inner)
-    s[packed_output_cached].compute_at(s[packed_output], hoo)
-
-    # conv2d #2 schedule
-    n, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[packed_output_cached].op.axis
-    _, _, reduce_channel = s[packed_output_cached].op.reduce_axis
-    rco, rci = s[packed_output_cached].split(reduce_channel, factor=block_c)
-    koo, koi = s[packed_output_cached].split(k_outer, factor=k_split_factor)
-    hoo, hoi = s[packed_output_cached].split(h_outer, factor=h_split_factor)
-    s[packed_output_cached].reorder(
-        n, koo, hoo, koi, hoi, w_outer, rco, h_inner, w_inner, k_inner, rci
-    )
-    s[temp_output].compute_at(s[packed_output_cached], hoo)
-    s[filt_packed2_cached].compute_at(s[packed_output_cached], hoo)
-
-    binds = {}
-    if storage_scope and storage_scope != "global":
-        with tvm.transform.PassContext():
-            input_buffer = tvm.tir.decl_buffer(
-                packed_shape, name="Xb", dtype=dtype, scope=storage_scope
-            )
-            output_buffer = tvm.tir.decl_buffer(
-                output_shape2, name="Yb", dtype=dtype, scope=storage_scope
-            )
-            binds = {logical_input: input_buffer, packed_output: output_buffer}
-
-    return (s, [logical_input, filt_packed1, filt_packed2, packed_output], binds)
-
-
-class BaseConv2dConv2d:
-    """Base class for conv2d-conv2d tests"""
-
-    # input
-    batch = tvm.testing.parameter(1)
-    in_size = tvm.testing.parameter(64)
-    in_channel = tvm.testing.parameter(128)
-    # conv2d #1
-    pad1 = tvm.testing.parameter(0)
-    stride1 = tvm.testing.parameter(1)
-    kernel_size1 = tvm.testing.parameter(1, 3)
-    out_channel1 = tvm.testing.parameter(128)
-    # conv2d #2
-    stride2 = tvm.testing.parameter(1)
-    kernel_size2 = tvm.testing.parameter(1, 3)
-    out_channel2 = tvm.testing.parameter(128)
-    # schedule params
-    k_split_factor = tvm.testing.parameter(1, 2)
-    h_split_factor = tvm.testing.parameter(1, 2)
-    dtype = tvm.testing.parameter("float32")
-
-
-class TestConv2dConv2dPackedFilter(BaseConv2dConv2d):
-    """Conv2d-Conv2d packed filter test class"""
-
-    @tvm.testing.parametrize_targets("llvm")
-    @tvm.testing.skip_if_32bit(reason="Test known to be flaky on i386 machines")
-    def test_conv2d(
-        self,
-        batch,
-        in_size,
-        in_channel,
-        pad1,
-        stride1,
-        kernel_size1,
-        out_channel1,
-        stride2,
-        kernel_size2,
-        out_channel2,
-        k_split_factor,
-        h_split_factor,
-        dtype,
-        target,
-    ):
-        """conv2d-conv2d test"""
-        # TODO: no support for padding in conv2d #2
-        pad2 = 0
-
-        # TODO: no support for dilation
-        dilation1 = 1
-        dilation2 = 1
-
-        shape_input = [batch, in_size, in_size, in_channel]
-        shape_filter1_oihw = [out_channel1, in_channel, kernel_size1, kernel_size1]
-        shape_filter1_oihw8i32o4i = get_packed_filter_shape(shape_filter1_oihw)
-
-        shape_filter2_oihw = [out_channel2, out_channel1, kernel_size2, kernel_size2]
-        shape_filter2_oihw8i32o4i = get_packed_filter_shape(shape_filter2_oihw)
-
-        inputs = [
-            np.random.uniform(0, 255, size=shape_input).astype(dtype),
-            np.random.uniform(0, 255, size=shape_filter1_oihw8i32o4i).astype(dtype),
-            np.random.uniform(0, 255, size=shape_filter2_oihw8i32o4i).astype(dtype),
-        ]
-        np_filter1 = (
-            inputs[1]
-            .transpose(0, 5, 1, 4, 6, 2, 3)
-            .reshape(shape_filter1_oihw)
-            .transpose(2, 3, 1, 0)
-        )
-        np_filter2 = (
-            inputs[2]
-            .transpose(0, 5, 1, 4, 6, 2, 3)
-            .reshape(shape_filter2_oihw)
-            .transpose(2, 3, 1, 0)
-        )
-        temp_output = testing.conv2d_nhwc_python(inputs[0], np_filter1, stride1, pad1)
-        ref_output = testing.conv2d_nhwc_python(temp_output, np_filter2, stride2, pad2)
-        output = build_and_run(
-            inputs,
-            conv2dconv2d_nhwc8h8w32c,
-            target,
-            target,
-            shape_input=shape_input,
-            pad1=(pad1, pad1, pad1, pad1),
-            stride1=(stride1, stride1),
-            dilation1=(dilation1, dilation1),
-            shape_filter1=shape_filter1_oihw8i32o4i,
-            pad2=(pad2, pad2, pad1, pad1),
-            stride2=(stride2, stride2),
-            dilation2=(dilation2, dilation2),
-            shape_filter2=shape_filter2_oihw8i32o4i,
-            k_split_factor=k_split_factor,
-            h_split_factor=h_split_factor,
-            dtype=dtype,
-        )
-
-        conv2d_verify(output, ref_output, dtype)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
index d22b2db9c399..99fc6ac074c2 100644
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -200,12 +200,7 @@ def schedule_args(
         working_scope,
     ):
         """Create and return the schedule and input args after applying layout transform"""
-        if schedule_type == "TE":
-
-            return self._te_schedule_args(
-                input_shape, dtype, input_layout, output_layout, working_layout, working_scope
-            )
-        elif schedule_type == "TIR":
+        if schedule_type == "TIR":
             return self._tir_schedule_args(
                 input_shape, dtype, input_layout, output_layout, working_layout, working_scope
             )
@@ -222,40 +217,6 @@ def _te_tensors(self, input_shape, dtype):
         )
         return input_tensor, output_tensor
 
-    def _te_schedule_args(
-        self,
-        input_shape,
-        dtype,
-        input_layout,
-        output_layout,
-        working_layout,
-        working_scope,
-    ):
-        input_tensor, output_tensor = self._te_tensors(input_shape, dtype)
-
-        schedule = te.create_schedule(output_tensor.op)
-
-        write_cache = schedule.cache_write(output_tensor, working_scope)
-        read_cache = schedule.cache_read(input_tensor, working_scope, [write_cache])
-
-        def apply_transform(tensor, layout):
-            if layout == "nhwc":
-                return None
-            if layout == "nchw-8h8w32c-1d":
-                return schedule[tensor].transform_layout(layout_transform_1d)
-            if layout == "nchw-8h8w32c-2d":
-                return schedule[tensor].transform_layout(layout_transform_2d)
-            raise RuntimeError(f"Unexpected layout '{layout}'")
-
-        apply_transform(input_tensor, input_layout)
-        compute_loopnest = apply_transform(output_tensor, output_layout) or output_tensor.op.axis
-        schedule[write_cache].compute_at(schedule[output_tensor], compute_loopnest[0])
-
-        apply_transform(read_cache, working_layout)
-        apply_transform(write_cache, working_layout)
-
-        return [schedule, [input_tensor, output_tensor]]
-
     def _tir_schedule_args(
         self, input_shape, dtype, input_layout, output_layout, working_layout, working_scope
     ):
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 95c6c1e19805..c84e7a9d4a4c 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -39,11 +39,9 @@ def test_add(hexagon_session: Session):
     compute_c = tvm.te.compute(
         placeholder_a.shape, lambda i: placeholder_a[i] + placeholder_b[0], name="C"
     )
-    sched = tvm.te.create_schedule(compute_c.op)
 
     func = tvm.build(
-        sched,
-        [placeholder_a, placeholder_b, compute_c],
+        te.create_prim_func([placeholder_a, placeholder_b, compute_c]),
         get_hexagon_target("v68"),
         name="add",
     )
@@ -69,11 +67,9 @@ def test_add_vtcm(hexagon_session: Session):
     compute_c = tvm.te.compute(
         placeholder_a.shape, lambda i: placeholder_a[i] + placeholder_b[0], name="C"
     )
-    sched = tvm.te.create_schedule(compute_c.op)
 
     func = tvm.build(
-        sched,
-        [placeholder_a, placeholder_b, compute_c],
+        te.create_prim_func([placeholder_a, placeholder_b, compute_c]),
         get_hexagon_target("v68"),
         name="add",
     )
@@ -117,11 +113,9 @@ def test_matmul(self, hexagon_session, size_m, size_n, size_k):
                 placeholder_x[i, reduce_k1] * placeholder_y[reduce_k1, j], axis=[reduce_k1]
             ),
         )
-        schedule = te.create_schedule(compute_z.op)
 
         func = tvm.build(
-            schedule,
-            [placeholder_x, placeholder_y, compute_z],
+            te.create_prim_func([placeholder_x, placeholder_y, compute_z]),
             get_hexagon_target("v68"),
         )
 
diff --git a/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py b/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
deleted file mode 100644
index 0cc6dbd8163f..000000000000
--- a/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Contrib tests for blocked conv2d and maxpool2d"""
-
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import te, topi
-from tvm.topi import testing
-
-from .infrastructure import build_and_run, get_block_shape, get_packed_shape
-
-
-# Blocked layout: NHWC8h8w32c :: [N, H//8, W//8, C//32, 8h, 8w, 32c]
-def maxpool2d_logical(
-    shape_nhwc,
-    window_shape,
-    stride,
-    padding,
-    dtype,
-    storage_scope="global",
-):
-    """
-    Maxpool2d TE wherein the input activation is defined by its
-    logical NHWC shape. The packed physical layout for the
-    activation is nhwc8h8w32c.
-    """
-
-    block_h, block_w, block_c = get_block_shape()
-    shape = get_packed_shape(shape_nhwc)
-    logical_output_shape = (
-        shape_nhwc[0],
-        (shape_nhwc[1] - window_shape[0] + padding[0] + padding[1]) // stride[0] + 1,
-        (shape_nhwc[2] - window_shape[1] + padding[2] + padding[3]) // stride[0] + 1,
-        shape_nhwc[3],
-    )
-    output_shape = get_packed_shape(logical_output_shape)
-
-    _, height, width, _ = shape_nhwc
-    placeholder_x = te.placeholder(shape_nhwc, dtype=dtype)
-
-    # Combination of padding required by maxpool operator and padding to evenly divisible
-    # number of blocks. Note that this padding should be inlined in the schedule so
-    # as to avoid input copying.
-    pad_h = (block_h - ((height + padding[1]) % block_h)) % block_h
-    pad_w = (block_w - ((width + padding[3]) % block_w)) % block_w
-    x_pad = topi.nn.pad(
-        placeholder_x, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0
-    )
-
-    # Calculate packed layout
-    x_packed = te.compute(
-        shape,
-        lambda n, ho, wo, co, hi, wi, ci: x_pad[
-            n, ho * block_h + hi, wo * block_w + wi, co * block_c + ci
-        ],
-    )
-
-    reduce_h = te.reduce_axis((0, window_shape[0]), name="rh")
-    reduce_w = te.reduce_axis((0, window_shape[1]), name="rw")
-
-    def compute(batch, h_outer, w_outer, c_outer, h_inner, w_inner, c_inner):
-        # Construct blockized strided maxpool height indices
-        h = h_outer * block_h + h_inner
-        h_contig = h * stride[0] + reduce_h
-        h_block_id = h_contig // block_h
-        h_block_offset = h_contig % block_h
-
-        # Construct blockized strided maxpool width indices
-        w_idx = w_outer * block_w + w_inner
-        w_contig = w_idx * stride[1] + reduce_w
-        w_block_id = w_contig // block_w
-        w_block_offset = w_contig % block_w
-
-        return te.max(
-            x_packed[
-                batch, h_block_id, w_block_id, c_outer, h_block_offset, w_block_offset, c_inner
-            ],
-            axis=[reduce_h, reduce_w],
-        )
-
-    compute_y = te.compute(output_shape, compute)
-    schedule = te.create_schedule(compute_y.op)
-
-    # Ensure the padding and array packing is performed inline
-    schedule[x_pad].compute_inline()
-    schedule[x_packed].compute_inline()
-
-    binds = {}
-    if storage_scope and storage_scope != "global":
-        with tvm.transform.PassContext():
-            x_buffer = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope)
-            y_buffer = tvm.tir.decl_buffer(
-                output_shape, name="Yb", dtype=dtype, scope=storage_scope
-            )
-            binds = {placeholder_x: x_buffer, compute_y: y_buffer}
-
-    return (schedule, [placeholder_x, compute_y], binds)
-
-
-class BaseMaxPooling:
-    batch = tvm.testing.parameter(1)
-    in_size = tvm.testing.parameter(8, 112)
-    in_channel = tvm.testing.parameter(64)
-    window_size = tvm.testing.parameter(3)
-    stride = tvm.testing.parameter(2)
-    pad = tvm.testing.parameter(1)
-    dtype = tvm.testing.parameter("float32")
-
-
-class TestMaxPooling(BaseMaxPooling):
-    """Test MaxPool class"""
-
-    @tvm.testing.parametrize_targets("llvm")
-    def test_maxpool(self, shape_nhwc, window_size, stride, pad, dtype, target):
-        """Test blocked maxpool"""
-        inputs = [np.random.uniform(0, 255, size=shape_nhwc).astype(dtype)]
-        ref_output = testing.poolnd_python(
-            inputs[0],
-            (window_size, window_size),
-            strides=(stride, stride),
-            dilation=(1, 1),
-            padding_before=(pad, pad),
-            padding_after=(pad, pad),
-            pool_type="max",
-        )
-        output = build_and_run(
-            inputs,
-            maxpool2d_logical,
-            target,
-            target,
-            shape_nhwc,
-            window_shape=(window_size, window_size),
-            stride=(stride, stride),
-            padding=(pad, pad, pad, pad),
-            dtype=dtype,
-        )
-        assert all([output is not None, ref_output is not None])
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/contrib/test_hipblas.py b/tests/python/contrib/test_hipblas.py
index 63a7553704bf..e5df51e62942 100644
--- a/tests/python/contrib/test_hipblas.py
+++ b/tests/python/contrib/test_hipblas.py
@@ -29,14 +29,13 @@ def verify_matmul_add(in_dtype, out_dtype, rtol=1e-5):
     A = te.placeholder((n, l), name="A", dtype=in_dtype)
     B = te.placeholder((l, m), name="B", dtype=in_dtype)
     C = hipblas.matmul(A, B, dtype=out_dtype)
-    s = te.create_schedule(C.op)
 
     def verify(target="rocm"):
         if not tvm.get_global_func("tvm.contrib.hipblas.matmul", True):
             print("skip because extern function is not available")
             return
         dev = tvm.rocm(0)
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
         a = tvm.nd.array(np.random.uniform(0, 128, size=(n, l)).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(0, 128, size=(l, m)).astype(B.dtype), dev)
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
@@ -56,10 +55,9 @@ def verify_batch_matmul(Ashape, Bshape, Cshape, in_dtype, out_dtype, rtol=1e-5):
     A = te.placeholder(Ashape, name="A", dtype=in_dtype)
     B = te.placeholder(Bshape, name="B", dtype=in_dtype)
     C = hipblas.batch_matmul(A, B, dtype=out_dtype)
-    s = te.create_schedule(C.op)
 
     dev = tvm.rocm(0)
-    f = tvm.build(s, [A, B, C], "rocm")
+    f = tvm.build(te.create_prim_func([A, B, C]), target="rocm")
 
     if "int" in in_dtype:
         a = tvm.nd.array(np.random.uniform(1, 10, size=Ashape).astype(in_dtype), dev)
diff --git a/tests/python/contrib/test_miopen.py b/tests/python/contrib/test_miopen.py
deleted file mode 100644
index 81115b6c0238..000000000000
--- a/tests/python/contrib/test_miopen.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.testing
-from tvm import te
-from tvm.contrib import miopen
-import numpy as np
-import pytest
-
-
-requires_miopen = pytest.mark.skipif(
-    tvm.get_global_func("tvm.contrib.miopen.conv2d.setup", True) is None,
-    reason="MIOpen is not enabled",
-)
-
-
-@tvm.testing.requires_rocm
-@requires_miopen
-def test_conv2d():
-    in_channel = 3
-    out_channel = 64
-    filter_h = 3
-    filter_w = 3
-    pad_h = 1
-    pad_w = 1
-    stride_h = 1
-    stride_w = 1
-    dilation_h = 1
-    dilation_w = 1
-
-    xshape = [1, in_channel, 128, 128]
-    wshape = (out_channel, in_channel, filter_h, filter_w)
-
-    X = te.placeholder(xshape, name="X")
-    W = te.placeholder(wshape, name="W")
-    Y = miopen.conv2d_forward(
-        X, W, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, conv_mode=0, data_type=1
-    )
-
-    yshape = [x.value for x in Y.shape]
-    from tvm import topi
-
-    s = te.create_schedule(Y.op)
-
-    def verify():
-        dev = tvm.rocm(0)
-        f = tvm.build(s, [X, W, Y], "rocm --host=llvm", name="conv2d")
-        x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32), dev)
-        w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32), dev)
-        y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), dev)
-        f(x, w, y)
-
-        Y_ref = topi.nn.conv2d_nchw(
-            X, W, (stride_h, stride_w), (pad_h, pad_w), (dilation_h, dilation_w)
-        )
-        s_ref = te.create_schedule(Y_ref.op)
-        f_ref = tvm.build(s_ref, [X, W, Y_ref], "rocm --host=llvm")
-        y_ref = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), dev)
-        f_ref(x, w, y_ref)
-        print("Max abs diff:", np.max(np.abs(y.numpy() - y_ref.numpy())))
-        tvm.testing.assert_allclose(y.numpy(), y_ref.numpy(), atol=1e-3)
-
-    verify()
-
-
-def verify_softmax(shape, axis, dtype="float32", log_softmax=False):
-    miopen_op = miopen.log_softmax if log_softmax else miopen.softmax
-    testing_op = (
-        tvm.topi.testing.log_softmax_python if log_softmax else tvm.topi.testing.softmax_python
-    )
-
-    A = te.placeholder(shape, dtype=dtype, name="A")
-    B = miopen_op(A, axis)
-    s = te.create_schedule([B.op])
-
-    dev = tvm.rocm(0)
-    a_np = np.random.uniform(size=shape).astype(dtype)
-    b_np = testing_op(a_np)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    f = tvm.build(s, [A, B], target="rocm --host=llvm", name="softmax")
-    f(a, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3)
-
-
-def verify_softmax_4d(shape, dtype="float32", log_softmax=False):
-    miopen_op = miopen.log_softmax if log_softmax else miopen.softmax
-    testing_op = (
-        tvm.topi.testing.log_softmax_python if log_softmax else tvm.topi.testing.softmax_python
-    )
-
-    A = te.placeholder(shape, dtype=dtype, name="A")
-    B = miopen_op(A, axis=1)
-    s = te.create_schedule([B.op])
-
-    dev = tvm.rocm(0)
-    n, c, h, w = shape
-    a_np = np.random.uniform(size=shape).astype(dtype)
-    b_np = testing_op(a_np.transpose(0, 2, 3, 1).reshape(h * w, c))
-    b_np = b_np.reshape(n, h, w, c).transpose(0, 3, 1, 2)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    f = tvm.build(s, [A, B], target="rocm --host=llvm", name="softmax")
-    f(a, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3)
-
-
-@tvm.testing.requires_rocm
-@requires_miopen
-def test_softmax():
-    verify_softmax((32, 10), -1)
-    verify_softmax((3, 4), -1)
-    verify_softmax_4d((1, 16, 256, 256))
-    verify_softmax_4d((1, 16, 256, 256))
-
-    verify_softmax((32, 10), -1, log_softmax=True)
-    verify_softmax((3, 4), -1, log_softmax=True)
-    verify_softmax_4d((1, 16, 256, 256), log_softmax=True)
-
-
-if __name__ == "__main__":
-    test_conv2d()
diff --git a/tests/python/contrib/test_mps.py b/tests/python/contrib/test_mps.py
index 92462e4c4f9e..e876672feaed 100644
--- a/tests/python/contrib/test_mps.py
+++ b/tests/python/contrib/test_mps.py
@@ -29,33 +29,20 @@ def test_matmul():
     A = te.placeholder((n, l), name="A")
     B = te.placeholder((l, m), name="B")
     C = mps.matmul(A, B)
-    D = te.compute(C.shape, lambda *i: C(*i) + 1.0)
-    s = te.create_schedule(D.op)
-    yo, xo = D.op.axis
-    block_y = te.thread_axis("blockIdx.y")
-    block_x = te.thread_axis("blockIdx.x")
-    thread_y = te.thread_axis("threadIdx.y")
-    thread_x = te.thread_axis("threadIdx.x")
-    by, ty = s[D].split(yo, factor=16)
-    bx, tx = s[D].split(xo, factor=16)
-    s[D].bind(by, block_y)
-    s[D].bind(bx, block_x)
-    s[D].bind(ty, thread_y)
-    s[D].bind(tx, thread_x)
 
-    def verify(A, B, D, s, target="metal"):
+    def verify(A, B, C):
         if not tvm.get_global_func("tvm.contrib.mps.matmul", True):
             print("skip because extern function is not available")
             return
         dev = tvm.metal(0)
-        f = tvm.build(s, [A, B, D], "metal")
+        f = tvm.build(te.create_prim_func([A, B, C]), target="metal")
         a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev)
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
         f(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()) + 1, rtol=1e-5)
+        tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()), rtol=1e-5)
 
-    verify(A, B, D, s)
+    verify(A, B, C)
 
 
 @tvm.testing.requires_metal
@@ -71,20 +58,17 @@ def test_conv2d():
     A = te.placeholder((n, h, w, ci), name="x")
     B = te.placeholder((co, kh, kw, ci), name="w")
     C = mps.conv2d(A, B, "SAME", 2)
-    s1 = te.create_schedule(C.op)
 
     def verify(A, B, C, target="llvm"):
         if not tvm.get_global_func("tvm.contrib.mps.conv2d", True):
             print("skip because extern function is not available")
             return
         dev = tvm.metal(0)
-        f = tvm.build(s1, [A, B, C], "metal")
+        f = tvm.build(te.create_prim_func([A, B, C]), target="metal")
         a = tvm.nd.array(np.random.uniform(size=(n, h, w, ci)).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(size=(co, kh, kw, ci)).astype(B.dtype), dev)
         c = tvm.nd.array(np.zeros((n, h // stride, w // stride, co), dtype=C.dtype), dev)
         f(a, b, c)
-        # print(c.numpy())
-        # print(c.shape)
 
     verify(A, B, C, s1)
 
diff --git a/tests/python/contrib/test_random.py b/tests/python/contrib/test_random.py
index 6ffd417a0a48..be9fed2c6ee8 100644
--- a/tests/python/contrib/test_random.py
+++ b/tests/python/contrib/test_random.py
@@ -30,7 +30,6 @@ def test_randint():
     m = 10240
     n = 10240
     A = random.randint(-127, 128, size=(m, n), dtype="int32")
-    s = te.create_schedule(A.op)
 
     def verify(target="llvm"):
         if not tvm.testing.device_enabled(target):
@@ -40,7 +39,7 @@ def verify(target="llvm"):
             print("skip because extern function is not available")
             return
         dev = tvm.cpu(0)
-        f = tvm.build(s, [A], target)
+        f = tvm.build(te.create_prim_func([A]), target=target)
         a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev)
         f(a)
         na = a.numpy()
@@ -56,7 +55,6 @@ def test_uniform():
     m = 10240
     n = 10240
     A = random.uniform(0, 1, size=(m, n))
-    s = te.create_schedule(A.op)
 
     def verify(target="llvm"):
         if not tvm.testing.device_enabled(target):
@@ -66,7 +64,7 @@ def verify(target="llvm"):
             print("skip because extern function is not available")
             return
         dev = tvm.cpu(0)
-        f = tvm.build(s, [A], target)
+        f = tvm.build(te.create_prim_func([A]), target=target)
         a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev)
         f(a)
         na = a.numpy()
@@ -82,7 +80,6 @@ def test_normal():
     m = 10240
     n = 10240
     A = random.normal(3, 4, size=(m, n))
-    s = te.create_schedule(A.op)
 
     def verify(target="llvm"):
         if not tvm.testing.device_enabled(target):
@@ -92,7 +89,7 @@ def verify(target="llvm"):
             print("skip because extern function is not available")
             return
         dev = tvm.cpu(0)
-        f = tvm.build(s, [A], target)
+        f = tvm.build(te.create_prim_func([A]), target=target)
         a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev)
         f(a)
         na = a.numpy()
diff --git a/tests/python/contrib/test_rocblas.py b/tests/python/contrib/test_rocblas.py
index c5321cd4eaaf..2c1889a0c43b 100644
--- a/tests/python/contrib/test_rocblas.py
+++ b/tests/python/contrib/test_rocblas.py
@@ -33,14 +33,13 @@ def test_matmul():
     A = te.placeholder((n, l), name="A")
     B = te.placeholder((l, m), name="B")
     C = rocblas.matmul(A, B)
-    s = te.create_schedule(C.op)
 
     def verify(target="rocm"):
         if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True):
             print("skip because extern function is not available")
             return
         dev = tvm.rocm(0)
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
         a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev)
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
@@ -57,7 +56,6 @@ def verify_batch_matmul(batch, m, k, n, lib, transa=False, transb=False, dtype="
     A = te.placeholder(ashape, name="A", dtype=dtype)
     B = te.placeholder(bshape, name="B", dtype=dtype)
     C = lib.batch_matmul(A, B, transa, transb)
-    s = te.create_schedule(C.op)
 
     def get_numpy(a, b, transa, transb):
         if transa:
@@ -74,7 +72,7 @@ def verify(target="rocm"):
             print("skip because extern function is not available")
             return
         dev = tvm.rocm(0)
-        f = tvm.build(s, [A, B, C], target)
+        f = tvm.build(te.create_prim_func([A, B, C]), target=target)
         a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), dev)
         c = tvm.nd.array(np.zeros((batch, m, n), dtype=C.dtype), dev)
diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py
index c135450c09e1..0e0aa71caf10 100644
--- a/tests/python/contrib/test_sort.py
+++ b/tests/python/contrib/test_sort.py
@@ -20,7 +20,6 @@
 import tvm
 import tvm.testing
 from tvm import te
-from tvm.topi.cuda import sort_by_key
 
 
 def test_sort():
@@ -53,8 +52,7 @@ def test_sort():
 
     dev = tvm.cpu(0)
     target = "llvm"
-    s = te.create_schedule(out.op)
-    f = tvm.build(s, [data, sort_num, out], target)
+    f = tvm.build(te.create_prim_func([data, sort_num, out]), target=target)
     a = tvm.nd.array(np.array(input_data).astype(data.dtype), dev)
     b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), dev)
     c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), dev)
@@ -82,8 +80,7 @@ def test_sort_np():
 
     dev = tvm.cpu(0)
     target = "llvm"
-    s = te.create_schedule(out.op)
-    f = tvm.build(s, [data, sort_num, out], target)
+    f = tvm.build(te.create_prim_func([data, sort_num, out]), target=target)
 
     np_data = np.random.uniform(size=dshape)
     np_out = np.argsort(np_data, axis=axis)
@@ -95,40 +92,6 @@ def test_sort_np():
     tvm.testing.assert_allclose(c.numpy(), np_out, rtol=1e-5)
 
 
-def test_sort_by_key_gpu():
-    """Tests sort function using gpu"""
-    size = 6
-    keys = te.placeholder((size,), name="keys", dtype="int32")
-    values = te.placeholder((size,), name="values", dtype="int32")
-
-    for target in ["cuda", "nvptx", "opencl", "rocm"]:
-        if not tvm.testing.device_enabled(target):
-            print("Skip because %s is not enabled" % target)
-            continue
-
-        with tvm.target.Target(target):
-            keys_out, values_out = sort_by_key(keys, values)
-            dev = tvm.device(target)
-            s = te.create_schedule([keys_out.op, values_out.op])
-            f = tvm.build(s, [keys, values, keys_out, values_out], target)
-
-            keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32)
-            values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32)
-            keys_np_out = np.zeros(keys_np.shape, np.int32)
-            values_np_out = np.zeros(values_np.shape, np.int32)
-            keys_in = tvm.nd.array(keys_np, dev)
-            values_in = tvm.nd.array(values_np, dev)
-            keys_out = tvm.nd.array(keys_np_out, dev)
-            values_out = tvm.nd.array(values_np_out, dev)
-            f(keys_in, values_in, keys_out, values_out)
-
-            ref_keys_out = np.sort(keys_np)
-            ref_values_out = np.array([values_np[i] for i in np.argsort(keys_np)])
-            tvm.testing.assert_allclose(keys_out.numpy(), ref_keys_out, rtol=1e-5)
-            tvm.testing.assert_allclose(values_out.numpy(), ref_values_out, rtol=1e-5)
-
-
 if __name__ == "__main__":
     test_sort()
     test_sort_np()
-    test_sort_by_key_gpu()
diff --git a/tests/python/contrib/test_sparse.py b/tests/python/contrib/test_sparse.py
deleted file mode 100644
index 8ebd02cc170c..000000000000
--- a/tests/python/contrib/test_sparse.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Configure pytest"""
-# pylint: disable=invalid-name
-from collections import namedtuple
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import te
-import tvm.contrib.sparse as tvmsp
-import tvm.runtime.ndarray as _nd
-
-
-def test_static_tensor():
-    """Tests static tensor"""
-    dtype = "float32"
-    target = "llvm"
-    dev = tvm.device(target, 0)
-    m = te.size_var("m")
-    n = te.size_var("n")
-    A = tvmsp.placeholder(shape=(m, n), name="A", dtype=dtype)
-    assert A.stype == "csr"
-    n = 3
-    a = np.maximum(np.random.uniform(size=(n, n)).astype(dtype) - 0.6, 0.0)
-    a = tvmsp.array(a, dev)
-    A.data = te.placeholder(a.data.shape, dtype, name="A_data")
-    Ab = tvm.tir.decl_buffer(a.data.shape, dtype, name="A_data")
-    binds = {A.data: Ab}
-    C = te.compute(A.data.shape, lambda i: A.data[i] * 2.0, tag="cs_scatter")
-    s = te.create_schedule(C.op)
-    f = tvm.build(s, [A.data, C], target, binds=binds)
-    c = tvmsp.array(np.zeros((n, n), dtype), dev)
-    c.data = tvm.nd.empty(a.data.shape, dtype)
-    c.indices = a.indices
-    c.indptr = a.indptr
-    f(a.data, c.data)
-    tvm.testing.assert_allclose(c.numpy(), a.numpy() * 2.0, rtol=1e-5)
-
-
-def test_dynamic_tensor():
-    """Tests dynamic tensor"""
-    dtype = "float32"
-    target = "llvm"
-    dev = tvm.device(target, 0)
-    nr, nc, n = te.size_var("nr"), te.size_var("nc"), te.size_var("n")
-    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name="A", dtype=dtype)
-    assert A.stype == "csr"
-    C = te.compute(A.data.shape, lambda i: A.data[i] * 2.0, tag="cs_scatter")
-    s = te.create_schedule(C.op)
-    _nr, _nc = 3, 5
-    a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype) - 0.6, 0.0)
-    a = tvmsp.array(a, dev)
-    assert a.data.dtype == a.dtype
-    Ab = namedtuple("CSRBuffer", ["data", "indices", "indptr"])
-    Ab.data = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_data")
-    Ab.indices = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_indices")
-    binds = {A.data: Ab.data, A.indices: Ab.indices}
-    f = tvm.build(s, [nr, A.data, C], target, binds=binds)
-    c = tvmsp.array(np.zeros((_nr, _nc), dtype), dev)
-    c.data = tvm.nd.empty(a.data.shape, dtype)
-    c.indices = a.indices
-    c.indptr = a.indptr
-    f(a.data.shape[0], a.data, c.data)
-    tvm.testing.assert_allclose(c.numpy(), a.numpy() * 2.0, rtol=1e-5)
-
-
-def test_sparse_array_tuple():
-    """Tests array when it is sparse"""
-    dtype, itype = "float32", "int32"
-    target = "llvm"
-    dev = tvm.device(target, 0)
-    nr, nc, n = te.size_var("nr"), te.size_var("nc"), te.size_var("n")
-    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name="A", dtype=dtype)
-    assert A.stype == "csr"
-    C = te.compute(A.data.shape, lambda i: A.data[i] * 2.0, tag="cs_scatter")
-    s = te.create_schedule(C.op)
-    _nr, _nc = 3, 5
-    a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype) - 0.6, 0.0)
-    # convert to sparse array tuple
-    source_array = a
-    ridx, cidx = np.nonzero(source_array)
-    data = source_array[ridx, cidx]
-    a_data = _nd.array(data, dev)
-    indices = np.nonzero(source_array)[1].astype(itype)
-    a_indices = _nd.array(indices, dev)
-    indptr = [0] + np.apply_along_axis(np.count_nonzero, axis=1, arr=source_array).tolist()
-    indptr = np.cumsum(np.array(indptr, itype)).astype(itype)
-    a_indptr = _nd.array(indptr, dev)
-    a_init = (a_data, a_indices, a_indptr)
-    # construct tvm sparse array with tuple
-    a = tvmsp.array(a_init, shape=source_array.shape, device=dev)
-    assert a.data.dtype == a.dtype
-    Ab = namedtuple("CSRBuffer", ["data", "indices", "indptr"])
-    Ab.data = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_data")
-    Ab.indices = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_indices")
-    binds = {A.data: Ab.data, A.indices: Ab.indices}
-    f = tvm.build(s, [nr, A.data, C], target, binds=binds)
-    c = tvmsp.array(np.zeros((_nr, _nc), dtype), dev)
-    c.data = tvm.nd.empty(a.data.shape, dtype)
-    c.indices = a.indices
-    c.indptr = a.indptr
-    f(a.data.shape[0], a.data, c.data)
-    tvm.testing.assert_allclose(c.numpy(), a.numpy() * 2.0, rtol=1e-5)
-
-
-if __name__ == "__main__":
-    test_static_tensor()
-    test_dynamic_tensor()
-    test_sparse_array_tuple()
diff --git a/tests/python/relax/test_frontend_from_fx.py b/tests/python/relax/test_frontend_from_fx.py
index 3c932f86c582..446c4149fdde 100644
--- a/tests/python/relax/test_frontend_from_fx.py
+++ b/tests/python/relax/test_frontend_from_fx.py
@@ -3637,7 +3637,6 @@ def main(
 
 
 def test_stack():
-
     input_info = [
         ([1, 3, 10, 10], "float32"),
         ([1, 3, 10, 10], "float32"),
diff --git a/tests/python/runtime/test_runtime_dlpack.py b/tests/python/runtime/test_runtime_dlpack.py
index cf12c89cdd51..60a86f662c6c 100644
--- a/tests/python/runtime/test_runtime_dlpack.py
+++ b/tests/python/runtime/test_runtime_dlpack.py
@@ -35,9 +35,7 @@ def test_from_dlpack_shape_one():
     B = te.placeholder((rows, 16), name="B")
     C = te.compute(A.shape, lambda i, j: A[i, j] + B[i, j], name="C")
 
-    s = te.create_schedule(C.op)
-
-    fadd = tvm.build(s, [A, B, C], tgt)
+    fadd = tvm.build(te.create_prim_func([A, B, C]), target=tgt)
 
     dev = tvm.device(tgt.kind.name, 0)
 
diff --git a/tests/python/runtime/test_runtime_measure.py b/tests/python/runtime/test_runtime_measure.py
index 8955b03241a2..4b39cef18bc5 100644
--- a/tests/python/runtime/test_runtime_measure.py
+++ b/tests/python/runtime/test_runtime_measure.py
@@ -35,8 +35,7 @@ def my_debug(filename):
             fout.write("c")
 
     X = te.compute((), lambda: tvm.tir.call_packed("my_debug", filename))
-    s = te.create_schedule(X.op)
-    func = tvm.build(s, [X])
+    func = tvm.build(te.create_prim_func([X]))
 
     x = tvm.nd.empty((), dtype="int32")
     ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1)
diff --git a/tests/python/runtime/test_runtime_module_export.py b/tests/python/runtime/test_runtime_module_export.py
index a6554f3a4f75..1dff6c42502e 100644
--- a/tests/python/runtime/test_runtime_module_export.py
+++ b/tests/python/runtime/test_runtime_module_export.py
@@ -17,211 +17,10 @@
 
 import tvm
 import tvm.testing
-import pytest
 
 from tvm.contrib import utils
-import os
 
-header_file_dir_path = utils.tempdir()
 
-
-def gen_engine_header():
-    code = r"""
-        #ifndef _ENGINE_H_
-        #define _ENGINE_H_
-        #include <cstdint>
-        #include <string>
-        #include <sstream>
-        #include <vector>
-        class Engine {
-        };
-
-        #endif
-        """
-    header_file = header_file_dir_path.relpath("gcc_engine.h")
-    with open(header_file, "w") as f:
-        f.write(code)
-
-
-def generate_engine_module():
-    code = r"""
-        #include <tvm/runtime/c_runtime_api.h>
-        #include <dlpack/dlpack.h>
-        #include "gcc_engine.h"
-
-        extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5,
-                float* gcc_input6, float* gcc_input7, float* out) {
-            Engine engine;
-        }
-        """
-    import tvm.runtime._ffi_api
-
-    gen_engine_header()
-    csource_module = tvm.runtime._ffi_api.CSourceModuleCreate(code, "cc", [], None)
-    return csource_module
-
-
-@pytest.mark.skip("LEGACY-TEST: test to be replaced by relax")
-@tvm.testing.uses_gpu
-def test_mod_export():
-    def verify_gpu_mod_export(obj_format):
-        for device in ["llvm", "cuda"]:
-            if not tvm.testing.device_enabled(device):
-                print("skip because %s is not enabled..." % device)
-                return
-
-        synthetic_mod, synthetic_params = relay.testing.synthetic.get_workload()
-        synthetic_llvm_mod, synthetic_llvm_params = relay.testing.synthetic.get_workload()
-        with tvm.transform.PassContext(opt_level=3):
-            _, synthetic_gpu_lib, _ = relay.build_module.build(
-                synthetic_mod, "cuda", params=synthetic_params, mod_name="cudalib"
-            )
-            _, synthetic_llvm_cpu_lib, _ = relay.build_module.build(
-                synthetic_llvm_mod, "llvm", params=synthetic_llvm_params, mod_name="llvmlib"
-            )
-
-        temp = utils.tempdir()
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        synthetic_gpu_lib.import_module(synthetic_llvm_cpu_lib)
-        synthetic_gpu_lib.export_library(path_lib)
-        loaded_lib = tvm.runtime.load_module(path_lib)
-        assert loaded_lib.type_key == "library"
-        assert loaded_lib.imported_modules[0].type_key == "cuda"
-        #  dso modules are merged together
-        assert len(loaded_lib.imported_modules) == 1
-
-    def verify_multi_dso_mod_export(obj_format):
-        for device in ["llvm"]:
-            if not tvm.testing.device_enabled(device):
-                print("skip because %s is not enabled..." % device)
-                return
-
-        A = te.placeholder((1024,), name="A")
-        B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-        s = te.create_schedule(B.op)
-        mod0 = tvm.build(s, [A, B], "llvm", name="myadd0")
-        mod1 = tvm.build(s, [A, B], "llvm", name="myadd1")
-
-        temp = utils.tempdir()
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-
-        mod0.import_module(mod1)
-        mod0.export_library(path_lib)
-        loaded_lib = tvm.runtime.load_module(path_lib)
-        assert loaded_lib.type_key == "library"
-        # dso modules are merged
-        assert len(loaded_lib.imported_modules) == 0
-
-    def verify_json_import_dso(obj_format):
-        for device in ["llvm"]:
-            if not tvm.testing.device_enabled(device):
-                print("skip because %s is not enabled..." % device)
-                return
-
-        # Get subgraph Json.
-        subgraph_json = (
-            "json_rt_0\n"
-            + "input 0 10 10\n"
-            + "input 1 10 10\n"
-            + "input 2 10 10\n"
-            + "input 3 10 10\n"
-            + "add 4 inputs: 0 1 shape: 10 10\n"
-            + "sub 5 inputs: 4 2 shape: 10 10\n"
-            + "mul 6 inputs: 5 3 shape: 10 10\n"
-            + "json_rt_1\n"
-            + "input 0 10 10\n"
-            + "input 1 10 10\n"
-            + "input 2 10 10\n"
-            + "input 3 10 10\n"
-            + "add 4 inputs: 0 1 shape: 10 10\n"
-            + "sub 5 inputs: 4 2 shape: 10 10\n"
-            + "mul 6 inputs: 5 3 shape: 10 10"
-        )
-
-        temp = utils.tempdir()
-        subgraph_path = temp.relpath("subgraph.examplejson")
-        with open(subgraph_path, "w") as f:
-            f.write(subgraph_json)
-
-        # Get Json and module.
-        A = te.placeholder((1024,), name="A")
-        B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-        s = te.create_schedule(B.op)
-        f = tvm.build(s, [A, B], "llvm", name="myadd")
-        try:
-            ext_lib = tvm.runtime.load_module(subgraph_path, "examplejson")
-        except:
-            print("skip because Loader of examplejson is not presented")
-            return
-        ext_lib.import_module(f)
-        if obj_format == ".so":
-            file_name = "deploy_lib.so"
-        else:
-            assert obj_format == ".tar"
-            file_name = "deploy_lib.tar"
-        path_lib = temp.relpath(file_name)
-        ext_lib.export_library(path_lib)
-        lib = tvm.runtime.load_module(path_lib)
-        assert lib.type_key == "examplejson"
-        assert lib.imported_modules[0].type_key == "library"
-
-    def verify_multi_c_mod_export():
-        from shutil import which
-
-        if which("gcc") is None:
-            print("Skip test because gcc is not available.")
-
-        for device in ["llvm"]:
-            if not tvm.testing.device_enabled(device):
-                print("skip because %s is not enabled..." % device)
-                return
-
-        synthetic_mod, synthetic_params = relay.testing.synthetic.get_workload()
-        with tvm.transform.PassContext(opt_level=3):
-            _, synthetic_cpu_lib, _ = relay.build_module.build(
-                synthetic_mod, "llvm", params=synthetic_params
-            )
-
-        A = te.placeholder((1024,), name="A")
-        B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-        s = te.create_schedule(B.op)
-        f = tvm.build(s, [A, B], "c", name="myadd")
-        engine_module = generate_engine_module()
-
-        temp = utils.tempdir()
-        file_name = "deploy_lib.so"
-        path_lib = temp.relpath(file_name)
-        synthetic_cpu_lib.import_module(f)
-        synthetic_cpu_lib.import_module(engine_module)
-        kwargs = {"options": ["-O2", "-std=c++17", "-I" + header_file_dir_path.relpath("")]}
-        work_dir = temp.relpath("work_dir")
-        os.mkdir(work_dir)
-        synthetic_cpu_lib.export_library(path_lib, fcompile=False, workspace_dir=work_dir, **kwargs)
-        assert os.path.exists(os.path.join(work_dir, "devc.o"))
-        loaded_lib = tvm.runtime.load_module(path_lib)
-        assert loaded_lib.type_key == "library"
-        # dso modules are merged
-        assert len(loaded_lib.imported_modules) == 0
-
-    for obj_format in [".so", ".tar"]:
-        verify_gpu_mod_export(obj_format)
-        verify_multi_dso_mod_export(obj_format)
-        verify_json_import_dso(obj_format)
-
-    verify_multi_c_mod_export()
-
-
-@pytest.mark.skip("LEGACY-TEST: test to be replaced by TensorIR")
 @tvm.testing.requires_llvm
 def test_import_static_library():
     from tvm import te
@@ -229,9 +28,15 @@ def test_import_static_library():
     # Generate two LLVM modules.
     A = te.placeholder((1024,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
-    mod0 = tvm.build(s, [A, B], "llvm", name="myadd0")
-    mod1 = tvm.build(s, [A, B], "llvm", name="myadd1")
+    irmod0 = tvm.IRModule.from_expr(
+        te.create_prim_func([A, B]).with_attr("global_symbol", "myadd0")
+    )
+    irmod1 = tvm.IRModule.from_expr(
+        te.create_prim_func([A, B]).with_attr("global_symbol", "myadd1")
+    )
+
+    mod0 = tvm.build(irmod0, target="llvm")
+    mod1 = tvm.build(irmod1, target="llvm")
 
     assert mod0.implements_function("myadd0")
     assert mod1.implements_function("myadd1")
diff --git a/tests/python/runtime/test_runtime_module_load.py b/tests/python/runtime/test_runtime_module_load.py
index 33bd281b045f..130a274c354b 100644
--- a/tests/python/runtime/test_runtime_module_load.py
+++ b/tests/python/runtime/test_runtime_module_load.py
@@ -101,12 +101,13 @@ def test_device_module_dump():
     n = tvm.runtime.convert(1024)
     A = te.placeholder((n,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
+
+    sch = tvm.tir.Schedule(te.create_prim_func([A, B]))
     # create iter var and assign them tags.
     num_thread = 8
-    bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
-    s[B].bind(bx, te.thread_axis("blockIdx.x"))
-    s[B].bind(tx, te.thread_axis("threadIdx.x"))
+    bx, tx = sch.split(sch.get_loops("B")[0], factors=[None, num_thread])
+    sch.bind(bx, "blockIdx.x")
+    sch.bind(tx, "threadIdx.x")
 
     def check_device(device):
         dev = tvm.device(device, 0)
@@ -114,9 +115,7 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         temp = utils.tempdir()
-        name = "myadd_%s" % device
-
-        f = tvm.build(s, [A, B], device, "llvm", name=name)
+        f = tvm.build(sch.mod, target=device)
 
         path_dso = temp.relpath("dev_lib.so")
         # test cross compiler function
@@ -143,8 +142,7 @@ def check_stackvm(device):
             print("Skip because %s is not enabled" % device)
             return
         temp = utils.tempdir()
-        name = "myadd_%s" % device
-        f = tvm.build(s, [A, B], device, "stackvm", name=name)
+        f = tvm.build(sch.mod, target=tvm.target.Target(device, host="stackvm"))
         path_dso = temp.relpath("dev_lib.stackvm")
         f.export_library(path_dso)
         f1 = tvm.runtime.load_module(path_dso)
diff --git a/tests/python/runtime/test_runtime_module_property.py b/tests/python/runtime/test_runtime_module_property.py
index bd71e856d917..97c51ff93996 100644
--- a/tests/python/runtime/test_runtime_module_property.py
+++ b/tests/python/runtime/test_runtime_module_property.py
@@ -33,12 +33,7 @@ def create_csource_module():
 def create_llvm_module():
     A = te.placeholder((1024,), name="A")
     B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
-    return tvm.build(s, [A, B], "llvm", name="myadd0")
-
-
-def create_aot_module():
-    return tvm.get_global_func("relay.build_module._AOTExecutorCodegen")()
+    return tvm.build(te.create_prim_func([A, B]), target="llvm")
 
 
 def test_property():
@@ -52,11 +47,6 @@ def test_property():
         expected={"is_binary_serializable": False, "is_runnable": True, "is_dso_exportable": True},
     )
 
-    checker(
-        create_aot_module(),
-        expected={"is_binary_serializable": False, "is_runnable": True, "is_dso_exportable": False},
-    )
-
 
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/runtime/test_runtime_rpc.py b/tests/python/runtime/test_runtime_rpc.py
index 31cab2819df1..717cc8fffa05 100644
--- a/tests/python/runtime/test_runtime_rpc.py
+++ b/tests/python/runtime/test_runtime_rpc.py
@@ -73,8 +73,7 @@ def test_bigendian_rpc():
     def verify_rpc(remote, target, shape, dtype):
         A = te.placeholder(shape, dtype=dtype)
         B = te.compute(A.shape, lambda i: A[i] + tvm.tir.const(1, A.dtype))
-        s = te.create_schedule(B.op)
-        f = tvm.build(s, [A, B], target, name="myadd")
+        f = tvm.build(te.create_prim_func([A, B]), target=target)
 
         dev = remote.cpu(0)
         a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), device=dev)
diff --git a/tests/python/runtime/test_runtime_trace.py b/tests/python/runtime/test_runtime_trace.py
index 08f56b56c8c7..58d1a079e46b 100644
--- a/tests/python/runtime/test_runtime_trace.py
+++ b/tests/python/runtime/test_runtime_trace.py
@@ -23,8 +23,7 @@ def test_trace_default_action():
     n = 2
     x = te.placeholder((n, n, n), name="X", dtype="float32")
     y = te.compute(x.shape, lambda i, j, k: tvm.tir.trace([i, j, k, x[i][j][k]]))
-    s = te.create_schedule(y.op)
-    f = tvm.build(s, [x, y], target="llvm")
+    f = tvm.build(te.create_prim_func([x, y]), target="llvm")
     xnd = tvm.nd.array(np.ones((n, n, n), dtype=x.dtype))
     ynd = tvm.nd.array(np.zeros((n, n, n), dtype=y.dtype))
     f(xnd, ynd)
@@ -44,8 +43,7 @@ def check_assign(dtype):
         z = te.compute(
             x.shape, lambda i, j, k: tvm.tir.trace([y[i][j][k]], "tvm.tir.trace_callback2")
         )
-        s = te.create_schedule(z.op)
-        f = tvm.build(s, [x, y, z], "llvm")
+        f = tvm.build(te.create_prim_func([x, y, z]), "llvm")
 
         xnd = tvm.nd.array(np.ones((n, n, n), dtype=x.dtype))
         ynd = tvm.nd.array(np.zeros((n, n, n), dtype=y.dtype))
@@ -74,8 +72,7 @@ def check_expr_sum(dtype):
             lambda i, j, k: tvm.tir.trace([a[i][j][k]], "tvm.tir.trace_callback3")
             + tvm.tir.trace([b[i][j][k]], "tvm.tir.trace_callback3"),
         )
-        s = te.create_schedule(c.op)
-        f = tvm.build(s, [a, b, c])
+        f = tvm.build(te.create_prim_func([a, b, c]))
         xnd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=a.dtype)))
         ynd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=b.dtype)))
         znd = tvm.nd.array(np.zeros((n, n, n), dtype=c.dtype))
@@ -105,8 +102,7 @@ def check_expr_sum(dtype):
             + tvm.tir.trace([i, j, k, d[i][j][k]], "tvm.tir.trace_silent")
             + tvm.tir.trace([i, j, k, e[i][j][k]], "tvm.tir.trace_silent"),
         )
-        s = te.create_schedule(c.op)
-        f = tvm.build(s, [a, b, d, e, c])
+        f = tvm.build(te.create_prim_func([a, b, d, e, c]))
         a_nd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=a.dtype)))
         b_nd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=b.dtype)))
         d_nd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=d.dtype)))
@@ -135,8 +131,7 @@ def check_expr_sum_custom(dtype):
             lambda i, j: tvm.tir.trace([a[i][j]], "tvm.tir.trace_callback4")
             + tvm.tir.trace([b[i][j]], "tvm.tir.trace_callback4"),
         )
-        s = te.create_schedule(c.op)
-        f = tvm.build(s, [a, b, c])
+        f = tvm.build(te.create_prim_func([a, b, c]))
         npa = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=a.dtype)
         npb = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=a.dtype)
         xnd = tvm.nd.array(npa)
@@ -163,8 +158,7 @@ def check_assign(dtype):
         x = te.placeholder((n,), name="X", dtype=dtype)
         y = te.compute(x.shape, lambda i: tvm.tir.trace([x[i]], "tvm.tir.trace_change_int_first"))
         z = te.compute(x.shape, lambda i: tvm.tir.trace([y[i]], "tvm.tir.trace_change_int_second"))
-        s = te.create_schedule(z.op)
-        f = tvm.build(s, [x, y, z], "llvm")
+        f = tvm.build(te.create_prim_func([x, y, z]))
 
         xnd = tvm.nd.array(np.ones((n,), dtype=x.dtype))
         ynd = tvm.nd.array(np.zeros((n,), dtype=y.dtype))
@@ -195,8 +189,7 @@ def check_assign(dtype):
         z = te.compute(
             x.shape, lambda i: tvm.tir.trace([y[i]], "tvm.tir.trace_change_float_second")
         )
-        s = te.create_schedule(z.op)
-        f = tvm.build(s, [x, y, z], "llvm")
+        f = tvm.build(te.create_prim_func([x, y, z]), target="llvm")
 
         xnd = tvm.nd.array(np.ones((n,), dtype=x.dtype))
         ynd = tvm.nd.array(np.zeros((n,), dtype=y.dtype))
diff --git a/tests/python/target/test_target_target.py b/tests/python/target/test_target_target.py
index cda228939f31..b99834aef35a 100644
--- a/tests/python/target/test_target_target.py
+++ b/tests/python/target/test_target_target.py
@@ -578,7 +578,7 @@ def func():
     func = func.with_attr("Target", target)
     target2 = tvm.ir.load_json(tvm.ir.save_json(target))
     mod = tvm.IRModule({"main": func})
-    lib = tvm.build({target2: mod}, target_host=target)
+    lib = tvm.build(mod, target=target2)
     lib["func"]()
 
 
diff --git a/tests/python/te/test_te_autodiff.py b/tests/python/te/test_te_autodiff.py
deleted file mode 100644
index a5995ff0337f..000000000000
--- a/tests/python/te/test_te_autodiff.py
+++ /dev/null
@@ -1,351 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-import pytest
-import tvm
-from tvm import te, topi
-from tvm.testing import assert_allclose
-from tvm.topi.utils import get_const_tuple
-
-
-def check_grad(
-    out, inputs, args=[], data_range=(-10, 10), desired_grads=None, assert_no_jacobian=True
-):
-    inputs = inputs if isinstance(inputs, list) else [inputs]
-
-    def check_device(device, host="llvm"):
-        dev = tvm.device(device, 0)
-        if not tvm.testing.device_enabled(host):
-            return
-
-        sout = te.create_schedule(out.op)
-        mout = tvm.build(sout, [out] + inputs + args)
-        out_shape = get_const_tuple(out.shape)
-
-        l, h = data_range
-        input_data = [
-            tvm.nd.array(
-                np.random.uniform(l, h, size=get_const_tuple(input.shape)).astype(input.dtype)
-            )
-            for input in inputs
-        ]
-        arg_vals = [
-            tvm.nd.array(np.random.uniform(l, h, size=get_const_tuple(arg.shape)).astype(arg.dtype))
-            for arg in args
-        ]
-
-        ones = topi.full_like(out, 1.0)
-        # we provide head to sum and reduce the output dimension,
-        # which equals to grad(out.sum(), inputs)
-        grads = te.gradient(out, inputs, head=ones)
-        grad_sched = te.create_schedule([grad.op for grad in grads])
-        mgrad = tvm.build(grad_sched, list(grads) + inputs + args)
-        if assert_no_jacobian:
-            # TODO(yzhliu): it is better to visit the expression and do assertion
-            lowered_ir = str(tvm.lower(grad_sched, list(grads) + inputs + args, simple_mode=True))
-            assert "jacobian" not in lowered_ir, lowered_ir
-
-        grad_data = [tvm.nd.empty(get_const_tuple(i.shape), g.dtype) for i, g in zip(inputs, grads)]
-
-        mgrad(*grad_data, *input_data, *arg_vals)
-        g_res = [g.numpy() for g in grad_data]
-
-        if desired_grads:
-            assert isinstance(desired_grads, list)
-            for actual, desired in zip(g_res, desired_grads):
-                assert_allclose(actual, desired, rtol=0.1, atol=1e-2)
-        else:
-
-            def forward(*in_data):
-                out_data = tvm.nd.empty(out_shape, out.dtype)
-                mout(out_data, *[tvm.nd.array(d) for d in list(in_data)])
-                return out_data.numpy().sum()
-
-            tvm.testing.check_numerical_grads(
-                forward, [d.numpy() for d in input_data + arg_vals], g_res
-            )
-
-    check_device("cpu")
-
-
-def test_basic_operation():
-    np.random.seed(0)
-    shape = (10, 10)
-    x = te.var("x", dtype="float32")
-    k = te.reduce_axis((0, 10), name="k")
-    l = te.reduce_axis((0, 10), name="l")
-    A0 = te.placeholder(shape, name="A0")
-    A1 = te.placeholder(shape, name="A1")
-    zeros = np.zeros(shape)
-
-    B = te.compute(shape, lambda i, j: A0[i, j], name="B")
-    check_grad(B, [A0])
-
-    B = te.compute(shape, lambda i, j: A0[i, j] + A1[i, j], name="B")
-    check_grad(B, [A0, A1])
-
-    B = te.compute(shape, lambda i, j: A0[i, j] + A0[j, i], name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: te.floor(A0[i, j]), name="B")
-    check_grad(B, A0, desired_grads=[zeros])
-
-    B = te.compute(shape, lambda i, j: te.ceil(A0[i, j]), name="B")
-    check_grad(B, A0, desired_grads=[zeros])
-
-    B = te.compute(shape, lambda i, j: te.trunc(A0[i, j]), name="B")
-    check_grad(B, A0, desired_grads=[zeros])
-
-    B = te.compute(shape, lambda i, j: te.round(A0[i, j]), name="B")
-    check_grad(B, A0, desired_grads=[zeros])
-
-    B = te.compute(shape, lambda i, j: A0[i, j] + te.exp(A0[j, i]), name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: te.log(0.1 + te.abs(A0[i, j] + te.exp(A0[j, i]))), name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: te.sigmoid(A0[i, j] * A0[i, j] * A0[j, i]), name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: te.tanh(A0[i, j] * A0[i, j] * A0[j, i]), name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: te.sqrt(A0[i, j] * A0[i, j] * A0[j, i]), name="B")
-    check_grad(B, A0, data_range=(0.1, 10))
-
-    B = te.compute(shape, lambda i, j: te.power(te.abs(A0[i, j]), A0[j, i]), name="B")
-    check_grad(B, A0, data_range=(-4, 4))
-
-    B = te.compute(shape, lambda i, j: A0[i, j] * A0[j, i], name="B")
-    check_grad(B, A0)
-
-    B = te.compute((10,), lambda i: te.sum(A0[i, k] * A0[k, i], axis=k), name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: te.sum(A0[i, k] * A0[k, i] + 5, axis=k), name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: te.max(A0[i, k] * A0[k, j] + 5, axis=k), name="B")
-    check_grad(B, A0)
-
-    B = te.compute(shape, lambda i, j: A0[i, j] * (A1[j, i] + A0[j, i]), name="B")
-    check_grad(B, [A0, A1])
-
-    B = te.compute(
-        shape, lambda i, j: te.sum(A0[k, k] - A0[te.min(j + k, 9), j] * A0[i, k], axis=k), name="B"
-    )
-    check_grad(B, A0)
-
-    def fcombine(x, y):
-        return x * y
-
-    def fidentity(t0):
-        return tvm.tir.const(1, t0)
-
-    prod = te.comm_reducer(fcombine, fidentity, name="prod")
-    B = te.compute((10, 10), lambda i, j: prod(A0[i, k] + A0[k, i], axis=k), name="B")
-    check_grad(B, A0)
-
-    X = te.placeholder((10,), name="X")
-    A = te.compute((10,), lambda i: X[i] + X[9 - i])
-    B = te.compute((10,), lambda i: X[i] * X[9 - i])
-    Y = topi.tensordot(A, B, 1)
-    check_grad(Y, X)
-
-    X = te.placeholder((3, 3), name="X")
-    Y = topi.einsum("ii->i", (X))
-    check_grad(Y, X)
-
-
-def test_topi():
-    X = te.placeholder((1, 2, 4, 4), name="X")
-    W = te.placeholder((5, 2, 3, 3), name="W")
-    W1 = te.placeholder((2, 5, 3, 3), name="W1")
-    W2 = te.placeholder((1,), name="W2")
-
-    R = topi.nn.conv2d(X, W, 1, 1, 1)
-    check_grad(R, [X, W])
-
-    R1 = topi.nn.conv2d(topi.nn.relu(R), W1, 1, 0, 1)
-    check_grad(R1, [X, W, W1])
-
-    R = topi.broadcast_to(W2, (5, 2, 3, 3))
-    check_grad(R, [W2])
-
-    R = topi.nn.conv2d(X, topi.broadcast_to(W2, (5, 2, 3, 3)), 1, 1, 1)
-    check_grad(R, [X, W2])
-
-    R = topi.nn.pool2d(X, [2, 2], [1, 1], [2, 2], [0, 0, 0, 0], "avg")
-    check_grad(R, X)
-
-    R = topi.nn.pool2d(X, [2, 2], [1, 1], [2, 2], [0, 0, 0, 0], "max")
-    check_grad(R, X)
-
-    X = te.placeholder((1, 2, 5, 5), name="X")
-    R = topi.reshape(X, (1, 32))
-    check_grad(R, [X])
-
-    X = te.placeholder((1, 2, 5, 5), name="X")
-    W = te.placeholder((2, 2, 3, 3), name="W")
-
-    S = topi.reshape(X, (1, 50))
-    check_grad(S, [X])
-
-    R = X + topi.nn.conv2d(X + topi.nn.conv2d(X, W, 1, 1, 1), W, 1, 1, 1)
-    check_grad(R, [X, W])
-
-    S = topi.nn.softmax(topi.reshape(R, (1, 50)))
-    check_grad(S, [X, W])
-
-    S = topi.sigmoid(topi.reshape(R, (1, 50)))
-    check_grad(S, [X, W])
-
-    S = topi.tanh(topi.reshape(R, (1, 50)))
-    check_grad(S, [X, W])
-
-    S = topi.nn.log_softmax(topi.reshape(R, (1, 50)))
-    check_grad(S, [X, W])
-    check_grad(S, [W], [X])
-
-    X = te.placeholder((1, 2, 3, 5), name="X")
-    Y = te.placeholder((1, 2, 7, 5), name="Y")
-    S = topi.concatenate((X, Y), 2)
-    check_grad(S, [X, Y])
-
-    X = te.placeholder((1, 2, 6, 5), name="X")
-    (S, R) = topi.split(X, 2, 2)
-    check_grad(S, [X])
-    check_grad(R, [X])
-    R1 = topi.concatenate((S, R), 2)
-    check_grad(R1, [X])
-    R2 = topi.concatenate((R, S), 2)
-    check_grad(R2, [X])
-
-    X = te.placeholder((4, 5), name="X")
-    I = te.placeholder((100,), name="I", dtype="int32")
-    R = topi.take(X, topi.abs(I))
-    check_grad(R, [X], [I])
-
-    W = te.placeholder((5, 5), name="W")
-    exps = topi.exp(topi.nn.dense(X, W))
-    sumexps = topi.sum(exps, axis=-1, keepdims=True)
-    R = exps / sumexps
-    check_grad(R, [X, W], data_range=(-1, 1))
-
-
-def test_stride_dilation():
-    X = te.placeholder((1, 2, 10, 10), name="X")
-    W = te.placeholder((2, 2, 1, 1), name="W")
-
-    Y = topi.nn.conv2d(X, W, 1, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 1, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 1, 0, 3)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 3)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 3)
-    check_grad(Y, [X, W])
-
-    W = te.placeholder((2, 2, 2, 2), name="W")
-
-    Y = topi.nn.conv2d(X, W, 1, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 1, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 1, 0, 3)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 3)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 3)
-    check_grad(Y, [X, W])
-
-    W = te.placeholder((2, 2, 3, 3), name="W")
-
-    Y = topi.nn.conv2d(X, W, 1, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 1)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 1, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 2)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 1, 0, 3)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 2, 0, 3)
-    check_grad(Y, [X, W])
-    Y = topi.nn.conv2d(X, W, 3, 0, 3)
-    check_grad(Y, [X, W])
-
-    Y = topi.nn.pool2d(X, [1, 1], [1, 1], [1, 1], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [1, 1], [1, 1], [2, 2], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [1, 1], [1, 1], [3, 3], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [2, 2], [1, 1], [1, 1], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [2, 2], [1, 1], [2, 2], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [2, 2], [1, 1], [3, 3], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [3, 3], [1, 1], [1, 1], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [3, 3], [1, 1], [2, 2], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-    Y = topi.nn.pool2d(X, [3, 3], [1, 1], [3, 3], [0, 0, 0, 0], "max")
-    check_grad(Y, [X])
-
-
-@pytest.mark.xfail
-def test_reduction_init():
-    np.random.seed(0)
-    shape = (10, 10)
-    k = te.reduce_axis((0, 10), name="k")
-    A0 = te.placeholder(shape, name="A0")
-
-    B = te.compute((10,), lambda i: te.sum(A0[i, k] * A0[k, i], axis=k, init=0.0), name="B")
-    check_grad(B, A0)
-
-
-if __name__ == "__main__":
-    test_basic_operation()
-    test_topi()
-    test_stride_dilation()
diff --git a/tests/python/te/test_te_build_lower.py b/tests/python/te/test_te_build_lower.py
deleted file mode 100644
index 50d5119b43a0..000000000000
--- a/tests/python/te/test_te_build_lower.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-
-
-def test_lower_rfactor():
-    n = te.size_var("n")
-    m = te.size_var("m")
-    A = te.placeholder((n, m), name="A")
-    k = te.reduce_axis((0, m), "k")
-    B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
-    s = te.create_schedule(B.op)
-    ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
-    BF = s.rfactor(B, ki)
-    xo, xi = s[B].split(s[B].op.axis[0], factor=32)
-    s[B.op].bind(xo, te.thread_axis("blockIdx.x"))
-    s[B.op].bind(xi, te.thread_axis("threadIdx.y"))
-    s[B].bind(s[B].op.reduce_axis[0], te.thread_axis("threadIdx.x"))
-    s[BF].compute_at(s[B], s[B].op.reduce_axis[0])
-    fapi = tvm.lower(s, [A, B])
-
-
-def test_dependent_output_shape():
-    n, m, x = te.size_var("n"), te.size_var("m"), te.size_var("x")
-    A = te.placeholder((n, m))
-    B = te.compute((m, n // x), lambda i, j: A[i, j], name="B")
-    s = te.create_schedule(B.op)
-    mod = tvm.build(s, [A, B, x])
-
-
-def test_split_uneven_unique_likely():
-    a = te.placeholder(
-        (16, 16),
-    )
-    b = te.placeholder(
-        (16, 16),
-    )
-    c = te.compute((16, 16), lambda x, y: a[x, y] + b[x, y])
-
-    x, y = c.op.axis
-    sch = te.create_schedule(c.op)
-    xo, xi = sch[c].split(x, 5)
-    stmt = tvm.lower(sch, [a, b, c])["main"].body
-    assert isinstance(stmt.body.body, tvm.tir.stmt.IfThenElse)
-
-
-if __name__ == "__main__":
-    test_lower_rfactor()
-    test_dependent_output_shape()
-    test_split_uneven_unique_likely()
diff --git a/tests/python/te/test_te_group.py b/tests/python/te/test_te_group.py
deleted file mode 100644
index e57040abc085..000000000000
--- a/tests/python/te/test_te_group.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test group effect"""
-import tvm
-from tvm import te
-
-
-def test_scan_group():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    s_state = te.placeholder((m, n))
-    s_init = te.compute((1, n), lambda _, i: x[0, i])
-
-    s_update1 = te.compute((m, n), lambda t, i: s_state[t - 1, i] + x[t, i])
-    s_update2 = te.compute((m, n), lambda t, i: s_update1[t, i] + 1)
-    s_update3 = te.compute((m, n), lambda t, i: s_update2[t, i] + 1)
-    res = tvm.te.scan(s_init, s_update3, s_state, inputs=x)
-
-    s = te.create_schedule(res.op)
-    assert s[s_update1].group is not None
-    assert s[s_update2].group == s[s_update1].group
-    # Assign within group, is valid
-    s[s_update1].compute_at(s[s_update2], s_update2.op.axis[1])
-    # create a new group, for [s_update2 and s_update1]
-    g2 = s.create_group(outputs=s_update2, inputs=[s_state, x])
-    assert g2.group is not None
-    assert g2.group == s[s_update3].group
-    assert s[s_update2].group == g2
-    assert s[s_update1].group == g2
-    g2.compute_at(s[s_update3], s_update3.op.axis[1])
-    assert g2.attach_stage == s[s_update3]
-    try:
-        # compute outside group error.
-        s[s_update2].compute_at(s[s_init], s_init.op.axis[0])
-        assert False
-    except tvm.error.TVMError:
-        pass
-
-
-def test_compute_group():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
-    x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
-    s = te.create_schedule(x2.op)
-    g = s.create_group(outputs=x1, inputs=x, include_inputs=True)
-    assert s[x1].group == g
-    assert s[x].group == g
-    g.compute_at(s[x2], x2.op.axis[1])
-    assert g.attach_stage == s[x2]
-    assert g.num_child_stages == 2
-
-
-def test_nest_group():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
-    x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
-    s = te.create_schedule(x2.op)
-    g1 = s.create_group(outputs=x1, inputs=x)
-    g2 = s.create_group(outputs=x1, inputs=x, include_inputs=True)
-    assert set(s.groups) == set([g1, g2])
-    assert s[x].group == g2
-    assert s[x1].group == g1
-    assert g1.group == g2
-    assert g2.num_child_stages == 2
-    assert g1.num_child_stages == 1
-
-
-if __name__ == "__main__":
-    test_nest_group()
-    test_compute_group()
-    test_scan_group()
diff --git a/tests/python/te/test_te_hybrid_script.py b/tests/python/te/test_te_hybrid_script.py
deleted file mode 100644
index 862e80ffb6ce..000000000000
--- a/tests/python/te/test_te_hybrid_script.py
+++ /dev/null
@@ -1,872 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm, inspect, sys, traceback, numpy, pytest, types, os
-
-from tvm import te
-from tvm.contrib import utils
-from tvm.te.hybrid import script
-from tvm.te.hybrid.runtime import HYBRID_GLOBALS
-
-import tvm.testing
-
-
-@pytest.mark.skip
-def run_and_check(func, args, var_dict={}, target="llvm", sch=None, outs=None):
-    def tvm_val_2_py_val(val):
-        val = tvm.tir.stmt_functor.substitute(val, var_dict)
-        val = tvm.arith.Analyzer().simplify(val)
-        assert isinstance(val, (tvm.tir.IntImm,))
-        return val.value
-
-    dev = tvm.device(target, 0)
-    op = None
-
-    if sch is None:
-        outs = func(*tuple(tvm.runtime.convert(i) if isinstance(i, list) else i for i in args))
-        op = outs[0].op if isinstance(outs, list) else outs.op
-        sch = te.create_schedule(op)
-    else:
-        assert outs is not None
-        assert isinstance(outs, list)
-        op = outs[0].op
-
-    emu_args = []
-    nd_args = []
-    for i in args:
-        if isinstance(i, te.tensor.Tensor):
-            shape = [tvm_val_2_py_val(j) for j in i.shape]
-            emu_args.append(numpy.random.randn(*shape).astype(i.dtype))
-            nd_args.append(tvm.nd.array(emu_args[-1], dev))
-        elif isinstance(i, tvm.tir.Var):
-            emu_args.append(tvm_val_2_py_val(i))
-            nd_args.append(emu_args[-1])
-        else:
-            assert isinstance(i, list)
-            emu_args.append(numpy.array(i))
-
-    compile_args = [i for i in args if isinstance(i, (te.tensor.Tensor, tvm.tir.Var))] + (
-        outs if isinstance(outs, list) else [outs]
-    )
-    module = tvm.build(sch, compile_args, target=target)
-    assert module
-
-    out_tensors = []
-    for i in range(op.num_outputs):
-        output = op.output(i)
-        shape = [tvm_val_2_py_val(j) for j in output.shape]
-        nd_args.append(tvm.nd.array(numpy.zeros(shape).astype(output.dtype), dev))
-        out_tensors.append(nd_args[-1])
-
-    ref_data = func(*emu_args)
-    if isinstance(ref_data, numpy.ndarray):
-        ref_data = [ref_data]
-
-    module(*nd_args)
-
-    for nd, np in zip(out_tensors, ref_data):
-        tvm.testing.assert_allclose(nd.numpy(), np, rtol=1e-5, atol=1e-5)
-
-    module_args = [i for i in args if isinstance(i, (te.tensor.Tensor, tvm.tir.Var))]
-    module_outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    h_module = te.hybrid.build(sch, module_args, module_outs)
-
-    return h_module, module_args, module_outs
-
-
-@script
-def outer_product(n, m, a, b):
-    """This is a simple outer product.
-    Actually this function is not required to be documented.
-    I write this docstring to test skipping docstring functionality.
-    """
-    c = output_tensor((n, m), a.dtype)
-    for i in range(n):
-        for j in range(m):
-            assert i < n and j < m, "index out of range!"
-            c[i, j] = a[i] * b[j]
-    return c
-
-
-@tvm.testing.skip_if_wheel_test
-# Test global function
-# Test bridge between frontend and backend
-def test_outer_product():
-    n = te.size_var("n")
-    m = te.size_var("m")
-    a = te.placeholder((n,), name="a")
-    b = te.placeholder((m,), name="b")
-
-    try:
-        c = outer_product(n, m, a, b)
-        ir = c.op.body
-    except IOError as err:
-        assert sys.version_info[0] == 2 and str(err) == "could not get source code"
-        return
-
-    # Check for i in (0, n)
-    assert isinstance(ir, tvm.tir.For)
-    assert ir.loop_var.name == "i"
-    assert ir.min.value == 0
-    assert ir.extent.name == "n"
-    ibody = ir.body
-    assert isinstance(ibody, tvm.tir.For)
-    # Check for j in (0, m)
-    assert ibody.loop_var.name == "j"
-    assert ibody.min.value == 0
-    assert ibody.extent.name == "m"
-    # Check loop body
-    jblock = ibody.body
-    assert isinstance(jblock, tvm.tir.SeqStmt)
-    jbody = jblock[0]
-    assert isinstance(jbody, tvm.tir.AssertStmt)
-    assert isinstance(jbody.message, tvm.tir.StringImm)
-    assert jbody.message.value == "index out of range!"
-    jbody = jblock[1]
-    assert isinstance(jbody, tvm.tir.ProducerStore)
-    assert jbody.producer.op.name == "c"
-    assert len(jbody.indices) == 2
-    assert jbody.indices[0].name == "i"
-    assert jbody.indices[1].name == "j"
-    assert isinstance(jbody.value, tvm.tir.Mul)
-    mul = jbody.value
-    assert isinstance(mul.a, tvm.tir.ProducerLoad)
-    assert mul.a.producer.name == "a"
-    assert mul.b.producer.name == "b"
-
-    func, ins, outs = run_and_check(outer_product, [n, m, a, b], {n: 99, m: 101})
-    temp = utils.tempdir()
-    path = temp.relpath("%s.py" % func.name)
-    func.save(path)
-    func_ = te.hybrid.HybridModule()
-    func_.load(path)
-    run_and_check(func_, ins, {n: 99, m: 101}, outs=outs)
-
-    for key, _ in HYBRID_GLOBALS.items():
-        assert key not in globals().keys()
-        assert key not in outer_product.__globals__.keys()
-
-
-@tvm.testing.skip_if_wheel_test
-# Test local function
-# Test allocation of local variable
-def test_fanout():
-    @script
-    def fanout(n, a):
-        three = 3.0
-        b = output_tensor((a.shape[0] - 3,), a.dtype)
-        for i in range(a.shape[0] - 3):
-            sigma = 0.0
-            for j in range(3):
-                sigma += a[i + j]
-            sigma = sigma / three
-            b[i] = sigma
-        return b
-
-    n = te.size_var("n")
-    a = te.placeholder((n,), "float32", name="a")
-    try:
-        b = fanout(n, a)
-        ir = b.op.body
-    except IOError as err:
-        assert sys.version_info[0] == 2 and str(err) == "could not get source code"
-        return
-
-    # Check for i in (0, n-3)
-    assert isinstance(ir, tvm.tir.For)
-    assert ir.loop_var.name == "i"
-    assert ir.min.value == 0
-    tvm.ir.assert_structural_equal(ir.extent, n - 3)
-    # Check loopbody
-    abody = ir.body
-    assert isinstance(abody, tvm.tir.ProducerRealize)
-    assert abody.bounds[0].min.value == 0
-    assert abody.bounds[0].extent.value == 1
-    assert abody.producer.op.name == "sigma"
-    # Check i loop body
-    rbody = abody.body
-    assert isinstance(rbody[0], tvm.tir.ProducerStore)
-    assert rbody[0].producer.op.name == "sigma"
-    assert len(rbody[0].indices) == 1
-    assert rbody[0].indices[0].value == 0
-    # Check fanout loop
-    jloop = rbody[1]
-    assert jloop.loop_var.name == "j"
-    assert jloop.min.value == 0
-    assert jloop.extent.value == 3
-    jbody = jloop.body
-    assert isinstance(jbody, tvm.tir.ProducerStore)
-    assert len(jbody.indices) == 1
-    assert jbody.indices[0].value == 0
-    assert jbody.producer.op.name == "sigma"
-    assert isinstance(jbody.value, tvm.tir.Add)
-    value = jbody.value
-    assert isinstance(value.a, tvm.tir.ProducerLoad)
-    assert value.a.producer.name == "sigma"
-    assert len(value.a.indices) == 1
-    assert value.a.indices[0].value == 0
-    assert value.b.producer.name == "a"
-    assert len(value.b.indices) == 1
-    tvm.ir.assert_structural_equal(value.b.indices[0], ir.loop_var + jloop.loop_var)
-    divide = rbody[2]
-    assert isinstance(divide, tvm.tir.ProducerStore)
-    assert len(divide.indices) == 1
-    assert divide.indices[0].value == 0
-    value = divide.value
-    assert isinstance(value, tvm.tir.Mul)
-    assert value.a.producer.name == "sigma"
-    assert len(value.a.indices) == 1
-    assert value.a.indices[0].value == 0
-    assert abs(value.b.value - (1 / 3.0)) < 1e-5
-    write = rbody[3]
-    assert isinstance(write, tvm.tir.ProducerStore)
-    assert write.producer.op.name == "b"
-    assert write.value.producer.name == "sigma"
-    assert len(write.value.indices) == 1
-    assert write.value.indices[0].value == 0
-
-    func, ins, outs = run_and_check(fanout, [n, a], {n: 10})
-    run_and_check(func, ins, {n: 10}, outs=outs)
-
-
-def test_looptype():
-    @script
-    def looptype(a, b, c):
-        d = output_tensor((16,), "int32")
-        e = output_tensor((16,), "int32")
-        f = output_tensor((16,), "int32")
-        for i in parallel(16):
-            d[i] = a[i]
-        for j in vectorize(16):
-            e[j] = b[j]
-        for k in unroll(16):
-            f[k] = c[k]
-        return d, e, f
-
-    a = te.placeholder((16,), name="a", dtype="int32")
-    b = te.placeholder((16,), name="b", dtype="int32")
-    c = te.placeholder((16,), name="c", dtype="int32")
-    try:
-        d, e, f = looptype(a, b, c)
-        ir = d.op.body
-    except:
-        return
-    iloop = ir[0]
-    jloop = ir[1]
-    kloop = ir[2]
-    assert iloop.kind == tvm.tir.ForKind.PARALLEL
-    assert jloop.kind == tvm.tir.ForKind.VECTORIZED
-    assert kloop.kind == tvm.tir.ForKind.UNROLLED
-
-    func, ins, outs = run_and_check(looptype, [a, b, c])
-    run_and_check(func, ins, outs=outs)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_if():
-    @script
-    def if_then_else(a):
-        b = output_tensor((10,), "int32")
-        c = output_tensor((10,), "int32")
-        for i in range(10):
-            if i % 2 == 0:
-                c[i] = a[i]
-            else:
-                c[i] = b[i]
-        for i in unroll(10):
-            b[i] = -1 if i % 2 == 0 else 1
-        return b, c
-
-    a = te.placeholder((10,), dtype="int32", name="a")
-
-    func, ins, outs = run_and_check(if_then_else, [a])
-    run_and_check(func, ins, outs=outs)
-
-    @script
-    def if_triple_condition(a):
-        b = output_tensor((10,), "int32")
-        for i in range(10):
-            if 0 <= i < 5:
-                b[i] = a[i]
-            else:
-                b[i] = a[i] + 1
-        return b
-
-    func, ins, outs = run_and_check(if_triple_condition, [a])
-    run_and_check(func, ins, outs=outs)
-
-    @script
-    def if_and(a):
-        b = output_tensor((10,), "int32")
-        for i in range(10):
-            if i >= 0 and i < 5:
-                b[i] = a[i]
-            else:
-                b[i] = a[i] + 1
-        return b
-
-    func, ins, outs = run_and_check(if_and, [a])
-    run_and_check(func, ins, outs=outs)
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_bind():
-    @script
-    def vec_add(a, b):
-        c = output_tensor((1000,), "float32")
-        for tx in bind("threadIdx.x", 1000):
-            c[tx] = a[tx] + b[tx]
-        return c
-
-    a = te.placeholder((1000,), dtype="float32", name="a")
-    b = te.placeholder((1000,), dtype="float32", name="b")
-    func, ins, outs = run_and_check(vec_add, [a, b], target="cuda")
-    run_and_check(func, ins, outs=outs, target="cuda")
-
-    @script
-    def raw(a, b):
-        c = output_tensor((1000,), "float32")
-        for i in range(1000):
-            c[i] = a[i] + b[i]
-        return c
-
-    c = raw(a, b)
-    sch = te.create_schedule(c.op)
-    x = te.thread_axis("threadIdx.x")
-    sch[c].bind(c.op.axis[0], x)
-    func, ins, outs = run_and_check(raw, [a, b], sch=sch, outs=[c], target="cuda")
-    run_and_check(func, ins, outs=outs, target="cuda")
-
-    @te.hybrid.script
-    def foo(a):
-        c = output_tensor((a.shape[0],), a.dtype)
-        total = allocate((1,), a.dtype, "local")
-        len_i = a.shape[0]
-        len_j = a.shape[1]
-        for i in bind("threadIdx.x", len_i):
-            total[0] = 0.0
-            for k in const_range(len_j):
-                total[0] += a[i, k]
-            c[i] = total[0]
-
-        return c
-
-    a = te.placeholder((8, 4), "float32")
-    c = foo(a)
-    s = te.create_schedule(c.op)
-    ir = tvm.lower(s, [a, c])
-
-    func, ins, outs = run_and_check(foo, [a], target="cuda")
-    run_and_check(func, ins, outs=outs, target="cuda")
-
-    @te.hybrid.script
-    def max_threads(a):
-        b = output_tensor(a.shape, a.dtype)
-        n = a.shape[0]
-        m = max_num_threads(True)
-        for i in bind("threadIdx.x", m):
-            for j in bind("blockIdx.x", ceil_div(n, m)):
-                if i * m + j < n:
-                    b[i * m + j] = a[i * m + j] + a[i * m + j]
-        return b
-
-    a = te.placeholder((10000,), "float32")
-    with tvm.target.Target("cuda"):
-        func, ins, outs = run_and_check(max_threads, [a], target="cuda")
-        run_and_check(func, ins, outs=outs, target="cuda")
-
-
-@tvm.testing.skip_if_wheel_test
-def test_math_intrin():
-    @script
-    def intrin_real(a):
-        b = output_tensor((8,), "float32")
-        b[0] = sqrt(a[0])
-        b[1] = log(a[1])
-        b[2] = exp(a[2])
-        b[3] = sigmoid(a[3])
-        b[4] = power(a[4], a[5])
-        b[5] = tanh(a[5])
-        b[6] = min(a[4], a[5])
-        b[7] = max(a[5], a[6])
-        return b
-
-    a8 = te.placeholder((8,), dtype="float32", name="a")
-    b8 = intrin_real(a8)
-    sch = te.create_schedule(b8.op)
-    func = tvm.build(sch, [a8, b8])
-    assert func
-    a = numpy.arange(2, 10).astype("float32")
-    tvm_a = tvm.nd.array(a)
-    tvm_b = tvm.nd.array(numpy.zeros((8,), dtype="float32"))
-    b = intrin_real(a)
-    func(tvm_a, tvm_b)
-    tvm.testing.assert_allclose(b, tvm_b.numpy(), rtol=1e-5)
-
-    @script
-    def intrin_int(a):
-        b = output_tensor((1,), "int32")
-        b[0] = popcount(a[0])
-        return b
-
-    a1 = te.placeholder((1,), dtype="int32")
-    b1 = intrin_int(a1)
-    sch = te.create_schedule(b1.op)
-    func = tvm.build(sch, [a1, b1])
-    assert func
-    a = numpy.array([114514]).astype("int32")
-    tvm_a = tvm.nd.array(a)
-    tvm_b = tvm.nd.array(numpy.array([0]).astype("int32"))
-    b = intrin_int(a)
-    func(tvm_a, tvm_b)
-    assert tvm_b.numpy()[0] == b[0]
-
-
-@tvm.testing.skip_if_wheel_test
-# test non caconical loops
-def test_non_zero():
-    @te.hybrid.script
-    def blur(a):
-        b = output_tensor((30, 30), "float32")
-        for i in range(2, 32):
-            for j in range(2, 32):
-                s = 0.0
-                for di in range(3):
-                    for dj in range(3):
-                        s += a[i - di, j - dj]
-                b[i - 2, j - 2] = s / 9.0
-        return b
-
-    a = te.placeholder((32, 32), "float32", "a")
-    func, ins, outs = run_and_check(blur, [a])
-    run_and_check(func, ins, outs=outs)
-
-    @te.hybrid.script
-    def triangle(a, b):
-        c = output_tensor((10, 10), dtype="float32")
-        for i in range(10):
-            for j in range(i, 10):
-                c[i, j] = a[i] * b[j]
-        return c
-
-    a = te.placeholder((10,), dtype="float32", name="a")
-    b = te.placeholder((10,), dtype="float32", name="b")
-
-    func, ins, outs = run_and_check(triangle, [a, b])
-    run_and_check(func, ins, outs=outs)
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_allocate():
-    @te.hybrid.script
-    def blur2d(a):
-        b = output_tensor((30, 30), "float32")
-        for i in range(30):
-            ha = allocate((3, 30), "float32")
-            for j in range(3):
-                for k in range(30):
-                    ha[j, k] = a[i + j, k] + a[i + j, k + 1] + a[i + j, k + 2]
-            for j in range(30):
-                b[i, j] = (ha[0, j] + ha[1, j] + ha[2, j]) / 9.0
-        return b
-
-    a = te.placeholder((32, 32), "float32", "a")
-    b = blur2d(a)
-    sch = te.create_schedule(b.op)
-    func, ins, outs = run_and_check(blur2d, [a])
-    run_and_check(func, ins, outs=outs)
-
-    @te.hybrid.script
-    def share_vec_add(a, b):
-        c = output_tensor((256,), "float32")
-        shared = allocate((256,), "float32", "shared")
-        for i in bind("threadIdx.x", 256):
-            shared[i] = a[i]
-        local = allocate((256,), "float32", "local")
-        for i in bind("threadIdx.x", 256):
-            local[i] = b[i]
-        for i in bind("threadIdx.x", 256):
-            c[i] = shared[i] + local[i]
-        return c
-
-    a = te.placeholder((256,), dtype="float32", name="a")
-    b = te.placeholder((256,), dtype="float32", name="b")
-    c = share_vec_add(a, b)
-    func, ins, outs = run_and_check(share_vec_add, [a, b], target="cuda")
-    run_and_check(func, ins, outs=outs, target="cuda")
-
-
-@tvm.testing.skip_if_wheel_test
-def test_upstream():
-    @te.hybrid.script
-    def upstream(a):
-        b = output_tensor((20,), "float32")
-        for i in range(20):
-            b[i] = a[i] * i
-        return b
-
-    a = te.placeholder((20,), "float32")
-    b = te.placeholder((20,), "float32")
-    c = te.compute((20,), lambda x: a[x] + b[x])
-    d = upstream(c)
-    sch = te.create_schedule([c.op, d.op])
-    ir = tvm.lower(sch, [a, b, d])
-    func = tvm.build(sch, [a, b, d])
-    assert func
-
-    a = numpy.random.randn(20).astype("float32")
-    b = numpy.random.randn(20).astype("float32")
-    ref = numpy.zeros((20,), "float32")
-    for i in range(20):
-        ref[i] = (a[i] + b[i]) * i
-
-    tvm_a = tvm.nd.array(a)
-    tvm_b = tvm.nd.array(b)
-    tvm_d = tvm.nd.array(numpy.zeros((20,)).astype("float32"))
-
-    func(tvm_a, tvm_b, tvm_d)
-    tvm.testing.assert_allclose(tvm_d.numpy(), ref, 1e-5, 1e-5)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_downstream():
-    @te.hybrid.script
-    def downstream(a):
-        b = output_tensor((20,), "float32")
-        for i in range(20):
-            b[i] = a[i] * i
-        return b
-
-    a = te.placeholder((20,), "float32")
-    b = downstream(a)
-    c = te.compute((20,), lambda x: b[x] + 1.0)
-
-    sch = te.create_schedule(c.op)
-    module = tvm.build(sch, [a, c])
-    assert module
-
-    a = numpy.random.randn(20).astype("float32")
-    ref = numpy.zeros((20,)).astype("float32")
-    for i in range(20):
-        ref[i] = (a[i] * i) + 1.0
-
-    tvm_a = tvm.nd.array(a)
-    tvm_c = tvm.nd.array(numpy.zeros((20,)).astype("float32"))
-    module(tvm_a, tvm_c)
-    tvm.testing.assert_allclose(tvm_c.numpy(), ref, 1e-5, 1e-5)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_const_param():
-    @te.hybrid.script
-    def add_something(a, b):
-        c = output_tensor((11,), "int32")
-        for i in range(11):
-            c[i] = a[i] + b
-        return c
-
-    a = te.placeholder((11,), dtype="int32", name="a")
-    b = tvm.tir.const(11, "int32")
-    c = add_something(a, b)
-    sch = te.create_schedule(c.op)
-    module = tvm.build(sch, [a, c], "llvm")
-    assert module
-
-    np_a = numpy.arange(11).astype("int32")
-    np_b = 11
-    np_c = numpy.zeros((11,)).astype("int32")
-
-    nd_a = tvm.nd.array(np_a)
-    nd_c = tvm.nd.array(numpy.zeros((11,)).astype("int32"))
-    module(nd_a, nd_c)
-    ref = add_something(np_a, 11)
-
-    tvm.testing.assert_allclose(nd_c.numpy(), ref, 1e-5, 1e-5)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_value_index():
-    @te.hybrid.script
-    def kernel_a(a):
-        b = output_tensor((16,), "int32")
-        c = output_tensor((4, 4), "int32")
-        for i in range(16):
-            b[i] = a[i] + 2
-            c[i // 4, i % 4] = a[i] + 1
-        return b, c
-
-    @te.hybrid.script
-    def kernel_b(b, a):
-        c = output_tensor((4, 4), "int32")
-        for i in range(4):
-            for j in range(4):
-                c[i, j] = a[i * 4 + j] * b[i, j]
-        return c
-
-    a = te.placeholder((16,), "int32")
-    b, c = kernel_a(a)
-    d = kernel_b(c, b)
-    sch = te.create_schedule(d.op)
-    module = tvm.build(sch, [a, d])
-    assert module
-
-    np_a = numpy.arange(16).astype("int32")
-    np_b, np_c = kernel_a(np_a)
-    ref = kernel_b(np_c, np_b)
-
-    res = tvm.nd.array(numpy.zeros((4, 4)).astype("int32"))
-    module(tvm.nd.array(np_a), res)
-    tvm.testing.assert_allclose(res.numpy(), ref)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_func_call():
-    @te.hybrid.script
-    def foo(a, b):
-        for i in range(len(a)):
-            a[i] = i + 1.0
-        for i in range(len(a)):
-            b[i] = i + 1.0
-        c = outer_product(10, 10, a, b)
-        d = output_tensor(c.shape, c.dtype)
-        for i in range(10):
-            for j in range(10):
-                d[i, j] = c[i, j] + i * j
-        return d
-
-    a = te.placeholder((10,), name="a")
-    b = te.placeholder((10,), name="b")
-    func, ins, outs = run_and_check(foo, [a, b])
-    run_and_check(func, ins, outs=outs)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_bool():
-    @te.hybrid.script
-    def foo(a):
-        b = output_tensor(a.shape, a.dtype)
-        b[0] = 1.2
-        for i in range(1, a.shape[0] - 1):
-            if a[i] * a[i - 1] < a[i] or a[i] * a[i - 1] < a[i - 1] or i * a[i] == a[i]:
-                b[i] = a[i]
-            else:
-                b[i] = 0.0
-        return b
-
-    a = te.placeholder((10,), name="a")
-    func, ins, outs = run_and_check(foo, [a])
-    run_and_check(func, ins, outs=outs)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_const_range():
-    @te.hybrid.script
-    def foo(a, b):
-        c = output_tensor(a.shape, a.dtype)
-        d = output_tensor(a.shape, "int32")
-
-        for i in const_range(2):
-            for j in const_range(5):
-                c[i, j] = float32(int32(a[i, j]) + b[i, j])
-
-        for i in const_range(len(b)):
-            for j in const_range(len(b[0])):
-                d[i, j] = int32(a[i, j] + b[i, j])
-
-        return c, d
-
-    a = te.placeholder((2, 5), name="a", dtype="float32")
-    b = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]]
-    func, ins, outs = run_and_check(foo, [a, b])
-    run_and_check(func, ins, outs=outs)
-
-    @te.hybrid.script
-    def goo(a, b):
-        c = output_tensor(a.shape, a.dtype)
-        len_b = len(b)
-        for i in const_range(len_b * 2):
-            if i < len_b:
-                c[i] = a[i] + b[i]
-            else:
-                c[i - len_b] = a[i - len_b] + b[i - len_b]
-        return c
-
-    a = te.placeholder((5,), name="a", dtype="int32")
-    b = [1, 2, 3, 4, 5]
-    c = goo(a, tvm.runtime.convert(b))
-    sch = te.create_schedule(c.op)
-    func, ins, outs = run_and_check(goo, [a, b])
-    run_and_check(func, ins, outs=outs)
-
-    @te.hybrid.script
-    def hoo(a, b):
-        c = output_tensor(a.shape, a.dtype)
-        len_b = len(b)
-        for i in range(a.shape[0]):
-            for j in const_range(len(b)):
-                d = a[i] * b[j]
-                d += a[i] + b[j]
-                c[i] = d
-        return c
-
-    a = te.placeholder((5,), name="a", dtype="int32")
-    b = [1, 2, 3, 4, 5]
-    func, ins, outs = run_and_check(hoo, [a, b])
-    run_and_check(func, ins, outs=outs)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_schedule():
-    @script
-    def outer_product(a, b):
-        c = output_tensor((64, 64), a.dtype)
-        for i in range(64):
-            for j in range(64):
-                c[i, j] = a[i] * b[j]
-        return c
-
-    a = te.placeholder((64,), name="a", dtype="float32")
-    b = te.placeholder((64,), name="b", dtype="float32")
-    c = outer_product(a, b)
-
-    # Test perfect loop split
-    # Test loop reorder
-    # Test loop annotation
-    sch = te.create_schedule(c.op)
-    i, j = c.op.axis
-    io, ii = sch[c].split(i, 4)
-    sch[c].parallel(ii)
-    jo, ji = sch[c].split(j, 4)
-    joo, joi = sch[c].split(jo, 4)
-    sch[c].vectorize(ji)
-    sch[c].reorder(ii, io, joo, joi, ji)
-    ir = tvm.lower(sch, [a, b, c])["main"].body
-    assert isinstance(ir, tvm.tir.AttrStmt)
-    ir = ir.body
-    assert isinstance(ir, tvm.tir.For)
-    assert ir.loop_var.name == "i.inner"
-    ir = ir.body
-    assert isinstance(ir, tvm.tir.For)
-    assert ir.loop_var.name == "i.outer"
-    ir = ir.body
-    assert isinstance(ir, tvm.tir.For)
-    assert ir.loop_var.name == "j.outer.outer"
-    ir = ir.body
-    assert isinstance(ir, tvm.tir.For)
-    assert ir.loop_var.name == "j.outer.inner"
-    ir = ir.body
-    func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c])
-    run_and_check(func, ins, outs=outs)
-
-    # Test fuse
-    sch = te.create_schedule(c.op)
-    sch[c].fuse(c.op.axis[0], c.op.axis[1])
-    ir = tvm.lower(sch, [a, b, c])["main"].body
-    assert isinstance(ir, tvm.tir.AttrStmt)
-    ir = ir.body
-    assert isinstance(ir, tvm.tir.For)
-    assert ir.loop_var.name == "i.j.fused"
-    func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c])
-    run_and_check(func, ins, outs=outs)
-
-    # Test imperfect loop split
-    sch = te.create_schedule(c.op)
-    sch[c].split(c.op.axis[0], 3)
-    ir = tvm.lower(sch, [a, b, c], simple_mode=True)
-    func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c])
-    run_and_check(func, ins, outs=outs)
-
-    # Test loop binds
-
-
-@tvm.testing.skip_if_wheel_test
-def test_capture():
-    n = 8
-
-    constant_tuple = (10, n)
-    constant_list = [[1, 2], [3, n]]
-    const_value = 1
-
-    @te.hybrid.script
-    def add_something(a):
-        c = output_tensor((constant_tuple[1],), "int32")
-        for i in range(constant_tuple[1]):
-            c[i] = a[i] + constant_list[1][const_value]
-        return c
-
-    a = te.placeholder((n,), dtype="int32", name="a")
-
-    func, ins, outs = run_and_check(add_something, [a])
-    run_and_check(func, ins, outs=outs)
-
-
-@tvm.testing.skip_if_wheel_test
-def test_array_inputs():
-    @script
-    def sum_array(inputs):
-        out = output_tensor((10,), inputs[0].dtype)
-        n = len(inputs)
-        for i in range(10):
-            for j in const_range(n):
-                out[i] += inputs[j][i]
-        return out
-
-    n = 5
-    inputs = []
-    for i in range(n):
-        inputs.append(te.placeholder((10,), name="t%s" % i, dtype="float32"))
-
-    out = sum_array(tvm.runtime.convert(inputs))
-    assert len(out.op.inputs) == n
-
-    sch = te.create_schedule(out.op)
-    mod = tvm.build(sch, inputs + [out], target="llvm")
-    assert mod
-
-    input_nd = []
-    out_ref = numpy.zeros((10,))
-    for _ in range(n):
-        arr = numpy.random.uniform(size=(10,)).astype("float32")
-        input_nd.append(tvm.nd.array(arr))
-        out_ref += arr
-    out_nd = tvm.nd.array(numpy.zeros((10,), "float32"))
-    mod(*input_nd, out_nd)
-    tvm.testing.assert_allclose(out_nd.numpy(), out_ref)
-
-
-if __name__ == "__main__":
-    test_outer_product()
-    test_fanout()
-    test_looptype()
-    test_if()
-    test_bind()
-    test_math_intrin()
-    test_non_zero()
-    test_allocate()
-    test_upstream()
-    test_downstream()
-    test_const_param()
-    test_value_index()
-    test_func_call()
-    test_bool()
-    test_const_range()
-    test_schedule()
-    test_capture()
-    test_array_inputs()
-    # TODO:
-    # test_inplace()
diff --git a/tests/python/te/test_te_schedule.py b/tests/python/te/test_te_schedule.py
deleted file mode 100644
index d46db2b702c0..000000000000
--- a/tests/python/te/test_te_schedule.py
+++ /dev/null
@@ -1,382 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pickle as pkl
-
-import pytest
-import tvm
-from tvm import te
-from tvm.driver.build_module import schedule_to_module
-
-
-def test_schedule_create():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    l = te.size_var("l")
-    A = te.placeholder((m, l), name="A")
-    B = te.placeholder((n, l), name="B")
-    AA = te.compute((m, l), lambda i, j: A[i, j])
-    T = te.compute((m, n, l), lambda i, j, k: AA(i, k) * B(j, k))
-    s = te.create_schedule(T.op)
-    s[AA].set_scope("shared")
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-    xi1, xi2 = s[T].split(xi, factor=2)
-    s[AA].compute_at(s[T], xi1)
-    xo, xi = s[AA].split(AA.op.axis[0], factor=10)
-    s[T].reorder(xi2, xi1)
-    assert T.op.axis[1] in s[T].leaf_iter_vars
-
-    # save load json
-    json_str = tvm.ir.save_json(s)
-    s_loaded = tvm.ir.load_json(json_str)
-    assert isinstance(s_loaded, tvm.te.schedule.Schedule)
-    assert str(s_loaded.outputs[0].body) == str(s.outputs[0].body)
-
-    # pickle unpickle
-    dump = pkl.dumps(s)
-    s_loaded = pkl.loads(dump)
-    assert isinstance(s_loaded, tvm.te.schedule.Schedule)
-    assert str(s_loaded.outputs[0].body) == str(s.outputs[0].body)
-
-
-def test_reorder():
-    m = te.size_var("m")
-    A = te.placeholder((m,), name="A")
-    T = te.compute(m, lambda i: A[i + 1])
-
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-    xi1, xi2 = s[T].split(xi, factor=2)
-    order = (xi2, xi1, xo)
-    assert tuple(s[T].leaf_iter_vars) != order
-    s[T].reorder(*order)
-    assert tuple(s[T].leaf_iter_vars) == order
-    try:
-        # pass duplicate IterVar
-        # must raise an error
-        s[T].reorder(xi2, xi1, xi2)
-        assert False
-    except tvm.error.TVMError:
-        pass
-
-
-def test_split():
-    m = te.size_var("m")
-    A = te.placeholder((m,), name="A")
-    T = te.compute((m,), lambda i: A[i])
-
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-    assert tuple(s[T].leaf_iter_vars) == (xo, xi)
-
-
-def test_tile():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    A = te.placeholder((m, n), name="A")
-    T = te.compute((m, n), lambda i, j: A[i, j])
-
-    s = te.create_schedule(T.op)
-    xo, yo, xi, yi = s[T].tile(T.op.axis[0], T.op.axis[1], x_factor=10, y_factor=5)
-    assert tuple(s[T].leaf_iter_vars) == (xo, yo, xi, yi)
-
-
-def test_fuse():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    A = te.placeholder((m, n), name="A")
-    T = te.compute((m, n), lambda i, j: A[i, j])
-
-    s = te.create_schedule(T.op)
-    xo, yo, xi, yi = s[T].tile(T.op.axis[0], T.op.axis[1], x_factor=10, y_factor=5)
-    fused = s[T].fuse(xo, yo)
-    assert any(isinstance(x, tvm.te.schedule.Fuse) for x in s[T].relations)
-    assert tuple(s[T].leaf_iter_vars) == (fused, xi, yi)
-
-
-def test_fuse_with_split():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    A = te.placeholder((m, n), name="A")
-    T = te.compute((m, n), lambda i, j: A[i, j])
-
-    s = te.create_schedule(T.op)
-    y = T.op.axis[1]
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-    fused = s[T].fuse(xi, y)
-    assert any(isinstance(x, tvm.te.schedule.Fuse) for x in s[T].relations)
-    assert tuple(s[T].leaf_iter_vars) == (xo, fused)
-
-
-def test_fuse_with_out_of_order_axis():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    A = te.placeholder((m, n), name="A")
-    T = te.compute((m, n), lambda i, j: A[i, j])
-
-    s = te.create_schedule(T.op)
-    y = T.op.axis[1]
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-
-    with pytest.raises(RuntimeError):
-        fused = s[T].fuse(xo, y)  # should throw here
-
-
-def test_fuse_with_out_of_order_axis_with_reorder():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    A = te.placeholder((m, n), name="A")
-    T = te.compute((m, n), lambda i, j: A[i, j])
-
-    s = te.create_schedule(T.op)
-    y = T.op.axis[1]
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-    s[T].reorder(y, xo, xi)
-    fused = s[T].fuse(y, xo)  # should be ok
-
-    s = te.create_schedule(T.op)
-    y = T.op.axis[1]
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-    s[T].reorder(y, xo, xi)
-
-    with pytest.raises(RuntimeError):
-        fused = s[T].fuse(y, xi)  # should throw here
-
-
-def test_singleton():
-    A = te.placeholder((), name="A")
-    T = te.compute((), lambda: A() + 1)
-    s = te.create_schedule(T.op)
-    fused = s[T].fuse()
-    assert any(isinstance(x, tvm.te.schedule.Singleton) for x in s[T].relations)
-    assert tuple(s[T].leaf_iter_vars) == (fused,)
-    dump = pkl.dumps(s)
-    s_loaded = pkl.loads(dump)
-    assert isinstance(s_loaded, tvm.te.schedule.Schedule)
-
-
-def test_vectorize():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    A = te.placeholder((m, n), name="A")
-    T = te.compute((m, n), lambda i, j: A[i, j])
-
-    s = te.create_schedule(T.op)
-    xo, yo, xi, yi = s[T].tile(T.op.axis[0], T.op.axis[1], x_factor=10, y_factor=5)
-    s[T].vectorize(yi)
-    s[T].unroll(xi)
-    UNROLL = tvm.te.schedule.IterVar.Unrolled
-    VECTORIZE = tvm.te.schedule.IterVar.Vectorized
-    assert s[T].iter_var_attrs[xi].iter_type == UNROLL
-    assert s[T].iter_var_attrs[yi].iter_type == VECTORIZE
-
-
-def test_vectorize_commreduce():
-    V = te.placeholder((128,), name="V")
-    ax = te.reduce_axis((0, 128), name="ax")
-    O = te.compute((1,), lambda _: te.sum(V[ax], axis=[ax]))
-    s = te.create_schedule(O.op)
-    with pytest.raises(RuntimeError):
-        s[O].vectorize(ax)  # should throw here
-
-
-def test_pragma():
-    m = 100
-    A = te.placeholder((m,), name="A")
-    T = te.compute((m,), lambda i: A[i])
-
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=10)
-    s[T].pragma(xo, "pragma1")
-    s[T].pragma(xi, "vectorize")
-    VECTORIZE = tvm.te.schedule.IterVar.Vectorized
-    assert s[T].iter_var_attrs[xo].pragma_keys[0].value == "pragma1"
-    assert s[T].iter_var_attrs[xi].iter_type == VECTORIZE
-
-
-def test_rfactor():
-    n = te.size_var("n")
-    k1 = te.reduce_axis((0, n), name="k1")
-    k2 = te.reduce_axis((0, n), name="k2")
-    A = te.placeholder((n, n, n), name="A")
-    B = te.compute((n,), lambda i: te.sum(A[i, k1, k2], axis=[k1, k2]))
-    # normal schedule
-    s = te.create_schedule(B.op)
-    BF = s.rfactor(B, k1)
-    assert tuple(BF.shape) == (n, n)
-    assert set(BF.op.body[0].axis) == set([k2])
-    assert s[B].op.body[0].axis[0].dom.extent == n
-    assert len(s[B].all_iter_vars) == 2
-    # schedule with split
-    s = te.create_schedule(B.op)
-    ko, ki = s[B].split(k1, factor=4)
-    xo, xi = s[B].split(B.op.axis[0], factor=8)
-    BF = s.rfactor(B, ki)
-    assert BF.shape[0].value == 4
-    assert BF.shape[1] == n
-    assert BF.op.body[0].axis[0] == k2
-    assert BF.op.body[0].axis[1].var == ko.var
-    assert s[B].op.body[0].axis[0].dom.extent.value == 4
-    # schedule with factor_axis
-    s = te.create_schedule(B.op)
-    ko, ki = s[B].split(k1, factor=4)
-    xo, xi = s[B].split(B.op.axis[0], factor=8)
-    BF = s.rfactor(B, ki, 1)
-    assert n == BF.shape[0]
-    assert BF.shape[1].value == 4
-    assert BF.op.body[0].axis[0] == k2
-    assert BF.op.body[0].axis[1].var == ko.var
-    assert s[B].op.body[0].axis[0].dom.extent.value == 4
-
-
-def test_tensor_intrin():
-    n = 16
-    x = te.placeholder((n,), name="x")
-    y = te.placeholder((n,), name="y")
-    z = te.compute(x.shape, lambda i: x[i] + y[i], name="z")
-
-    def intrin_func(ins, outs):
-        assert isinstance(ins[0], tvm.te.schedule.Buffer)
-        assert ins[0].shape[0].value == n
-        return tvm.tir.call_packed("vadd", ins[0].data, outs[0].data, ins[0].shape[0])
-
-    intrin = te.decl_tensor_intrin(z.op, intrin_func, default_buffer_params={"offset_factor": n})
-    assert intrin.op == z.op
-    assert intrin.reduce_init is None
-    assert tuple(intrin.inputs) == tuple(z.op.input_tensors)
-    assert intrin.buffers[0].shape[0].value == n
-    m = 32
-    X = te.placeholder((m,), name="X")
-    Y = te.placeholder((m,), name="Y")
-    Z = te.compute(X.shape, lambda i: X[i] + Y[i], name="Z")
-    s = te.create_schedule(Z.op)
-    xo, xi = s[Z].split(Z.op.axis[0], factor=n)
-    s[Z].tensorize(xi, intrin)
-    stmt = tvm.lower(s, [X, Y, Z])["main"].body
-    assert isinstance(stmt.body, tvm.tir.Evaluate)
-    assert str(stmt.body.value.args[0]) == '"vadd"'
-    assert str(stmt.body.value.args[1]) == "X"
-    assert str(stmt.body.value.args[2]) == "Z"
-    assert s[Z].iter_var_attrs[xi].tensor_intrin == intrin
-    assert s[Z].iter_var_attrs[xi].iter_type == tvm.te.schedule.IterVar.Tensorized
-
-
-def test_tensor_intrin_scalar_params():
-    n = te.size_var("n")
-    x = te.placeholder((n,), name="x")
-    v = te.size_var("v")
-    w = te.size_var("w")
-    z = te.compute((n,), lambda i: x[i] * v + w, name="z")
-
-    def intrin_func(ins, outs, sp):
-        assert isinstance(ins[0], tvm.te.schedule.Buffer)
-        assert ins[0].shape[0] == n
-        assert sp[0] == v
-        assert sp[1] == w
-        return tvm.tir.call_packed("hw_func", ins[0].data, outs[0].data, sp[0], sp[1])
-
-    intrin = te.decl_tensor_intrin(
-        z.op, intrin_func, scalar_params=[v, w], default_buffer_params={"offset_factor": 1}
-    )
-    assert intrin.op == z.op
-    assert intrin.reduce_init is None
-    assert tuple(intrin.inputs) == tuple(z.op.input_tensors)
-    assert intrin.buffers[0].shape[0] == n
-    assert tuple(intrin.scalar_params) == tuple((v, w))
-
-    A = te.placeholder((10, 10), name="A")
-    # Pass scalar inputs to the TensorIntrin, interleaved with tensor inputs
-    C = te.compute((10, 10), lambda i, j: intrin(i * i, A[i, j], i + j), name="C")
-    s = te.create_schedule(C.op)
-    stmt = tvm.lower(s, [A, C])["main"].body
-    assert isinstance(stmt.body.body, tvm.tir.Evaluate)
-    assert len(stmt.body.body.value.args) == 5
-    assert str(stmt.body.body.value.args[3]) == "i * i"
-    assert str(stmt.body.body.value.args[4]) == "i + j"
-
-
-def test_legalize_invalid_attach():
-    A = te.compute((10, 10), lambda i, j: 1.0, name="A")
-    B = te.compute((10, 10), lambda i, j: A[i][j], name="B")
-
-    # Case 1: Split an axis which is the target of a compute_at
-    s = te.create_schedule([B.op])
-    s[A].compute_at(s[B], B.op.axis[1])
-    s[B].split(B.op.axis[1], 2)
-
-    stmt = tvm.lower(s, [A, B], simple_mode=True)["main"].body
-    assert isinstance(stmt.body.body, tvm.tir.stmt.For)
-
-    # Case 2: Fuse an axis which is the target of a compute_at
-    s = te.create_schedule([B.op])
-    s[A].compute_at(s[B], B.op.axis[1])
-    s[B].fuse(B.op.axis[0], B.op.axis[1])
-    stmt = tvm.lower(s, [A, B], simple_mode=True)["main"].body
-    assert isinstance(stmt, tvm.tir.stmt.For)
-
-
-def test_compute_at():
-    def add():
-        shape = (16, 16)
-        A = tvm.te.compute(shape, lambda *i: 1.0, name="A")
-        B = tvm.te.compute(shape, lambda *i: 2.0, name="B")
-        C = tvm.te.compute(shape, lambda *i: A(*i) + B(*i), name="C")
-        return A, B, C
-
-    def invalid_compute_at_self():
-        A, B, C = add()
-        s = tvm.te.create_schedule(C.op)
-        s[C].compute_at(s[C], C.op.axis[0])
-        with pytest.raises(RuntimeError):
-            tvm.lower(s, [A, B], simple_mode=True)
-
-    def invalid_compute_at_loop():
-        A, B, C = add()
-        s = tvm.te.create_schedule(C.op)
-        s[A].compute_at(s[C], C.op.axis[0])
-        s[C].compute_at(s[A], A.op.axis[0])
-        with pytest.raises(RuntimeError):
-            tvm.lower(s, [C], simple_mode=True)
-
-    invalid_compute_at_self()
-    invalid_compute_at_loop()
-
-
-@pytest.mark.parametrize("split_factor", [4, 4 * tvm.tir.vscale()])
-@pytest.mark.parametrize("disable_predication", [True, False])
-def test_split_disable_predicate(split_factor, disable_predication):
-    A = te.placeholder((43,), name="A")
-    B = te.compute(A.shape, lambda i: A[i] + 2, name="C")
-
-    sch = te.create_schedule(B.op)
-    (i,) = sch[B].op.axis
-    _, _ = sch[B].split(i, factor=split_factor, disable_predication=disable_predication)
-
-    mod = schedule_to_module(sch, [A, B], "main")
-
-    predicates = []
-
-    def _find_predicates(stmt):
-        if isinstance(stmt, tvm.tir.stmt.IfThenElse):
-            predicates.append(stmt)
-
-    tvm.tir.stmt_functor.post_order_visit(mod["main"].body, _find_predicates)
-
-    assert bool(len(predicates)) != disable_predication
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/te/test_te_schedule_bound_inference.py b/tests/python/te/test_te_schedule_bound_inference.py
deleted file mode 100644
index c246ee9f4109..000000000000
--- a/tests/python/te/test_te_schedule_bound_inference.py
+++ /dev/null
@@ -1,512 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.testing
-from tvm import te
-
-
-def test_bound1():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule([A2.op])
-    xo, xi = s[A2].split(s[A2].op.axis[0], 8)
-    s[A1].compute_at(s[A2], xo)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[A1.op.axis[0]].extent.value == 8
-
-
-def test_bound2():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-    s = te.create_schedule(A2.op)
-    xo, yo, xi, yi = s[A2].tile(A2.op.axis[0], A2.op.axis[1], 8, 8)
-    # test normalize not affecting schedule
-    _ = s.normalize()
-    s[A1].compute_at(s[A2], yo)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[A1.op.axis[0]].extent.value == 8
-    assert bounds[A1.op.axis[1]].extent.value == 8
-
-
-def test_bound3():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    s[A1].set_scope("shared")
-    xo, xi = s[A2].split(A2.op.axis[0], 32)
-    xi0, xi1 = s[A2].split(xi, nparts=16)
-    s[A2].bind(xi0, te.thread_axis("threadIdx.x"))
-    yo, yi = s[A2].split(A2.op.axis[1], 16)
-    # test normalize not affecting schedule
-    _ = s.normalize()
-    s[A2].reorder(xo, xi0, yo, xi1, yi)
-    s[A1].compute_at(s[A2], yo)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[A1.op.axis[0]].extent.value == 32
-    assert bounds[A1.op.axis[1]].extent.value == 16
-
-
-def test_bound_split_ext_less_than_factor():
-    m = 8
-    I = te.placeholder((m,), name="I")
-    EF = te.compute((m,), lambda i: I[i] * 2, name="EF")
-    E = te.compute((m,), lambda i: EF[i] * 2, name="E")
-    s = te.create_schedule([E.op])
-    xo, xi = s[E].split(s[E].op.axis[0], factor=32)
-    s[EF].compute_at(s[E], xo)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[xi].extent.value == m
-
-
-def test_bound_split_ext_less_than_naprts():
-    m = 8
-    I = te.placeholder((m,), name="I")
-    EF = te.compute((m,), lambda i: I[i] * 2, name="EF")
-    E = te.compute((m,), lambda i: EF[i] * 2, name="E")
-    s = te.create_schedule([E.op])
-    xo, xi = s[E].split(s[E].op.axis[0], nparts=32)
-    s[EF].compute_at(s[E], xo)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[xo].extent.value == m
-
-
-def test_bound_split_divisible():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((8 * m, l), name="A")
-    B = te.compute((8 * m, l), lambda i, j: A[i, j], name="B")
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], 8)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[xo].extent == m
-    assert bounds[xi].extent.value == 8
-
-
-def test_bound_tile_divisible():
-    m = te.var("m")
-    l = te.var("l")
-    shape = (8 * m, 32 * l)
-    A = te.placeholder(shape, name="A")
-    B = te.compute(shape, lambda i, j: A[i, j], name="B")
-    s = te.create_schedule(B.op)
-    xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], 8, 32)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[xo].extent == m
-    assert bounds[xi].extent.value == 8
-    assert bounds[yo].extent == l
-    assert bounds[yi].extent.value == 32
-
-
-def test_bound_fusesplit1():
-    m = te.var("m")
-    l = te.var("l")
-    split1 = te.var("s")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    fused_axes = s[A2].fuse(A2.op.axis[0], A2.op.axis[1])
-    xo, xi = s[A2].split(fused_axes, split1)
-    s[A1].compute_at(s[A2], xo)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    idxdiv = tvm.tir.indexdiv
-    tvm.testing.assert_prim_expr_equal(bounds[A1.op.axis[0]].min, idxdiv(xo * split1, l))
-
-    expected_extent = idxdiv((xo + 1) * split1 - 1, l) - idxdiv(xo * split1, l) + 1
-    for i in range(1, 6):
-        for j in range(1, 6):
-            for k in range(1, 6):
-                vars = tvm.runtime.convert(
-                    {
-                        split1: tvm.tir.const(i, "int32"),
-                        l: tvm.tir.const(j, "int32"),
-                        xo.var: tvm.tir.const(k, "int32"),
-                    }
-                )
-                tvm.testing.assert_prim_expr_equal(
-                    tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[0]].extent, vars),
-                    tvm.tir.stmt_functor.substitute(expected_extent, vars),
-                )
-
-    tvm.testing.assert_prim_expr_equal(bounds[A1.op.axis[1]].extent, l)
-
-
-def test_bound_fusesplit2():
-    m = te.var("m")
-    l = tvm.runtime.convert(6)
-    split = tvm.runtime.convert(3)
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    fused_axes = s[A2].fuse(A2.op.axis[0], A2.op.axis[1])
-    xo, xi = s[A2].split(fused_axes, split)
-    s[A1].compute_at(s[A2], xo)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    vars = tvm.runtime.convert({xo.var: tvm.tir.const(5, "int32")})
-    tvm.testing.assert_prim_expr_equal(
-        tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[0]].min, vars), 2
-    )
-    tvm.testing.assert_prim_expr_equal(
-        tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[1]].min, vars), 3
-    )
-    tvm.testing.assert_prim_expr_equal(
-        tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[0]].extent, vars), 1
-    )
-    tvm.testing.assert_prim_expr_equal(
-        tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[1]].extent, vars), 3
-    )
-
-
-def test_bound_warp():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    s[A1].set_scope("warp")
-    xo, xi = s[A2].split(A2.op.axis[0], 32)
-    xi0, xi1 = s[A2].split(xi, factor=16)
-    tx = te.thread_axis("threadIdx.x")
-    s[A2].bind(xi1, tx)
-    s[A2].bind(xi0, te.thread_axis("threadIdx.y"))
-    y = s[A2].op.axis[1]
-    s[A1].compute_at(s[A2], y)
-    xo, xi = s[A1].split(s[A1].op.axis[0], factor=16)
-    s[A1].bind(xi, tx)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[A1.op.axis[0]].extent.value == 16
-
-
-def test_bound_scan():
-    m = te.var("m")
-    n = te.var("n")
-    X = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    s_state = te.placeholder((m, n))
-    s_init = te.compute((1, n), lambda _, i: X[0, i])
-    s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i])
-    s_scan = tvm.te.scan(s_init, s_update, s_state)
-
-    assert tuple(s_scan.shape) == (m, n)
-    s = te.create_schedule(s_scan.op)
-    XX = s.cache_read(X, "local", s_update)
-    xo, xi = s[s_update].split(s_update.op.axis[1], factor=4)
-    s[XX].compute_at(s[s_update], xo)
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-    assert bounds[XX.op.axis[1]].extent.value == 4
-
-
-def test_bound_conv1d():
-    n = te.var("n")
-    A = te.compute((n + 2), lambda i: 1, name="A")
-
-    def computeB(ii):
-        i = ii + 1
-        return A[i - 1] + A[i] + A[i + 1]
-
-    B = te.compute(n, computeB, name="B")
-    s = te.create_schedule(B.op)
-    s[A].compute_at(s[B], B.op.axis[0])
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    assert bounds[A.op.axis[0]].extent.value == 3
-
-
-def test_bound_blur():
-    n = tvm.runtime.convert(12)
-    A = te.compute((n, n), lambda i, j: 1, name="A")
-
-    def computeB(ii, jj):
-        # set the correct center
-        i = ii + 1
-        j = jj + 1
-        return A[i][j] + A[i - 1][j] + A[i + 1][j] + A[i][j + 1] + A[i][j - 1]
-
-    B = te.compute((n - 2, n - 2), computeB, name="B")
-    s = te.create_schedule(B.op)
-    s[A].compute_at(s[B], B.op.axis[1])
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    assert bounds[A.op.axis[0]].extent.value == 3
-    assert bounds[A.op.axis[1]].extent.value == 3
-
-
-def test_bound_rfactor():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    k = te.reduce_axis((0, n))
-    B = te.compute((1,), lambda i: te.sum(A[k], axis=k, where=(i > 1)), name="B")
-    # schedule
-    s = te.create_schedule(B.op)
-    kf, ki = s[B].split(k, nparts=4)
-    BF = s.rfactor(B, kf)
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-
-    assert bounds[BF.op.axis[0]].extent.value == 4
-    assert bounds[BF.op.axis[1]].extent.value == 1
-
-
-def test_bound_group_schedule():
-    m = te.var("m")
-    n = te.var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
-    x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
-    s = te.create_schedule(x2.op)
-    g = s.create_group(outputs=x1, inputs=x, include_inputs=True)
-    g.compute_at(s[x2], x2.op.axis[0])
-    assert s[x1].group == g
-    assert s[x].group == g
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    assert bounds[x.op.axis[0]].extent.value == 1
-    assert bounds[x.op.axis[1]].extent == n
-
-
-def test_bound_nest_group():
-    m = te.var("m")
-    n = te.var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
-    x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
-    s = te.create_schedule(x2.op)
-    g1 = s.create_group(outputs=x, inputs=x, include_inputs=True)
-    g2 = s.create_group(outputs=x1, inputs=x, include_inputs=True)
-    assert s[x].group == g1
-    assert s[x1].group == g2
-    g2.compute_at(s[x2], x2.op.axis[0])
-    g1.compute_at(s[x1], s[x1].op.axis[1])
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    assert bounds[x.op.axis[0]].extent.value == 1
-    assert bounds[x.op.axis[1]].extent.value == 1
-    assert bounds[x1.op.axis[0]].extent.value == 1
-    assert bounds[x1.op.axis[1]].extent == n
-
-
-def test_bound_nest_thread():
-    m = te.var("m")
-    A = te.placeholder((m), name="A")
-    A1 = te.compute((m,), lambda i: A[i], name="A1")
-    A2 = te.compute((m,), lambda i: A1[i] + 2, name="A2")
-    A3 = te.compute((m,), lambda i: A2[i] + 3, name="A3")
-
-    s = te.create_schedule(A3.op)
-    s[A2].set_scope("shared")
-    s[A1].set_scope("local")
-
-    block_x = te.thread_axis("blockIdx.x")
-    thread_x = te.thread_axis("threadIdx.x")
-    bx, tx = s[A3].split(A3.op.axis[0], factor=32)
-    s[A3].bind(bx, block_x)
-    s[A3].bind(tx, thread_x)
-    s[A2].compute_at(s[A3], tx)
-    _, xi = s[A2].split(A2.op.axis[0], nparts=1)
-    s[A2].bind(xi, thread_x)
-    s[A1].compute_at(s[A3], tx)
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    assert bounds[A1.op.axis[0]].extent.value == 1
-    assert bounds[A2.op.axis[0]].extent.value == 32
-    assert bounds[A3.op.axis[0]].extent == m
-
-
-def test_gemm_bound():
-    nn = 1024
-    n = tvm.runtime.convert(nn)
-    A = te.placeholder((n, n), name="A")
-    B = te.placeholder((n, n), name="B")
-    k = te.reduce_axis((0, n), name="k")
-    C = te.compute((n, n), lambda ii, jj: te.sum(A[ii, k] * B[jj, k], axis=k), name="CC")
-    # schedule
-    s = te.create_schedule(C.op)
-    xtile, ytile = 32, 32
-    scale = 8
-    num_thread = 8
-    block_factor = scale * num_thread
-    block_x = te.thread_axis("blockIdx.x")
-    thread_x = te.thread_axis("threadIdx.x")
-    block_y = te.thread_axis("blockIdx.y")
-    thread_y = te.thread_axis("threadIdx.y")
-
-    CC = s.cache_write(C, "local")
-    AA = s.cache_read(A, "shared", [CC])
-    BB = s.cache_read(B, "shared", [CC])
-    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
-    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
-    s[C].reorder(by, bx, yi, xi)
-    s[C].bind(by, block_y)
-    s[C].bind(bx, block_x)
-    ty, yi = s[C].split(yi, nparts=num_thread)
-    tx, xi = s[C].split(xi, nparts=num_thread)
-    s[C].reorder(ty, tx, yi, xi)
-    s[C].bind(ty, thread_y)
-    s[C].bind(tx, thread_x)
-    yo, xo = CC.op.axis
-    s[CC].reorder(k, yo, xo)
-
-    s[CC].compute_at(s[C], tx)
-    s[AA].compute_at(s[CC], k)
-    s[BB].compute_at(s[CC], k)
-
-    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
-    tx, xi = s[AA].split(xi, nparts=num_thread)
-    s[AA].bind(ty, thread_y)
-    s[AA].bind(tx, thread_x)
-
-    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
-    tx, xi = s[BB].split(xi, nparts=num_thread)
-    s[BB].bind(ty, thread_y)
-    s[BB].bind(tx, thread_x)
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    assert bounds[BB.op.axis[0]].extent.value == 64
-    assert bounds[AA.op.axis[0]].extent.value == 64
-    assert bounds[CC.op.axis[0]].extent.value == 8
-    assert bounds[CC.op.axis[1]].extent.value == 8
-
-
-def test_bound_tensor_compute_op():
-    def intrin_test():
-        m1 = te.var("m1")
-        n1 = te.var("n1")
-        a = te.placeholder((m1, n1), name="a")
-        c = te.compute((1, n1), lambda i, j: a[0, j] + a[1, j] + a[2, j], name="c")
-
-        Ab = tvm.tir.decl_buffer(a.shape, name="Abuf", offset_factor=1)
-        Cb = tvm.tir.decl_buffer(c.shape, name="Cbuf", offset_factor=1)
-
-        def intrin_func(ins, outs):
-            aa = ins[0]
-            cc = outs[0]
-
-            def _body():
-                ib = tvm.tir.ir_builder.create()
-                ib.emit(
-                    tvm.tir.call_extern("int32", "test", cc.access_ptr("w"), aa.access_ptr("r"))
-                )
-                return ib.get()
-
-            return _body()
-
-        return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, c: Cb})
-
-    test_func = intrin_test()
-    A = te.placeholder((20, 20), name="A")
-    B = te.compute(A.shape, lambda i, j: A[i, j], name="B")
-    C = te.compute((10, 20), lambda i: test_func(B[i:10, 0:20]), name="C")
-    s = te.create_schedule(C.op)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    assert bounds[B.op.axis[0]].extent.value == 10
-
-
-def test_bound_simplification_failure():
-    # Check that the bounds are not expanded
-    A = te.compute((2,), lambda j: j, "A")
-
-    def _check(B, A=A):
-        s = te.create_schedule(B.op)
-        s = s.normalize()
-        bounds = tvm.te.schedule.InferBound(s)
-        stmt = tvm.lower(s, [B, A], simple_mode=True)
-        if not bounds[A.op.axis[0]].extent.value <= 2:
-            print(stmt)
-            assert bounds[A.op.axis[0]].extent.value <= 2
-
-    tdiv = tvm.tir.truncdiv
-    # These are hard to simplify, moreover we don't simplify them
-    _check(te.compute((10,), lambda i: A[tvm.te.min(3 * i, 4 * i) + tvm.te.min(-3 * i, -2 * i)]))
-    _check(te.compute((10,), lambda i: A[tvm.te.min(3 * i, 4 * i) + tvm.te.max(-3 * i, -4 * i)]))
-    _check(te.compute((10,), lambda i: A[-2 * tdiv(i, 2) - tvm.te.min(i, 0 - i)]))
-    _check(te.compute((10,), lambda i: A[i + (0 - i)]))
-    # This would cause out of bounds, but we nevertheless include it
-    _check(te.compute((10,), lambda i: A[i]))
-
-
-def test_bound_block():
-    def _check(shape, expected, block_size=4):
-        N, C, H, W = shape
-        tail = C % block_size
-        chunks = C // block_size
-        if tail != 0:
-            chunks += 1
-        A = te.placeholder((N, C, H, W), name="A")
-        pad_value = tvm.tir.const(0, A.dtype)
-
-        def _reorder_data_nchw(*indices):
-            condition = []
-            condition.append(indices[1] == chunks - 1)
-            condition.append(indices[4] >= tail)
-            condition = tvm.tir.all(*condition)
-            return tvm.tir.if_then_else(
-                condition,
-                pad_value,
-                A[indices[0], indices[1] * block_size + indices[4], indices[2], indices[3]],
-            )
-
-        repack = te.compute((N, chunks, H, W, block_size), _reorder_data_nchw, name="repack")
-        B = te.compute(
-            (N, C, H, W),
-            lambda n, c, h, w: repack[n, c // block_size, h, w, c % block_size],
-            name="back_repack",
-        )
-        s = te.create_schedule([B.op])
-        bounds = tvm.te.schedule.InferBound(s)
-        # Block for intermediate compute function should be equal to 4 for all cases except than number of channels is less than 4
-        assert bounds[repack.op.axis[4]].extent.value == expected
-
-    _check((1, 4, 6, 6), 4)
-    _check((1, 7, 6, 6), 4)
-    _check((1, 3, 6, 6), 3)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/te/test_te_schedule_bound_inference_tiling.py b/tests/python/te/test_te_schedule_bound_inference_tiling.py
deleted file mode 100644
index 039fe08cd328..000000000000
--- a/tests/python/te/test_te_schedule_bound_inference_tiling.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-
-
-def test_bound_tile_mod():
-    def compute(M_tiles, N_tiles, factor, dtype):
-        # Algo
-        M = M_tiles * factor
-        N = N_tiles * factor
-
-        A = tvm.te.placeholder((N, M), name="A", dtype=dtype)
-        C = tvm.te.compute((N, M), lambda n, m: A[n, m], name="C")
-        s = tvm.te.create_schedule(C.op)
-
-        return s, A, C
-
-    def schedule(s, factor, padding, A, C):
-        C_local = s.cache_write(C, "local")
-
-        n, m = C.op.axis
-        bn, bm, ni, mi = s[C].tile(n, m, factor, factor)
-        nio, nii = s[C].split(ni, 2)
-        n = s[C].fuse(nii, mi)
-        C_shared = s.cache_write(C, "shared")
-        bn, bm, ni, mi = C_shared.op.axis
-        s[C_shared].storage_align(ni, factor * 2, padding)
-
-        n, m = s[C].op.axis
-        bn, bm, ni, mi = s[C].tile(n, m, factor, factor)
-        s[C].set_scope("global")
-        niio, niii = s[C].split(ni, 32)
-        s[C_shared].compute_at(s[C], niio)
-
-        return s
-
-    s, A, C = compute(2, 2, 128, "float16")
-    s = schedule(s, 128, 8, A, C)
-    bounds = tvm.te.schedule.InferBound(s)
-    check = bounds[s.stages[2].op.axis[2]].extent == 16
-    if not check:
-        print(tvm.lower(s, [A, C], simple_mode=True))
-    assert check
-
-
-if __name__ == "__main__":
-    test_bound_tile_mod()
diff --git a/tests/python/te/test_te_schedule_graph.py b/tests/python/te/test_te_schedule_graph.py
deleted file mode 100644
index 05ca9fdbf8a8..000000000000
--- a/tests/python/te/test_te_schedule_graph.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-
-
-def test_scan():
-    m = te.var("m")
-    n = te.var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    s_state = te.placeholder((m, n))
-    s_init = te.compute((1, n), lambda _, i: x[0, i], name="s_init")
-    x_trans = te.compute((m, n), lambda i, j: x[i, j] + 1, name="x_trans")
-    s_up1 = te.compute((m, n), lambda t, i: s_state[t - 1, i] + 1, name="up1")
-    s_update = te.compute((m, n), lambda t, i: s_up1[t, i] + x_trans[t, i], name="update")
-    s_scan = tvm.te.scan(s_init, s_update, s_state)
-
-    def test_getbody():
-        body = tvm.te.schedule.ScanGetBody(s_scan.op)
-        assert set(body) == set([s_scan.op, s_update.op, s_up1.op])
-
-    def test_attach_path():
-        s = te.create_schedule(s_scan.op)
-        s[x_trans].compute_at(s[s_update], s_update.op.axis[0])
-        apath = tvm.te.schedule.CreateAttachPath(s)
-        assert tuple(apath[s_update.op]) == tuple([s_scan.op.scan_axis])
-        assert tuple(apath[x_trans.op]) == tuple([s_update.op.axis[0], s_scan.op.scan_axis])
-
-    def test_fix_pt():
-        body = tvm.te.schedule.ScanGetBody(s_scan.op)
-        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op)
-        assert fxpt[s_scan.spatial_axis_[0]].value != 0
-
-
-def test_scan_fix_point():
-    m = te.var("m")
-    n = te.var("n")
-    l = te.var("l")
-    x = te.compute((l, m, n), lambda *i: tvm.tir.const(1, "float32"), name="x")
-    s_state = te.placeholder((l, m, n))
-    s_init = te.compute((1, m, n), lambda _, i, j: x[0, i, j], name="s_init")
-
-    def test_scan0():
-        s_update = te.compute(
-            (l, m, n), lambda t, i, j: x[t, j, i] + s_state[t - 1, i, j], name="update"
-        )
-        s_scan = tvm.te.scan(s_init, s_update, s_state)
-        body = tvm.te.schedule.ScanGetBody(s_scan.op)
-        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op)
-        assert fxpt[s_scan.op.spatial_axis_[0]].value == 1
-        assert fxpt[s_scan.op.spatial_axis_[1]].value == 1
-
-    def test_scan1():
-        s_update = te.compute(
-            (l, m, n), lambda t, i, j: x[t, j, i] + s_state[t - 1, j, i], name="update"
-        )
-        s_scan = tvm.te.scan(s_init, s_update, s_state)
-        body = tvm.te.schedule.ScanGetBody(s_scan.op)
-        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op)
-        assert fxpt[s_scan.op.spatial_axis_[0]].value == 0
-        assert fxpt[s_scan.op.spatial_axis_[1]].value == 0
-
-    def test_scan3_not_exact_reach():
-        s_h1 = te.compute((l, n, m), lambda t, j, i: s_state[t - 1, i, j], name="h1")
-        s_h2 = te.compute((l, m, n), lambda t, i, j: s_state[t - 1, i, 10] * 2, name="h1")
-        s_update = te.compute(
-            (l, m, n), lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update"
-        )
-        s_scan = tvm.te.scan(s_init, s_update, s_state)
-        body = tvm.te.schedule.ScanGetBody(s_scan.op)
-        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op)
-        assert fxpt[s_scan.op.spatial_axis_[0]].value == 1
-        assert fxpt[s_scan.op.spatial_axis_[1]].value == 0
-
-    def test_scan4_reach_other():
-        s_h1 = te.compute((l, n, m), lambda t, j, i: s_state[t - 1, j, j], name="h1")
-        s_h2 = te.compute((l, m, n), lambda t, i, j: s_state[t - 1, i, j] * 2, name="h1")
-        s_update = te.compute(
-            (l, m, n), lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update"
-        )
-        s_scan = tvm.te.scan(s_init, s_update, s_state)
-        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op)
-        assert fxpt[s_scan.op.spatial_axis_[0]].value == 0
-        assert fxpt[s_scan.op.spatial_axis_[1]].value == 0
-
-    def test_scan5_multi_output():
-        m = te.var("m")
-        n = te.var("n")
-        x1 = te.placeholder((m, n))
-        s1 = te.placeholder((m, n))
-        x2 = te.placeholder((m, n))
-        s2 = te.placeholder((m, n))
-        s1_init = te.compute((1, n), lambda _, i: x1[0, i])
-        s2_init = te.compute((1, n), lambda _, i: x2[0, i])
-        s1_update = te.compute((m, n), lambda t, i: s1[t - 1, i] + x1[t, i])
-        s2_update = te.compute((m, n), lambda t, i: x2[t, i] + s2[t - 1, i])
-        r0, r1 = tvm.te.scan([s1_init, s2_init], [s1_update, s2_update], [s1, s2])
-        body = tvm.te.schedule.ScanGetBody(r0.op)
-        fxpt = tvm.te.schedule.ScanFixPointAnalysis(r0.op)
-        assert fxpt[r1.op.spatial_axis_[0]].value == 1
-
-    test_scan0()
-    test_scan1()
-    test_scan3_not_exact_reach()
-    test_scan4_reach_other()
-    test_scan5_multi_output()
-
-
-def test_create_read_graph():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j])
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3)
-
-    g = tvm.te.schedule.CreateReadGraph([A2.op])
-
-    assert g[A2.op][0] == A1
-    assert g[A1.op][0] == A
-    post_order = tvm.te.schedule.PostDFSOrder([A2.op], g)
-    assert post_order[0] == A.op
-    assert post_order[1] == A1.op
-
-
-if __name__ == "__main__":
-    test_scan()
-    test_create_read_graph()
-    test_scan_fix_point()
diff --git a/tests/python/te/test_te_schedule_lstm.py b/tests/python/te/test_te_schedule_lstm.py
deleted file mode 100644
index abdf81d3795d..000000000000
--- a/tests/python/te/test_te_schedule_lstm.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-
-
-def test_lstm_cell_inline():
-    num_step = 128
-    num_input = 256
-    num_hidden = 1152
-    batch_size = 4
-    # Global transition matrix
-    X = te.placeholder((num_step - 1, batch_size, num_input), name="X")
-    Wi2h = te.placeholder((4, num_hidden, num_input), name="Wi2h")
-    Wh2h = te.placeholder((4, num_hidden, num_hidden), name="Wh2h")
-    # h: output hidden state, c: cell state.
-    s_state_h = te.placeholder((num_step, batch_size, num_hidden))
-    s_state_c = te.placeholder((num_step, batch_size, num_hidden))
-    s_init_c = te.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_c")
-    s_init_h = te.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_h")
-    # LSTM transition
-    k = te.reduce_axis((0, num_input), name="ki2h")
-    s_i2h = te.compute(
-        (num_step, 4, batch_size, num_hidden),
-        lambda t, x, i, j: te.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k),
-        name="s_i2h",
-    )
-    k = te.reduce_axis((0, num_hidden), name="ki2h")
-    s_h2h = te.compute(
-        (num_step, 4, batch_size, num_hidden),
-        lambda t, x, i, j: te.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k),
-        name="s_h2h",
-    )
-    # Gate rules
-    gates = te.compute(s_i2h.shape, lambda *i: s_i2h(*i) + s_h2h(*i), name="gates")
-    gshape = (num_step, batch_size, num_hidden)
-    in_gate = te.compute(gshape, lambda t, i, j: te.sigmoid(gates[t, 0, i, j]), name="in_gate")
-    in_transform = te.compute(
-        gshape, lambda t, i, j: te.tanh(gates[t, 1, i, j]), name="in_transform"
-    )
-    forget_gate = te.compute(
-        gshape, lambda t, i, j: te.sigmoid(gates[t, 2, i, j]), name="forget_gate"
-    )
-    out_gate = te.compute(gshape, lambda t, i, j: te.sigmoid(gates[t, 3, i, j]), name="out_gate")
-    next_c = te.compute(
-        gshape,
-        lambda t, i, j: forget_gate[t, i, j] * s_state_c[t - 1, i, j]
-        + in_gate[t, i, j] * in_transform[t, i, j],
-        name="next_c",
-    )
-    next_h = te.compute(
-        gshape, lambda t, i, j: out_gate[t, i, j] * te.tanh(next_c[t, i, j]), name="next_h"
-    )
-    update_c = te.compute(gshape, lambda *i: next_c(*i), name="update_c")
-    update_h = te.compute(gshape, lambda *i: next_h(*i), name="update_h")
-    # schedule
-    scan_h, scan_c = tvm.te.scan(
-        [s_init_h, s_init_c],
-        [update_h, update_c],
-        [s_state_h, s_state_c],
-        inputs=[X],
-        name="lstm_scan",
-    )
-    # schedule
-    s = te.create_schedule(scan_h.op)
-    # Inline gate computations
-    s[gates].compute_inline()
-    s[in_gate].compute_inline()
-    s[in_transform].compute_inline()
-    s[forget_gate].compute_inline()
-    s[out_gate].compute_inline()
-    # verify we can lower correctly
-    tvm.lower(s, [X, Wi2h, Wh2h, scan_h, scan_c])
-
-
-if __name__ == "__main__":
-    test_lstm_cell_inline()
diff --git a/tests/python/te/test_te_schedule_ops.py b/tests/python/te/test_te_schedule_ops.py
deleted file mode 100644
index 1ff0297539ce..000000000000
--- a/tests/python/te/test_te_schedule_ops.py
+++ /dev/null
@@ -1,695 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm.driver.build_module import schedule_to_module
-
-
-def test_const():
-    x = tvm.te.const(1, "int32")
-    assert x.dtype == "int32"
-    assert isinstance(x, tvm.tir.IntImm)
-
-
-def test_schedule0():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    s = te.create_schedule(A1.op)
-
-    mod = schedule_to_module(s, [A, A1])
-    assert isinstance(mod["main"], tvm.tir.PrimFunc)
-
-
-def test_schedule1():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-
-    s = te.create_schedule(A1.op)
-    xo, xi = s[A1].split(A1.op.axis[0], 8)
-    s[A1].pragma(xo, "auto_unroll_max_step", 10)
-
-    mod = schedule_to_module(s, [A, A1])
-    assert isinstance(mod["main"], tvm.tir.PrimFunc)
-
-
-def test_schedule2():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    xo, xi = s[A2].split(A2.op.axis[0], 8)
-    s[A1].compute_at(s[A2], xo)
-
-    mod = schedule_to_module(s, [A, A2])
-    assert isinstance(mod["main"], tvm.tir.PrimFunc)
-
-
-def test_schedule_scan():
-    m = te.var("m")
-    n = te.var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    s_state = te.placeholder((m, n))
-    s_init = te.compute((1, n), lambda _, i: x[0, i])
-    s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + x[t, i])
-    res = tvm.te.scan(s_init, s_update, s_state)
-
-    assert tuple(res.shape) == (m, n)
-    s = te.create_schedule(res.op)
-    s = s.normalize()
-    ir = tvm.lower(s, [s_state], simple_mode=True)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert bounds[res.op.scan_axis].min.value == 1
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_inline_multi_reduce():
-    def argmax_comp(x, y):
-        idx = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
-        val = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
-        return idx, val
-
-    def argmax_init(idx_typ, val_typ):
-        return tvm.tir.const(-1, idx_typ), tvm.te.min_value(val_typ)
-
-    argmax = te.comm_reducer(argmax_comp, argmax_init, name="argmax")
-    m = te.var("m")
-    n = te.var("n")
-    val = te.placeholder((m, n), name="val", dtype="float32")
-    val1 = te.compute((m, n), lambda i, j: val[i, j] + 1, name="val1")
-    val2 = te.compute((m, n), lambda i, j: te.exp(val1[i, j]), name="val2")
-    k = te.reduce_axis((0, n), "k")
-    T_idx, T_val = te.compute((m,), lambda i: argmax((k.var, val2[i, k]), axis=k), name="T")
-    s = te.create_schedule(T_idx.op)
-    s[val1].compute_inline()
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_auto_inline():
-    def elemwise():
-        m = te.var("m")
-        n = te.var("n")
-        A = te.placeholder((m, n), name="A")
-        B = te.placeholder((m, n), name="B")
-        C = te.placeholder((m, n), name="C")
-        T1 = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="T1")
-        T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2")
-
-        return te.create_schedule(T2.op), T1
-
-    def broadcast():
-        m = te.var("m")
-        n = te.var("n")
-        A = te.placeholder((1,), name="A")
-        B = te.placeholder((m, n), name="B")
-        C = te.placeholder((m, n), name="C")
-        T1 = te.compute((m, n), lambda i, j: A(0) * B(i, j), name="T1", tag="broadcast")
-        T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2")
-
-        return te.create_schedule(T2.op), T1
-
-    def injective():
-        m = te.var("m")
-        n = te.var("n")
-        A = te.placeholder((m,), name="A")
-        B = te.placeholder((m, n), name="B")
-        C = te.placeholder((m, n), name="C")
-        T1 = te.compute((m, n), lambda i, j: A(i) * B(i, j), name="T1")
-        T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2")
-
-        return te.create_schedule(T2.op), T1
-
-    def check_auto_inline(schedule_func, auto_inline_func):
-        s, T1 = schedule_func()
-        # before auto inline the attach type is AttachType.kGroupRoot
-        assert s[T1].attach_type == 1
-        auto_inline_func(s)
-        # after auto inline the attach type is AttachType.kInline
-        assert s[T1].attach_type == 2
-        s = s.normalize()
-        bounds = tvm.te.schedule.InferBound(s)
-        stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    check_auto_inline(elemwise, tvm.te.schedule.AutoInlineElemWise)
-    check_auto_inline(broadcast, tvm.te.schedule.AutoInlineBroadcast)
-    check_auto_inline(injective, tvm.te.schedule.AutoInlineInjective)
-
-
-def test_schedule_const_bound():
-    n = 128
-    A = te.placeholder((n,), name="A")
-    A1 = te.compute((n,), lambda i: A[i] + 1, name="A1")
-    s = te.create_schedule(A1.op)
-    xo, xi = s[A1].split(A1.op.axis[0], 8)
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_inline_mixed():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    A1 = te.compute(A.shape, lambda *i: A(*i) + 1, name="A1")
-    A2 = te.compute(A.shape, lambda *i: A1(*i) + 2, name="A2")
-    C = te.compute((n,), lambda i: A2[i] + A1[i], name="C")
-
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=8)
-    s[A1].compute_at(s[C], xo)
-    s[A2].compute_inline()
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    def check(x):
-        if isinstance(x, tvm.tir.Call):
-            assert x.func != A2
-
-    tvm.tir.stmt_functor.post_order_visit(s[C].op.body[0], check)
-
-
-def test_scan_inline1():
-    m = te.var("m")
-    n = te.var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    s_state1 = te.placeholder((m, n))
-    s_state2 = te.placeholder((m, n))
-    s_init1 = te.compute((1, n), lambda _, i: x[0, i])
-    s_init2 = te.compute((1, n), lambda _, i: x[0, i])
-    s_x1 = te.compute((m, n), lambda t, i: s_state1[t - 1, i] + x[t, i], name="x1")
-    s_x2 = te.compute((m, n), lambda t, i: s_state2[t - 1, i] + 1, name="x2")
-    s_update1 = te.compute((m, n), lambda t, i: s_x1[t, i], "u1")
-    s_update2 = te.compute((m, n), lambda t, i: s_x2[t, i], "u2")
-    res1, res2 = tvm.te.scan([s_init1, s_init2], [s_update1, s_update2], [s_state1, s_state2])
-    s = te.create_schedule(res1.op)
-    s[s_x1].compute_inline()
-    stmt = tvm.lower(s, [x, res1, res2])
-
-
-def test_scan_inline2():
-    m = te.var("m")
-    n = te.var("n")
-    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
-    s_state1 = te.placeholder((m, n))
-    s_state2 = te.placeholder((m, n))
-    s_init1 = te.compute((1, n), lambda _, i: x[0, i])
-    s_init2 = te.compute((1, n), lambda _, i: x[0, i])
-    s_xx = te.compute((m, n), lambda t, i: s_state1[t - 1, i] + x[t, i], name="xx")
-    s_x1 = te.compute((m, n), lambda t, i: s_xx[t, i] + 1, name="x1")
-    s_x2 = te.compute((m, n), lambda t, i: s_xx[t, i] + s_state2[t - 1, 2], name="x2")
-    s_update1 = te.compute((m, n), lambda t, i: s_x1[t, i], "u1")
-    s_update2 = te.compute((m, n), lambda t, i: s_x2[t, i], "u2")
-    res1, res2 = tvm.te.scan([s_init1, s_init2], [s_update1, s_update2], [s_state1, s_state2])
-    s = te.create_schedule(res1.op)
-    s[s_xx].compute_inline()
-    s[s_x1].compute_inline()
-    s[s_x2].compute_inline()
-    stmt = tvm.lower(s, [x, res1, res2])
-
-
-def test_schedule_cache():
-    m = te.var("m")
-    n = te.var("n")
-    A = te.placeholder((m, n), name="A")
-    B = te.placeholder((m, n), name="B")
-    C = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="C")
-
-    s = te.create_schedule(C.op)
-    AA = s.cache_read(A, "shared", readers=[C])
-    CC = s.cache_write(C, "shared")
-    s[AA].compute_at(s[CC], CC.op.axis[0])
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_schedule_middle_cache():
-    m = te.var("m")
-    n = te.var("n")
-    A = te.placeholder((m, n), name="A")
-    B = te.placeholder((m, n), name="B")
-
-    C = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="C")
-    D = te.compute((m, n), lambda i, j: C(i, j), name="D")
-
-    s = te.create_schedule(D.op)
-    AA = s.cache_read(A, "local", readers=[C])
-    BB = s.cache_read(B, "local", readers=[C])
-    CC = s.cache_read(C, "local", readers=[D])
-    DD = s.cache_write(D, "local")
-    # s[AA].compute_at(s[CC], CC.op.axis[0])
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_schedule_cache_relayout1():
-    m = te.var("m")
-    n = te.var("n")
-    A = te.placeholder((m, n), name="A")
-    B = te.placeholder((m, n), name="B")
-    C = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="C")
-
-    s = te.create_schedule(C.op)
-    s[C].reorder(C.op.axis[1], C.op.axis[0])
-    CC = s.cache_write(C, "global")
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_schedule_cache_relayout2():
-    m = te.var("m")
-    n = te.var("n")
-    A = te.placeholder((m * 4, n), name="A")
-    B = te.placeholder((m * 4, n), name="B")
-    C = te.compute(A.shape, lambda i, j: A(i, j) * B(i, j), name="C")
-    s = te.create_schedule(C.op)
-    x, y = C.op.axis
-    xo, xi = s[C].split(x, factor=4)
-    s[C].reorder(xo, y, xi)
-    CC = s.cache_write(C, "global")
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_schedule_cache_relayout3():
-    m = te.var("m")
-    n = te.var("n")
-    A = te.placeholder((m * 4, n), name="A")
-    B = te.placeholder((m * 4, n), name="B")
-    k = te.reduce_axis((0, n), "k")
-    C = te.compute((A.shape[0],), lambda i: te.sum(A(i, k) * B(i, k), axis=k), name="C")
-    s = te.create_schedule(C.op)
-    x = C.op.axis[0]
-    xo, xi = s[C].split(x, factor=4)
-    CC = s.cache_write(C, "global")
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_schedule_cache_relayout4():
-    def _compute(*indice):
-        return A(*indice) + 1, B(*indice) / 2
-
-    m = te.var("m")
-    n = te.var("n")
-    A = te.placeholder((m * 4, n), name="A")
-    B = te.placeholder((m * 4, n), name="B")
-    C1, C2 = te.compute(A.shape, _compute, name="C")
-    s = te.create_schedule([C1.op, C2.op])
-    C1_cache, C2_cache = s.cache_write([C1, C2], "local")
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def intrin_gemv(m, n):
-    w = te.placeholder((m, n), name="w")
-    x = te.placeholder((n,), name="x")
-    k = te.reduce_axis((0, n), name="k")
-    z = te.compute((m,), lambda i: te.sum(w[i, k] * x[k], axis=k), name="z")
-    Wb = tvm.tir.decl_buffer(
-        w.shape, w.dtype, name="W", offset_factor=16, strides=[te.var("ldw"), 1]
-    )
-
-    def intrin_func(ins, outs):
-        ww, xx = ins
-        zz = outs[0]
-        ww_ptr = ww.access_ptr("r")
-        xx_ptr = xx.access_ptr("r")
-        zz_ptr = zz.access_ptr("w")
-        body = tvm.tir.call_packed("gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        reset = tvm.tir.call_packed("fill_zero", zz_ptr, n)
-        update = tvm.tir.call_packed("gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        return body, reset, update
-
-    buffer_params = {"data_alignment": 16, "offset_factor": 16}
-    return te.decl_tensor_intrin(
-        z.op, intrin_func, binds={w: Wb}, default_buffer_params=buffer_params
-    )
-
-
-def test_schedule_tensor_compute1():
-    # basic: split, reorder, tile
-    M, N, L = 2048, 1024, 512
-    factor, rfactor = 16, 16
-    A = te.placeholder((N // factor, L // rfactor, factor, rfactor), name="A")
-    B = te.placeholder((M, L // rfactor, rfactor), name="B")
-    k = te.reduce_axis((0, L // rfactor), name="k")
-
-    gemv = intrin_gemv(factor, rfactor)
-    C = te.compute(
-        (N, M // factor, factor),
-        lambda i, j: gemv(A[i, k, 0:factor, 0:factor], B[j, k, 0:rfactor], reduce_axis=k),
-        name="C",
-    )
-
-    s = te.create_schedule(C.op)
-    ai, aj, ax = s[C].op.axis
-    aio, aii = s[C].split(ai, 16)
-    s[C].reorder(aio, aj, aii)
-    aioo, ajo, aioi, aji = s[C].tile(aio, aj, 16, 4)
-
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def intrin_vadd(n, cache_read=False, cache_write=False):
-    scope_ubuf = "local"
-    dtype = "float32"
-    x = te.placeholder((n,), dtype=dtype, name="vx")
-    y = te.placeholder((n,), dtype=dtype, name="vy")
-    z = te.compute(x.shape, lambda i: x[i] + y[i], name="z")
-    s = te.create_schedule(z.op)
-
-    def create_buffer(t):
-        return tvm.tir.decl_buffer(
-            t.shape, t.dtype, name="W" + t.name, scope=scope_ubuf, offset_factor=16
-        )
-
-    binds = {}
-    if cache_read:
-        binds[x] = create_buffer(x)
-        binds[y] = create_buffer(y)
-    if cache_write:
-        binds[z] = create_buffer(z)
-
-    def intrin_func(ins, outs):
-        ib = tvm.tir.ir_builder.create()
-        ib.emit(
-            tvm.tir.call_extern(
-                outs[0].dtype,
-                "vadd",
-                ins[0].access_ptr("r"),
-                ins[1].access_ptr("r"),
-                outs[0].access_ptr("wr"),
-            )
-        )
-        return ib.get()
-
-    return te.decl_tensor_intrin(
-        z.op, intrin_func, binds=binds, default_buffer_params={"offset_factor": 16}
-    )
-
-
-def test_schedule_tensor_compute2():
-    # cache_read, cache_write
-    M = 1024
-    factor = 16
-    dtype = "float32"
-    scope_ubuf = "local"
-
-    A = te.placeholder((M // factor, factor), name="A", dtype=dtype)
-    B = te.placeholder((M // factor, factor), name="B", dtype=dtype)
-
-    vadd = intrin_vadd(factor, True, True)
-    C = te.compute((M // factor, factor), lambda i: vadd(A[i, 0:factor], B[i, 0:factor]), name="C")
-
-    s = te.create_schedule(C.op)
-    AL = s.cache_read(A, scope_ubuf, C)
-    BL = s.cache_read(B, scope_ubuf, C)
-    CL = s.cache_write(C, scope_ubuf)
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_schedule_tensor_compute3():
-    # compute_at
-    M = 1024
-    factor = 16
-    dtype = "float32"
-    A = te.placeholder((M // factor, factor), name="A", dtype=dtype)
-    B = te.placeholder((M // factor, factor), name="B", dtype=dtype)
-    Bi = te.compute((M // factor, factor), lambda i, j: B[i, j] + 5, name="Bi")
-
-    vadd = intrin_vadd(factor)
-    C = te.compute((M // factor, factor), lambda i: vadd(A[i, 0:factor], Bi[i, 0:factor]), name="C")
-    s = te.create_schedule(C.op)
-    s[Bi].compute_at(s[C], C.op.axis[0])
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_loop_dep_reduce():
-    X = te.placeholder(shape=(10,), name="x")
-
-    def f(n):
-        rv = te.reduce_axis((0, n))
-        return te.sum(X[rv], axis=rv)
-
-    Y = te.compute(X.shape, f, name="y")
-    s = te.create_schedule([Y.op])
-    f = tvm.build(s, [X, Y])
-
-
-def test_loop_dep_reduce_cache_write():
-    X = te.placeholder(shape=(10,), name="x")
-
-    def f(n):
-        rv = te.reduce_axis((0, n))
-        init = lambda dtype: tvm.tir.Select(n > 1, tvm.tir.const(0, dtype), n.astype(dtype))
-        sum = te.comm_reducer(lambda x, y: tvm.te.max(x + y, n.astype("float32")), init, name="sum")
-        return sum(X[rv], axis=rv)
-
-    Y = te.compute(X.shape, f, name="y")
-    s = te.create_schedule([Y.op])
-    s.cache_write(Y, "local")
-    f = tvm.build(s, [X, Y])
-
-
-def test_reduction_and_dummy_fuse_split():
-    n = 10
-    X = te.placeholder(shape=(n,), dtype="int32", name="X")
-    k = te.reduce_axis((0, n))
-    Y = te.compute((), lambda: te.sum(X[k], k), name="Y")
-    s = te.create_schedule([Y.op])
-    ax = s[Y.op].fuse(*Y.op.axis)
-    axo, axi = s[Y.op].split(ax, nparts=20)
-    f = tvm.build(s, [Y, X])
-
-    args = [tvm.nd.empty((), "int32")] + [tvm.nd.array(np.ones((n,), dtype="int32"))]
-    f(*args)
-    assert args[0].numpy() == n
-
-    n = 10
-    X = te.placeholder(shape=(n,), dtype="int32", name="X")
-    k = te.reduce_axis((0, n))
-    Y = te.compute((n,), lambda i: te.sum(X[k], k), name="Y")
-    s = te.create_schedule([Y.op])
-    ax = s[Y.op].fuse(*(list(Y.op.axis) + list(Y.op.reduce_axis)))
-    f = tvm.build(s, [Y, X])
-
-    args = [tvm.nd.array(np.ones((n,), dtype="int32"))] + [
-        tvm.nd.array(np.ones((n,), dtype="int32"))
-    ]
-    f(*args)
-    assert np.all(args[0].numpy() == n)
-
-
-def test_schedule_compute_inline():
-    shape = [10, 1024]
-    A = te.placeholder(shape, name="A")
-    B = te.placeholder(shape, name="B")
-    C = te.compute(shape, lambda *index: A(*index) + B(*index), name="C")
-
-    def _compute(*index):
-        return C(*index), C(*index) * B(*index)
-
-    F, E = te.compute(shape, _compute, name="F")
-
-    s = te.create_schedule([F.op, E.op])
-    AL = s.cache_read(A, "local", [C])
-    BL = s.cache_read(B, "local", [C, E])
-    CL = s.cache_write(C, "local")
-    FL, EL = s.cache_write([F, E], "local")
-    s[C].compute_inline()
-
-    s = s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-
-def test_local_stage_predicate():
-    m = 1
-    n = 3
-    p = 2
-    A = tvm.te.placeholder((m, n, p), name="A")
-    B = tvm.te.compute((m, n, p), lambda bi, bj, bk: A[bi, bj, bk], name="B")
-    C = tvm.te.compute((m, n, p), lambda ci, cj, ck: B[ci, cj, ck], name="C")
-    by = tvm.te.thread_axis("blockIdx.y")
-    tx = tvm.te.thread_axis("threadIdx.x")
-    vx = tvm.te.thread_axis("vthread")
-
-    def schedule(thread_tag, mem_scope):
-        s = tvm.te.create_schedule(C.op)
-        s[B].compute_at(s[C], s[C].op.axis[0])
-        s[B].set_scope(mem_scope)
-        bno, bni = s[B].split(s[B].op.axis[1], n)
-        bx = tvm.te.thread_axis("blockIdx.x")
-        s[C].bind(s[C].op.axis[0], bx)
-        s[C].bind(s[C].op.axis[1], thread_tag)
-        s[B].bind(bni, thread_tag)
-        return s
-
-    def collect_visit(stmt, f):
-        ret = []
-        tvm.tir.stmt_functor.post_order_visit(stmt, lambda x: ret.append(f(x)))
-        return ret
-
-    # local vs. threadIdx
-    s = schedule(tx, "local")
-    lowered_body = tvm.lower(s, [A, C])["main"].body
-    assert not any(collect_visit(lowered_body, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-    # local vs. vthread
-    s = schedule(vx, "local")
-    lowered_body = tvm.lower(s, [A, C])["main"].body
-    assert not any(collect_visit(lowered_body, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-    # shared vs. blockIdx
-    s = schedule(by, "shared")
-    lowered_body = tvm.lower(s, [A, C])["main"].body
-    assert not any(collect_visit(lowered_body, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-
-def test_local_stage_predicate2():
-    A = tvm.te.placeholder((128,), name="A")
-    B = tvm.te.compute((128,), lambda bi: A[bi] + 1, name="B")
-    C = tvm.te.compute((128,), lambda ci: B[ci] + 2, name="C")
-    s = tvm.te.create_schedule(C.op)
-    AA = s.cache_read(A, "local", [B])
-    s[B].set_scope("shared")
-    block_x = tvm.te.thread_axis("blockIdx.x")
-    thread_x = tvm.te.thread_axis((0, 32), "threadIdx.x")
-    oc, ic = s[C].split(s[C].op.axis[0], factor=64)
-    ooc, ioc = s[C].split(oc, factor=2)
-    oic, iic = s[C].split(ic, factor=32)
-    s[C].bind(ooc, block_x)
-    s[C].bind(iic, thread_x)
-    s[B].compute_at(s[C], ioc)
-    ob, ib = s[B].split(s[B].op.axis[0], factor=32)
-    s[B].bind(ib, thread_x)
-    s[AA].compute_root()
-    s[AA].compute_at(s[C], ooc)
-    oaa, iaa = s[AA].split(s[AA].op.axis[0], factor=32)
-    s[AA].bind(iaa, thread_x)
-    lowered_body = tvm.lower(s, [A, C])["main"].body
-
-    def collect_visit(stmt, f):
-        ret = []
-        tvm.tir.stmt_functor.post_order_visit(stmt, lambda x: ret.append(f(x)))
-        return ret
-
-    def visit_stmt(op):
-        if isinstance(op, tvm.tir.Allocate):
-            return op.extents[0].value == 97
-        return False
-
-    assert not any(collect_visit(lowered_body, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-    assert any(collect_visit(lowered_body, visit_stmt))
-
-
-def test_schedule_record_gemm():
-    with tvm.transform.PassContext(config={"te.keep_schedule_record": True}):
-        M, K, N = 1024, 1024, 1024
-        k = te.reduce_axis((0, K), "k")
-        A = te.placeholder((M, K), name="A")
-        B = te.placeholder((K, N), name="B")
-        C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C")
-        s = te.create_schedule(C.op)
-        # currently there are no other applied primitives
-        # size of schedule record is expected to be 1 (vanilla schedule)
-        assert len(s.schedule_record) == 1
-        # apply sequential optimizatoin primitives
-        block_size, factor = 32, 8
-        # tile -> split + split + reorder
-        mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], block_size, block_size)
-        ko, ki = s[C].split(k, factor=factor)
-        s[C].reorder(mo, ko, no, mi, ki, ni)
-        s[C].vectorize(ni)
-        s[C].parallel(mo)
-        assert len(s.schedule_record) == 8
-        # compare primitive names
-        expected_names = [
-            "vanilla",
-            "split",
-            "split",
-            "reorder",
-            "split",
-            "reorder",
-            "vectorize",
-            "parallel",
-        ]
-        for i in range(len(s.schedule_record)):
-            assert s.primitive_record[i] == expected_names[i]
-
-
-def test_schedule_record_misc():
-    s = te.create_schedule([])
-    # size of schedule record is expected to be 0 (no storing behavior)
-    assert len(s.schedule_record) == 0
-
-    with tvm.transform.PassContext(config={"te.keep_schedule_record": True}):
-        s = te.create_schedule([])
-        # size of schedule record is expected to be 1 (vanilla schedule)
-        assert len(s.schedule_record) == 1
-
-        stg = te.compute((), lambda *args: 0, name="empty_op")
-        s = te.create_schedule(stg.op)
-        # size of schedule record is expected to be 1 (vanilla schedule)
-        assert len(s.schedule_record) == 1
-
-
-if __name__ == "__main__":
-    test_loop_dep_reduce()
-    test_loop_dep_reduce_cache_write()
-    test_schedule_middle_cache()
-    test_inline_multi_reduce()
-    test_schedule_cache_relayout4()
-    test_schedule_cache_relayout3()
-    test_schedule_cache_relayout2()
-    test_schedule_cache_relayout1()
-    test_schedule_const_bound()
-    test_scan_inline1()
-    test_scan_inline2()
-    test_inline_mixed()
-    test_auto_inline()
-    test_schedule_scan()
-    test_schedule0()
-    test_schedule1()
-    test_schedule2()
-    test_schedule_cache()
-    test_schedule_tensor_compute1()
-    test_schedule_tensor_compute2()
-    test_schedule_tensor_compute3()
-    test_reduction_and_dummy_fuse_split()
-    test_schedule_compute_inline()
-    test_local_stage_predicate()
-    test_local_stage_predicate2()
-    test_schedule_record_gemm()
-    test_schedule_record_misc()
diff --git a/tests/python/te/test_te_schedule_postproc_rewrite_for_tensor_core.py b/tests/python/te/test_te_schedule_postproc_rewrite_for_tensor_core.py
deleted file mode 100644
index 83584ad56400..000000000000
--- a/tests/python/te/test_te_schedule_postproc_rewrite_for_tensor_core.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm import topi
-import numpy as np
-import tvm.testing
-
-
-def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96):
-    A = te.placeholder((n, l), name="A", dtype="float16")
-    B = te.placeholder((l, m), name="B", dtype="float16")
-    k = te.reduce_axis((0, l), name="k")
-    C = te.compute(
-        (n, m), lambda i, j: te.sum(A[i, k].astype("float32") * B[k, j].astype("float32"), axis=k)
-    )
-    s = te.create_schedule(C.op)
-    y, x = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    AA = s.cache_read(A, "shared", [C])
-    AL = s.cache_read(AA, "local", [C])
-    BB = s.cache_read(B, "shared", [C])
-    BL = s.cache_read(BB, "local", [C])
-    CL = s.cache_write(C, "local")
-
-    bx = 4
-    by = 32
-    step_k = 8
-    v = 4
-    TX = 8
-    TY = 1
-    tile_x = bx * TX
-    tile_y = by * TY
-    WX = min(warp_tile_m, tile_x)
-    tile_k = 16
-    vthread = 1
-
-    yo, ty = s[C].split(y, tile_y * vthread)
-    vy, ty = s[C].split(ty, tile_y)
-    ty, yi = s[C].split(ty, TY)
-
-    xo, xi = s[C].split(x, tile_x)
-    tz, xi = s[C].split(xi, WX)
-    tx, xi = s[C].split(xi, TX)
-    ko, ki = s[CL].split(k, step_k * tile_k)
-    kl, ki = s[CL].split(ki, tile_k)
-
-    s[C].reorder(yo, xo, tz, ty, tx, yi, xi)
-    s[C].bind(yo, te.thread_axis("blockIdx.y"))
-    s[C].bind(xo, te.thread_axis("blockIdx.x"))
-    s[C].bind(ty, te.thread_axis("threadIdx.y"))
-    s[C].bind(tz, te.thread_axis("threadIdx.z"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    s[C].bind(vy, te.thread_axis((0, vthread), "vthread", name="vy"))
-    s[CL].compute_at(s[C], tx)
-    yo, xo = CL.op.axis
-    s[CL].reorder(ko, kl, ki, yo, xo)
-
-    s[AA].compute_at(s[CL], ko)
-    xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx * v)
-    tz, tx = s[AA].split(xi, factor=(WX // TX) * v)
-    tx, vec = s[AA].split(tx, factor=v)
-    fused = s[AA].fuse(s[AA].op.axis[0], xo)
-    _, ty = s[AA].split(fused, factor=by)
-    s[AA].bind(ty, te.thread_axis("threadIdx.y"))
-    s[AA].bind(tz, te.thread_axis("threadIdx.z"))
-    s[AA].bind(tx, te.thread_axis("threadIdx.x"))
-    s[AA].vectorize(vec)
-
-    s[BB].compute_at(s[CL], ko)
-    xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx * v)
-    tz, tx = s[BB].split(xi, factor=(WX // TX) * v)
-    tx, vec = s[BB].split(tx, factor=v)
-    fused = s[BB].fuse(s[BB].op.axis[0], xo)
-    _, ty = s[BB].split(fused, factor=by)
-    s[BB].bind(ty, te.thread_axis("threadIdx.y"))
-    s[BB].bind(tz, te.thread_axis("threadIdx.z"))
-    s[BB].bind(tx, te.thread_axis("threadIdx.x"))
-    s[BB].vectorize(vec)
-
-    s[AL].compute_at(s[CL], kl)
-    s[BL].compute_at(s[CL], kl)
-
-    s[CL].pragma(ko, "tensor_core")
-
-    func = tvm.build(s, [A, B, C], "cuda")
-
-    dev = tvm.cuda(0)
-    a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
-    b_np = np.random.uniform(size=(l, m)).astype(B.dtype)
-    c_np = np.zeros((n, m), dtype=np.float32)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
-    func(a, b, c)
-    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
-    print("gemm m=%d n=%d k=%d: %f ms" % (m, n, l, evaluator(a, b, c).mean * 1e3))
-
-    c_np = np.dot(a_np, b_np)
-    np.testing.assert_allclose(c_np, c.numpy(), rtol=1e-3)
-
-
-def tensor_core_batch_matmul(warp_tile_m=16, m=64, n=32, l=96, batch=2):
-    A = te.placeholder((batch, n, l), name="A", dtype="float16")
-    B = te.placeholder((batch, l, m), name="B", dtype="float16")
-    k = te.reduce_axis((0, l), name="k")
-    C = te.compute(
-        (batch, n, m), lambda b, i, j: te.sum((A[b, i, k] * B[b, k, j]).astype("float32"), axis=k)
-    )
-    s = te.create_schedule(C.op)
-    z, y, x = s[C].op.axis
-    k = s[C].op.reduce_axis[0]
-
-    AA = s.cache_read(A, "shared", [C])
-    AL = s.cache_read(AA, "local", [C])
-    BB = s.cache_read(B, "shared", [C])
-    BL = s.cache_read(BB, "local", [C])
-    CL = s.cache_write(C, "local")
-
-    bx = 2
-    by = 32
-    step_k = 8
-    v = 4
-    TX = 8
-    TY = 1
-    tile_x = bx * TX
-    tile_y = by * TY
-    WX = min(warp_tile_m, tile_x)
-    tile_k = 16
-    vthread = 1
-
-    yo, ty = s[C].split(y, tile_y * vthread)
-    vy, ty = s[C].split(ty, tile_y)
-    ty, yi = s[C].split(ty, TY)
-
-    xo, xi = s[C].split(x, tile_x)
-    tz, xi = s[C].split(xi, WX)
-    tx, xi = s[C].split(xi, TX)
-    ko, ki = s[CL].split(k, step_k * tile_k)
-    kl, ki = s[CL].split(ki, tile_k)
-
-    s[C].reorder(z, yo, xo, tz, ty, tx, yi, xi)
-    s[C].bind(z, te.thread_axis("blockIdx.z"))
-    s[C].bind(yo, te.thread_axis("blockIdx.y"))
-    s[C].bind(xo, te.thread_axis("blockIdx.x"))
-    s[C].bind(ty, te.thread_axis("threadIdx.y"))
-    s[C].bind(tz, te.thread_axis("threadIdx.z"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    s[C].bind(vy, te.thread_axis((0, vthread), "vthread", name="vy"))
-    s[CL].compute_at(s[C], tx)
-    zo, yo, xo = CL.op.axis
-    s[CL].reorder(ko, kl, ki, zo, yo, xo)
-
-    s[AA].compute_at(s[CL], ko)
-    xo, xi = s[AA].split(s[AA].op.axis[2], factor=bx * v)
-    tz, tx = s[AA].split(xi, factor=(WX // TX) * v)
-    tx, vec = s[AA].split(tx, factor=v)
-    fused = s[AA].fuse(s[AA].op.axis[1], xo)
-    _, ty = s[AA].split(fused, factor=by)
-    s[AA].bind(ty, te.thread_axis("threadIdx.y"))
-    s[AA].bind(tz, te.thread_axis("threadIdx.z"))
-    s[AA].bind(tx, te.thread_axis("threadIdx.x"))
-    s[AA].vectorize(vec)
-
-    s[BB].compute_at(s[CL], ko)
-    xo, xi = s[BB].split(s[BB].op.axis[2], factor=bx * v)
-    tz, tx = s[BB].split(xi, factor=(WX // TX) * v)
-    tx, vec = s[BB].split(tx, factor=v)
-    fused = s[BB].fuse(s[BB].op.axis[1], xo)
-    _, ty = s[BB].split(fused, factor=by)
-    s[BB].bind(ty, te.thread_axis("threadIdx.y"))
-    s[BB].bind(tz, te.thread_axis("threadIdx.z"))
-    s[BB].bind(tx, te.thread_axis("threadIdx.x"))
-    s[BB].vectorize(vec)
-
-    s[AL].compute_at(s[CL], kl)
-    s[BL].compute_at(s[CL], kl)
-
-    s[CL].pragma(ko, "tensor_core")
-
-    func = tvm.build(s, [A, B, C], "cuda")
-
-    dev = tvm.cuda(0)
-    a_np = np.random.uniform(size=(batch, n, l)).astype(A.dtype)
-    b_np = np.random.uniform(size=(batch, l, m)).astype(B.dtype)
-    c_np = np.zeros((batch, n, m), dtype=np.float32)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    c = tvm.nd.array(np.zeros((batch, n, m), dtype=C.dtype), dev)
-    func(a, b, c)
-    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
-    print(
-        "batch gemm m=%d n=%d k=%d batch=%d: %f ms"
-        % (m, n, l, batch, evaluator(a, b, c).mean * 1e3)
-    )
-
-    for bs in range(batch):
-        c_np[bs, :, :] = np.dot(a_np[bs, :, :], b_np[bs, :, :])
-    np.testing.assert_allclose(c_np, c.numpy(), rtol=1e-3)
-
-
-@tvm.testing.requires_tensorcore
-def test_tensor_core_matmul():
-    tensor_core_matmul(16)  # test with warp_tile 16x16x16
-    tensor_core_matmul(8)  # test with warp_tile 8x32x16
-    tensor_core_matmul(32)  # test with warp_tile 32x8x16
-
-
-@tvm.testing.requires_tensorcore
-def test_tensor_core_batch_matmul():
-    tensor_core_batch_matmul()
-
-
-if __name__ == "__main__":
-    test_tensor_core_matmul()
-    test_tensor_core_batch_matmul()
diff --git a/tests/python/te/test_te_schedule_tensor_core.py b/tests/python/te/test_te_schedule_tensor_core.py
deleted file mode 100644
index d86b05ad83f1..000000000000
--- a/tests/python/te/test_te_schedule_tensor_core.py
+++ /dev/null
@@ -1,461 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-import numpy as np
-from tvm.topi.testing import conv2d_nhwc_python
-import tvm.testing
-
-VERIFY = True
-
-
-def intrin_wmma_load_matrix(shape, scope):
-    n, m, l = shape
-    if scope == "wmma.matrix_a":
-        row, col = n, l
-    elif scope == "wmma.matrix_b":
-        row, col = l, m
-    A = te.placeholder((row, col), name="A", dtype="float16")
-    BA = tvm.tir.decl_buffer(
-        A.shape, A.dtype, scope="shared", data_alignment=32, offset_factor=row * col
-    )
-    C = te.compute((row, col), lambda i, j: A[i, j], name="C")
-    BC = tvm.tir.decl_buffer(
-        C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=row * col
-    )
-
-    def intrin_func(ins, outs):
-        ib = tvm.tir.ir_builder.create()
-
-        BA = ins[0]
-        BC = outs[0]
-        ib.emit(
-            tvm.tir.call_intrin(
-                "handle",
-                "tir.tvm_load_matrix_sync",
-                BC.data,
-                n,
-                m,
-                l,
-                BC.elem_offset // (row * col),
-                BA.access_ptr("r"),
-                col,
-                "row_major",
-            )
-        )
-        return ib.get()
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
-
-
-def intrin_wmma_gemm(shape):
-    n, m, l = shape
-    A = te.placeholder((n, l), name="A", dtype="float16")
-    B = te.placeholder((l, m), name="B", dtype="float16")
-    k = te.reduce_axis((0, l), name="k")
-    C = te.compute(
-        (n, m),
-        lambda ii, jj: te.sum(A[ii, k].astype("float") * B[k, jj].astype("float"), axis=k),
-        name="C",
-    )
-    BA = tvm.tir.decl_buffer(
-        A.shape, A.dtype, name="BA", scope="wmma.matrix_a", data_alignment=32, offset_factor=n * l
-    )
-    BB = tvm.tir.decl_buffer(
-        B.shape, B.dtype, name="BB", scope="wmma.matrix_b", data_alignment=32, offset_factor=l * m
-    )
-    BC = tvm.tir.decl_buffer(
-        C.shape,
-        C.dtype,
-        name="BC",
-        scope="wmma.accumulator",
-        data_alignment=32,
-        offset_factor=n * m,
-    )
-
-    def intrin_func(ins, outs):
-        BA, BB = ins
-        (BC,) = outs
-
-        def init():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_intrin(
-                    "handle",
-                    "tir.tvm_fill_fragment",
-                    BC.data,
-                    n,
-                    m,
-                    l,
-                    BC.elem_offset // (n * m),
-                    0.0,
-                )
-            )
-            return ib.get()
-
-        def update():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_intrin(
-                    "handle",
-                    "tir.tvm_mma_sync",
-                    BC.data,
-                    BC.elem_offset // (n * m),
-                    BA.data,
-                    BA.elem_offset // (n * l),
-                    BB.data,
-                    BB.elem_offset // (l * m),
-                    BC.data,
-                    BC.elem_offset // (n * m),
-                )
-            )
-            return ib.get()
-
-        return update(), init(), update()
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
-
-
-def intrin_wmma_store_matrix(shape):
-    n, m, l = shape
-    A = te.placeholder((n, m), name="A", dtype="float32")
-    BA = tvm.tir.decl_buffer(
-        A.shape, A.dtype, scope="wmma.accumulator", data_alignment=32, offset_factor=n * m
-    )
-    C = te.compute((n, m), lambda i, j: A[i, j], name="C")
-    BC = tvm.tir.decl_buffer(
-        C.shape, C.dtype, scope="global", data_alignment=32, offset_factor=n * m
-    )
-
-    def intrin_func(ins, outs):
-        ib = tvm.tir.ir_builder.create()
-
-        BA = ins[0]
-        BC = outs[0]
-        ib.emit(
-            tvm.tir.call_intrin(
-                "handle",
-                "tir.tvm_store_matrix_sync",
-                BA.data,
-                n,
-                m,
-                l,
-                BA.elem_offset // (n * m),
-                BC.access_ptr("w"),
-                m,
-                "row_major",
-            )
-        )
-        return ib.get()
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
-
-
-@tvm.testing.requires_tensorcore
-def test_tensor_core_batch_matmal():
-    batch_size = 4
-    n = 512
-    m, l = n, n
-    assert n % 32 == 0
-    assert m % 8 == 0
-    assert l % 16 == 0
-    nn, mm, ll = n // 32, m // 8, l // 16
-    A = te.placeholder((batch_size, nn, ll, 32, 16), name="A", dtype="float16")
-    B = te.placeholder((batch_size, ll, mm, 16, 8), name="B", dtype="float16")
-    k1 = te.reduce_axis((0, ll), name="k1")
-    k2 = te.reduce_axis((0, 16), name="k2")
-    C = te.compute(
-        (batch_size, nn, mm, 32, 8),
-        lambda b, i, j, ii, jj: te.sum(
-            A[b, i, k1, ii, k2].astype("float") * B[b, k1, j, k2, jj].astype("float"), axis=[k1, k2]
-        ),
-        name="Fragment_C",
-    )
-    s = te.create_schedule(C.op)
-
-    warp_size = 32
-    kernel_size = 16
-    block_row_warps = 2
-    block_col_warps = 4
-    warp_row_tiles = 4
-    warp_col_tiles = 2
-    chunk = 4
-
-    block_x = te.thread_axis("blockIdx.x")
-    block_y = te.thread_axis("blockIdx.y")
-    block_z = te.thread_axis("blockIdx.z")
-    thread_x = te.thread_axis("threadIdx.x")
-    thread_y = te.thread_axis("threadIdx.y")
-    thread_z = te.thread_axis("threadIdx.z")
-
-    AS = s.cache_read(A, "shared", [C])
-    BS = s.cache_read(B, "shared", [C])
-    AF = s.cache_read(AS, "wmma.matrix_a", [C])
-    BF = s.cache_read(BS, "wmma.matrix_b", [C])
-    CF = s.cache_write(C, "wmma.accumulator")
-
-    b, i, j, kernel_i, kernel_j = s[C].op.axis
-    i, ii = s[C].split(i, factor=warp_row_tiles)
-    block_i, i = s[C].split(i, factor=block_row_warps)
-    j, jj = s[C].split(j, factor=warp_col_tiles)
-    block_j, j = s[C].split(j, factor=block_col_warps)
-    s[C].reorder(block_i, block_j, i, j, ii, jj, kernel_i, kernel_j)
-    s[C].bind(b, block_z)
-    s[C].bind(block_i, block_x)
-    s[C].bind(block_j, block_y)
-    s[C].bind(i, thread_y)
-    s[C].bind(j, thread_z)
-
-    s[CF].compute_at(s[C], j)
-    b, warp_i, warp_j, _i, _j = s[CF].op.axis
-    k, _k = CF.op.reduce_axis
-    ko, ki = s[CF].split(k, factor=chunk)
-    s[CF].reorder(ko, ki, warp_i, warp_j, _i, _j, _k)
-
-    s[AF].compute_at(s[CF], ki)
-    s[BF].compute_at(s[CF], ki)
-
-    s[AS].compute_at(s[CF], ko)
-    b, xo, yo, xi, yi = AS.op.axis
-    tx, xo = s[AS].split(xo, nparts=block_row_warps)
-    ty, yo = s[AS].split(yo, nparts=block_col_warps)
-    t = s[AS].fuse(xi, yi)
-    to, ti = s[AS].split(t, nparts=warp_size)
-    s[AS].bind(tx, thread_y)
-    s[AS].bind(ty, thread_z)
-    s[AS].bind(to, thread_x)
-
-    s[BS].compute_at(s[CF], ko)
-    b, xo, yo, xi, yi = BS.op.axis
-    tx, xo = s[BS].split(xo, nparts=block_row_warps)
-    ty, yo = s[BS].split(yo, nparts=block_col_warps)
-    t = s[BS].fuse(xi, yi)
-    to, ti = s[BS].split(t, nparts=warp_size)
-    s[BS].bind(tx, thread_y)
-    s[BS].bind(ty, thread_z)
-    s[BS].bind(to, thread_x)
-
-    s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), "wmma.matrix_a"))
-    s[BF].tensorize(BF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), "wmma.matrix_b"))
-    s[C].tensorize(kernel_i, intrin_wmma_store_matrix((32, 8, 16)))
-    s[CF].tensorize(_i, intrin_wmma_gemm((32, 8, 16)))
-
-    func = tvm.build(s, [A, B, C], "cuda")
-
-    dev = tvm.cuda(0)
-    a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype)
-    b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype)
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), dev)
-    func(a, b, c)
-    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
-    print("gemm with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3))
-
-    if VERIFY:
-        func(a, b, c)
-        a_np = a_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
-        b_np = b_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
-        c_np = c.numpy().transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
-        np.testing.assert_allclose(
-            c_np, np.matmul(a_np.astype(C.dtype), b_np.astype(C.dtype)), rtol=1e-4, atol=1e-4
-        )
-
-
-@tvm.testing.requires_tensorcore
-def test_tensor_core_batch_conv():
-    # The sizes of inputs and filters
-    batch_size = 32
-    height = 14
-    width = 14
-    in_channels = 32
-    out_channels = 64
-    kernel_h = 3
-    kernel_w = 3
-    pad_h = 1
-    pad_w = 1
-    stride_h = 1
-    stride_w = 1
-    block_size = 16
-
-    block_row_warps = 2
-    block_col_warps = 4
-    warp_row_tiles = 4
-    warp_col_tiles = 2
-    warp_size = 32
-    chunk = 2
-
-    # Input feature map: (N, H, W, IC, n, ic)
-    data_shape = (
-        batch_size // block_size,
-        height,
-        width,
-        in_channels // block_size,
-        block_size,
-        block_size,
-    )
-    # Kernel: (H, W, IC, OC, ic, oc)
-    kernel_shape = (
-        kernel_h,
-        kernel_w,
-        in_channels // block_size,
-        out_channels // block_size,
-        block_size,
-        block_size,
-    )
-
-    # Output feature map: (N, H, W, OC, n, oc)
-    output_shape = (
-        batch_size // block_size,
-        height,
-        width,
-        out_channels // block_size,
-        block_size,
-        block_size,
-    )
-
-    assert batch_size % block_size == 0
-    assert in_channels % block_size == 0
-    assert out_channels % block_size == 0
-
-    kh = te.reduce_axis((0, kernel_h), name="kh")
-    kw = te.reduce_axis((0, kernel_w), name="kw")
-    ic = te.reduce_axis((0, in_channels // block_size), name="ic")
-    ii = te.reduce_axis((0, block_size), name="ii")
-
-    # Algorithm
-    A = te.placeholder(data_shape, name="A", dtype="float16")
-    W = te.placeholder(kernel_shape, name="W", dtype="float16")
-    Apad = te.compute(
-        (
-            batch_size // block_size,
-            height + 2 * pad_h,
-            width + 2 * pad_w,
-            in_channels // block_size,
-            block_size,
-            block_size,
-        ),
-        lambda n, h, w, i, nn, ii: tvm.tir.if_then_else(
-            tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width),
-            A[n, h - pad_h, w - pad_w, i, nn, ii],
-            tvm.tir.const(0.0, "float16"),
-        ),
-        name="Apad",
-    )
-    Conv = te.compute(
-        output_shape,
-        lambda n, h, w, o, nn, oo: te.sum(
-            Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32")
-            * W[kh, kw, ic, o, ii, oo].astype("float32"),
-            axis=[ic, kh, kw, ii],
-        ),
-        name="Conv",
-    )
-
-    s = te.create_schedule(Conv.op)
-    s[Apad].compute_inline()
-
-    AS = s.cache_read(Apad, "shared", [Conv])
-    WS = s.cache_read(W, "shared", [Conv])
-    AF = s.cache_read(AS, "wmma.matrix_a", [Conv])
-    WF = s.cache_read(WS, "wmma.matrix_b", [Conv])
-    ConvF = s.cache_write(Conv, "wmma.accumulator")
-
-    block_x = te.thread_axis("blockIdx.x")
-    block_y = te.thread_axis("blockIdx.y")
-    block_z = te.thread_axis("blockIdx.z")
-    thread_x = te.thread_axis("threadIdx.x")
-    thread_y = te.thread_axis("threadIdx.y")
-    thread_z = te.thread_axis("threadIdx.z")
-
-    nc, hc, wc, oc, nnc, ooc = Conv.op.axis
-    block_k = s[Conv].fuse(hc, wc)
-    s[Conv].bind(block_k, block_z)
-    nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
-    block_i, nc = s[Conv].split(nc, factor=block_row_warps)
-    oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
-    block_j, oc = s[Conv].split(oc, factor=block_col_warps)
-    s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
-    s[Conv].bind(block_i, block_x)
-    s[Conv].bind(block_j, block_y)
-    s[Conv].bind(nc, thread_y)
-    s[Conv].bind(oc, thread_z)
-
-    s[ConvF].compute_at(s[Conv], oc)
-    n, h, w, o, nnf, oof = ConvF.op.axis
-    ko, ki = s[ConvF].split(ic, factor=chunk)
-    s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)
-
-    s[AF].compute_at(s[ConvF], kw)
-    s[WF].compute_at(s[ConvF], kw)
-
-    s[WS].compute_at(s[ConvF], kh)
-    s[AS].compute_at(s[ConvF], kh)
-
-    n, h, w, i, nn, ii = AS.op.axis
-    tx, xo = s[AS].split(n, nparts=block_row_warps)
-    ty, yo = s[AS].split(xo, nparts=block_col_warps)
-    t = s[AS].fuse(nn, ii)
-    to, ti = s[AS].split(t, factor=warp_size)
-    s[AS].bind(tx, thread_y)
-    s[AS].bind(ty, thread_z)
-    s[AS].bind(ti, thread_x)
-
-    kh, kw, ic, o, ii, oo = WS.op.axis
-    tx, xo = s[WS].split(o, nparts=block_row_warps)
-    ty, yo = s[WS].split(xo, nparts=block_col_warps)
-    t = s[WS].fuse(ii, oo)
-    to, ti = s[WS].split(t, nparts=warp_size)
-    s[WS].bind(tx, thread_y)
-    s[WS].bind(ty, thread_z)
-    s[WS].bind(to, thread_x)
-    s[WS].vectorize(ti)
-
-    s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_a"))
-    s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_b"))
-    s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16)))
-    s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16)))
-
-    func = tvm.build(s, [A, W, Conv], "cuda")
-
-    dev = tvm.cuda(0)
-    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
-    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
-    a = tvm.nd.array(a_np, dev)
-    w = tvm.nd.array(w_np, dev)
-    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), dev)
-    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
-    print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3))
-
-    if VERIFY:
-        func(a, w, c)
-        a_np = a_np.transpose(0, 4, 1, 2, 3, 5).reshape(batch_size, height, width, in_channels)
-        w_np = w_np.transpose(0, 1, 2, 4, 3, 5).reshape(
-            kernel_h, kernel_w, in_channels, out_channels
-        )
-        c_np = (
-            c.numpy().transpose((0, 4, 1, 2, 3, 5)).reshape(batch_size, height, width, out_channels)
-        )
-        c_std = conv2d_nhwc_python(
-            a_np.astype(Conv.dtype), w_np.astype(Conv.dtype), (stride_h, stride_w), (pad_h, pad_w)
-        ).astype(Conv.dtype)
-        np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)
-
-
-if __name__ == "__main__":
-    test_tensor_core_batch_matmal()
-    test_tensor_core_batch_conv()
diff --git a/tests/python/te/test_te_schedule_tensorize.py b/tests/python/te/test_te_schedule_tensorize.py
deleted file mode 100644
index 419d3edb5c3d..000000000000
--- a/tests/python/te/test_te_schedule_tensorize.py
+++ /dev/null
@@ -1,392 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-from tvm.script import tir as T
-
-
-def intrin_vadd(xo, m, n):
-    x = te.placeholder((n,), name="vx")
-    y = te.placeholder((n,), name="vy")
-    if m % n == 0:
-        body = lambda i: x[i] + y[i]
-    else:
-        body = lambda i: tvm.tir.Select(
-            xo * n + i < m, x[i] + y[i], tvm.tir.const(0, dtype=x.dtype)
-        )
-    z = te.compute(x.shape, body, name="z")
-
-    def intrin_func(ins, outs):
-        xx, yy = ins
-        zz = outs[0]
-        # special handle needed to tackle tail loop part when m % n != 0
-        # here is tvm.min(n, m - xo * n)
-        return tvm.tir.call_packed("vadd", xx, yy, zz)
-
-    buffer_params = {"offset_factor": 16}
-    return te.decl_tensor_intrin(z.op, intrin_func, default_buffer_params=buffer_params)
-
-
-def intrin_gemv(m, n):
-    w = te.placeholder((m, n), name="w")
-    x = te.placeholder((n,), name="x")
-    k = te.reduce_axis((0, n), name="k")
-    z = te.compute((m,), lambda i: te.sum(w[i, k] * x[k], axis=k), name="z")
-    Wb = tvm.tir.decl_buffer(
-        w.shape, w.dtype, name="W", offset_factor=16, strides=[te.var("ldw"), 1]
-    )
-
-    def intrin_func(ins, outs):
-        ww, xx = ins
-        zz = outs[0]
-        ww_ptr = ww.access_ptr("r")
-        xx_ptr = xx.access_ptr("r")
-        zz_ptr = zz.access_ptr("w")
-        body = tvm.tir.call_packed("gemv", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        reset = tvm.tir.call_packed("fill_zero", zz_ptr, n)
-        update = tvm.tir.call_packed("gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        return body, reset, update
-
-    buffer_params = {"offset_factor": 16, "data_alignment": 16}
-    return te.decl_tensor_intrin(
-        z.op, intrin_func, binds={w: Wb}, default_buffer_params=buffer_params
-    )
-
-
-def intrin_gemv_no_reset(m, n):
-    w = te.placeholder((m, n), name="w")
-    x = te.placeholder((n,), name="x")
-    k = te.reduce_axis((0, n), name="k")
-    z = te.compute((m,), lambda i: te.sum(w[i, k] * x[k], axis=k), name="z")
-    Wb = tvm.tir.decl_buffer(
-        w.shape, w.dtype, name="W", offset_factor=16, strides=[te.var("ldw"), 1]
-    )
-
-    def intrin_func(ins, outs):
-        ww, xx = ins
-        zz = outs[0]
-        ww_ptr = ww.access_ptr("r")
-        xx_ptr = xx.access_ptr("r")
-        zz_ptr = zz.access_ptr("w")
-        body = tvm.tir.call_packed("gemv", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        update = tvm.tir.call_packed("gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        return body, None, update
-
-    buffer_params = {"offset_factor": 16, "data_alignment": 16}
-    return te.decl_tensor_intrin(
-        z.op, intrin_func, binds={w: Wb}, default_buffer_params=buffer_params
-    )
-
-
-def test_tensorize_vadd():
-    def add(m):
-        x = te.placeholder((m,), name="x")
-        y = te.placeholder((m,), name="y")
-        z = te.compute(x.shape, lambda i: x[i] + y[i], name="z")
-        return x, y, z
-
-    def check(m, factor):
-        x, y, z = add(m)
-        factor = T.int32(factor)
-        s = te.create_schedule(z.op)
-        xo, xi = s[z].split(z.op.axis[0], factor=factor)
-        vadd = intrin_vadd(xo, m, factor)
-        s[z].tensorize(xi, vadd)
-        s = s.normalize()
-        dom_map = tvm.te.schedule.InferBound(s)
-        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
-        out_dom, in_dom = finfer(s[z], dom_map)
-        tvm.ir.assert_structural_equal(out_dom[z.op.axis[0]].extent, factor)
-        tvm.ir.assert_structural_equal(out_dom[z.op.axis[0]].min, xo * factor)
-        tvm.ir.assert_structural_equal(in_dom.items()[0][1][0].extent, factor)
-        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
-        body = fmatch(s[z], out_dom, in_dom, vadd)
-        ana = tvm.arith.Analyzer()
-        tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(vadd.op.body[0]))
-        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-        tvm.lower(s, [x, y, z])
-
-    def check_cache_write(m, factor):
-        x, y, z = add(m)
-        s = te.create_schedule(z.op)
-        _, _ = s[z].split(z.op.axis[0], factor=factor)
-
-        z_global = s.cache_write(z, "global")
-        xo, xi = z_global.op.axis
-
-        vadd = intrin_vadd(xo, m, factor)
-        s[z_global].tensorize(xi, vadd)
-        s = s.normalize()
-        dom_map = tvm.te.schedule.InferBound(s)
-        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
-        out_dom, in_dom = finfer(s[z_global], dom_map)
-        # outer loop var will be rebased, so min value is the new loop var and extent is 1
-        tvm.ir.assert_structural_equal(out_dom[xo].extent, T.int32(1))
-        assert isinstance(out_dom[xo].min, tvm.tir.Var)
-        assert xo.var.name == out_dom[xo].min.name
-
-        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
-        body = fmatch(s[z_global], out_dom, in_dom, vadd)[0]
-        ana = tvm.arith.Analyzer()
-        vars = tvm.runtime.convert({xo.var: out_dom[xo].min})
-        vadd_body = tvm.tir.stmt_functor.substitute(vadd.op.body[0], vars)
-        tvm.ir.assert_structural_equal(ana.simplify(body), ana.simplify(vadd_body))
-        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-        tvm.lower(s, [x, y, z])
-
-    def check_compute_reuse():
-        x, y, z = add(32)
-
-        def _intrin_vadd():
-            def _intrin_func(ins, outs):
-                return tvm.tir.call_packed("vadd", ins[0], ins[1], outs[0])
-
-            return tvm.te.decl_tensor_intrin(z.op, _intrin_func)
-
-        s = tvm.te.create_schedule(z.op)
-        s[z].tensorize(z.op.axis[0], _intrin_vadd())
-        tvm.lower(s, [x, y, z])
-
-    check(128, 16)
-    check_cache_write(129, 16)
-    check_compute_reuse()
-
-
-def test_tensorize_matmul():
-    n = 1024
-    m = n
-    l = n
-    A = te.placeholder((n, l), name="A")
-    B = te.placeholder((m, l), name="B")
-    k = te.reduce_axis((0, l), name="k")
-    C = te.compute((n, m), lambda i, j: te.sum(B[j, k] * A[i, k], axis=k), name="C")
-
-    def check(factor):
-        s = te.create_schedule(C.op)
-        x, y = C.op.axis
-        yo, yi = s[C].split(y, factor=factor)
-        gemv = intrin_gemv(factor, l)
-        s[C].tensorize(yi, gemv)
-        s = s.normalize()
-        dom_map = tvm.te.schedule.InferBound(s)
-        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
-        out_dom, in_dom = finfer(s[C], dom_map)
-        tvm.ir.assert_structural_equal(out_dom[x].extent, T.int32(1))
-        tvm.ir.assert_structural_equal(out_dom[y].extent, factor)
-        tvm.ir.assert_structural_equal(out_dom[y].min, yo * factor)
-        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
-        body = fmatch(s[C], out_dom, in_dom, gemv)
-        ana = tvm.arith.Analyzer()
-
-        tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(gemv.op.body[0]))
-        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-        tvm.lower(s, [A, B, C])
-
-    def check_rfactor(factor, rfactor):
-        s = te.create_schedule(C.op)
-        x, y = C.op.axis
-        rk = C.op.reduce_axis[0]
-        yo, yi = s[C].split(y, factor=factor)
-        ro, ri = s[C].split(rk, factor=rfactor)
-        s[C].reorder(yo, ro, yi, ri)
-        gemv = intrin_gemv(factor, rfactor)
-        s[C].tensorize(yi, gemv)
-        s = s.normalize()
-        dom_map = tvm.te.schedule.InferBound(s)
-        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
-        out_dom, in_dom = finfer(s[C], dom_map)
-        tvm.ir.assert_structural_equal(out_dom[x].extent, T.int32(1))
-        tvm.ir.assert_structural_equal(out_dom[y].extent, factor)
-        tvm.ir.assert_structural_equal(out_dom[y].min, yo * factor)
-        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
-        body = fmatch(s[C], out_dom, in_dom, gemv)
-        ana = tvm.arith.Analyzer()
-        tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(gemv.op.body[0]))
-        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-        tvm.lower(s, [A, B, C])
-
-    def check_rfactor_no_reset(factor, rfactor):
-        s = te.create_schedule(C.op)
-        x, y = C.op.axis
-        rk = C.op.reduce_axis[0]
-        yo, yi = s[C].split(y, factor=factor)
-        ro, ri = s[C].split(rk, factor=rfactor)
-        s[C].reorder(yo, ro, yi, ri)
-        gemv = intrin_gemv_no_reset(factor, rfactor)
-        s[C].tensorize(yi, gemv)
-        s = s.normalize()
-        dom_map = tvm.te.schedule.InferBound(s)
-        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
-        out_dom, in_dom = finfer(s[C], dom_map)
-        tvm.ir.assert_structural_equal(out_dom[x].extent, T.int32(1))
-        tvm.ir.assert_structural_equal(out_dom[y].extent, factor)
-        tvm.ir.assert_structural_equal(out_dom[y].min, yo * factor)
-        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
-        body = fmatch(s[C], out_dom, in_dom, gemv)
-        ana = tvm.arith.Analyzer()
-        tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(gemv.op.body[0]))
-        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-        tvm.lower(s, [A, B, C])
-
-    def check_rfactor_no_reset_multi_reduction(factor, rfactor):
-        s = te.create_schedule(C.op)
-        x, y = C.op.axis
-        rk = C.op.reduce_axis[0]
-        yo, yi = s[C].split(y, factor=factor)
-        ro, ri = s[C].split(rk, factor=rfactor)
-        roo, roi = s[C].split(ro, factor=2)
-        s[C].reorder(yo, roo, roi, yi, ri)
-        gemv = intrin_gemv_no_reset(factor, rfactor)
-        s[C].tensorize(yi, gemv)
-        s = s.normalize()
-        dom_map = tvm.te.schedule.InferBound(s)
-        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
-        out_dom, in_dom = finfer(s[C], dom_map)
-        tvm.ir.assert_structural_equal(out_dom[x].extent, T.int32(1))
-        tvm.ir.assert_structural_equal(out_dom[y].extent, factor)
-        tvm.ir.assert_structural_equal(out_dom[y].min, yo * factor)
-        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
-        body = fmatch(s[C], out_dom, in_dom, gemv)
-        ana = tvm.arith.Analyzer()
-        tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(gemv.op.body[0]))
-        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-        tvm.lower(s, [A, B, C])
-
-    check(T.int32(16))
-    check_rfactor(T.int32(16), T.int32(16))
-    check_rfactor_no_reset(T.int32(16), T.int32(16))
-    check_rfactor_no_reset_multi_reduction(T.int32(16), T.int32(16))
-
-
-# This tests whether algorithm and intrinsics expressions are simplified
-# as much as possible first and then checked for equality. See Issue #696
-def test_tensorize_op():
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    def op_intrin():
-        bh = 9
-        bw = 9
-        x = te.placeholder((5, 5), name="A")
-        y = te.compute((bh, bw), lambda i, j: x[idxd(j, 3) + idxm(i, 3), idxm(j, 3) + idxd(i, 3)])
-
-        def intrin_func(ins, outs):
-            (xx,) = ins
-            zz = outs[0]
-            return tvm.tir.call_packed("op", xx, zz)
-
-        return te.decl_tensor_intrin(y.op, intrin_func, default_buffer_params={"offset_factor": 2})
-
-    A = te.placeholder((5, 5), name="A")
-    B = te.compute((9, 9), lambda i, j: A[idxd(j, 3) + idxm(i, 3), idxm(j, 3) + idxd(i, 3)])
-    bt = op_intrin()
-    s = te.create_schedule(B.op)
-
-    x, y = B.op.axis
-    s[B].tensorize(x, bt)
-    s = s.normalize()
-    tvm.lower(s, [A, B])
-
-
-# This test asserts that tensorize does not have any effect on
-# TensorComputeOp operations
-def test_tensorize_tensor_compute_op():
-    # an intrinsic called "multivadd" whose definition (pattern)
-    # is a loop of another intrinsic called "vadd"
-    def intrin_multivadd(n):
-        n_a = te.var("n_a")
-        Ab = tvm.tir.decl_buffer((n,), "float32", strides=[n_a])
-
-        n_b = te.var("n_b")
-        Bb = tvm.tir.decl_buffer((n,), "float32", strides=[n_b])
-
-        n_c = te.var("n_c")
-        Cb = tvm.tir.decl_buffer((n,), "float32", strides=[n_c])
-
-        z = te.compute(
-            (n,),
-            lambda i: tvm.tir.call_extern(
-                "float32",
-                "vadd",
-                Ab.access_ptr("w", offset=n_a * i),
-                Bb.access_ptr("r", offset=n_b * i),
-                Cb.access_ptr("r", offset=n_c * i),
-            ),
-        )
-
-        # replace the pattern with the multivadd call. I need to figure out
-        # how to pass it the right parameters.
-        def intrin_func(ins, outs):
-            return tvm.tir.call_packed("multivadd")
-
-        return te.decl_tensor_intrin(z.op, intrin_func, name="multivadd")
-
-    def intrin_vadd(n):
-        dtype = "float32"
-        x = te.placeholder((n,), dtype=dtype, name="vx")
-        y = te.placeholder((n,), dtype=dtype, name="vy")
-        z = te.compute(x.shape, lambda i: x[i] + y[i], name="z")
-        s = te.create_schedule(z.op)
-
-        def create_buffer(t):
-            return tvm.tir.decl_buffer(t.shape, t.dtype, name="W" + t.name, offset_factor=16)
-
-        def intrin_func(ins, outs):
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_extern(
-                    "float32",
-                    "vadd",
-                    ins[0].access_ptr("r"),
-                    ins[1].access_ptr("r"),
-                    outs[0].access_ptr("wr"),
-                )
-            )
-            return ib.get()
-
-        return te.decl_tensor_intrin(
-            z.op, intrin_func, binds={x: create_buffer(x), y: create_buffer(y), z: create_buffer(z)}
-        )
-
-    # cache_read, cache_write
-    M = 1024
-    factor = 16
-    dtype = "float32"
-
-    A = te.placeholder((M // factor, factor), name="A", dtype=dtype)
-    B = te.placeholder((M // factor, factor), name="B", dtype=dtype)
-
-    vadd = intrin_vadd(factor)
-    C = te.compute((M // factor, factor), lambda i: vadd(A[i, 0:factor], B[i, 0:factor]), name="C")
-
-    s = te.create_schedule(C.op)
-    multivadd = intrin_multivadd(64)
-    s[C].tensorize(C.op.axis[0], multivadd)
-    s = s.normalize()
-    dom_map = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-    # The loop that we tried to tensorize still exists in the code
-    # That means tensorize didn't work as expected
-    assert isinstance(stmt.body, tvm.tir.For)
-    assert stmt.body.loop_var.name == C.op.axis[0].var.name
-
-
-if __name__ == "__main__":
-    test_tensorize_vadd()
-    test_tensorize_matmul()
-    test_tensorize_op()
-    test_tensorize_tensor_compute_op()
diff --git a/tests/python/te/test_te_tensor.py b/tests/python/te/test_te_tensor.py
index 6958888e9bb6..31d6b1f4eb3a 100644
--- a/tests/python/te/test_te_tensor.py
+++ b/tests/python/te/test_te_tensor.py
@@ -128,91 +128,6 @@ def fidentity(t0, t1):
     T0, T1 = te.compute((m,), lambda i: mysum((idx[i, k], val[i, k]), axis=k, where=cond), name="T")
 
 
-def test_tensor_compute1():
-    m = 1024
-    factor = 16
-    dtype = "float32"
-
-    def intrin_vadd(n):
-        x = te.placeholder((n,))
-        y = te.placeholder((n,))
-        z = te.compute(x.shape, lambda i: x[i] + y[i])
-
-        def intrin_func(ins, outs):
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(
-                tvm.tir.call_extern(
-                    outs[0].dtype,
-                    "vadd",
-                    ins[0].access_ptr("r"),
-                    ins[1].access_ptr("r"),
-                    outs[0].access_ptr("wr"),
-                )
-            )
-            return ib.get()
-
-        return te.decl_tensor_intrin(z.op, intrin_func, default_buffer_params={"offset_factor": n})
-
-    vadd = intrin_vadd(factor)
-
-    A = te.placeholder((m // factor, factor), name="A", dtype=dtype)
-    B = te.placeholder((m // factor, factor), name="B", dtype=dtype)
-    C = te.compute((m // factor, factor), lambda i: vadd(A[i, 0:factor], B[i, 0:factor]))
-
-    s = te.create_schedule(C.op)
-    # check lowering with the CSE pass disabled as otherwise it would do some commoning
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        stmt = tvm.lower(s, [A, B, C])["main"].body
-    assert isinstance(stmt.body, tvm.tir.Evaluate)
-
-
-def test_tensor_compute2():
-    M = 2048
-    N = 1024
-    L = 1024
-    factor = 16
-    factor1 = 32
-    factor2 = 32
-    dtype = "float32"
-
-    def intrin_gemm(m, n, l):
-        k = te.reduce_axis((0, l))
-        x = te.placeholder((m, l))
-        y = te.placeholder((n, l))
-        # in theory, no relation
-        z = te.compute((m, n), lambda i, j: te.sum(x[i][k] * y[j][k], axis=k))
-
-        def intrin_func(ins, outs):
-            x_ptr = ins[0].access_ptr("r")
-            y_ptr = ins[1].access_ptr("r")
-            z_ptr = outs[0].access_ptr("w")
-            body = tvm.tir.call_packed("gemv", x_ptr, y_ptr, z_ptr, m, n, l)
-            reset = tvm.tir.call_packed("fill_zero", z_ptr, m, n)
-            update = tvm.tir.call_packed("gemv_add", x_ptr, y_ptr, z_ptr, m, n, l)
-            return body, reset, update
-
-        return te.decl_tensor_intrin(z.op, intrin_func, default_buffer_params={"offset_factor": n})
-
-    vgemm = intrin_gemm(factor1, factor2, factor)
-
-    A = te.placeholder((M // factor1, L // factor, factor1, factor), name="A", dtype=dtype)
-    B = te.placeholder((N // factor2, L // factor, factor2, factor), name="B", dtype=dtype)
-    k = te.reduce_axis((0, L // factor), name="k")
-    C = te.compute(
-        (M // factor1, N // factor2, factor1, factor2),
-        lambda i, j: vgemm(
-            A[i, k, 0:factor1, 0:factor], B[j, k, 0:factor2, 0:factor], reduce_axis=k
-        ),
-    )
-
-    s = te.create_schedule(C.op)
-    # check lowering with the CSE pass disabled as otherwise it would do some commoning
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        stmt = tvm.lower(s, [A, B, C])["main"].body
-    assert isinstance(stmt.body.body[0], tvm.tir.Evaluate)
-    assert isinstance(stmt.body.body[1].body, tvm.tir.Evaluate)
-
-
 def test_tensor_scan():
     m = te.size_var("m")
     n = te.size_var("n")
@@ -251,7 +166,7 @@ def test_extern():
     A = te.placeholder((m,), name="A")
 
     def extern_func(ins, outs):
-        assert isinstance(ins[0], tvm.te.schedule.Buffer)
+        assert isinstance(ins[0], tvm.tir.Buffer)
         return tvm.tir.call_packed("myadd", ins[0].data, outs[0].data, m)
 
     B = te.extern((m,), [A], extern_func)
@@ -264,7 +179,7 @@ def test_extern_multi_out():
     B = te.compute((m,), lambda i: A[i] * 10)
 
     def extern_func(ins, outs):
-        assert isinstance(ins[0], tvm.te.schedule.Buffer)
+        assert isinstance(ins[0], tvm.tir.Buffer)
         return tvm.tir.call_packed("myadd", ins[0].data, outs[0].data, outs[1].data, m)
 
     res = te.extern([A.shape, A.shape], [A, B], extern_func)
@@ -278,13 +193,7 @@ def test_tuple_inputs():
     A0 = te.placeholder((m, n), name="A0")
     A1 = te.placeholder((m, n), name="A1")
     T0, T1 = te.compute((m, n), lambda i, j: (A0[i, j] * 2, A1[i, j] * 3), name="T")
-    s = te.create_schedule(T0.op)
-
-    for i in range(len(T0.shape)):
-        assert T0.shape[i] == T1.shape[i]
-    assert T0.op == T1.op
-    assert T0.value_index == 0
-    assert T1.value_index == 1
+    s = te.create_prim_func([A0, A1, T0])
 
 
 def test_tuple_with_different_deps():
@@ -295,25 +204,7 @@ def test_tuple_with_different_deps():
     B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] * 2, A1[i, j] * 3), name="B")
     C = te.compute((m, n), lambda i, j: B0[i, j] + 4, name="C")
 
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=10)
-    s[B0.op].compute_at(s[C], xo)
-    sch = s.normalize()
-    bounds = tvm.te.schedule.InferBound(sch)
-    stmt = tvm.te.schedule.ScheduleOps(sch, bounds)
-
-    def get_B1_realize(x):
-        if (
-            isinstance(x, tvm.tir.ProducerRealize)
-            and x.producer.op == B1.op
-            and x.producer.value_index == 1
-        ):
-            ret.append(x)
-
-    ret = []
-    tvm.tir.stmt_functor.post_order_visit(stmt, get_B1_realize)
-
-    assert stmt.producer == C and len(ret) == 1
+    te.create_prim_func([A0, A1, C])
 
 
 def test_tensor_inputs():
@@ -322,91 +213,6 @@ def test_tensor_inputs():
     assert tuple(y.op.input_tensors) == (x,)
 
 
-def test_tensor_pool():
-    def intrin_pool():
-        A = te.placeholder((64, 16, 16), name="A")
-        kh = te.reduce_axis((0, 3), name="kh")
-        kw = te.reduce_axis((0, 3), name="kw")
-        P = te.compute(
-            (64, 14, 14),
-            lambda c, oh, ow: tvm.te.max(A[c, oh + kh, ow + kw], axis=[kh, kw]),
-            name="p",
-        )
-
-        def intrin_func(ins, outs):
-            dinp = ins[0]
-            dout = outs[0]
-            return tvm.tir.call_packed("op", dinp, dout)
-
-        return te.decl_tensor_intrin(P.op, intrin_func, default_buffer_params={"offset_factor": 1})
-
-    A = te.placeholder((1, 64, 16, 16), name="A")
-    P = pool2d(
-        data=A, kernel=(3, 3), stride=(1, 1), dilation=(1, 1), padding=(0, 0, 0, 0), pool_type="max"
-    )
-    s = te.create_schedule(P.op)
-    _, oh, _, _ = P.op.axis
-    intrin = intrin_pool()
-    s[P].tensorize(oh, intrin)
-    tvm.lower(s, [A, P])
-
-
-def test_tensor_scalar_mixed():
-    # test te with tensor and scalar
-    a = np.array(np.random.uniform(size=(10,)), "float32")
-    b = np.array(np.random.uniform(size=(1))[0], "float32")
-    c = np.array(np.random.uniform(size=(10,)), "float32")
-
-    @tvm.register_func("tvm.test_tensor_scalar_scale")
-    def my_scale(tensor, scalar, out):
-        out_np = tensor.numpy() * scalar.numpy()
-        tvm.nd.array(out_np).copyto(out)
-
-    A = te.placeholder(a.shape, name="A")
-    B = te.placeholder(b.shape, name="B")
-    C = te.extern(
-        a.shape,
-        [A, B],
-        lambda ins, outs: tvm.tir.call_packed(
-            "tvm.test_tensor_scalar_scale", ins[0], ins[1], outs[0]
-        ),
-        name="C",
-    )
-    s = te.create_schedule(C.op)
-    f = tvm.build(s, [A, B, C], "llvm")
-
-    ta = tvm.nd.array(a)
-    tb = tvm.nd.array(b)
-    tc = tvm.nd.array(c)
-    f(ta, tb, tc)
-    tvm.testing.assert_allclose(a * b, tc.numpy())
-
-
-def test_tensor_scalar():
-    # test te with scalar shape
-    a = np.array(np.random.uniform(size=(1))[0], "float32")
-    b = np.array(0.0, "float32")
-
-    @tvm.register_func("tvm.test_tensor_scalar_copy")
-    def mycopy(x, y):
-        x.copyto(y)
-
-    A = te.placeholder(a.shape, name="A")
-    B = te.extern(
-        a.shape,
-        [A],
-        lambda ins, outs: tvm.tir.call_packed("tvm.test_tensor_scalar_copy", ins[0], outs[0]),
-        name="B",
-    )
-    s = te.create_schedule(B.op)
-    f = tvm.build(s, [A, B], "llvm")
-
-    ta = tvm.nd.array(a)
-    tb = tvm.nd.array(b)
-    f(ta, tb)
-    tvm.testing.assert_allclose(ta.numpy(), tb.numpy())
-
-
 if __name__ == "__main__":
     test_tensor()
     test_rank_zero()
@@ -426,6 +232,3 @@ def mycopy(x, y):
     test_tuple_inputs()
     test_tuple_with_different_deps()
     test_tensor_inputs()
-    test_tensor_pool()
-    test_tensor_scalar_mixed()
-    test_tensor_scalar()
diff --git a/tests/python/te/test_te_transform_layout.py b/tests/python/te/test_te_transform_layout.py
deleted file mode 100644
index 375fe4a24d57..000000000000
--- a/tests/python/te/test_te_transform_layout.py
+++ /dev/null
@@ -1,592 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import functools
-import sys
-import pytest
-
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import te
-from tvm.tir.stmt_functor import post_order_visit
-from tvm.driver.build_module import schedule_to_module
-
-dtype = tvm.testing.parameter("int32")
-
-
-def flatten_all_indices(preflatten_shape):
-    def mapping(*indices):
-        output = 0
-        for index, size in zip(indices, preflatten_shape):
-            output = output * size + index
-        return [output]
-
-    return mapping
-
-
-def unpack_flattened_indices(preflatten_shape):
-    def mapping(i):
-        output = []
-        for dim in reversed(preflatten_shape):
-            output.append(i % dim)
-            i //= dim
-        return output[::-1]
-
-    return mapping
-
-
-def traverse(s, op, callback):
-    visited = set()
-
-    def _traverse(op):
-        if op in visited:
-            return
-        visited.add(op)
-        for tensor in op.input_tensors:
-            _traverse(tensor.op)
-        callback(op)
-
-    _traverse(op)
-
-
-class TestCompareAgainstExplicitReshape:
-    A_definition_style = tvm.testing.parameter(
-        "explicit_reshape",
-        "transform_layout",
-    )
-    B_definition_style = tvm.testing.parameter(
-        "explicit_reshape",
-        "transform_layout",
-    )
-
-    reordered_shape = tvm.testing.parameter((2, 3, 4))
-
-    @tvm.testing.fixture
-    def n_items(self, reordered_shape):
-        return functools.reduce(lambda x, y: x * y, reordered_shape, 1)
-
-    @tvm.testing.fixture
-    def fphysical_layout(self, reordered_shape):
-        return unpack_flattened_indices(reordered_shape)
-
-    @tvm.testing.fixture
-    def fcompute(self, A_definition_style, B_definition_style, reordered_shape, n_items, dtype):
-        assert A_definition_style in ["explicit_reshape", "transform_layout"]
-        assert B_definition_style in ["explicit_reshape", "transform_layout"]
-
-        def func():
-            if A_definition_style == "explicit_reshape":
-                A_input = te.placeholder(shape=reordered_shape, name="A_input", dtype=dtype)
-                A = te.compute(
-                    shape=(n_items,),
-                    fcompute=lambda i: A_input[
-                        i // (reordered_shape[1] * reordered_shape[2]),
-                        (i // reordered_shape[2]) % reordered_shape[1],
-                        i % reordered_shape[2],
-                    ],
-                    name="A",
-                )
-
-            elif A_definition_style == "transform_layout":
-                A = te.placeholder(shape=(n_items,), name="A", dtype=dtype)
-                A_input = A
-
-            B = te.compute(shape=A.shape, fcompute=lambda i: A[i], name="B")
-
-            if B_definition_style == "explicit_reshape":
-                B_output = te.compute(
-                    shape=reordered_shape,
-                    fcompute=lambda i, j, k: B[
-                        i * reordered_shape[1] * reordered_shape[2] + j * reordered_shape[2] + k
-                    ],
-                    name="B_output",
-                )
-            elif B_definition_style == "transform_layout":
-                B_output = B
-
-            return A_input, B_output
-
-        return func
-
-    @tvm.testing.fixture
-    def fschedule(self, A_definition_style, B_definition_style, fphysical_layout):
-        def func(outs):
-            outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-            s = te.create_schedule([x.op for x in outs])
-
-            def callback(op):
-                if (op.name == "A" and A_definition_style == "transform_layout") or (
-                    op.name == "B" and B_definition_style == "transform_layout"
-                ):
-                    s[op].transform_layout(fphysical_layout)
-
-            traverse(s, outs[0].op, callback)
-            return s
-
-        return func
-
-    @tvm.testing.parametrize_targets("llvm")
-    def test_external_reshape(
-        self, target, dev, fcompute, fschedule, n_items, reordered_shape, dtype
-    ):
-        A, B = fcompute()
-        s = fschedule(B)
-
-        func = tvm.build(s, [A, B], target=target, name="copy_reshape")
-
-        a_np = np.arange(n_items).reshape(reordered_shape).astype(dtype)
-        b_np = np.arange(n_items).reshape(reordered_shape).astype(dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.empty(b_np.shape, dtype=dtype, device=dev)
-
-        func(a, b)
-
-        tvm.testing.assert_allclose(b.numpy(), b_np)
-
-    @tvm.testing.parametrize_targets("llvm")
-    def test_internal_reshape(self, target, dev, n_items, reordered_shape, dtype, fphysical_layout):
-        # The reshaping of the buffer gets flattened away in
-        # StorageFlatten.  Therefore, testing the behavior by running only
-        # ApplyLayoutTransforms.
-        logical_shape = (n_items,)
-        A = te.placeholder(logical_shape, name="A", dtype=dtype)
-        B = te.compute(shape=logical_shape, fcompute=lambda i: A[i], name="B")
-        C = te.compute(shape=logical_shape, fcompute=lambda i: B[i], name="C")
-
-        s = te.create_schedule(C.op)
-        s[B].transform_layout(fphysical_layout)
-
-        mod = schedule_to_module(s, [A, C])
-        body = mod["main"].body
-
-        def walk_buffer_interactions(stmt, callback):
-            buffer_classes = [
-                tvm.tir.BufferLoad,
-                tvm.tir.BufferStore,
-                tvm.tir.BufferRealize,
-            ]
-
-            def inner(node):
-                if (type(node) in buffer_classes) and node.buffer.name == "B":
-                    callback(node)
-
-            post_order_visit(stmt, inner)
-
-        # All references to the buffer are the same object
-        def check_references():
-            buffer_object = None
-
-            def inner(node):
-                nonlocal buffer_object
-                if buffer_object is None:
-                    buffer_object = node.buffer
-                else:
-                    assert node.buffer.same_as(buffer_object)
-
-            return inner
-
-        # The buffer has the expected shape.
-        def check_shape(expected_shape):
-            def inner(node):
-                assert tuple(node.buffer.shape) == expected_shape
-
-            return inner
-
-        # Before the transform, the buffer should be in the logical shape.
-        walk_buffer_interactions(body, check_references())
-        walk_buffer_interactions(body, check_shape(logical_shape))
-
-        mod = tvm.tir.transform.ApplyLayoutTransforms()(mod)
-        body = mod["main"].body
-
-        # After the transform, the buffer should be in the physical shape.
-        walk_buffer_interactions(body, check_references())
-        walk_buffer_interactions(body, check_shape(reordered_shape))
-
-
-class Test2DPhysicalLayout:
-    transform_A = tvm.testing.parameter(
-        "1d_A",
-        "2d_A",
-        "2d_rev_A",
-        "3d_A",
-    )
-    transform_B = tvm.testing.parameter(
-        "1d_B",
-        "2d_B",
-        "2d_rev_B",
-        "3d_B",
-    )
-
-    @staticmethod
-    def extract_logical_indices(stmt):
-        output = {}
-
-        # Since the for loops can be reordered by the layout
-        # transformation, identify the loop corresponding to each
-        # pre-transformation axis based on the iteration extent.
-        def callback(node):
-            if isinstance(node, tvm.tir.For):
-                output[node.loop_var] = node.extent.value
-
-        post_order_visit(stmt, callback)
-        return sorted(output, key=output.get)
-
-    def get_transform(self, name):
-        name = name[:-2]
-        if name == "1d":
-            return None
-        elif name == "2d":
-            return lambda i, j, k: [i, j, te.AXIS_SEPARATOR, k]
-        elif name == "2d_rev":
-            return lambda i, j, k: [k, j, te.AXIS_SEPARATOR, i]
-        elif name == "3d":
-            return lambda i, j, k: [i, te.AXIS_SEPARATOR, j, te.AXIS_SEPARATOR, k]
-        else:
-            raise ValueError(f"Unknown transformation: {name}")
-
-    def transform_indices(self, name, logical_shape, logical_index_vars):
-        name = name[:-2]
-
-        i, j, k = logical_index_vars
-
-        if name == "1d":
-            return [i * (logical_shape[1] * logical_shape[2]) + j * logical_shape[2] + k]
-        elif name == "2d":
-            return [i * logical_shape[1] + j, k]
-        elif name == "2d_rev":
-            return [k * logical_shape[1] + j, i]
-        elif name == "3d":
-            return [i, j, k]
-        else:
-            raise ValueError(f"Unknown transformation: {name}")
-
-    def test_2d_physical(self, dtype, transform_A, transform_B):
-        logical_shape = (2, 3, 4)
-        A = te.placeholder(shape=logical_shape, dtype=dtype, name="A")
-        B = te.compute(shape=A.shape, fcompute=lambda i, j, k: A[i, j, k], name="B")
-
-        s = te.create_schedule(B.op)
-
-        func = self.get_transform(transform_A)
-        if func:
-            s[A].transform_layout(func)
-
-        func = self.get_transform(transform_B)
-        if func:
-            s[B].transform_layout(func)
-
-        # If the two buffers are accessed with the same indices, CSE
-        # will replace them with a Let binding.  Since this makes it
-        # harder to test what the transformed indices are, disabling
-        # the CSE pass for this test.
-        with tvm.transform.PassContext(disabled_pass=["tir.CommonSubexprElimTIR"]):
-            mod = tvm.lower(s, [A, B])
-
-        logical_index_vars = self.extract_logical_indices(mod["main"].body)
-        expected_indices_A = self.transform_indices(transform_A, logical_shape, logical_index_vars)
-        expected_indices_B = self.transform_indices(transform_B, logical_shape, logical_index_vars)
-
-        def callback(node):
-            if type(node) in [tvm.tir.BufferLoad, tvm.tir.BufferStore]:
-                name = node.buffer.name
-                if name == "A":
-                    expected_indices = expected_indices_A
-                elif name == "B":
-                    expected_indices = expected_indices_B
-                else:
-                    raise RuntimeError(f"Unexpected buffer: {name}")
-
-                tvm.ir.assert_structural_equal(expected_indices, node.indices)
-
-        post_order_visit(mod["main"].body, callback)
-
-
-class TestTransformedSchedules:
-    logical_shape = tvm.testing.parameter((4, 6, 40))
-
-    transform_names = [
-        None,
-        "reverse",
-        "flatten_all",
-        "factor_last_by_4",
-    ]
-
-    transform_A = tvm.testing.parameter(by_dict={f"A_{t}": t for t in transform_names})
-    transform_B = tvm.testing.parameter(
-        by_dict={f"B_{t}": t for t in transform_names if t is not None}
-    )
-
-    after_transform = tvm.testing.parameter(None)
-
-    def make_transform(self, logical_shape, transform_name):
-        if transform_name is None:
-            return lambda *indices: indices
-        elif transform_name == "reverse":
-            return lambda *indices: indices[::-1]
-        elif transform_name == "flatten_all":
-            return flatten_all_indices(logical_shape)
-        elif transform_name == "factor_last_by_4":
-            return lambda *indices, n: [*indices, n // 4, n % 4]
-        else:
-            raise NotImplementedError(f"Unknown transformation {transform_name}")
-
-    def make_transformed_shape(self, logical_shape, transform_name):
-        if transform_name is None:
-            return logical_shape
-        elif transform_name == "reverse":
-            return logical_shape[::-1]
-        elif transform_name == "flatten_all":
-            num_elements = functools.reduce(lambda x, y: x * y, logical_shape, 1)
-            return [num_elements]
-        elif transform_name == "factor_last_by_4":
-            *indices, n = logical_shape
-            return [*indices, n // 4, 4]
-        else:
-            raise NotImplementedError(f"Unknown transformation {transform_name}")
-
-    @tvm.testing.fixture
-    def expected_loop_order(self, logical_shape, transform_B, after_transform):
-        shape = self.make_transformed_shape(logical_shape, transform_B)
-
-        if after_transform == "reorder":
-            shape = shape[::-1]
-
-        elif after_transform == "split":
-            shape = [
-                *shape[:-1],
-                2,
-                shape[-1] // 2,
-            ]
-
-        elif after_transform == "fuse":
-            fused_size = shape[0] if transform_B == "flatten_all" else shape[0] * shape[1]
-            shape = [fused_size, *shape[2:]]
-
-        return shape
-
-    @tvm.testing.fixture
-    def schedule(self, logical_shape, dtype, transform_A, transform_B, after_transform):
-        A = te.placeholder(shape=logical_shape, dtype=dtype, name="A")
-        B = te.compute(shape=A.shape, fcompute=lambda i, j, k: A[i, j, k], name="B")
-
-        s = te.create_schedule(B.op)
-
-        if transform_A:
-            s[A].transform_layout(self.make_transform(logical_shape, transform_A))
-
-        iter_vars = s[B].transform_layout(self.make_transform(logical_shape, transform_B))
-        iter_vars = list(iter_vars)
-
-        if after_transform == "reorder":
-            s[B].reorder(*iter_vars[::-1])
-
-        elif after_transform == "split":
-            s[B].split(iter_vars[-1], nparts=2)
-
-        elif after_transform == "fuse":
-            to_fuse = iter_vars[:2]
-            s[B].fuse(*iter_vars[:2])
-
-        return {
-            "schedule": s,
-            "tensors": [A, B],
-            "iter_vars": iter_vars,
-        }
-
-    def compare_tir_loop_order(self, stmt, expected_loop_order):
-        def collect_loops(node):
-            output = []
-
-            def callback(node):
-                if isinstance(node, tvm.tir.For):
-                    output.append(node)
-
-            post_order_visit(node, callback)
-            return output[::-1]
-
-        loops = collect_loops(stmt)
-        loop_order = [loop.extent for loop in loops]
-
-        np.testing.assert_array_equal(loop_order, expected_loop_order)
-
-    def test_tir_loop_order(self, schedule, expected_loop_order):
-        func = tvm.lower(schedule["schedule"], schedule["tensors"])["main"]
-        self.compare_tir_loop_order(func.body, expected_loop_order)
-
-    def test_te_loop_order(self, schedule, expected_loop_order):
-        s = schedule["schedule"]
-        A, B = schedule["tensors"]
-        iter_vars = schedule["iter_vars"]
-
-        # No reduction axis, so all leaf_iter_vars are over the data
-        # array, and should have the new iteration variables.
-        extents = [int(iter_var.dom.extent) for iter_var in s[B].leaf_iter_vars]
-        np.testing.assert_array_equal(extents, expected_loop_order)
-
-        # layout_transform should return the new iteration variables.
-        extents = [int(iter_var.dom.extent) for iter_var in iter_vars]
-        np.testing.assert_array_equal(extents, expected_loop_order)
-
-    @pytest.mark.parametrize("after_transform", ["reorder", "split", "fuse"])
-    def test_use_transformed_axes(
-        self, schedule, expected_loop_order, transform_A, transform_B, after_transform
-    ):
-        s = schedule["schedule"]
-        A, B = schedule["tensors"]
-
-        func = tvm.lower(s, [A, B])["main"]
-        self.compare_tir_loop_order(func.body, expected_loop_order)
-
-
-class TestTransformCache:
-    A_size = tvm.testing.parameter(16)
-
-    transform_A = tvm.testing.parameter(by_dict={"transformA": True, "": False})
-    transform_B = tvm.testing.parameter(by_dict={"transformB": True, "": False})
-    cache_A = tvm.testing.parameter(by_dict={"cacheA": True, "": False})
-    cache_B = tvm.testing.parameter(by_dict={"cacheB": True, "": False})
-
-    @tvm.testing.fixture
-    def schedule_args(self, target, A_size, transform_A, transform_B, cache_A, cache_B, dtype):
-        A = te.placeholder(shape=[A_size], dtype=dtype, name="A")
-        B = te.compute(A.shape, lambda i: A[i], name="B")
-        s = te.create_schedule(B.op)
-
-        requires_thread_bind = "gpu" in tvm.target.Target(target).keys
-        thread_x = te.thread_axis("threadIdx.x")
-        thread_y = te.thread_axis("threadIdx.y")
-        thread_z = te.thread_axis("threadIdx.z")
-
-        if cache_A:
-            AA = s.cache_read(A, "shared", [B])
-            if requires_thread_bind:
-                s[AA].bind(AA.op.axis[0], thread_x)
-
-        if cache_B:
-            BB = s.cache_write(B, "shared")
-            if requires_thread_bind:
-                s[BB].bind(BB.op.axis[0], thread_y)
-
-        if transform_A:
-            A_axis = s[A].transform_layout(lambda i: [i // 4, i % 4])
-
-        if transform_B:
-            B_axis = s[B].transform_layout(lambda i: [i // 4, i % 4])
-        else:
-            B_axis = B.op.axis
-
-        if requires_thread_bind:
-            s[B].bind(B_axis[0], thread_z)
-
-        return [s, [A, B]]
-
-    @tvm.testing.fixture
-    def ref_data(self, A_size, dtype, transform_A, transform_B):
-        a_np = (100 * np.random.uniform(size=A_size)).astype(dtype)
-        b_np = a_np
-
-        if transform_A:
-            a_np = a_np.reshape((-1, 4))
-
-        if transform_B:
-            b_np = b_np.reshape((-1, 4))
-
-        return a_np, b_np
-
-    def test_lower(self, schedule_args):
-        tvm.lower(*schedule_args)
-
-    def test_execute(self, target, dev, schedule_args, ref_data, dtype):
-        func = tvm.build(*schedule_args, target=target)
-
-        a_np, b_np = ref_data
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.empty(b_np.shape, dtype=dtype, device=dev)
-
-        func(a, b)
-
-        if "int" in dtype:
-            np.testing.assert_equal(b.numpy(), b_np)
-        else:
-            tvm.testing.assert_allclose(b.numpy(), b_np)
-
-
-def test_transform_with_reduction():
-    # To trigger this failure mode, the computation must use a
-    # reduction axis,
-    A = te.placeholder([16, 32, 64], dtype="float32", name="A")
-    k = te.reduce_axis((0, A.shape[-1]), name="k")
-    B = te.compute(A.shape[:-1], lambda i, j: te.sum(A[i, j, k], axis=[k]))
-    s = te.create_schedule(B.op)
-
-    # And the output of the computation must have a layout
-    # transformation applied.
-    s[B].transform_layout(lambda i, j: [j, i])
-
-    # When present, the failure occurred during tvm.lower, during the
-    # call to `tvm::te::PassDownBitMaskOr`.
-    tvm.lower(s, [A, B])
-
-
-shape, transform = tvm.testing.parameters(
-    ([1, 8], lambda n, i: [i, n]),
-    ([1, 1, 8], lambda i, j, k: [j, te.AXIS_SEPARATOR, i, k]),
-    ([1, 1, 8], lambda i, j, k: [i, te.AXIS_SEPARATOR, j, k]),
-)
-
-
-def test_size_one_buffer(shape, transform):
-    # This test is to catch a failure mode that occurred if a
-    # transformation were applied to a te.compute buffer, and one of
-    # the dimensions of the buffer was 1.  Prior to bugfix,
-    # arith::DetectIterMap would fold the variable as a constant,
-    # causing an error when attempting to solve for the variable using
-    # arith::InverseAffineIterMap.
-
-    dtype = "int8"
-    A = te.placeholder(shape, dtype, name="A")
-    B = te.compute(
-        shape=A.shape,
-        fcompute=lambda *indices: A[indices].astype(dtype),
-        name="B",
-    )
-    s = te.create_schedule(B.op)
-
-    # If layout transformation is on the output buffer, and any
-    # dimension of the output buffer is 1, failure occurs in
-    # CheckFusePattern.
-    s[B].transform_layout(transform)
-
-
-def test_non_divisible_transform_raises_error():
-    A = te.placeholder([1, 3, 8, 8])
-    B = te.compute(A.shape, lambda *indices: A[indices])
-    s = te.create_schedule(B.op)
-
-    transform = lambda n, c, h, w: [n, c // 4, h, w, c % 4]
-    # Error occurs here, because the transformation would introduce
-    # padding.  Padded transforms are supported in TIR-based
-    # schedules.
-    with pytest.raises(tvm.TVMError):
-        s[B].transform_layout(transform)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/tir-analysis/test_tir_analysis_usedef.py b/tests/python/tir-analysis/test_tir_analysis_usedef.py
deleted file mode 100644
index 940355e1415c..000000000000
--- a/tests/python/tir-analysis/test_tir_analysis_usedef.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import tvm
-from tvm import te
-
-
-@pytest.mark.xfail
-def test_loop_dependent_allocate():
-    N = te.size_var("N")
-    A = te.placeholder((2 * N,), "float32", "A")
-    C = te.compute((N,), lambda i: A[2 * i] + A[i + 1], name="C")
-    s = te.create_schedule(C.op)
-    AA = s.cache_read(A, "local", [C])
-    s[AA].compute_at(s[C], s[C].op.axis[0])
-    # this line should fail due to IRUseDefAnalysis sees an allocate statement
-    # referencing undefined variable
-    tvm.lower(s, [A, C])
-
-
-if __name__ == "__main__":
-    test_loop_dependent_allocate()
diff --git a/tests/python/tir-analysis/test_tir_analysis_verify_gpu_code.py b/tests/python/tir-analysis/test_tir_analysis_verify_gpu_code.py
deleted file mode 100644
index 45a8a8138bd5..000000000000
--- a/tests/python/tir-analysis/test_tir_analysis_verify_gpu_code.py
+++ /dev/null
@@ -1,434 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test gpu code verifier"""
-import tvm
-from tvm import te
-from tvm import topi
-import tvm.testing
-
-
-def get_verify_pass(valid, **kwargs):
-    def _fverify(f, *_):
-        valid[0] = tvm.tir.analysis.verify_gpu_code(f, kwargs)
-        return f
-
-    return tvm.tir.transform.prim_func_pass(_fverify, opt_level=0)
-
-
-@tvm.testing.requires_gpu
-def test_shared_memory():
-    def check_shared_memory(storage_scope, dtype):
-        N = 1024
-        M = 128
-
-        tvm_type = tvm.runtime.DataType(dtype)
-        type_size = tvm_type.bits // 8 * tvm_type.lanes
-
-        A = te.placeholder((N,), name="A", dtype=dtype)
-        B = te.compute((N,), lambda i: A[i], name="B")
-
-        s = te.create_schedule([B.op])
-        AA = s.cache_read(A, storage_scope, [B])
-        o, i = s[B].split(s[B].op.axis[0], M)
-        s[AA].compute_at(s[B], o)
-        s[B].bind(o, te.thread_axis("blockIdx.x"))
-        s[B].bind(i, te.thread_axis("threadIdx.x"))
-
-        # shared memory usage: M * sizeof(dtype) Bytes
-        # thread usage: M
-
-        for target in ["opencl", "cuda"]:
-            if not tvm.testing.device_enabled(target):
-                continue
-            valid = [None]
-            with tvm.transform.PassContext(
-                config={
-                    "tir.add_lower_pass": [
-                        (
-                            2,
-                            get_verify_pass(
-                                valid,
-                                max_shared_memory_per_block=type_size * M - 1,
-                                max_threads_per_block=M,
-                            ),
-                        )
-                    ]
-                }
-            ):
-                tvm.build(s, [A, B], target)
-            assert not valid[0]
-
-            with tvm.transform.PassContext(
-                config={
-                    "tir.add_lower_pass": [
-                        (
-                            2,
-                            get_verify_pass(
-                                valid,
-                                max_shared_memory_per_block=type_size * M,
-                                max_threads_per_block=M,
-                            ),
-                        )
-                    ]
-                }
-            ):
-                tvm.build(s, [A, B], target)
-            assert valid[0]
-
-    check_shared_memory("shared", "float32")
-    check_shared_memory("shared", "int8x4")
-    check_shared_memory("shared.dyn", "float32")
-
-
-@tvm.testing.requires_gpu
-def test_local_memory():
-    N = 1024
-    M = 128
-
-    A = te.placeholder((N,), name="A", dtype="float32")
-    B = te.compute((N,), lambda i: A[i], name="B")
-
-    s = te.create_schedule([B.op])
-    AA = s.cache_read(A, "local", [B])
-    o, i = s[B].split(s[B].op.axis[0], M)
-    s[AA].compute_at(s[B], o)
-    s[B].bind(o, te.thread_axis("blockIdx.x"))
-
-    # local memory usage: M * 4B
-    # thread usage: M
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid, max_local_memory_per_block=4 * M - 1, max_threads_per_block=1
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, B], target)
-        assert not valid[0]
-
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid, max_local_memory_per_block=4 * M, max_threads_per_block=1
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, B], target)
-        assert valid[0]
-
-
-@tvm.testing.requires_gpu
-def test_num_thread():
-    N = 1024
-    M = 128
-
-    A = te.placeholder((N,), name="A", dtype="float32")
-    B = te.compute((N,), lambda i: A[i], name="B")
-
-    s = te.create_schedule([B.op])
-    o, i = s[B].split(s[B].op.axis[0], M)
-
-    s[B].bind(o, te.thread_axis("threadIdx.x"))
-    s[B].bind(i, te.thread_axis("threadIdx.y"))
-
-    # shared memory usage: 0
-    # thread usage: N
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid, max_shared_memory_per_block=0, max_threads_per_block=N - 1
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, B], target)
-        assert not valid[0]
-
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid, max_shared_memory_per_block=0, max_threads_per_block=N
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, B], target)
-        assert valid[0]
-
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid,
-                            max_shared_memory_per_block=0,
-                            max_threads_per_block=N,
-                            max_thread_y=M - 1,
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, B], target)
-        assert not valid[0]
-
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid,
-                            max_shared_memory_per_block=0,
-                            max_threads_per_block=N,
-                            max_thread_y=M,
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, B], target)
-        assert valid[0]
-
-
-@tvm.testing.requires_gpu
-def test_multiple_kernels():
-    N = 1024
-
-    A = te.placeholder((N, N), name="A")
-    B = te.compute((N, N), lambda i, j: A[i, j])
-    C = te.compute((N, N), lambda i, j: B[i, j])
-
-    s = te.create_schedule([C.op])
-
-    s[C].bind(s[C].op.axis[1], te.thread_axis("threadIdx.x"))
-    s[B].bind(s[B].op.axis[1], te.thread_axis("threadIdx.x"))
-
-    # shared memory usage: 0
-    # thread usage: N
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid, max_shared_memory_per_block=0, max_threads_per_block=N - 1
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, C], target)
-        assert not valid[0]
-
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [
-                    (
-                        2,
-                        get_verify_pass(
-                            valid, max_shared_memory_per_block=0, max_threads_per_block=N
-                        ),
-                    )
-                ]
-            }
-        ):
-            tvm.build(s, [A, C], target)
-        assert valid[0]
-
-
-@tvm.testing.requires_gpu
-def test_wrong_bind():
-    N = 1024
-
-    A = te.placeholder((N, N - 1), name="A")
-    B = te.compute((N, N - 1), lambda i, j: A[i, j])
-
-    s = te.create_schedule([B.op])
-
-    # bind a thread axis to two loop axes with different lengths
-    s[B].bind(s[B].op.axis[0], te.thread_axis("threadIdx.x"))
-    s[B].bind(s[B].op.axis[1], te.thread_axis("threadIdx.x"))
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={
-                "tir.add_lower_pass": [(2, get_verify_pass(valid, max_threads_per_block=N * N))]
-            }
-        ):
-            tvm.build(s, [A, B], target)
-        assert not valid[0]
-
-
-@tvm.testing.requires_gpu
-def test_vectorize():
-    N = 1024
-
-    A = te.placeholder((N, N), name="A")
-    B = te.compute((N, N), lambda i, j: A[i, j])
-
-    s = te.create_schedule([B.op])
-
-    i, j = s[B].op.axis
-
-    s[B].bind(i, te.thread_axis("blockIdx.x"))
-    jo, ji = s[B].split(j, factor=64)
-    s[B].bind(jo, te.thread_axis("threadIdx.x"))
-    s[B].vectorize(ji)
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={"tir.add_lower_pass": [(2, get_verify_pass(valid, max_vector_bytes=16))]}
-        ):
-            tvm.lower(s, [A, B])
-        assert not valid[0]
-
-
-@tvm.testing.requires_gpu
-def test_vectorize_half():
-    N = 1024
-
-    A = te.placeholder((N, N), name="A", dtype="float16")
-    B = te.compute((N, N), lambda i, j: A[i, j])
-
-    s = te.create_schedule([B.op])
-
-    i, j = s[B].op.axis
-
-    s[B].bind(i, te.thread_axis("blockIdx.x"))
-    jo, ji = s[B].split(j, factor=8)
-    s[B].bind(jo, te.thread_axis("threadIdx.x"))
-    s[B].vectorize(ji)
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={"tir.add_lower_pass": [(2, get_verify_pass(valid, max_vector_bytes=16))]}
-        ):
-            tvm.lower(s, [A, B])
-        assert valid[0]
-
-
-@tvm.testing.requires_gpu
-def test_vectorize_strided():
-    N = 1024
-
-    A = te.placeholder((N, N), name="A", dtype="float16")
-    B = te.compute((N, N), lambda i, j: A[j, i])
-
-    s = te.create_schedule([B.op])
-
-    i, j = s[B].op.axis
-
-    s[B].bind(i, te.thread_axis("blockIdx.x"))
-    jo, ji = s[B].split(j, factor=8)
-    s[B].vectorize(ji)
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={"tir.add_lower_pass": [(2, get_verify_pass(valid, max_vector_bytes=16))]}
-        ):
-            tvm.lower(s, [A, B])
-        assert not valid[0]
-
-
-@tvm.testing.requires_gpu
-def test_vthread():
-    N = 1024
-
-    A = te.placeholder((N, 16), name="A")
-    B = te.compute((N, 16), lambda i, j: A[i, j])
-
-    s = te.create_schedule([B.op])
-
-    s[B].bind(s[B].op.axis[0], te.thread_axis("blockIdx.x"))
-    s[B].bind(s[B].op.axis[1], te.thread_axis("vthread"))
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-
-        for phase in [1, 2]:
-            with tvm.transform.PassContext(
-                config={"tir.add_lower_pass": [(phase, get_verify_pass(valid, max_vthread=16))]}
-            ):
-                tvm.build(s, [A, B], target)
-            assert valid[0]
-
-            with tvm.transform.PassContext(
-                config={"tir.add_lower_pass": [(phase, get_verify_pass(valid, max_vthread=15))]}
-            ):
-                tvm.build(s, [A, B], target)
-            assert not valid[0]
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/tir-analysis/test_tir_analysis_verify_memory.py b/tests/python/tir-analysis/test_tir_analysis_verify_memory.py
deleted file mode 100644
index 4c89ff1185f7..000000000000
--- a/tests/python/tir-analysis/test_tir_analysis_verify_memory.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import pytest
-from tvm import te
-import tvm.testing
-
-# The following DLDeviceType/TVMDeviceExtType values
-# are originally defined in dlpack.h and c_runtime_api.h.
-gpu_devices = ["cuda", "opencl", "metal", "vulkan"]
-other_devices = ["llvm", "ext_dev"]
-
-
-# All computations are bound.
-# So VerifyMemory pass is expected to succeed.
-#
-@tvm.testing.uses_gpu
-def test_verify_memory_all_bind():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda i: A[i] + 1.0, name="B")
-
-    # B is bound to threads.
-    s = te.create_schedule(B.op)
-    bx, tx = s[B].split(B.op.axis[0], factor=64)
-    s[B].bind(bx, te.thread_axis("blockIdx.x"))
-    s[B].bind(tx, te.thread_axis("threadIdx.x"))
-
-    mod = tvm.lower(s, [A, B])
-
-    for dev_type in gpu_devices + other_devices:
-        if tvm.testing.device_enabled(dev_type):
-            binded_mod = tvm.tir.transform.Apply(
-                lambda f: f.with_attr("target", tvm.target.Target(dev_type))
-            )(mod)
-            tvm.tir.transform.VerifyMemory()(binded_mod)
-
-
-# Computations are not bound.
-# So VerifyMemory pass fails when device type is GPU.
-#
-@tvm.testing.uses_gpu
-def test_verify_memory_not_bind():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda i: A[i] + 1.0, name="B")
-
-    # B is not bound to threads.
-    s = te.create_schedule(B.op)
-
-    mod = tvm.lower(s, [A, B])
-
-    for dev_type in gpu_devices:
-        if tvm.testing.device_enabled(dev_type):
-            binded_mod = tvm.tir.transform.Apply(
-                lambda f: f.with_attr("target", tvm.target.Target(dev_type))
-            )(mod)
-            with pytest.raises(RuntimeError):
-                tvm.tir.transform.VerifyMemory()(binded_mod)
-
-    for dev_type in other_devices:
-        if tvm.testing.device_enabled(dev_type):
-            binded_mod = tvm.tir.transform.Apply(
-                lambda f: f.with_attr("target", tvm.target.Target(dev_type))
-            )(mod)
-            tvm.tir.transform.VerifyMemory()(binded_mod)
-
-
-# Computations are partially bound.
-# So VerifyMemory pass fails when device type is GPU.
-#
-@tvm.testing.uses_gpu
-def test_verify_memory_partially_bind():
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda i: A[i] + 1.0, name="B")
-    C = te.compute(B.shape, lambda i: B[i] + 2.0, name="C")
-    D = te.compute(C.shape, lambda i: C[i] + 2.0, name="D")
-
-    # C is bound to threads, but B and D are not.
-    s = te.create_schedule([B.op, C.op, D.op])
-    bx, tx = s[C].split(C.op.axis[0], factor=64)
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-
-    mod = tvm.lower(s, [A, B, C, D])
-
-    for dev_type in gpu_devices:
-        if tvm.testing.device_enabled(dev_type):
-            binded_mod = tvm.tir.transform.Apply(
-                lambda f: f.with_attr("target", tvm.target.Target(dev_type))
-            )(mod)
-            with pytest.raises(RuntimeError):
-                tvm.tir.transform.VerifyMemory()(binded_mod)
-
-    for dev_type in other_devices:
-        if tvm.testing.device_enabled(dev_type):
-            binded_mod = tvm.tir.transform.Apply(
-                lambda f: f.with_attr("target", tvm.target.Target(dev_type))
-            )(mod)
-            tvm.tir.transform.VerifyMemory()(binded_mod)
-
-
-if __name__ == "__main__":
-    test_verify_memory_all_bind()
-    test_verify_memory_not_bind()
-    test_verify_memory_partially_bind()
diff --git a/tests/python/tir-base/test_lower_build.py b/tests/python/tir-base/test_lower_build.py
index 0e610cc1659b..edb3ed351e5d 100644
--- a/tests/python/tir-base/test_lower_build.py
+++ b/tests/python/tir-base/test_lower_build.py
@@ -18,7 +18,6 @@
 import numpy as np
 
 import tvm
-from tvm import te
 from tvm.ir.module import IRModule
 from tvm.script import tir as T
 import tvm.testing
@@ -94,22 +93,6 @@ def main(
                 )
 
 
-def test_lower_build_te_schedule():
-    m, n, k = 128, 128, 128
-    axis_k = te.reduce_axis((0, k), "k")
-    A = te.placeholder((m, k), name="A")
-    B = te.placeholder((k, n), name="B")
-    C = te.compute((m, n), lambda x, y: te.sum(A[x, axis_k] * B[y, axis_k], axis=axis_k), name="C")
-    s = te.create_schedule(C.op)
-    # check lowering with the CSE pass disabled as otherwise it would do some commoning
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        ir_mod = tvm.lower(s, [A, B, C])
-    tvm.ir.assert_structural_equal(ir_mod, LoweredModule)
-    # check building
-    mod = tvm.build(s, [A, B, C], target="llvm")
-    _check_module_with_numpy(mod)
-
-
 def test_lower_build_tir_func():
     # check lowering with the CSE pass disabled as otherwise it would do some commoning
     with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
diff --git a/tests/python/tir-base/test_tir_buffer.py b/tests/python/tir-base/test_tir_buffer.py
index d706e65d8186..791de769955e 100644
--- a/tests/python/tir-base/test_tir_buffer.py
+++ b/tests/python/tir-base/test_tir_buffer.py
@@ -178,85 +178,6 @@ def assert_simplified_equal(index_simplified, index_direct):
     assert_simplified_equal(index_simplified2, index_direct)
 
 
-@tvm.testing.requires_llvm
-def test_buffer_broadcast():
-    m0, m1, m2 = te.size_var("m0"), te.size_var("m1"), te.size_var("m2")
-    n0, n1, n2 = te.size_var("n0"), te.size_var("n1"), te.size_var("n2")
-    o0, o1, o2 = te.size_var("o0"), te.size_var("o1"), te.size_var("o2")
-
-    A = te.placeholder((m0, m1, m2), name="A")
-    B = te.placeholder((n0, n1, n2), name="B")
-
-    C = te.compute((o0, o1, o2), lambda i, j, k: A[i, j, k] + B[i, j, k], name="C")
-
-    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast")
-    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast")
-    s = te.create_schedule(C.op)
-
-    def check():
-        fadd = tvm.build(s, [A, B, C], target="llvm", name="bcast_add", binds={A: Ab, B: Bb})
-        dev = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(2, 1, 1)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    check()
-
-
-@tvm.testing.requires_llvm
-def test_buffer_broadcast_expr():
-    n0, m0, x = te.size_var("n0"), te.size_var("m0"), te.size_var("x")
-    n1, m1 = te.size_var("n1"), te.size_var("m1")
-    o0, o1 = te.size_var("o0"), te.size_var("o1")
-
-    A = te.placeholder((m0, n0), name="A")
-    B = te.placeholder((m1, n1), name="B")
-    C = te.compute((o0, o1 // x), lambda i, j: A[i, j] + B[i, j], name="C")
-
-    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast")
-    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast")
-    Cc = tvm.tir.decl_buffer(C.shape, C.dtype, name="Cc", buffer_type="auto_broadcast")
-    s = te.create_schedule(C.op)
-
-    def check_stride():
-        fadd = tvm.build(
-            s, [A, B, C, o1, x], target="llvm", name="bcast_add", binds={A: Ab, B: Bb, C: Cc}
-        )
-        dev = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev)
-        fadd(a, b, c, 4, 1)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    def check_no_stride():
-        fadd = tvm.build(
-            s, [A, B, C, o1, x], target="llvm", name="bcast_add", binds={A: Ab, B: Bb, C: Cc}
-        )
-        dev = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev)
-        fadd(a, b, c, 4, 1)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    def check_auto_bind():
-        # Let build bind buffers
-        fadd = tvm.build(s, [A, B, C, o1, x], target="llvm", name="bcast_add")
-        dev = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev)
-        fadd(a, b, c, 4, 1)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    check_stride()
-    check_no_stride()
-    check_auto_bind()
-
-
 def test_buffer_flatten():
     """A buffer should flatten to a 1-d shape"""
     buf = tvm.tir.decl_buffer([16, 32])
diff --git a/tests/python/tir-base/test_tir_intrin.py b/tests/python/tir-base/test_tir_intrin.py
index 1ee709191c41..8ab18bc84855 100644
--- a/tests/python/tir-base/test_tir_intrin.py
+++ b/tests/python/tir-base/test_tir_intrin.py
@@ -31,13 +31,19 @@ def test_nearbyint():
     )
     A = te.placeholder((m,), name="A")
     A_rounded = te.compute((m,), lambda *i: tvm.tir.nearbyint(A(*i)), name="A")
-    s = te.create_schedule(A_rounded.op)
-    f = tvm.build(s, [A, A_rounded], "llvm")
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, A_rounded])
+    sch = tir.Schedule(mod)
+
+    # Build from scheduled TIR
+    func = tvm.build(sch.mod, target="llvm")
+
     dev = tvm.cpu(0)
     n = 10
     a = tvm.nd.array(np.random.uniform(high=100, size=n).astype(A.dtype), dev)
     a_rounded = tvm.nd.array(np.random.uniform(size=n).astype(A_rounded.dtype), dev)
-    f(a, a_rounded)
+    func(a, a_rounded)
     # Note that numpys rint rounds to nearest integer with
     # ties to halfway is broken by rounding to even.
     # So that 1.5 and 2.5 will round 2.
@@ -79,13 +85,19 @@ def run_test(tvm_intrin, np_func):
         )
         A = te.placeholder((m,), name="A")
         B = te.compute((m,), lambda *i: tvm_intrin(A(*i)), name="B")
-        s = te.create_schedule(B.op)
-        f = tvm.build(s, [A, B], "llvm")
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, B])
+        sch = tir.Schedule(mod)
+
+        # Build from scheduled TIR
+        func = tvm.build(sch.mod, target="llvm")
+
         dev = tvm.cpu(0)
         n = 10
         a = tvm.nd.array(np.random.uniform(0.1, 0.5, size=n).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        f(a, b)
+        func(a, b)
         tvm.testing.assert_allclose(b.numpy(), np_func(a.numpy()), atol=1e-5, rtol=1e-5)
 
     for func in test_funcs:
@@ -107,14 +119,20 @@ def run_test(tvm_intrin, np_func):
         A = te.placeholder((m,), name="A")
         B = te.placeholder((m,), name="B")
         C = te.compute((m,), lambda *i: tvm_intrin(A(*i), B(*i)), name="C")
-        s = te.create_schedule(C.op)
-        f = tvm.build(s, [A, B, C], "llvm")
+
+        # Convert to TIR and create schedule
+        mod = te.create_prim_func([A, B, C])
+        sch = tir.Schedule(mod)
+
+        # Build from scheduled TIR
+        func = tvm.build(sch.mod, target="llvm")
+
         dev = tvm.cpu(0)
         n = 10
         a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(B.dtype), dev)
         c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        f(a, b, c)
+        func(a, b, c)
         tvm.testing.assert_allclose(c.numpy(), np_func(a.numpy(), b.numpy()), atol=1e-5, rtol=1e-5)
 
     for func in test_funcs:
@@ -128,14 +146,20 @@ def test_ldexp():
     A = te.placeholder((m,), name="A")
     B = te.placeholder((m,), name="B", dtype="int32")
     C = te.compute((m,), lambda *i: tvm.tir.ldexp(A(*i), B(*i)), name="C")
-    s = te.create_schedule(C.op)
-    f = tvm.build(s, [A, B, C], "llvm")
+
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, B, C])
+    sch = tir.Schedule(mod)
+
+    # Build from scheduled TIR
+    func = tvm.build(sch.mod, target="llvm")
+
     dev = tvm.cpu(0)
     n = 10
     a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
     b = tvm.nd.array(np.random.randint(0, 5, size=n).astype(B.dtype), dev)
     c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-    f(a, b, c)
+    func(a, b, c)
     tvm.testing.assert_allclose(c.numpy(), np.ldexp(a.numpy(), b.numpy()), atol=1e-5, rtol=1e-5)
 
 
@@ -162,17 +186,23 @@ def clz_np(x, dtype):
     m = te.var("m")
     A = te.placeholder((m,), name="A", dtype=dtype)
     B = te.compute((m,), lambda *i: tvm.tir.clz(A(*i)), name="B")
-    s = te.create_schedule(B.op)
 
+    # Convert to TIR and create schedule
+    mod = te.create_prim_func([A, B])
+    sch = tir.Schedule(mod)
+
+    # Apply scheduling primitives if target is Vulkan
     if target.kind.name == "vulkan":
-        bx, tx = s[B].split(B.op.axis[0], factor=64)
+        block = sch.get_block("B")
+        loop = sch.get_loops(block)[0]
+        bx, tx = sch.split(loop, factors=[None, 64])
+        sch.bind(bx, "blockIdx.x")
+        sch.bind(tx, "threadIdx.x")
 
-        s[B].bind(bx, te.thread_axis("blockIdx.x"))
-        s[B].bind(tx, te.thread_axis("threadIdx.x"))
+    # Build from scheduled TIR
+    func = tvm.build(sch.mod, target=target)
 
-    f = tvm.build(s, [A, B], target)
     n = 10
-
     highs = [10, 100, 1000, 10000, 100000, 1000000]
 
     if dtype == "int64":
@@ -182,7 +212,7 @@ def clz_np(x, dtype):
         a_np = np.random.randint(1, high=high, size=(n,), dtype=dtype)
         a = tvm.nd.array(a_np, dev)
         b = tvm.nd.array(np.zeros((n,)).astype("int32"), dev)
-        f(a, b)
+        func(a, b)
         ref = clz_np(a_np, dtype)
         np.testing.assert_equal(b.numpy(), ref)
 
diff --git a/tests/python/tir-base/test_tir_ir_builder.py b/tests/python/tir-base/test_tir_ir_builder.py
deleted file mode 100644
index 8a39337575a7..000000000000
--- a/tests/python/tir-base/test_tir_ir_builder.py
+++ /dev/null
@@ -1,565 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm import te
-import numpy as np
-import tvm.testing
-from tvm.topi.math import cast
-
-
-def test_for():
-    ib = tvm.tir.ir_builder.create()
-    n = te.size_var("n")
-    A = ib.allocate("float32", n, name="A", scope="global")
-    with ib.for_range(0, n, name="i") as i:
-        A[i] = A[i] + 1
-        with ib.for_range(0, 10, name="j") as j:
-            A[j] = A[j] + 2
-
-    body = ib.get()
-    assert isinstance(body, tvm.tir.Allocate)
-    body = body.body
-    assert isinstance(body, tvm.tir.For)
-    body = body.body
-    assert isinstance(body, tvm.tir.SeqStmt)
-    assert isinstance(body[1], tvm.tir.For)
-
-
-def test_if():
-    ib = tvm.tir.ir_builder.create()
-    n = te.size_var("n")
-    A = ib.pointer("float32", name="A")
-    tmod = tvm.tir.truncmod
-    with ib.for_range(0, n, name="i") as i:
-        with ib.if_scope(tmod(i, 2) == 0):
-            A[i] = A[i] + 1
-        with ib.else_scope():
-            A[0] = A[i] + 2
-
-    body = ib.get()
-    assert A == A
-    assert isinstance(body, tvm.tir.For)
-    body = body.body
-    assert isinstance(body, tvm.tir.IfThenElse)
-    assert isinstance(body.condition, tvm.tir.EQ)
-    assert isinstance(body.then_case.indices[0], tvm.tir.Var)
-    assert list(body.else_case.indices) == [0]
-
-
-def test_prefetch():
-    A = tvm.tir.decl_buffer((10, 20), name="A")
-    ib = tvm.tir.ir_builder.create()
-    n = te.size_var("n")
-
-    with ib.for_range(0, n, name="i") as i:
-        ib.emit(
-            tvm.tir.Prefetch(
-                A, [tvm.ir.Range.from_min_extent(i + 1, 2), tvm.ir.Range.from_min_extent(0, 20)]
-            )
-        )
-    body = ib.get()
-    assert body.body.bounds[0].extent.value == 2
-
-
-def test_cpu():
-    n = 1024
-    dtype = "float32"
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    def test_device_ir(A, B, C):
-        n = A.shape[0]
-        max_threads = 8
-        ib = tvm.tir.ir_builder.create()
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-        with ib.for_range(0, n, name="i") as i:
-            Cptr[i] = Aptr[i] + Bptr[i]
-        body = ib.get()
-        return body
-
-    C = te.extern(
-        A.shape,
-        [A, B],
-        lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
-        name="vector_add",
-        dtype=dtype,
-    )
-    s = te.create_schedule(C.op)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-        # build and invoke the kernel.
-        fadd = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-        # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    check_target("llvm")
-
-
-@tvm.testing.requires_gpu
-def test_gpu():
-    n = te.size_var("n")
-    dtype = "float32"
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    idxd = tvm.tir.indexdiv
-
-    def test_device_ir(A, B, C):
-        n = A.shape[0]
-        max_threads = 32
-        ib = tvm.tir.ir_builder.create()
-        bx = te.thread_axis("blockIdx.x")
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(bx, "thread_extent", idxd(n + max_threads - 1, max_threads))
-        ib.scope_attr(tx, "thread_extent", max_threads)
-        idx = bx.var * max_threads + tx.var
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-        with ib.if_scope(ib.likely(idx < n)):
-            Cptr[idx] = Aptr[idx] + Bptr[idx]
-        body = ib.get()
-        return body
-
-    C = te.extern(
-        A.shape,
-        [A, B],
-        lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
-        name="vector_add",
-        dtype=dtype,
-    )
-    s = te.create_schedule(C.op)
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    def check_target(target):
-        n = 1024
-        if not tvm.testing.device_enabled(target):
-            return
-        # build and invoke the kernel.
-        fadd = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-        # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    check_target("opencl")
-    check_target("cuda")
-
-
-def test_while_vectorize():
-    """Test while loop + vectorized inner loop"""
-
-    n = 64
-    num_iter = 10
-
-    def test_ir(A, B, C):
-        ib = tvm.tir.ir_builder.create()
-        n = C.shape[0]
-        A = ib.buffer_ptr(A)
-        B = ib.buffer_ptr(B)
-        C = ib.buffer_ptr(C)
-        i = ib.allocate("int32", (1,), name="i", scope="local")
-        i[0] = 0
-
-        with ib.for_range(0, n) as j:
-            C[j] = 0.0
-
-        with ib.while_loop(i[0] < num_iter):
-            with ib.for_range(0, n, kind="vectorize") as j:
-                C[j] += A[j] + B[j]
-            i[0] += 1
-
-        return ib.get()
-
-    def check_target(target, ir):
-        dtype = "float32"
-        A = te.placeholder((n,), name="A", dtype=dtype)
-        B = te.placeholder((n,), name="B", dtype=dtype)
-
-        C = te.extern(
-            (n,),
-            [A, B],
-            lambda ins, outs: ir(ins[0], ins[1], outs[0]),
-            name="while_vectorize",
-            dtype=dtype,
-        )
-        s = te.create_schedule(C.op)
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(s, [A, B, C], target)
-
-        dev = tvm.device(target, 0)
-        a_np = np.random.uniform(size=n).astype(A.dtype)
-        b_np = np.random.uniform(size=n).astype(B.dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        func(a, b, c)
-        ref = num_iter * (a_np + b_np)
-        tvm.testing.assert_allclose(c.numpy(), ref, rtol=1e-5, atol=1e-5)
-
-    check_target("llvm", test_ir)
-
-
-def test_while_collatz():
-    """Test while loop + if"""
-
-    def collatz_ref(n):
-        a = n
-        i = 0
-        while a > 1:
-            if a % 2 == 1:
-                a = 3 * a + 1
-            else:
-                a = a >> 1
-            i += 1
-        return i
-
-    def collatz(ib, n, C):
-        i = ib.allocate("int32", (1,), name="i", scope="local")
-        a = ib.allocate("int32", (1,), name="a", scope="local")
-        i[0] = 0
-        a[0] = n
-        with ib.while_loop(a[0] > 1):
-            with ib.if_scope(tvm.tir.floormod(a[0], 2) == 1):
-                a[0] = 3 * a[0] + 1
-            with ib.else_scope():
-                a[0] = a[0] >> 1
-            i[0] += 1
-
-        C[n] = i[0]
-
-    def collatz_ir_cpu(C):
-        ib = tvm.tir.ir_builder.create()
-        n = C.shape[0]
-        C = ib.buffer_ptr(C)
-
-        with ib.for_range(0, n, name="i", kind="parallel") as i:
-            collatz(ib, i, C)
-
-        body = ib.get()
-
-        return body
-
-    n = 30
-
-    def check_target(target, ir):
-        C = te.extern(
-            (n,),
-            [],
-            lambda ins, outs: ir(outs[0]),
-            name="collatz",
-            dtype="int32",
-        )
-        s = te.create_schedule(C.op)
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(s, [C], target)
-
-        dev = tvm.device(target, 0)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        func(c)
-        ref = np.array([collatz_ref(i) for i in range(n)])
-        tvm.testing.assert_allclose(c.numpy(), ref)
-
-    check_target("llvm", collatz_ir_cpu)
-
-
-def test_while_mandel():
-    n = 160
-    shape = (n * 2, n)
-    t = 300
-
-    def mandel_ref():
-        def complex_sqr(z):
-            return np.array([z[0] ** 2 - z[1] ** 2, z[1] * z[0] * 2])
-
-        pixels = np.zeros(shape)
-
-        for i in range(pixels.shape[0]):
-            for j in range(pixels.shape[1]):
-                c = np.array([-0.8, np.cos(t) * 0.2])
-                z = np.array([i / n - 1, j / n - 0.5]) * 2
-                iterations = 0
-
-                while np.linalg.norm(z) < 20 and iterations < 50:
-                    z = complex_sqr(z) + c
-                    iterations += 1
-
-                pixels[i, j] = 1 - iterations * 0.02
-
-        return pixels
-
-    def mandel(ib, i, j, pixels):
-        z = ib.allocate("float32", (2,), name="z", scope="local")
-        tmp = ib.allocate("float32", (1,), name="tmp", scope="local")
-        iterations = ib.allocate("int32", (1,), name="iterations", scope="local")
-
-        z[0] = (i / float(n) - 1) * 2
-        z[1] = (j / float(n) - 0.5) * 2
-        iterations[0] = 0
-        c = [-0.8, float(np.cos(t)) * 0.2]
-
-        def norm(z):
-            return tvm.tir.sqrt(z[0] * z[0] + z[1] * z[1])
-
-        with ib.while_loop(tvm.tir.all(norm(z) < 20, iterations[0] < 50)):
-            tmp[0] = z[0]
-            z[0] = z[0] * z[0] - z[1] * z[1] + c[0]
-            z[1] = z[1] * tmp[0] * 2 + c[1]
-            iterations[0] += 1
-
-        pixels[i, j] = 1 - iterations[0] * 0.02
-
-    def mandel_ir_cpu(C):
-        ib = tvm.tir.ir_builder.create()
-        ny = C.shape[0]
-        nx = C.shape[1]
-        C = ib.buffer_ptr(C)
-
-        with ib.for_range(0, ny, name="i", kind="parallel") as i:
-            with ib.for_range(0, nx, name="j") as j:
-                mandel(ib, i, j, C)
-
-        body = ib.get()
-
-        return body
-
-    def mandel_ir_gpu(C):
-        ib = tvm.tir.ir_builder.create()
-        ny = C.shape[0]
-        nx = C.shape[1]
-        C = ib.buffer_ptr(C)
-
-        bx = te.thread_axis("blockIdx.x")
-        tx = te.thread_axis("threadIdx.x")
-        by = te.thread_axis("blockIdx.y")
-        ty = te.thread_axis("threadIdx.y")
-
-        max_threads = 16
-        ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(nx + max_threads - 1, max_threads))
-        ib.scope_attr(tx, "thread_extent", max_threads)
-        ib.scope_attr(by, "thread_extent", tvm.tir.indexdiv(ny + max_threads - 1, max_threads))
-        ib.scope_attr(ty, "thread_extent", max_threads)
-
-        tidx = bx * max_threads + tx
-        tidy = by * max_threads + ty
-
-        with ib.if_scope(tvm.tir.all(tidx < nx, tidy < ny)):
-            mandel(ib, tidy, tidx, C)
-
-        body = ib.get()
-
-        return body
-
-    ref = mandel_ref()
-
-    def check_target(target, ir):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        C = te.extern(
-            shape,
-            [],
-            lambda ins, outs: ir(outs[0]),
-            name="mandel_ir",
-            dtype="float32",
-        )
-        s = te.create_schedule(C.op)
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(s, [C], target)
-
-        dev = tvm.device(target, 0)
-        c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), dev)
-        func(c)
-        tvm.testing.assert_allclose(c.numpy(), ref, rtol=1e-5, atol=1e-5)
-
-    check_target("llvm", mandel_ir_cpu)
-    check_target("npvtx", mandel_ir_gpu)
-    check_target("cuda", mandel_ir_gpu)
-    check_target("vulkan", mandel_ir_gpu)
-
-
-def test_while_binary_search():
-    def binary_search(ib, n, i, Aptr, Bptr, Cptr):
-        lo = ib.allocate("int32", (1,), name="lo", scope="local")
-        hi = ib.allocate("int32", (1,), name="hi", scope="local")
-
-        lo[0] = 0
-        hi[0] = n
-        v = Bptr[i]
-
-        with ib.while_loop(lo[0] < hi[0]):
-            mid = lo[0] + (hi[0] - lo[0] >> 1)
-            with ib.if_scope(Aptr[mid] < v):
-                lo[0] = mid + 1
-            with ib.else_scope():
-                hi[0] = mid
-
-        Cptr[i] = lo[0]
-
-    def searchsorted_ir_cpu(A, B, C, n):
-        ib = tvm.tir.ir_builder.create()
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-
-        with ib.for_range(0, n, name="i", kind="parallel") as i:
-            binary_search(ib, n, i, Aptr, Bptr, Cptr)
-
-        body = ib.get()
-
-        return body
-
-    def searchsorted_ir_gpu(A, B, C, n):
-        ib = tvm.tir.ir_builder.create()
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-
-        bx = te.thread_axis("blockIdx.x")
-        tx = te.thread_axis("threadIdx.x")
-        max_threads = 32
-        ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(n + max_threads - 1, max_threads))
-        ib.scope_attr(tx, "thread_extent", max_threads)
-        tid = bx * max_threads + tx
-
-        with ib.if_scope(tid < n):
-            binary_search(ib, n, tid, Aptr, Bptr, Cptr)
-
-        body = ib.get()
-
-        return body
-
-    n = 1024
-    dtype = "float32"
-    A = te.placeholder((n,), name="A", dtype=dtype)
-    B = te.placeholder((n,), name="B", dtype=dtype)
-
-    def check_target(target, ir):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        C = te.extern(
-            A.shape,
-            [A, B],
-            lambda ins, outs: ir(ins[0], ins[1], outs[0], n),
-            name="searchsorted_ir",
-            dtype="int32",
-        )
-        s = te.create_schedule(C.op)
-
-        with tvm.transform.PassContext(opt_level=3):
-            func = tvm.build(s, [A, B, C], target)
-
-        dev = tvm.device(target, 0)
-        a_np = np.random.uniform(size=n).astype(A.dtype)
-        b_np = np.random.uniform(size=n).astype(B.dtype)
-        a_np = np.sort(a_np)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        func(a, b, c)
-        ref = np.searchsorted(a_np, b_np)
-        tvm.testing.assert_allclose(c.numpy(), ref)
-
-    check_target("llvm", searchsorted_ir_cpu)
-    check_target("cuda", searchsorted_ir_gpu)
-    check_target("nvptx", searchsorted_ir_gpu)
-    check_target("vulkan", searchsorted_ir_gpu)
-
-
-@tvm.testing.requires_gpu
-def test_dyn_shared():
-    n = te.size_var("n")
-    dtype = "float32"
-    A = te.placeholder((n,), name="A")
-
-    def test_device_ir(A, B):
-        n = A.shape[0]
-        ib = tvm.tir.ir_builder.create()
-
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", n)
-
-        temp = ib.allocate(dtype, (n,), scope="shared.dyn")  # n is symbolic size
-
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-
-        temp[tx] = Aptr[tx]
-        depth = tvm.tir.log2(cast(n, "float32"))
-
-        with ib.for_range(0, cast(tvm.tir.ceil(depth), n.dtype)) as i:
-            ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
-            d = n >> (i + 1)
-            with ib.if_scope(tx < d):
-                temp[tx] += temp[tx + d]
-
-        Bptr[0] = temp[0]
-        return ib.get()
-
-    B = te.extern(
-        (1,),
-        [A],
-        lambda ins, outs: test_device_ir(ins[0], outs[0]),
-        name="reduce",
-        dtype=dtype,
-    )
-    s = te.create_schedule(B.op)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        freduce = tvm.build(s, [A, B], target)
-        dev = tvm.device(target, 0)
-
-        for n in [512, 1024]:
-            a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-            b = tvm.nd.array(np.zeros(1, dtype=B.dtype), dev)
-            freduce(a, b)
-            tvm.testing.assert_allclose(b.numpy()[0], np.sum(a.numpy()), 1e-4, 1e-4)
-
-    for target in ["cuda", "nvptx"]:
-        check_target(target)
-
-
-if __name__ == "__main__":
-    test_prefetch()
-    test_if()
-    test_for()
-    test_cpu()
-    test_gpu()
-    test_while_vectorize()
-    test_while_collatz()
-    test_while_mandel()
-    test_while_binary_search()
-    test_dyn_shared()
diff --git a/tests/python/tir-transform/test_tir_transform_compact_buffer_region.py b/tests/python/tir-transform/test_tir_transform_compact_buffer_region.py
index cb7151f875e3..006ebf6a1a0d 100644
--- a/tests/python/tir-transform/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/tir-transform/test_tir_transform_compact_buffer_region.py
@@ -569,7 +569,6 @@ def expected(a: T.handle) -> None:
 
 
 class TestAnnotatedOpaqueAccess(BaseCompactTest):
-
     is_lower_order_free = False
 
     @T.prim_func
@@ -1154,7 +1153,6 @@ def expected(
 
 
 class TestNonStrictCompactionForPaddedMatmul(BaseCompactTest):
-
     is_strict_mode = False
 
     @T.prim_func
@@ -1231,7 +1229,6 @@ def expected(
 
 
 class TestNotCompactAliasBuffer(BaseCompactTest):
-
     # it is not testcase on block form
     is_lower_order_free = False
 
@@ -1251,7 +1248,6 @@ def before():
 
 
 class TestNotCompactBufferWithDifferentDtype(BaseCompactTest):
-
     # it is not testcase on block form
     is_lower_order_free = False
 
@@ -1268,7 +1264,6 @@ def before():
 
 
 class TestNonBoolCondition(BaseCompactTest):
-
     # it is not testcase on block form
     is_lower_order_free = False
 
@@ -1289,15 +1284,6 @@ def expected():
                 A[i - 1] = A[i - 1] + 1
 
 
-def test_lower_te():
-    x = te.placeholder((1,))
-    y = te.compute((1,), lambda i: x[i] + 2)
-    s = te.create_schedule(y.op)
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y])
-    mod = tvm.tir.transform.CompactBufferAllocation()(orig_mod)
-    tvm.ir.assert_structural_equal(mod, orig_mod)  # CompactBufferAllocation should do nothing on TE
-
-
 class TestCompactSymbolicBound0:
     """Test symbolic bound that get compacted to constant"""
 
diff --git a/tests/python/tir-transform/test_tir_transform_convert_blocks_to_opaque.py b/tests/python/tir-transform/test_tir_transform_convert_blocks_to_opaque.py
index f920a46ba57e..63a57eeffe29 100644
--- a/tests/python/tir-transform/test_tir_transform_convert_blocks_to_opaque.py
+++ b/tests/python/tir-transform/test_tir_transform_convert_blocks_to_opaque.py
@@ -74,15 +74,6 @@ def test_elementwise():
     _check(elementwise_func, substituted_elementwise_func)
 
 
-def test_lower_te():
-    x = te.placeholder((1,))
-    y = te.compute((1,), lambda i: x[i] + 2)
-    s = te.create_schedule(y.op)
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y])
-    mod = tvm.tir.transform.ConvertBlocksToOpaque()(orig_mod)
-    tvm.ir.assert_structural_equal(mod, orig_mod)  # ConvertBlocksToOpaque should do nothing on TE
-
-
 class TestErrorIfPredicateUsesBlockVariables(tvm.testing.CompareBeforeAfter):
     transform = tvm.tir.transform.ConvertBlocksToOpaque()
     check_well_formed = False
diff --git a/tests/python/tir-transform/test_tir_transform_flatten_buffer.py b/tests/python/tir-transform/test_tir_transform_flatten_buffer.py
index 20f91b639497..b215398622cc 100644
--- a/tests/python/tir-transform/test_tir_transform_flatten_buffer.py
+++ b/tests/python/tir-transform/test_tir_transform_flatten_buffer.py
@@ -259,19 +259,6 @@ def expected(input_A: T.Buffer(10, "bool"), input_B: T.Buffer(10, "bool")) -> No
             B[i0] = T.cast(T.cast(A[i0], "bool"), "int8")
 
 
-class TestLowerTE(BaseCompare):
-    """FlattenBuffer should do nothing on TE-based functions"""
-
-    def before(self):
-        x = te.placeholder((1,))
-        y = te.compute((1,), lambda i: x[i] + 2)
-        s = te.create_schedule(y.op)
-        mod = tvm.driver.build_module.schedule_to_module(s, [x, y])
-        return mod["main"]
-
-    expected = before
-
-
 class TestFlattenInsideBlock(BaseCompare):
     """Flattening access inside a block flattens the accessed region."""
 
diff --git a/tests/python/tir-transform/test_tir_transform_hoist_if.py b/tests/python/tir-transform/test_tir_transform_hoist_if.py
index 04f3f9771c64..6695913a3c2c 100644
--- a/tests/python/tir-transform/test_tir_transform_hoist_if.py
+++ b/tests/python/tir-transform/test_tir_transform_hoist_if.py
@@ -515,34 +515,6 @@ def test_no_hoisting_7():
     tvm.ir.assert_structural_equal(new_stmt, stmt)
 
 
-def test_hoisting_block_scope_1():
-    n = te.size_var("n")
-    m = te.size_var("m")
-    A = te.placeholder((n, m), name="A")
-    k = te.reduce_axis((0, m), "k")
-    B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
-    s = te.create_schedule(B.op)
-    ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
-    BF = s.rfactor(B, ki)
-    xo, xi = s[B].split(s[B].op.axis[0], factor=32)
-    s[B.op].bind(xo, te.thread_axis("blockIdx.x"))
-    s[B.op].bind(xi, te.thread_axis("threadIdx.y"))
-    s[B].bind(s[B].op.reduce_axis[0], te.thread_axis("threadIdx.x"))
-    s[BF].compute_at(s[B], s[B].op.reduce_axis[0])
-    mod = tvm.driver.build_module.schedule_to_module(s, [A, B], "main", None)
-    mod = tvm.tir.transform.Simplify()(mod)
-    mod = tvm.tir.transform.RemoveNoOp()(mod)
-    stmt = mod["main"].body
-    new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
-    tvm.ir.assert_structural_equal(new_stmt, stmt)
-
-    with tvm.transform.PassContext(
-        config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}}
-    ):
-        new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
-    assert not tvm.ir.structural_equal(new_stmt, stmt)
-
-
 def test_hoisting_block_scope_2():
     ib = tvm.tir.ir_builder.create()
     dshape = (32, 64)
@@ -617,37 +589,6 @@ def test_hoisting_block_scope_3():
     assert not tvm.ir.structural_equal(new_stmt, stmt)
 
 
-def test_hoisting_block_scope_4():
-    nn = 1024
-    n = tvm.runtime.convert(nn)
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    AA = te.compute((n,), lambda *i: A(*i), name="A")
-    BB = te.compute((n,), lambda *i: B(*i), name="B")
-    T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T")
-    C = te.compute(A.shape, lambda *i: T(*i), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], factor=4)
-    xo1, xo2 = s[C].split(xo, factor=13)
-    s[C].parallel(xo2)
-    s[C].pragma(xo1, "parallel_launch_point")
-    s[C].pragma(xo2, "parallel_stride_pattern")
-    s[C].pragma(xo2, "parallel_barrier_when_finish")
-    s[C].vectorize(xi)
-    mod = tvm.driver.build_module.schedule_to_module(s, [A, B, C], "main", None)
-    mod = tvm.tir.transform.Simplify()(mod)
-
-    stmt = mod["main"].body
-    new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
-    tvm.ir.assert_structural_equal(new_stmt, stmt)
-
-    with tvm.transform.PassContext(
-        config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}}
-    ):
-        new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
-    assert not tvm.ir.structural_equal(new_stmt, stmt)
-
-
 def test_hoisting_block_scope_5():
     ib = tvm.tir.ir_builder.create()
     data = ib.pointer("float32", name="data", scope="global")
diff --git a/tests/python/tir-transform/test_tir_transform_inject_copy_intrin.py b/tests/python/tir-transform/test_tir_transform_inject_copy_intrin.py
deleted file mode 100644
index aa0448c3c682..000000000000
--- a/tests/python/tir-transform/test_tir_transform_inject_copy_intrin.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.testing
-from tvm import te
-from tvm.driver.build_module import schedule_to_module
-
-
-def test_copy2d():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    B = te.compute((m, l), lambda i, j: A[i, j], name="B")
-    s = te.create_schedule(B.op)
-    s[B].pragma(B.op.axis[0], "memcpy")
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None)
-    mod = tvm.IRModule.from_expr(func)
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    def cb(src, dst, pad_before, pad_after, pad_value):
-        assert dst.strides[0] == l
-        assert dst.strides[1].value == 1
-        assert src.strides[0] == l
-        assert tuple(src.shape) == (m, l)
-        return tvm.tir.Evaluate(0)
-
-    stmt = tvm.tir.transform.InjectCopyIntrin("memcpy", cb)(mod)["main"].body
-
-
-def test_copy_pad():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    B = te.compute(
-        (m + 2, l),
-        lambda i, j: tvm.tir.if_then_else(tvm.tir.all(i >= 1, i < m + 1), A[i - 1, j], 1.0),
-        name="B",
-    )
-    s = te.create_schedule(B.op)
-    s[B].pragma(B.op.axis[0], "memcpy")
-    mod = schedule_to_module(s, [A, B])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    def cb(src, dst, pad_before, pad_after, pad_value):
-        tvm.testing.assert_prim_expr_equal(src.elem_offset, 0)
-        assert pad_before[0].value == 1
-        assert pad_before[1].value == 0
-        assert pad_after[0].value == 1
-        assert pad_after[1].value == 0
-        assert pad_value.value == 1.0
-        return tvm.tir.Evaluate(0)
-
-    stmt = tvm.tir.transform.InjectCopyIntrin("memcpy", cb)(mod)["main"].body
-
-
-def test_single_point_test():
-    A = te.placeholder((1,), name="A")
-    B = te.compute((1,), lambda i: A[i], name="B")
-    s = te.create_schedule(B.op)
-    s[B].pragma(B.op.axis[0], "memcpy")
-    mod = schedule_to_module(s, [A, B])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    def cb(src, dst, pad_before, pad_after, pad_value):
-        tvm.testing.assert_prim_expr_equal(src.elem_offset, 0)
-        tvm.testing.assert_prim_expr_equal(dst.elem_offset, 0)
-        tvm.testing.assert_prim_expr_equal(src.strides[0], 1)
-        tvm.testing.assert_prim_expr_equal(dst.strides[0], 1)
-        return tvm.tir.Evaluate(0)
-
-    stmt = tvm.tir.transform.InjectCopyIntrin("memcpy", cb)(mod)["main"].body
-
-
-def test_copy_pad_split():
-    m = 4 * 3
-    A = te.placeholder((m,), name="A")
-    Apad = te.compute(
-        (m + 2,), lambda i: tvm.tir.if_then_else(tvm.tir.all(i >= 1, i <= m), A[i - 1], 0.0), "Apad"
-    )
-    B = te.compute((m,), lambda i: Apad[i] + Apad[i + 1] + Apad[i + 2])
-    s = te.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=4)
-    s[Apad].compute_at(s[B], xo)
-    s[Apad].pragma(s[Apad].op.axis[0], "memcpy")
-
-    mod = schedule_to_module(s, [A, B])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod._move())
-    mod = tvm.tir.transform.Simplify()(mod._move())
-
-    def cb(src, dst, pad_before, pad_after, pad_value):
-        assert dst.elem_offset.value == 0
-        tvm.testing.assert_prim_expr_equal(src.elem_offset, tvm.te.max(xo * 4, 1) - 1)
-
-        rpad_before = tvm.te.max(1 - xo * 4, 0)
-        rpad_after = tvm.te.max(xo * 4 - 7, 0)
-        tvm.testing.assert_prim_expr_equal(pad_before[0], rpad_before)
-        tvm.testing.assert_prim_expr_equal(pad_after[0], rpad_after)
-        tvm.testing.assert_prim_expr_equal(src.shape[0], 6 - rpad_before - rpad_after)
-        return tvm.tir.Evaluate(0)
-
-    stmt = tvm.tir.transform.InjectCopyIntrin("memcpy", cb)(mod)["main"].body
-
-
-if __name__ == "__main__":
-    test_copy2d()
-    test_copy_pad()
-    test_copy_pad_split()
-    test_single_point_test()
diff --git a/tests/python/tir-transform/test_tir_transform_inject_rolling_buffer.py b/tests/python/tir-transform/test_tir_transform_inject_rolling_buffer.py
index c1c8141f70a7..3d8f85bf79dd 100644
--- a/tests/python/tir-transform/test_tir_transform_inject_rolling_buffer.py
+++ b/tests/python/tir-transform/test_tir_transform_inject_rolling_buffer.py
@@ -19,186 +19,9 @@
 import tvm
 import tvm.script
 from tvm import te, topi
-from tvm.driver.build_module import get_binds
 from tvm.script import tir as T
 
 
-def _tile_nd(s, tensor, tile):
-    outer_indices = []
-    inner_indices = []
-    for i, size in enumerate(tile):
-        outer, inner = s[tensor].split(tensor.op.axis[i], size)
-        outer_indices.append(outer)
-        inner_indices.append(inner)
-
-    s[tensor].reorder(*outer_indices, *inner_indices)
-    return outer_indices, inner_indices
-
-
-@tvm.tir.transform.prim_func_pass(opt_level=0)
-def remove_rolling_buffer_attr(func, mod, ctx):
-    def unwrap(node):
-        if isinstance(node, tvm.tir.AttrStmt) and node.attr_key == "rolling_buffer_scope":
-            return node.body
-        else:
-            return node
-
-    return func.with_body(
-        tvm.tir.stmt_functor.ir_transform(
-            func.body, None, postorder=unwrap, only_enable=["tir.AttrStmt"]
-        )
-    )
-
-
-@tvm.tir.transform.prim_func_pass(opt_level=0)
-def verify_no_rolling_buffer_attr(func, mod, ctx):
-    def verify(node):
-        if isinstance(node, tvm.tir.AttrStmt):
-            assert node.attr_key != "rolling_buffer_scope", "Failed to lower rolling buffers"
-
-    tvm.tir.stmt_functor.post_order_visit(func.body, verify)
-
-    return func
-
-
-def _verify_schedule(sch, inputs, output):
-    user_pass_lists = [
-        [(0, remove_rolling_buffer_attr), (0, verify_no_rolling_buffer_attr)],
-        [(0, tvm.tir.transform.InjectRollingBuffer()), (0, verify_no_rolling_buffer_attr)],
-    ]
-    built_funcs = []
-    for user_pass_list in user_pass_lists:
-        with tvm.transform.PassContext(config={"tir.add_lower_pass": user_pass_list}):
-            built_funcs.append(tvm.build(sch, inputs + [output]))
-
-    outputs = []
-    ctx = tvm.cpu(0)
-    input_data = []
-    for tensor in inputs:
-        shape = [i.value for i in tensor.shape]
-        input_data.append(
-            tvm.nd.array(np.random.randint(low=-100, high=100, size=shape).astype("int8"), ctx)
-        )
-    shape = [i.value for i in output.shape]
-    out = tvm.nd.array(np.zeros(shape, dtype="int8"), ctx)
-    for func in built_funcs:
-        func(*input_data, out)
-        outputs.append(out.numpy())
-
-    np.testing.assert_equal(outputs[0], outputs[1])
-
-
-@pytest.mark.parametrize("tile_shape", [(1, 4, 8, 16), (1, 8, 7, 11), (1, 8, 3, 8), (1, 7, 5, 3)])
-def test_tile_shapes(tile_shape):
-    A = te.placeholder((1, 12, 14, 16), name="A", dtype="int8")
-    pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_b = topi.nn.pool2d(pool_a, (3, 5), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-
-    sch = tvm.te.create_schedule([pool_b.op])
-    oi, ii = _tile_nd(sch, pool_b, tile_shape)
-    sch[pool_a].compute_at(sch[pool_b], oi[-1])
-    sch[pool_a].rolling_buffer()
-
-    _verify_schedule(sch, [A], pool_b)
-
-
-def test_implied_split():
-    A = te.placeholder((1, 12, 12, 16), name="A", dtype="int8")
-    pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_b = topi.nn.pool2d(pool_a, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-
-    sch = tvm.te.create_schedule([pool_b.op])
-    n, h, w, c = pool_b.op.axis
-    oi, ii = sch[pool_b].split(w, 4)
-    sch[pool_a].compute_at(sch[pool_b], oi)
-    sch[pool_a].rolling_buffer()
-
-    _verify_schedule(sch, [A], pool_b)
-
-
-@pytest.mark.parametrize("kernel_shape", [(1, 1), (3, 3)])
-def test_upscale(kernel_shape):
-    output_shape = (1, 24, 24, 16)
-    input_shape = (
-        output_shape[0],
-        output_shape[1] // 2 + 2 * (kernel_shape[0] - 1),
-        output_shape[2] // 2 + 2 * (kernel_shape[1] - 1),
-        output_shape[3],
-    )
-    A = te.placeholder(input_shape, name="A", dtype="int8")
-    pool_a = topi.nn.pool2d(A, kernel_shape, (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_b = topi.nn.pool2d(
-        pool_a, kernel_shape, (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC"
-    )
-    upscale = te.compute((1, 24, 24, 16), lambda nn, hh, ww, cc: pool_b[nn, hh // 2, ww // 2, cc])
-
-    sch = tvm.te.create_schedule([upscale.op])
-    oi, ii = _tile_nd(sch, upscale, (1, 5, 5, 16))
-    sch[pool_b].compute_at(sch[upscale], oi[-1])
-    sch[pool_b].rolling_buffer()
-    sch[pool_a].compute_at(sch[upscale], oi[-1])
-    sch[pool_a].rolling_buffer()
-
-    _verify_schedule(sch, [A], upscale)
-
-
-@pytest.mark.parametrize("tile_shape", [(1, 4, 8, 16), (1, 8, 7, 11), (1, 8, 3, 8), (1, 7, 5, 3)])
-def test_3_tiled_poolings(tile_shape):
-    A = te.placeholder((1, 14, 14, 16), name="A", dtype="int8")
-    pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_b = topi.nn.pool2d(pool_a, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_c = topi.nn.pool2d(pool_b, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-
-    sch = tvm.te.create_schedule([pool_c.op])
-    oi, ii = _tile_nd(sch, pool_c, tile_shape)
-    sch[pool_b].compute_at(sch[pool_c], oi[-1])
-    sch[pool_b].rolling_buffer()
-    sch[pool_a].compute_at(sch[pool_c], oi[-1])
-    sch[pool_a].rolling_buffer()
-
-    _verify_schedule(sch, [A], pool_c)
-
-
-@pytest.mark.parametrize("tile_shape", [(1, 4, 8, 16), (1, 8, 7, 11), (1, 8, 3, 8), (1, 7, 5, 3)])
-def test_tiled_added_poolings(tile_shape):
-    A = te.placeholder((1, 12, 12, 16), name="A", dtype="int8")
-    B = te.placeholder((1, 14, 14, 16), name="A", dtype="int8")
-    pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_b = topi.nn.pool2d(B, (5, 5), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    add = topi.add(pool_a, pool_b)
-    pool_c = topi.nn.pool2d(add, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-
-    sch = tvm.te.create_schedule([pool_c.op])
-    oi, ii = _tile_nd(sch, pool_c, tile_shape)
-    sch[add].compute_at(sch[pool_c], oi[-1])
-    sch[add].rolling_buffer()
-    sch[pool_b].compute_at(sch[pool_c], oi[-1])
-    sch[pool_b].rolling_buffer()
-    sch[pool_a].compute_at(sch[pool_c], oi[-1])
-    sch[pool_a].rolling_buffer()
-
-    _verify_schedule(sch, [A, B], pool_c)
-
-
-@pytest.mark.parametrize("make_rolling", [(0, 0), (1, 0), (0, 1), (1, 1)])
-def test_mixed_buffers(make_rolling):
-    A = te.placeholder((1, 14, 14, 16), name="A", dtype="int8")
-    pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_b = topi.nn.pool2d(pool_a, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-    pool_c = topi.nn.pool2d(pool_b, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC")
-
-    sch = tvm.te.create_schedule([pool_c.op])
-    oi, ii = _tile_nd(sch, pool_c, (1, 4, 8, 16))
-    sch[pool_b].compute_at(sch[pool_c], oi[-1])
-    if make_rolling[0]:
-        sch[pool_b].rolling_buffer()
-    sch[pool_a].compute_at(sch[pool_c], oi[-1])
-    if make_rolling[1]:
-        sch[pool_a].rolling_buffer()
-
-    _verify_schedule(sch, [A], pool_c)
-
-
 @tvm.script.ir_module
 class PreRollingBuffer:
     @T.prim_func
diff --git a/tests/python/tir-transform/test_tir_transform_instrument_bound_checkers.py b/tests/python/tir-transform/test_tir_transform_instrument_bound_checkers.py
deleted file mode 100644
index 3078572bb508..000000000000
--- a/tests/python/tir-transform/test_tir_transform_instrument_bound_checkers.py
+++ /dev/null
@@ -1,608 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-import tvm.testing
-from tvm import te, tir
-
-import pytest
-import numpy as np
-
-
-def collect_visit(stmt, f):
-    ret = []
-    tvm.tir.stmt_functor.post_order_visit(stmt, lambda x: ret.append(f(x)))
-    return ret
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_llvm(index_a, index_b):
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i + index_a] + B[i + index_b], name="C")
-    s = te.create_schedule(C.op)
-    tgt = "llvm"
-    tgt_host = "llvm"
-    stmt = tvm.lower(s, [A, B, C], simple_mode=True)
-    print(stmt)
-    tgt = tvm.target.Target(tgt, tgt_host)
-    fadd = tvm.build(s, [A, B, C], target=tgt, name="myadd")
-    dev = tvm.device(tgt.kind.name, 0)
-    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), dev)
-    c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), dev)
-    fadd(a, b, c)
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_llvm():
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
-    tgt = "llvm"
-    tgt_host = "llvm"
-    stmt = tvm.lower(s, [A, B, C], simple_mode=True)
-    tgt = tvm.target.Target(tgt, tgt_host)
-    fadd = tvm.build(s, [A, B, C], target=tgt, name="myadd")
-    dev = tvm.device(tgt.kind.name, 0)
-    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), dev)
-    c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), dev)
-    fadd(a, b, c)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_vectorize_llvm(nn, index_a, index_b):
-    n = tvm.runtime.convert(nn)
-    a = te.placeholder((n), name="a")
-    b = te.placeholder((n), name="b")
-    c = te.compute((n,), lambda i: a[i + index_a] + b[i + index_b], name="c")
-    s = te.create_schedule(c.op)
-    xo, xi = s[c].split(c.op.axis[0], factor=8)
-    s[c].parallel(xo)
-    s[c].vectorize(xi)
-    tgt = "llvm"
-    tgt_host = "llvm"
-    stmt = tvm.lower(s, [a, b, c], simple_mode=True)
-    tgt = tvm.target.Target(tgt, tgt_host)
-    f = tvm.build(s, [a, b, c], target=tgt, name="myaddvec")
-    dev = tvm.cpu(0)
-    n = nn
-    a = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), dev)
-    c = tvm.nd.array(np.zeros(n, dtype=c.dtype), dev)
-    f(a, b, c)
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_vectorize_llvm():
-    n = 512
-    lanes = 2
-    A = te.placeholder((n,), name="A", dtype="float32x%d" % lanes)
-    B = te.compute((n,), lambda i: A[i], name="B")
-    C = te.compute((n,), lambda i: B[i] + tvm.tir.const(1, A.dtype), name="C")
-    s = te.create_schedule(C.op)
-    xo, xi = s[C].split(C.op.axis[0], nparts=2)
-    _, xi = s[C].split(xi, factor=2)
-    s[C].parallel(xo)
-    s[C].vectorize(xi)
-    s[B].compute_at(s[C], xo)
-    xo, xi = s[B].split(B.op.axis[0], factor=2)
-    s[B].vectorize(xi)
-    # build and invoke the kernel.
-    lowered_func = tvm.lower(s, [A, C], "llvm", simple_mode=False)
-    f = tvm.build(s, [A, C], "llvm")
-    dev = tvm.cpu(0)
-    # launch the kernel.
-    a = tvm.nd.empty((n,), A.dtype).copyfrom(
-        np.random.uniform(size=[n] + ([] if lanes == 1 else [lanes]))
-    )
-    c = tvm.nd.empty((n,), C.dtype, dev)
-    f(a, c)
-    tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_loop_partition_basic_llvm():
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    T = te.compute((n,), lambda i: A[i] + B[i])
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=4)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev)
-    t = tvm.nd.empty((32,), T.dtype, dev)
-    f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_loop_partition_basic_llvm(index_a, index_b):
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    T = te.compute((n,), lambda i: A[i + index_a] + B[i + index_b])
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=4)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev)
-    t = tvm.nd.empty((32,), T.dtype, dev)
-    f(a, b, t)
-
-
-def test_in_bounds_const_loop_partition_ir():
-    def check_attr_stmt(x):
-        if (
-            isinstance(x, tvm.tir.AttrStmt)
-            and x.attr_key == "buffer_bound"
-            and tvm.ir.structural_equal(x.value.args, [n])
-        ):
-            return True
-        return False
-
-    def check_branch_stmt(x):
-        if isinstance(x, tvm.tir.IfThenElse):
-            return True
-        return False
-
-    def assert_bound_instrumentation(stmt, f, nums):
-        count = 0
-        for i in collect_visit(stmt, f):
-            if i is True:
-                count = count + 1
-        assert count == nums
-
-    def collect_branch_stmt(x):
-        if isinstance(x, tvm.tir.IfThenElse):
-            branch_collector.append(x)
-
-    n = tir.const(21)
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    T = te.compute((n,), lambda i: A[i] + B[i])
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=4)
-
-    with tvm.transform.PassContext(
-        config={
-            "tir.instrument_bound_checkers": True,
-            "tir.LoopPartition": {"partition_const_loop": True},
-        }
-    ):
-        mod = tvm.driver.lower(s, [A, B, T], name="main")
-
-    stmt = mod["main"].body
-    # after instrumentation
-    assert_bound_instrumentation(stmt, check_attr_stmt, 2 * 3)
-    assert_bound_instrumentation(stmt, check_branch_stmt, 2)
-
-    branch_collector = list()
-    collect_visit(stmt, collect_branch_stmt)
-    assert len(branch_collector) == 2
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_const_loop_partition_llvm():
-    with tvm.transform.PassContext(
-        config={
-            "tir.instrument_bound_checkers": True,
-            "tir.LoopPartition": {"partition_const_loop": True},
-        }
-    ):
-        n = 21
-        A = te.placeholder((n,), name="A")
-        B = te.placeholder((n,), name="B")
-
-        T = te.compute((n,), lambda i: A[i] + B[i])
-        s = te.create_schedule(T.op)
-        xo, xi = s[T].split(T.op.axis[0], factor=4)
-        lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-        dev = tvm.cpu(0)
-
-        f = tvm.build(s, [A, B, T], "llvm")
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), dev)
-        t = tvm.nd.empty((n,), T.dtype, dev)
-        f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_const_loop_partition_llvm(index_a, index_b):
-    with tvm.transform.PassContext(
-        config={
-            "tir.instrument_bound_checkers": True,
-            "tir.LoopPartition": {"partition_const_loop": True},
-        }
-    ):
-        n = 21
-        A = te.placeholder((n,), name="A")
-        B = te.placeholder((n,), name="B")
-
-        T = te.compute((n,), lambda i: A[i + index_a] + B[i + index_b])
-        s = te.create_schedule(T.op)
-        xo, xi = s[T].split(T.op.axis[0], factor=4)
-        lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-        dev = tvm.cpu(0)
-
-        f = tvm.build(s, [A, B, T], "llvm")
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), dev)
-        t = tvm.nd.empty((n,), T.dtype, dev)
-        f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_conv_llvm(loop_tiling=False):
-    HSTR = WSTR = 1
-    in_channel = 128
-    kernel_height = kernel_width = 3
-    out_channel = 64
-    batch_size = 1
-    in_height = in_width = 64
-    out_height = out_width = in_height - kernel_height + 1
-    data = te.placeholder((batch_size, in_channel, in_height, in_width), name="data")
-    kernel = te.placeholder((kernel_height, kernel_width, in_channel, out_channel), name="kernel")
-    ic = te.reduce_axis((0, in_channel), name="ic")
-    kh = te.reduce_axis((0, kernel_height), name="kh")
-    kw = te.reduce_axis((0, kernel_width), name="kw")
-    conv = te.compute(
-        (batch_size, out_channel, out_height, out_width),
-        lambda n, oc, oh, ow: te.sum(
-            data[n, ic, oh * HSTR + kh, ow * WSTR + kw] * kernel[kh, kw, ic, oc], axis=[ic, kh, kw]
-        ),
-        name="conv2d",
-    )
-    s = te.create_schedule(conv.op)
-
-    n, oc, oh, ow = conv.op.axis
-    if loop_tiling:
-        oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
-    lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [data, kernel, conv], "llvm")
-    data_input = tvm.nd.array(
-        np.random.uniform(size=(batch_size, in_channel, in_height, in_width)).astype("float32"), dev
-    )
-    kernel_input = tvm.nd.array(
-        np.random.uniform(size=(kernel_height, kernel_width, in_channel, out_channel)).astype(
-            "float32"
-        ),
-        dev,
-    )
-    conv_out = tvm.nd.empty((batch_size, out_channel, out_height, out_width), "float32", dev)
-    f(data_input, kernel_input, conv_out)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_conv_llvm(data_offsets, kernel_offsets, loop_tiling=False):
-    HSTR = WSTR = 1
-    in_channel = 128
-    kernel_height = kernel_width = 3
-    out_channel = 64
-    batch_size = 1
-    in_height = in_width = 64
-    out_height = out_width = in_height - kernel_height + 1
-    data = te.placeholder((batch_size, in_channel, in_height, in_width), name="data")
-    kernel = te.placeholder((kernel_height, kernel_width, in_channel, out_channel), name="kernel")
-    ic = te.reduce_axis((0, in_channel), name="ic")
-    kh = te.reduce_axis((0, kernel_height), name="kh")
-    kw = te.reduce_axis((0, kernel_width), name="kw")
-    conv = te.compute(
-        (batch_size, out_channel, out_height, out_width),
-        lambda n, oc, oh, ow: te.sum(
-            data[
-                n + data_offsets[0],
-                ic + data_offsets[1],
-                oh * HSTR + kh + data_offsets[2],
-                ow * WSTR + kw + data_offsets[3],
-            ]
-            * kernel[
-                kh + kernel_offsets[0],
-                kw + kernel_offsets[1],
-                ic + kernel_offsets[2],
-                oc + kernel_offsets[3],
-            ],
-            axis=[ic, kh, kw],
-        ),
-        name="conv2d",
-    )
-    s = te.create_schedule(conv.op)
-
-    n, oc, oh, ow = conv.op.axis
-    if loop_tiling:
-        oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
-    lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [data, kernel, conv], "llvm")
-    data_input = tvm.nd.array(
-        np.random.uniform(size=(batch_size, in_channel, in_height, in_width)).astype("float32"), dev
-    )
-    kernel_input = tvm.nd.array(
-        np.random.uniform(size=(kernel_height, kernel_width, in_channel, out_channel)).astype(
-            "float32"
-        ),
-        dev,
-    )
-    conv_out = tvm.nd.empty((batch_size, out_channel, out_height, out_width), "float32", dev)
-    f(data_input, kernel_input, conv_out)
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_tensors_with_same_shapes1D_llvm():
-    n = te.size_var("n")
-    k = te.size_var("k")
-    m = te.size_var("m")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((k,), name="B")
-
-    T = te.compute((m,), lambda i: A[i] * B[i])
-    s = te.create_schedule(T.op)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev)
-    t = tvm.nd.empty((32,), T.dtype, dev)
-    f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_tensors_with_diff_shapes1D_llvm(a_shape, b_shape, c_shape):
-    n = te.size_var("n")
-    k = te.size_var("k")
-    m = te.size_var("m")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((k,), name="B")
-
-    T = te.compute((m,), lambda i: A[i] * B[i])
-    s = te.create_schedule(T.op)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(a_shape,)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(b_shape,)).astype(B.dtype), dev)
-    t = tvm.nd.empty((c_shape,), T.dtype, dev)
-    f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_tensors_with_same_shapes2D_llvm():
-    n = te.size_var("n")
-    k = te.size_var("k")
-    m = te.size_var("m")
-    A = te.placeholder((n, n), name="A")
-    B = te.placeholder((k, k), name="B")
-
-    T = te.compute((m, m), lambda i, j: A[i][j] * B[i][j])
-    s = te.create_schedule(T.op)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(B.dtype), dev)
-    t = tvm.nd.empty((32, 32), T.dtype, dev)
-    f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_tensors_with_diff_shapes2D_llvm(a_shape, b_shape, c_shape):
-    n = te.size_var("n")
-    k = te.size_var("k")
-    m = te.size_var("m")
-    A = te.placeholder((n, n), name="A")
-    B = te.placeholder((k, k), name="B")
-
-    T = te.compute((m, m), lambda i, j: A[i][j] * B[i][j])
-    s = te.create_schedule(T.op)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(a_shape[0], a_shape[1])).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(b_shape[0], b_shape[1])).astype(B.dtype), dev)
-    t = tvm.nd.empty((c_shape[0], c_shape[1]), T.dtype, dev)
-    f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-def test_in_bounds_tensors_with_same_shapes3D_llvm():
-    n = te.size_var("n")
-    k = te.size_var("k")
-    m = te.size_var("m")
-    A = te.placeholder((n, n, n), name="A")
-    B = te.placeholder((k, k, k), name="B")
-
-    T = te.compute((m, m, m), lambda i, j, p: A[i][j][p] * B[i][j][p])
-    s = te.create_schedule(T.op)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(np.random.uniform(size=(32, 32, 32)).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=(32, 32, 32)).astype(B.dtype), dev)
-    t = tvm.nd.empty((32, 32, 32), T.dtype, dev)
-    f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_tensors_with_diff_shapes3D_llvm(a_shape, b_shape, c_shape):
-    n = te.size_var("n")
-    k = te.size_var("k")
-    m = te.size_var("m")
-    A = te.placeholder((n, n, n), name="A")
-    B = te.placeholder((k, k, k), name="B")
-
-    T = te.compute((m, m, m), lambda i, j, p: A[i][j][p] * B[i][j][p])
-    s = te.create_schedule(T.op)
-    lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False)
-
-    dev = tvm.cpu(0)
-
-    f = tvm.build(s, [A, B, T], "llvm")
-    a = tvm.nd.array(
-        np.random.uniform(size=(a_shape[0], a_shape[1], c_shape[2])).astype(A.dtype), dev
-    )
-    b = tvm.nd.array(
-        np.random.uniform(size=(b_shape[0], b_shape[1], b_shape[2])).astype(B.dtype), dev
-    )
-    t = tvm.nd.empty((c_shape[0], c_shape[1], c_shape[2]), T.dtype, dev)
-    f(a, b, t)
-
-
-@tvm.testing.requires_llvm
-@pytest.mark.xfail
-def test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm():
-    n = 64
-    A = te.placeholder((n,), name="A")
-    scale = te.placeholder((), name="scale")
-    k = te.reduce_axis((0, n), name="k")
-    C = te.compute((), lambda: te.sum(A[k + k + k] * scale, axis=k), name="C")
-    D = te.compute((), lambda: C + 1)
-    s = te.create_schedule(D.op)
-    stmt = tvm.lower(s, [A, scale, D], simple_mode=True)
-
-    # build and invoke the kernel.
-    f = tvm.build(s, [A, scale, D], "llvm")
-    dev = tvm.cpu(0)
-    # launch the kernel.
-    a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev)
-    sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), dev)
-    d = tvm.nd.empty((), D.dtype, dev)
-    f(a, sc, d)
-    d_np = np.sum(a.numpy()) * sc.numpy() + 1
-    tvm.testing.assert_allclose(d.numpy(), d_np)
-
-
-if __name__ == "__main__":
-    with tvm.transform.PassContext(
-        config={
-            "tir.instrument_bound_checkers": True,
-        }
-    ):
-        # zero scale
-        test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm()
-        # in bound
-        test_in_bounds_llvm()
-        # upper bound
-        test_out_of_bounds_llvm(1, 0)
-        test_out_of_bounds_llvm(0, 1)
-        test_out_of_bounds_llvm(1, 1)
-        test_out_of_bounds_llvm(10000, 0)
-        test_out_of_bounds_llvm(0, 10000)
-        test_out_of_bounds_llvm(10000, 10000)
-        # lower bound
-        test_out_of_bounds_llvm(-1, 0)
-        test_out_of_bounds_llvm(0, -1)
-        test_out_of_bounds_llvm(-1, -1)
-        test_out_of_bounds_llvm(-10000, 0)
-        test_out_of_bounds_llvm(0, -10000)
-        test_out_of_bounds_llvm(-10000, -10000)
-        # vectorize in bound
-        test_in_bounds_vectorize_llvm()
-        # vectorization upper bound
-        test_out_of_bounds_vectorize_llvm(1024, 1000, 0)
-        test_out_of_bounds_vectorize_llvm(1024, 0, 10000)
-        # vectorization lower bound
-        test_out_of_bounds_vectorize_llvm(1024, -1000, 0)
-        test_out_of_bounds_vectorize_llvm(1024, 0, -10000)
-        test_in_bounds_const_loop_partition_llvm()
-        test_out_of_bounds_const_loop_partition_llvm(1, 0)
-        test_out_of_bounds_const_loop_partition_llvm(0, 1)
-        test_out_of_bounds_const_loop_partition_llvm(-1, 0)
-        test_out_of_bounds_const_loop_partition_llvm(0, -1)
-        test_in_bounds_loop_partition_basic_llvm()
-        test_out_of_bounds_loop_partition_basic_llvm(32, 0)
-        test_out_of_bounds_loop_partition_basic_llvm(0, 32)
-        test_out_of_bounds_loop_partition_basic_llvm(-32, 0)
-        test_out_of_bounds_loop_partition_basic_llvm(0, -32)
-        # conv
-        test_in_bounds_conv_llvm()
-        test_out_of_bounds_conv_llvm([1, 0, 0, 0], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 1, 0, 0], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 1, 0], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 1], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([-1, 0, 0, 0], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, -1, 0, 0], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, -1, 0], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, -1], [0, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [1, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 1, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 1, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, 1])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [-1, 0, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, -1, 0, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, -1, 0])
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, -1])
-        # loop tiling
-        test_in_bounds_conv_llvm(True)
-        test_out_of_bounds_conv_llvm([1, 0, 0, 0], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 1, 0, 0], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 1, 0], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 1], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([-1, 0, 0, 0], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, -1, 0, 0], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, -1, 0], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, -1], [0, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [1, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 1, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 1, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, 1], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [-1, 0, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, -1, 0, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, -1, 0], True)
-        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, -1], True)
-        # tensors with diff shapes basic operation such as mul
-        test_out_of_bounds_tensors_with_diff_shapes1D_llvm(32, 64, 64)
-        test_out_of_bounds_tensors_with_diff_shapes1D_llvm(64, 32, 64)
-        test_out_of_bounds_tensors_with_diff_shapes2D_llvm([64, 64], [32, 32], [64, 64])
-        test_out_of_bounds_tensors_with_diff_shapes2D_llvm([32, 32], [64, 64], [64, 64])
-        test_out_of_bounds_tensors_with_diff_shapes3D_llvm([64, 64, 64], [32, 32, 32], [64, 64, 64])
-        test_out_of_bounds_tensors_with_diff_shapes3D_llvm([32, 32, 32], [64, 64, 64], [64, 64, 64])
-        # check tensors with the same shapes
-        test_in_bounds_tensors_with_same_shapes1D_llvm()
-        test_in_bounds_tensors_with_same_shapes2D_llvm()
-        test_in_bounds_tensors_with_same_shapes3D_llvm()
-        # ir tests
-        test_in_bounds_const_loop_partition_ir()
diff --git a/tests/python/tir-transform/test_tir_transform_loop_partition.py b/tests/python/tir-transform/test_tir_transform_loop_partition.py
index 5f24d1666fe6..bec4129ffcbf 100644
--- a/tests/python/tir-transform/test_tir_transform_loop_partition.py
+++ b/tests/python/tir-transform/test_tir_transform_loop_partition.py
@@ -29,74 +29,6 @@ def collect_visit(stmt, f):
     return ret
 
 
-def test_basic():
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    T = te.compute((n,), lambda i: A[i] + B[i])
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=4)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([n], stmt).with_attr("global_symbol", "main"))
-    mod = tvm.tir.transform.LoopPartition()(mod)
-    stmt = tvm.tir.transform.Simplify()(mod)["main"]
-
-    assert not any(collect_visit(stmt.body.body[0], lambda x: isinstance(x, tvm.tir.IfThenElse)))
-    assert any(collect_visit(stmt.body.body[1], lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-
-def test_const_loop():
-    n = 21
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    T = te.compute((n,), lambda i: A[i] + B[i])
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=4)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main"))
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        mod = tvm.tir.transform.LoopPartition()(mod)
-        stmt = tvm.tir.transform.Simplify()(mod)["main"].body
-
-    assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-
-def test_no_unroll_loop():
-    n = 21
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    T = te.compute((n,), lambda i: A[i] + B[i])
-    s = te.create_schedule(T.op)
-    xo, xi = s[T].split(T.op.axis[0], factor=4)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main"))
-    with tvm.transform.PassContext(
-        config={
-            "tir.LoopPartition": {
-                "partition_const_loop": True,
-                "no_unroll_loop_with_extent_one": True,
-            }
-        }
-    ):
-        mod = tvm.tir.transform.LoopPartition()(mod)
-        mod = tvm.tir.transform.Simplify()(mod)
-        stmt = tvm.tir.transform.RemoveNoOp()(mod)["main"].body
-
-    assert sum(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.For))) == 4
-
-
 def test_multi_loop():
     ib = tvm.tir.ir_builder.create()
     m = te.size_var("m")
@@ -141,52 +73,6 @@ def test_multi_if():
     assert not any(collect_visit(stmt.body[0], lambda x: isinstance(x, tvm.tir.IfThenElse)))
 
 
-def test_thread_axis():
-    m = te.size_var("m")
-    l = te.size_var("l")
-    A = te.placeholder((m, l), name="A")
-    B = te.compute((m, l), lambda i, j: A[i, j] + 3, name="B")
-    s = te.create_schedule(B.op)
-
-    s[B].set_scope("shared")
-    num_thread = 16
-    xo, xi = s[B].split(B.op.axis[0], 32)
-    xi0, xi1 = s[B].split(xi, nparts=num_thread)
-    s[B].bind(xi0, te.thread_axis("threadIdx.x"))
-
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main"))
-    mod = tvm.tir.transform.LoopPartition()(mod)
-    stmt = tvm.tir.transform.Simplify()(mod)["main"]
-
-    assert not any(collect_visit(stmt.body.body[0], lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-
-def test_vectorize():
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    bias = te.size_var("bias", dtype="float32")
-    scale = te.size_var("scale", dtype="float32")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i) * scale + bias, name="C")
-    # schedule
-    s = te.create_schedule(C.op)
-    # create iter var and assign them tags.
-    num_thread = 32
-    bx, x = s[C].split(C.op.axis[0], factor=num_thread * 4)
-    tx, x = s[C].split(x, nparts=num_thread)
-    _, x = s[C].split(x, factor=4)
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    s[C].vectorize(x)
-    stmt = tvm.lower(s, [A, B], name="main")["main"]
-    body = stmt.body.body.body.body
-    assert x.var.name not in str(body.condition)
-    assert any(collect_visit(body.then_case, lambda x: isinstance(x, tvm.tir.Ramp)))
-
-
 def test_condition():
     ib = tvm.tir.ir_builder.create()
     m = te.size_var("m")
@@ -219,24 +105,6 @@ def test_condition_EQ():
     assert not any(collect_visit(stmt[0], lambda x: isinstance(x, tvm.tir.Select)))
 
 
-def test_thread_axis2():
-    n = tvm.runtime.convert(4096)
-    m = te.size_var("m")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
-    num_thread = 32
-    bx, x = s[C].split(C.op.axis[0], factor=32)
-    tx, x = s[C].split(x, nparts=num_thread)
-    _, x = s[C].split(x, factor=m)
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
-    stmt = tvm.lower(s, [A, B], name="main")["main"]
-    for_body = stmt.body.body.body.body[0]
-    assert "threadIdx" not in str(for_body.extent)
-
-
 def test_everything_during_deduction():
     m = te.size_var("m")
     n = te.size_var("n")
@@ -255,55 +123,6 @@ def test_everything_during_deduction():
     assert isinstance(stmt.body.body, tvm.tir.IfThenElse)
 
 
-def test_single_likely():
-    n = 60
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-
-    T = te.compute((n,), lambda i: A[i] + B[i])
-    s = te.create_schedule(T.op)
-    x = T.op.axis[0]
-    xo, xi = s[T].split(x, factor=16)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main"))
-
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        mod = tvm.tir.transform.LoopPartition()(mod)
-        stmt = tvm.tir.transform.Simplify()(mod)["main"].body
-
-    assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-
-def test_multi_likely():
-    n = 94
-    m = 62
-    A = te.placeholder((n, m), name="A")
-    B = te.placeholder((n, m), name="B")
-
-    T = te.compute((n, m), lambda i, j: A[i, j] + B[i, j])
-    s = te.create_schedule(T.op)
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-    x, y = T.op.axis
-    xo, xi = s[T].split(x, factor=16)
-    yo, yi = s[T].split(y, factor=16)
-    s[T].reorder(xo, yo, xi, yi)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main"))
-
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        mod = tvm.tir.transform.LoopPartition()(mod)
-        stmt = tvm.tir.transform.Simplify()(mod)["main"].body
-
-    assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-
 def test_oneD_pool():
     m = te.size_var("m")
     ib = tvm.tir.ir_builder.create()
@@ -415,135 +234,6 @@ def test_cce_loop_3():
     assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse)))
 
 
-def test_conv_tiling():
-    HSTR = WSTR = 1
-    in_channel = 128
-    kernel_height = kernel_width = 3
-    out_channel = 64
-    batch_size = 1
-    in_height = in_width = 64
-    out_height = out_width = in_height - kernel_height + 1
-    data = te.placeholder((batch_size, in_channel, in_height, in_width), name="data")
-    kernel = te.placeholder((kernel_height, kernel_width, in_channel, out_channel), name="kernel")
-    ic = te.reduce_axis((0, in_channel), name="ic")
-    kh = te.reduce_axis((0, kernel_height), name="kh")
-    kw = te.reduce_axis((0, kernel_width), name="kw")
-    conv = te.compute(
-        (batch_size, out_channel, out_height, out_width),
-        lambda n, oc, oh, ow: te.sum(
-            data[n, ic, oh * HSTR + kh, ow * WSTR + kw] * kernel[kh, kw, ic, oc], axis=[ic, kh, kw]
-        ),
-        name="conv2d",
-    )
-    s = te.create_schedule(conv.op)
-
-    n, oc, oh, ow = conv.op.axis
-    oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main"))
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        mod = tvm.tir.transform.LoopPartition()(mod)
-        stmt = tvm.tir.transform.Simplify()(mod)["main"].body
-
-    assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-
-def test_multilevel_splitting_with_indivisble_factors():
-    from tvm import topi
-
-    A = te.placeholder((130,), dtype="float32")
-    B = topi.nn.relu(A)
-    s = te.create_schedule(B.op)
-    (y,) = s[B].op.axis
-    (yo, yi) = s[B].split(y, factor=8)
-    (yoo, yoi) = s[B].split(yo, factor=16)
-    s[B].reorder(yoo, yoi, yi)
-    s[B].unroll(yi)
-
-    ## But this does the right thing.
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        lowered_body = tvm.lower(s, [A, B], name="x")["x"].body
-
-        def visit_stmt(op):
-            return isinstance(op, tvm.tir.Max)
-
-        num_max = collect_visit(lowered_body, visit_stmt)
-        assert num_max.count(True) == 10
-
-
-def test_double_splitting_with_indivisible_factors():
-    m = 48
-    dtype = "float32"
-    A = te.placeholder((m,), name="A", dtype=dtype)
-    C = te.compute((m,), lambda i: A[i], name="C")
-    D = te.compute((m,), lambda i: C[i], name="D")
-
-    s = te.create_schedule(D.op)
-    co, ci = s[C].split(C.op.axis[0], factor=10)
-    do, di = s[D].split(D.op.axis[0], 32)
-    s[C].compute_at(s[D], do)
-
-    target = "llvm"
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False)
-        func = tvm.build(f, target=target)
-
-    top_produce = f["fadd1"].body
-    assert not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.tir.IfThenElse)))
-
-    # check functional correctness of generated code
-    dev = tvm.device(target, 0)
-    a = tvm.nd.array(
-        numpy.ones(
-            m,
-        ).astype(dtype),
-        dev,
-    )
-    c = tvm.nd.array(
-        numpy.zeros(
-            m,
-        ).astype(dtype),
-        dev,
-    )
-    d = tvm.nd.array(
-        numpy.zeros(
-            m,
-        ).astype(dtype),
-        dev,
-    )
-    func(a, c, d)
-    tvm.testing.assert_allclose(c.numpy(), a.numpy(), rtol=1e-5)
-    tvm.testing.assert_allclose(d.numpy(), a.numpy(), rtol=1e-5)
-
-
-def test_simple_rfactor():
-    K = 16 * 4 + 4
-    k = te.reduce_axis((0, K), "k")
-
-    A = te.placeholder((1, K), name="A")
-
-    B = te.compute((1,), lambda b: te.sum(A[b, k], axis=k), name="B")
-
-    s = te.create_schedule(B.op)
-    ko, _ = s[B].split(s[B].op.reduce_axis[0], 16)
-    BF = s.rfactor(B, ko, 0)
-
-    s.normalize()
-    bounds = tvm.te.schedule.InferBound(s)
-    stmt1 = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    mod1 = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt1).with_attr("global_symbol", "main"))
-    stmt1 = tvm.tir.transform.Simplify()(mod1)["main"].body
-
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        mod2 = tvm.tir.transform.LoopPartition()(mod1)
-        stmt2 = tvm.tir.transform.Simplify()(mod2)["main"].body
-
-    # make sure loop partition actually did something
-    assert not tvm.ir.structural_equal(stmt1.body, stmt2.body)
-
-
 @T.prim_func
 def partitioned_concat(
     A: T.Buffer((16,), "float32"), B: T.Buffer((16,), "float32"), C: T.Buffer((32,), "float32")
@@ -555,21 +245,6 @@ def partitioned_concat(
         C[i + 16] = B[i + 16]
 
 
-def test_explicit_partition_hint():
-    A = te.placeholder((16,), name="A")
-    B = te.placeholder((16,), name="B")
-    C = te.compute((32,), lambda i: te.if_then_else(i < 16, A[i], B[i]), name="C")
-    s = te.create_schedule(C.op)
-    s.normalize()
-    s[C].pragma(s[C].op.axis[0], "loop_partition_hint", True)
-    mod = tvm.driver.build_module.schedule_to_module(s, [A, B, C], "main", None)
-    with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}):
-        mod = tvm.tir.transform.StorageFlatten(64)(mod)
-        mod = tvm.tir.transform.LoopPartition()(mod)
-        mod = tvm.tir.transform.Simplify()(mod)
-    tvm.ir.assert_structural_equal(mod["main"], partitioned_concat)
-
-
 def partition_from_scheduled_tir(prim_func, pass_cfg, do_flatten=True):
     with tvm.transform.PassContext(config=pass_cfg):
         mod = IRModule.from_expr(prim_func.with_attr("global_symbol", "main"))
diff --git a/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py b/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py
index 35b4d55ea51d..63700853b36a 100644
--- a/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py
+++ b/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py
@@ -1897,21 +1897,6 @@ def test_no_thread_broadcast_rewrite():
     _check(no_thread_broadcast, lowered_no_thread_broadcast)
 
 
-def test_lower_te():
-    a = te.placeholder((32, 2, 2))
-    k1 = te.reduce_axis((0, 2), "k1")
-    k2 = te.reduce_axis((0, 2), "k2")
-    b = te.compute((32,), lambda i: te.sum(a[i, k1, k2], axis=[k1, k2]))
-    s = te.create_schedule(b.op)
-    s[b].bind(k1, te.thread_axis("threadIdx.x"))
-    s[b].bind(k2, te.thread_axis("threadIdx.y"))
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [a, b])
-    mod = tvm.tir.transform.LowerCrossThreadReduction()(orig_mod)
-    tvm.ir.assert_structural_equal(
-        mod, orig_mod
-    )  # LowerCrossThreadReduction should do nothing on TE
-
-
 def test_layer_norm_tuple_sum():
     _check(layer_norm_tuple_sum, lowered_layer_norm_tuple_sum)
 
diff --git a/tests/python/tir-transform/test_tir_transform_lower_init_block.py b/tests/python/tir-transform/test_tir_transform_lower_init_block.py
index 3ada747f6915..d05b8bc71f46 100644
--- a/tests/python/tir-transform/test_tir_transform_lower_init_block.py
+++ b/tests/python/tir-transform/test_tir_transform_lower_init_block.py
@@ -105,15 +105,6 @@ def test_lower_match_buffer():
     tvm.ir.assert_structural_equal(mod, BranchWithMatchBuffer, True)
 
 
-def test_lower_te():
-    x = te.placeholder((1,))
-    y = te.compute((1,), lambda i: x[i] + 2)
-    s = te.create_schedule(y.op)
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y])
-    mod = tvm.tir.transform.LowerInitBlock()(orig_mod)
-    tvm.ir.assert_structural_equal(mod, orig_mod)  # LowerInitBlock should do nothing on TE
-
-
 if __name__ == "__main__":
     test_lower_reduction()
     test_lower_match_buffer()
diff --git a/tests/python/tir-transform/test_tir_transform_lower_intrin.py b/tests/python/tir-transform/test_tir_transform_lower_intrin.py
index 0764daac461a..3eb642fb51b3 100644
--- a/tests/python/tir-transform/test_tir_transform_lower_intrin.py
+++ b/tests/python/tir-transform/test_tir_transform_lower_intrin.py
@@ -47,9 +47,7 @@ def make_binds(i):
         return x
 
     C = te.compute((n,), make_binds)
-    s = te.create_schedule([C.op])
-
-    f = tvm.build(s, [A, B, C], "llvm")
+    f = tvm.build(te.create_prim_func([A, B, C]), "llvm")
     a = tvm.nd.array(np.array([x for x, y in data], dtype=expr.dtype))
     b = tvm.nd.array(np.array([y for x, y in data], dtype=expr.dtype))
     c = tvm.nd.array(np.zeros(len(data), dtype=expr.dtype))
diff --git a/tests/python/tir-transform/test_tir_transform_lower_opaque_block.py b/tests/python/tir-transform/test_tir_transform_lower_opaque_block.py
index ae44d2127595..dbaafb617aad 100644
--- a/tests/python/tir-transform/test_tir_transform_lower_opaque_block.py
+++ b/tests/python/tir-transform/test_tir_transform_lower_opaque_block.py
@@ -349,15 +349,6 @@ def test_symbolic_strided_buffer():
     _check(compacted_symbolic_strided_buffer_func, transformed_symbolic_strided_buffer_func)
 
 
-def test_lower_te():
-    x = te.placeholder((1,))
-    y = te.compute((1,), lambda i: x[i] + 2)
-    s = te.create_schedule(y.op)
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y])
-    mod = tvm.tir.transform.LowerOpaqueBlock()(orig_mod)
-    tvm.ir.assert_structural_equal(mod, orig_mod)  # LowerOpaqueBlock should do nothing on TE
-
-
 def test_annotated_loops():
     mod = tvm.IRModule.from_expr(annotated_loops.with_attr("global_symbol", "main"))
     mod = tvm.tir.transform.LowerOpaqueBlock()(mod)
diff --git a/tests/python/tir-transform/test_tir_transform_lower_warp_memory.py b/tests/python/tir-transform/test_tir_transform_lower_warp_memory.py
deleted file mode 100644
index 99ccc5556585..000000000000
--- a/tests/python/tir-transform/test_tir_transform_lower_warp_memory.py
+++ /dev/null
@@ -1,356 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import pytest
-import tvm
-import tvm.testing
-from tvm import te, tir
-from tvm.contrib.nvcc import have_fp16
-
-
-def _run_passes(mod):
-    cuda_target = tvm.target.Target("cuda", host="llvm")
-    assert cuda_target.thread_warp_size == 32
-    mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", cuda_target))(mod)
-    mod = tvm.tir.transform.AnnotateDeviceRegions()(mod)
-    mod = tvm.tir.transform.SplitHostDevice()(mod)
-    mod = tvm.tir.transform.LowerWarpMemory()(mod)
-    return mod
-
-
-@tvm.testing.requires_cuda
-def test_lower_warp_memory_local_scope():
-    m = 128
-    A = te.placeholder((m,), name="A")
-    B = te.compute((m,), lambda i: A[i] + 3, name="B")
-
-    s = te.create_schedule(B.op)
-    AA = s.cache_read(A, "warp", [B])
-    xo, xi = s[B].split(B.op.axis[0], 64)
-    xi0, xi1 = s[B].split(xi, factor=32)
-    tx = te.thread_axis("threadIdx.x")
-    s[B].bind(xi1, tx)
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
-    s[AA].compute_at(s[B], xo)
-    xo, xi = s[AA].split(s[AA].op.axis[0], 32)
-    s[AA].bind(xi, tx)
-
-    # lowering with the CSE pass disabled as otherwise it would do some commoning
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        mod = tvm.lower(s, [A, B], name="f")
-
-    mod = _run_passes(mod)
-    fdevice = mod["f_kernel"]
-
-    allocate = fdevice
-    while not isinstance(allocate, tir.Allocate):
-        allocate = allocate.body
-
-    assert allocate.buffer_var.type_annotation.storage_scope == "local"
-    assert allocate.extents[0].value == 2
-
-
-@tvm.testing.requires_cuda
-def test_lower_warp_memory_correct_indices():
-    n = 32
-    A = te.placeholder((2, n, n), name="A", dtype="float32")
-    C = te.compute((2, n, n), lambda x, i, j: A(x, i, (j + 1) % n), name="C")
-
-    s = te.create_schedule(C.op)
-    bk_x = te.thread_axis("blockIdx.x")
-    th_y = te.thread_axis("threadIdx.y")
-    th_x = te.thread_axis("threadIdx.x")
-    B = s.cache_read(A, "warp", [C])
-    cx, ci, cj = C.op.axis
-    bx, bi, bj = B.op.axis
-    s[C].bind(cj, th_x)
-    s[C].bind(cx, bk_x)
-    s[B].compute_at(s[C], cx)
-    s[B].bind(bi, th_y)
-    s[B].bind(bj, th_x)
-
-    bounds = tvm.te.schedule.InferBound(s)
-    ir = tvm.te.schedule.ScheduleOps(s, bounds)
-    inner_func = ir.body.body.body
-    store_A_warp = inner_func.seq[0].body.body
-    indices = list(store_A_warp.indices)
-
-    # A.warp is actually many buffers, one for each warp, although they are all called A.warp
-    # 1. If we are accessing from different threads within a same warp (different
-    #    threadIdx.x), we need to distinguish between each elements using threadIdx.x,
-    #    so threadIdx.x is one if the indices.
-    # 2. If we are accessing from different warps (different threadIdx.y), we are actually
-    #    assessing different buffers, so there is no need to distinguish from elements,
-    #    and therefore threadIdx.y is NOT a index.
-    idx_names = map(lambda x: x.name, filter(lambda x: type(x) is tvm.tir.expr.Var, indices))
-    assert "threadIdx.x" in idx_names
-    assert "threadIdx.y" not in idx_names
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_lower_warp_memory_cuda_end_to_end():
-    def check_cuda(dtype):
-        if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
-            print("Skip because gpu does not have fp16 support")
-            return
-
-        m = 128
-        A = te.placeholder((m,), name="A", dtype=dtype)
-        B = te.compute((m,), lambda i: A[i // 32 * 32 + (i + 1) % 32], name="B")
-
-        cuda_target = tvm.target.Target("cuda", host="llvm")
-        assert cuda_target.thread_warp_size == 32
-        with cuda_target:
-            s = te.create_schedule(B.op)
-            AA = s.cache_read(A, "warp", [B])
-            xo, xi = s[B].split(B.op.axis[0], 64)
-            xi0, xi1 = s[B].split(xi, factor=32)
-            tx = te.thread_axis("threadIdx.x")
-            s[B].bind(xi1, tx)
-            s[B].bind(xo, te.thread_axis("blockIdx.x"))
-            s[AA].compute_at(s[B], xo)
-            xo, xi = s[AA].split(s[AA].op.axis[0], 32)
-            s[AA].bind(xi, tx)
-
-            dev = tvm.cuda(0)
-            # building with the CSE pass disabled as otherwise it would do some commoning
-            with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-                func = tvm.build(s, [A, B], "cuda")
-            A_np = np.array(list(range(m)), dtype=dtype)
-            B_np = np.array(
-                list(range(1, 32))
-                + [0]
-                + list(range(33, 64))
-                + [32]
-                + list(range(65, 96))
-                + [64]
-                + list(range(97, 128))
-                + [96],
-                dtype=dtype,
-            )
-            A_nd = tvm.nd.array(A_np, dev)
-            B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev)
-            func(A_nd, B_nd)
-            tvm.testing.assert_allclose(B_nd.numpy(), B_np, rtol=1e-3)
-
-    check_cuda("float32")
-    check_cuda("float16")
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_lower_warp_memory_cuda_half_a_warp():
-    def check_cuda(dtype):
-        if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
-            print("Skip because gpu does not have fp16 support")
-            return
-
-        n, m = 16, 16
-        A = te.placeholder(
-            (
-                n,
-                m,
-            ),
-            name="A",
-            dtype=dtype,
-        )
-        B = te.compute(
-            (
-                n,
-                m,
-            ),
-            lambda j, i: A[j, (i + 1) % m],
-            name="B",
-        )
-
-        cuda_target = tvm.target.Target("cuda", host="llvm")
-        assert cuda_target.thread_warp_size == 2 * m
-        with cuda_target:
-            s = te.create_schedule(B.op)
-            tx = te.thread_axis("threadIdx.x")
-            ty = te.thread_axis("threadIdx.y")
-            bx = te.thread_axis("blockIdx.x")
-
-            AA = s.cache_read(A, "warp", [B])
-            y, x = B.op.axis
-            z, y = s[B].split(y, nparts=2)
-            s[B].bind(x, tx)
-            s[B].bind(y, ty)
-            s[B].bind(z, bx)
-            s[AA].compute_at(s[B], y)
-            _, x = AA.op.axis
-            s[AA].bind(x, tx)
-
-            dev = tvm.cuda(0)
-            # building with the CSE pass disabled as otherwise it would do some commoning
-            with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-                func = tvm.build(s, [A, B], "cuda")
-            A_np = np.array([list(range(i, m + i)) for i in range(n)], dtype=dtype)
-            B_np = np.array([list(range(1 + i, m + i)) + [i] for i in range(n)], dtype=dtype)
-            A_nd = tvm.nd.array(A_np, dev)
-            B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev)
-            func(A_nd, B_nd)
-            tvm.testing.assert_allclose(B_nd.numpy(), B_np, rtol=1e-3)
-
-    check_cuda("float32")
-    check_cuda("float16")
-
-
-@tvm.testing.requires_gpu
-@tvm.testing.requires_cuda
-def test_lower_warp_memory_cuda_2_buffers():
-    def check_cuda(dtype):
-        if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
-            print("Skip because gpu does not have fp16 support")
-            return
-
-        m = 32
-        A = te.placeholder((m,), name="A", dtype=dtype)
-        B = te.placeholder((m,), name="B", dtype=dtype)
-        C = te.compute((m,), lambda i: A[(i + 1) % m] + B[(i + 1) % m], name="C")
-
-        cuda_target = tvm.target.Target("cuda", host="llvm")
-        assert m <= cuda_target.thread_warp_size
-        with cuda_target:
-            s = te.create_schedule(C.op)
-            tx = te.thread_axis("threadIdx.x")
-            bx = te.thread_axis("blockIdx.x")
-
-            AA = s.cache_read(A, "warp", [C])
-            BB = s.cache_read(B, "warp", [C])
-            xo, xi = s[C].split(C.op.axis[0], nparts=1)
-            s[C].bind(xi, tx)
-            s[C].bind(xo, bx)
-            s[AA].compute_at(s[C], xo)
-            s[BB].compute_at(s[C], xo)
-            xo, xi = s[AA].split(s[AA].op.axis[0], nparts=1)
-            s[AA].bind(xo, bx)
-            s[AA].bind(xi, tx)
-            xo, xi = s[BB].split(s[BB].op.axis[0], nparts=1)
-            s[BB].bind(xo, bx)
-            s[BB].bind(xi, tx)
-
-            dev = tvm.cuda(0)
-            # building with the CSE pass disabled as otherwise it would do some commoning
-            with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-                func = tvm.build(s, [A, B, C], "cuda")
-            AB_np = np.array(list(range(m)), dtype=dtype)
-            C_np = np.array(list(range(1, m)) + [0], dtype=dtype) * 2
-            A_nd = tvm.nd.array(AB_np, dev)
-            B_nd = tvm.nd.array(AB_np, dev)
-            C_nd = tvm.nd.array(np.zeros(C_np.shape, dtype=C_np.dtype), dev)
-            func(A_nd, B_nd, C_nd)
-            tvm.testing.assert_allclose(C_nd.numpy(), C_np, rtol=1e-3)
-
-    check_cuda("float32")
-    check_cuda("float16")
-
-
-@tvm.testing.requires_gpu
-def test_lower_warp_memory_roundup():
-    def check(device, m):
-        A = te.placeholder((m,), name="A")
-        B = te.compute((m,), lambda i: A[i] + 1, name="B")
-
-        with tvm.target.Target(device):
-            s = te.create_schedule(B.op)
-            xo, xi = s[B].split(B.op.axis[0], factor=32)
-            tx = te.thread_axis("threadIdx.x")
-            s[B].bind(xo, te.thread_axis("blockIdx.x"))
-            s[B].bind(xi, tx)
-
-            AA = s.cache_read(A, "warp", [B])
-            _, yi = s[AA].split(s[AA].op.axis[0], factor=32)
-            s[AA].bind(yi, tx)
-            s[AA].compute_at(s[B], xo)
-
-            dev = tvm.device(device, 0)
-            # building with the CSE pass disabled as otherwise it would do some commoning
-            with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-                func = tvm.build(s, [A, B], device)
-            A_np = np.random.uniform(size=(m,)).astype(A.dtype)
-            B_np = np.zeros(shape=(m,)).astype(B.dtype)
-            A_nd = tvm.nd.array(A_np, dev)
-            B_nd = tvm.nd.array(B_np, dev)
-            func(A_nd, B_nd)
-            B_np = A_np + 1
-            tvm.testing.assert_allclose(B_nd.numpy(), B_np)
-
-    for device in ["cuda", "rocm"]:
-        if not tvm.testing.device_enabled(device):
-            print("skip because", device, "is not enabled..")
-            continue
-        check(device, m=31)
-        check(device, m=32)
-        check(device, m=33)
-        check(device, m=63)
-        check(device, m=64)
-        check(device, m=65)
-
-
-@tvm.testing.requires_cuda
-def test_lower_warp_memory_same_thread():
-    m = n = 128
-    A = te.placeholder((m, n), name="A")
-    k = te.reduce_axis((0, n), name="k")
-    B = te.compute((m,), lambda i: te.sum(A[i, k], axis=[k]))
-
-    s = te.create_schedule(B.op)
-    BB = s.cache_write(B, "warp")
-    tx = te.thread_axis("threadIdx.x")
-    xo, xi = s[B].split(B.op.axis[0], factor=32)
-    s[B].bind(xi, tx)
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
-    s[BB].compute_at(s[B], xo)
-    xo, xi = s[BB].split(s[BB].op.axis[0], factor=32)
-    s[BB].bind(xi, tx)
-
-    # lowering with the CSE pass disabled as otherwise it would do some commoning
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        mod = tvm.lower(s, [A, B], name="f")
-
-    mod = _run_passes(mod)
-    fdevice = mod["f_kernel"]
-    assert "tvm_warp_shuffle" not in fdevice.script()
-
-
-@tvm.testing.requires_cuda
-def test_lower_warp_memory_divide_by_factor():
-    ib = tvm.tir.ir_builder.IRBuilder()
-    bx = te.thread_axis("blockIdx.x")
-    tx = te.thread_axis("threadIdx.x")
-
-    with ib.new_scope():
-        ib.scope_attr(bx, "thread_extent", 32)
-        ib.scope_attr(tx, "thread_extent", 32)
-        t = ib.allocate("float32", 16, name="t", scope="warp")
-        n = ib.allocate("float32", 16, name="n", scope="local")
-        n[0] = t[0]
-
-    stmt = ib.get()
-    func = tvm.tir.PrimFunc([], stmt)
-    func = func.with_attr("from_legacy_te_schedule", True)
-    # lowering with the CSE pass disabled as otherwise it would do some commoning
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
-        mod = tvm.lower(func, name="f")
-    with pytest.raises(tvm.error.TVMError, match="Divide by zero") as cm:
-        _run_passes(mod)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/tir-transform/test_tir_transform_make_packed_api.py b/tests/python/tir-transform/test_tir_transform_make_packed_api.py
index f783ab2fcef1..8605d5185d90 100644
--- a/tests/python/tir-transform/test_tir_transform_make_packed_api.py
+++ b/tests/python/tir-transform/test_tir_transform_make_packed_api.py
@@ -21,32 +21,6 @@
 import tvm.testing
 from tvm import te, tir
 from tvm.script import tir as T, ir as I
-from tvm.driver.build_module import schedule_to_module
-
-
-def test_makeapi():
-    """Not yet working, mock design"""
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = te.create_schedule(C.op)
-
-    mod = schedule_to_module(s, [n, A, B, C])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-    mod = tvm.tir.transform.Apply(
-        lambda f: f.with_attr(
-            {
-                "target": tvm.target.Target("llvm", host="llvm"),
-                "global_symbol": "main",
-            }
-        )
-    )(mod)
-
-    before = mod
-    after = tvm.tir.transform.MakePackedAPI()(before)
-    f = after["main"]
-    assert len(f.params) == 6
 
 
 def _find_assignment(stmt, var_name):
diff --git a/tests/python/tir-transform/test_tir_transform_merge_dynamic_shared_memory_allocations.py b/tests/python/tir-transform/test_tir_transform_merge_dynamic_shared_memory_allocations.py
index 9bb0aaf6e8e8..ee78dab2cbfe 100644
--- a/tests/python/tir-transform/test_tir_transform_merge_dynamic_shared_memory_allocations.py
+++ b/tests/python/tir-transform/test_tir_transform_merge_dynamic_shared_memory_allocations.py
@@ -19,314 +19,10 @@
 import tvm
 import tvm.testing
 from tvm import te
-from tvm.driver.build_module import schedule_to_module
 from tvm.topi.math import cast
 from tvm.script import tir as T
 
 
-def run_passes(sch, args):
-    mod = schedule_to_module(sch, args)
-    return tvm.transform.Sequential(
-        [
-            tvm.tir.transform.StorageFlatten(64),
-            tvm.tir.transform.Simplify(),
-            tvm.tir.transform.VectorizeLoop(),
-            tvm.tir.transform.StorageRewrite(),
-            tvm.tir.transform.MergeSharedMemoryAllocations(),
-        ]
-    )(mod)
-
-
-def verify_single_allocation(stmt, alloc_size=None):
-    num_alloc = [0]
-    alloc_extents = []
-
-    def verify(n):
-        if (
-            isinstance(n, tvm.tir.Allocate)
-            and n.buffer_var.type_annotation.storage_scope == "shared.dyn"
-        ):
-            num_alloc[0] += 1
-            alloc_extents.append(n.extents[0])
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert num_alloc[0] == 1
-
-    if alloc_size:
-        assert alloc_extents[0] == alloc_size
-
-
-@tvm.testing.requires_gpu
-def test_matmul_dyn_shared():
-    n = 1024
-    block = 16
-    A = te.placeholder((n, n), name="A", dtype="float16")
-    B = te.placeholder((n, n), name="B", dtype="float16")
-
-    def syncthread():
-        return tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))
-
-    def test_matmul_ir(A, B, C):
-        ib = tvm.tir.ir_builder.create()
-
-        tx = te.thread_axis("threadIdx.x")
-        ty = te.thread_axis("threadIdx.y")
-        bx = te.thread_axis("blockIdx.x")
-        by = te.thread_axis("blockIdx.y")
-        ib.scope_attr(tx, "thread_extent", block)
-        ib.scope_attr(ty, "thread_extent", block)
-        ib.scope_attr(bx, "thread_extent", n // block)
-        ib.scope_attr(by, "thread_extent", n // block)
-
-        A_sh = ib.allocate(A.dtype, (block, block), scope="shared.dyn", name="A_sh")  # fp16
-        B_sh = ib.allocate(B.dtype, (block, block), scope="shared.dyn", name="B_sh")  # fp16
-        # Create a dynamic shared memory for the accumulation.
-        # This is for testing merging dynamic shared memory alloctions with different data type.
-        # In practice, there is no need to allocate a shared memory for C.
-        C_local = ib.allocate(C.dtype, (1,), scope="local", name="C_local")
-        C_sh = ib.allocate(C.dtype, (block, block), scope="shared.dyn", name="C_sh")  # fp32
-
-        A_ptr = ib.buffer_ptr(A)
-        B_ptr = ib.buffer_ptr(B)
-        C_ptr = ib.buffer_ptr(C)
-
-        C_local[0] = 0.0
-
-        with ib.for_range(0, n // block, name="i") as i:
-            A_sh[ty, tx] = A_ptr[by * block + ty, i * block + tx]
-            B_sh[ty, tx] = B_ptr[i * block + ty, bx * block + tx]
-            ib.emit(syncthread())
-
-            with ib.for_range(0, block, name="k") as k:
-                C_local[0] += cast(A_sh[ty, k] * B_sh[k, tx], "float32")
-            ib.emit(syncthread())
-
-        C_sh[ty, tx] = C_local[0]
-        C_ptr[by * block + ty, bx * block + tx] = C_sh[ty, tx]
-
-        return ib.get()
-
-    C = te.extern(
-        A.shape,
-        [A, B],
-        lambda ins, outs: test_matmul_ir(ins[0], ins[1], outs[0]),
-        name="matmul",
-        dtype="float32",
-    )
-    s = te.create_schedule(C.op)
-    mod = run_passes(s, [A, B, C])
-    # C can be allocated at the start of A, so we only need to allocate 2 block * block memory with dtype = float16
-    expected_alloc_size = block * block * 4
-    verify_single_allocation(mod["main"].body, expected_alloc_size)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        fmatmul = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-
-        size = (n, n)
-        a_np = np.random.uniform(size=size).astype(A.dtype)
-        b_np = np.random.uniform(size=size).astype(B.dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(size, dtype=C.dtype), dev)
-        fmatmul(a, b, c)
-        np_ref = np.dot(a_np.astype("float32"), b_np.astype("float32"))
-        tvm.testing.assert_allclose(c.numpy(), np_ref, 1e-4, 1e-4)
-
-    for target in ["cuda", "nvptx"]:
-        check_target(target)
-
-
-@tvm.testing.requires_gpu
-def test_dyn_shared_vectorized_store():
-    """Test vectorized store into dynamic shared memory"""
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A", dtype="float16")
-    B = te.placeholder((n,), name="B", dtype="float32")
-
-    def test_device_ir(A, B, C):
-        n = A.shape[0]
-        ib = tvm.tir.ir_builder.create()
-
-        values_per_thread = 4
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", tvm.tir.indexdiv(n, values_per_thread))
-
-        A_sh = ib.allocate(A.dtype, (n,), scope="shared.dyn")  # fp16
-        B_sh = ib.allocate(B.dtype, (n,), scope="shared.dyn")  # fp32
-
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-
-        with ib.for_range(0, values_per_thread, kind="vectorize") as i:
-            A_sh[tx * values_per_thread + i] = Aptr[tx * values_per_thread + i]
-            B_sh[tx * values_per_thread + i] = Bptr[tx * values_per_thread + i]
-
-        with ib.for_range(0, values_per_thread) as i:
-            Cptr[tx * values_per_thread + i] = (
-                cast(A_sh[tx * values_per_thread + i], "float32") + B_sh[tx * values_per_thread + i]
-            )
-
-        return ib.get()
-
-    C = te.extern(
-        (n,),
-        [A, B],
-        lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
-        name="vadd",
-        dtype="float32",
-    )
-    s = te.create_schedule(C.op)
-
-    mod = run_passes(s, [A, B, C])
-    verify_single_allocation(mod["main"].body)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        fadd = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-
-        for n in [512, 1024]:
-            a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-            b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-            c = tvm.nd.array(np.zeros((n,), dtype=C.dtype), dev)
-            fadd(a, b, c)
-            tvm.testing.assert_allclose(
-                c.numpy(), a.numpy().astype("float32") + b.numpy(), 1e-4, 1e-4
-            )
-
-    for target in ["cuda", "nvptx"]:
-        check_target(target)
-
-
-@tvm.testing.requires_gpu
-def test_dyn_shared_reuse_and_merge():
-    n = 64
-    A = te.placeholder((n,), name="A", dtype="float32")
-    B = te.placeholder((n,), name="B", dtype="float32")
-    C = te.placeholder((te.size_var("n_dyn"),), name="C", dtype="float32")
-
-    def test_device_ir(A, B, C, D):
-        ib = tvm.tir.ir_builder.create()
-
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", n)
-
-        A_sh = ib.allocate(A.dtype, (n,), scope="shared.dyn", name="A_sh")
-        B_sh = ib.allocate(B.dtype, (n,), scope="shared.dyn", name="B_sh")
-        C_sh = ib.allocate(C.dtype, (C.shape[0],), scope="shared.dyn", name="C_sh")
-
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-        Dptr = ib.buffer_ptr(D)
-
-        A_sh[tx] = Aptr[tx]
-        Dptr[tx] = A_sh[tx]
-
-        B_sh[tx] = Bptr[tx]
-        Dptr[tx] += B_sh[tx]
-
-        C_sh[tx] = Cptr[tx]  # C cannot reuse other buffers since it size is dynamic
-        Dptr[tx] += C_sh[tx]
-
-        return ib.get()
-
-    D = te.extern(
-        (n,),
-        [A, B, C],
-        lambda ins, outs: test_device_ir(ins[0], ins[1], ins[2], outs[0]),
-        name="vadd",
-        dtype="float32",
-    )
-    s = te.create_schedule(D.op)
-
-    mod = run_passes(s, [A, B, C, D])
-    # merged allocation
-    # allocate(buf_dyn_shmem: Pointer(shared.dyn uint8), uint8, [((n_dyn*4) + 256)]);
-    verify_single_allocation(mod["main"].body)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        fadd = tvm.build(s, [A, B, C, D], target)
-        dev = tvm.device(target, 0)
-
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), dev)
-        d = tvm.nd.array(np.zeros((n,), dtype=D.dtype), dev)
-        fadd(a, b, c, d)
-        tvm.testing.assert_allclose(d.numpy(), a.numpy() + b.numpy() + c.numpy(), 1e-4, 1e-4)
-
-    for target in ["cuda", "nvptx"]:
-        check_target(target)
-
-
-def test_dyn_shared_more_dtype():
-    """Test vectorized store into dynamic shared memory"""
-    n = 512
-    A = te.placeholder((n,), name="A", dtype="int8")
-    B = te.placeholder((n,), name="B", dtype="int16")
-
-    def test_device_ir(A, B, C):
-        n = A.shape[0]
-        ib = tvm.tir.ir_builder.create()
-
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", n)
-
-        A_sh = ib.allocate(A.dtype, (n,), scope="shared.dyn")  # i8
-        B_sh = ib.allocate(B.dtype, (n,), scope="shared.dyn")  # i16
-        C_sh = ib.allocate(C.dtype, (n,), scope="shared.dyn")  # i32
-
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-
-        A_sh[tx] = Aptr[tx]
-        B_sh[tx] = Bptr[tx]
-
-        C_sh[tx] = cast(A_sh[tx], "int32") + cast(B_sh[tx], "int32")
-        Cptr[tx] = C_sh[tx]
-        return ib.get()
-
-    C = te.extern(
-        (n,),
-        [A, B],
-        lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
-        name="vadd",
-        dtype="int32",
-    )
-    s = te.create_schedule(C.op)
-
-    mod = run_passes(s, [A, B, C])
-    verify_single_allocation(mod["main"].body, n * 4)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        fadd = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((n,), dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy().astype("float32") + b.numpy(), 1e-4, 1e-4)
-
-    for target in ["cuda", "nvptx"]:
-        check_target(target)
-
-
 class TestMatmul(tvm.testing.CompareBeforeAfter):
     """Shared allocations should be merged, preserving DeclBuffer if present
 
diff --git a/tests/python/tir-transform/test_tir_transform_merge_static_shared_memory_allocations.py b/tests/python/tir-transform/test_tir_transform_merge_static_shared_memory_allocations.py
deleted file mode 100644
index be32514a720c..000000000000
--- a/tests/python/tir-transform/test_tir_transform_merge_static_shared_memory_allocations.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import te
-from tvm.driver.build_module import schedule_to_module
-from tvm.topi.math import cast
-from tvm.script import tir as T
-
-
-def run_passes(sch, args):
-    mod = schedule_to_module(sch, args)
-    with tvm.transform.PassContext(config={"tir.merge_static_smem": True}):
-        return tvm.transform.Sequential(
-            [
-                tvm.tir.transform.StorageFlatten(64),
-                tvm.tir.transform.Simplify(),
-                tvm.tir.transform.VectorizeLoop(),
-                tvm.tir.transform.StorageRewrite(),
-                tvm.tir.transform.MergeSharedMemoryAllocations(),
-            ]
-        )(mod)
-
-
-def verify_single_allocation(stmt, alloc_size=None):
-    num_alloc = [0]
-    alloc_extents = []
-
-    def verify(n):
-        if (
-            isinstance(n, tvm.tir.Allocate)
-            and n.buffer_var.type_annotation.storage_scope == "shared"
-        ):
-            num_alloc[0] += 1
-            alloc_extents.append(n.extents[0])
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert num_alloc[0] == 1
-
-    if alloc_size:
-        assert alloc_extents[0] == alloc_size
-
-
-@tvm.testing.requires_gpu
-def test_matmul_shared():
-    n = 1024
-    block = 16
-    A = te.placeholder((n, n), name="A", dtype="float16")
-    B = te.placeholder((n, n), name="B", dtype="float16")
-
-    def syncthread():
-        return tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))
-
-    def test_matmul_ir(A, B, C):
-        ib = tvm.tir.ir_builder.create()
-
-        tx = te.thread_axis("threadIdx.x")
-        ty = te.thread_axis("threadIdx.y")
-        bx = te.thread_axis("blockIdx.x")
-        by = te.thread_axis("blockIdx.y")
-        ib.scope_attr(tx, "thread_extent", block)
-        ib.scope_attr(ty, "thread_extent", block)
-        ib.scope_attr(bx, "thread_extent", n // block)
-        ib.scope_attr(by, "thread_extent", n // block)
-
-        A_sh = ib.allocate(A.dtype, (block, block), scope="shared", name="A_sh")  # fp16
-        B_sh = ib.allocate(B.dtype, (block, block), scope="shared", name="B_sh")  # fp16
-        # Create a shared memory for the accumulation.
-        # This is for testing merging shared memory alloctions with different data type.
-        # In practice, there is no need to allocate a shared memory for C.
-        C_local = ib.allocate(C.dtype, (1,), scope="local", name="C_local")
-        C_sh = ib.allocate(C.dtype, (block, block), scope="shared", name="C_sh")  # fp32
-
-        A_ptr = ib.buffer_ptr(A)
-        B_ptr = ib.buffer_ptr(B)
-        C_ptr = ib.buffer_ptr(C)
-
-        C_local[0] = 0.0
-
-        with ib.for_range(0, n // block, name="i") as i:
-            A_sh[ty, tx] = A_ptr[by * block + ty, i * block + tx]
-            B_sh[ty, tx] = B_ptr[i * block + ty, bx * block + tx]
-            ib.emit(syncthread())
-
-            with ib.for_range(0, block, name="k") as k:
-                C_local[0] += cast(A_sh[ty, k] * B_sh[k, tx], "float32")
-            ib.emit(syncthread())
-
-        C_sh[ty, tx] = C_local[0]
-        C_ptr[by * block + ty, bx * block + tx] = C_sh[ty, tx]
-
-        return ib.get()
-
-    C = te.extern(
-        A.shape,
-        [A, B],
-        lambda ins, outs: test_matmul_ir(ins[0], ins[1], outs[0]),
-        name="matmul",
-        dtype="float32",
-    )
-    s = te.create_schedule(C.op)
-    mod = run_passes(s, [A, B, C])
-    # C can be allocated at the start of A, so we only need to allocate 2 block * block memory with dtype = float16
-    expected_alloc_size = block * block * 4
-    verify_single_allocation(mod["main"].body, expected_alloc_size)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        fmatmul = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-
-        size = (n, n)
-        a_np = np.random.uniform(size=size).astype(A.dtype)
-        b_np = np.random.uniform(size=size).astype(B.dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(size, dtype=C.dtype), dev)
-        fmatmul(a, b, c)
-        np_ref = np.dot(a_np.astype("float32"), b_np.astype("float32"))
-        tvm.testing.assert_allclose(c.numpy(), np_ref, 1e-4, 1e-4)
-
-    for target in ["cuda"]:
-        check_target(target)
-
-
-@tvm.testing.requires_gpu
-def test_shared_more_dtype():
-    """Test vectorized store into shared memory"""
-    n = 512
-    A = te.placeholder((n,), name="A", dtype="int8")
-    B = te.placeholder((n,), name="B", dtype="int16")
-
-    def test_device_ir(A, B, C):
-        n = A.shape[0]
-        ib = tvm.tir.ir_builder.create()
-
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", n)
-
-        A_sh = ib.allocate(A.dtype, (n,), scope="shared")  # i8
-        B_sh = ib.allocate(B.dtype, (n,), scope="shared")  # i16
-        C_sh = ib.allocate(C.dtype, (n,), scope="shared")  # i32
-
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-
-        A_sh[tx] = Aptr[tx]
-        B_sh[tx] = Bptr[tx]
-
-        C_sh[tx] = cast(A_sh[tx], "int32") + cast(B_sh[tx], "int32")
-        Cptr[tx] = C_sh[tx]
-        return ib.get()
-
-    C = te.extern(
-        (n,),
-        [A, B],
-        lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
-        name="vadd",
-        dtype="int32",
-    )
-    s = te.create_schedule(C.op)
-
-    mod = run_passes(s, [A, B, C])
-    verify_single_allocation(mod["main"].body, n * 4)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        fadd = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((n,), dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy().astype("float32") + b.numpy(), 1e-4, 1e-4)
-
-    for target in ["cuda"]:
-        check_target(target)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/tir-transform/test_tir_transform_narrow_datatype.py b/tests/python/tir-transform/test_tir_transform_narrow_datatype.py
index 5ebdbe986082..93c680c846c5 100644
--- a/tests/python/tir-transform/test_tir_transform_narrow_datatype.py
+++ b/tests/python/tir-transform/test_tir_transform_narrow_datatype.py
@@ -16,7 +16,6 @@
 # under the License.
 import tvm
 from tvm import te
-from tvm.driver.build_module import schedule_to_module
 from tvm.script import tir as T
 from tvm.tir import const
 import tvm.testing
@@ -163,27 +162,6 @@ def check(m, lanes, target_bits, target_dtype):
     check(const(2**16, dtype="int32"), 2, target_bits=16, target_dtype="int32")
 
 
-def test_reduce():
-    def check(m, target_bits, target_dtype):
-        A = te.placeholder((m,), name="A", dtype="float32")
-        k = te.reduce_axis((0, m), "k")
-        B = te.compute((), lambda *idx: te.sum(A[k], axis=k), name="B")
-        s = te.create_schedule(B.op)
-        stmt = lower_sch(s, [A, B], target_bits)
-        assert stmt[1].loop_var.dtype == target_dtype
-
-    # i32 -> i32
-    check(const(64, dtype="int32"), 32, "int32")
-    # i64 -> i32
-    check(const(64, dtype="int64"), 32, "int32")
-    # i32 -> i16
-    check(const(64, dtype="int32"), 16, "int16")
-    check(const(2**16, dtype="int32"), 16, "int32")
-    # symbolic
-    check(te.var("n", dtype="int32"), 32, "int32")
-    check(te.var("n", dtype="int64"), 32, "int64")
-
-
 def test_slice():
     def check(m, n, target_bits, target_dtype):
         # The index may overflow in B, while not in A
@@ -208,25 +186,6 @@ def check(m, n, target_bits, target_dtype):
     )
 
 
-def test_ramp_dtype_consistency():
-    """
-    for (i :int64, (int64)0, (int64)4) {
-        A[ramp(i*(int64)2, (int64)1, 2)] = cast(int64, 2 ** 31 - 1) * i;
-    }
-    The infer result:
-        base:   int64 -> int64 (since i is involved in another int64 expr)
-        stride: int64 -> int32
-
-    Thus ramp should still use int64 for both stride and base after rewrite.
-    """
-    n = tvm.tir.IntImm("int64", 4)
-    m = tvm.tir.IntImm("int64", 2)
-    A = te.compute((n, m), lambda i, j: tvm.tir.Cast("int64", 2**31 - 1) * i, name="A")
-    s = te.create_schedule(A.op)
-    s[A].vectorize(A.op.axis[1])
-    lower_sch(s, [A], 32, extra_passes=[tvm.tir.transform.VectorizeLoop()])
-
-
 def test_condition():
     @T.prim_func
     def before(A: T.Buffer((128,), "float32"), B: T.Buffer((130,), "float32")):
diff --git a/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py b/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py
index 1a1e780a7272..8500f114610c 100644
--- a/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py
+++ b/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py
@@ -236,17 +236,6 @@ def test_opaque_access():
     _check(opaque_access, transformed_opaque_access)
 
 
-def test_lower_te():
-    x = te.placeholder((1,))
-    y = te.compute((1,), lambda i: x[i] + 2)
-    s = te.create_schedule(y.op)
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y])
-    mod = tvm.tir.transform.PlanAndUpdateBufferAllocationLocation()(orig_mod)
-    tvm.ir.assert_structural_equal(
-        mod, orig_mod
-    )  # PlanAndUpdateBufferAllocationLocation should do nothing on TE
-
-
 def test_loop_carried_dependency():
     """The buffer allocation should be above opaque iter var's loop scopes
     such that buffer accesses with loop carried dependencies are covered,
diff --git a/tests/python/tir-transform/test_tir_transform_simplify.py b/tests/python/tir-transform/test_tir_transform_simplify.py
index 0b2d5f16d833..bbd69d01cbb4 100644
--- a/tests/python/tir-transform/test_tir_transform_simplify.py
+++ b/tests/python/tir-transform/test_tir_transform_simplify.py
@@ -73,69 +73,6 @@ def test_if_likely():
     assert not isinstance(body.body.body.then_case, tvm.tir.IfThenElse)
 
 
-def test_basic_likely_elimination():
-    n = te.size_var("n")
-    X = te.placeholder(shape=(n,), name="x")
-    W = te.placeholder(shape=(n + 1,), dtype="int32", name="w")
-
-    def f(i):
-        start = W[i]
-        extent = W[i + 1] - W[i]
-        rv = te.reduce_axis((0, extent))
-        return te.sum(X[rv + start], axis=rv)
-
-    Y = te.compute(X.shape, f, name="y")
-    s = te.create_schedule([Y.op])
-    stmt = tvm.lower(s, [X, W, Y], simple_mode=True)
-    assert "if" not in str(stmt)
-
-
-def test_complex_likely_elimination():
-    def cumsum(X):
-        """
-        Y[i] = sum(X[:i])
-        """
-        (m,) = X.shape
-        s_state = te.placeholder((m + 1,), dtype="int32", name="state")
-        s_init = te.compute((1,), lambda _: tvm.tir.const(0, "int32"))
-        s_update = te.compute((m + 1,), lambda l: s_state[l - 1] + X[l - 1])
-        return tvm.te.scan(s_init, s_update, s_state, inputs=[X], name="cumsum")
-
-    def sparse_lengths_sum(data, indices, lengths):
-        oshape = list(data.shape)
-        oshape[0] = lengths.shape[0]
-        length_offsets = cumsum(lengths)
-
-        def sls(n, d):
-            gg = te.reduce_axis((0, lengths[n]))
-            indices_idx = length_offsets[n] + gg
-            data_idx = indices[indices_idx]
-            data_val = data[data_idx, d]
-            return te.sum(data_val, axis=gg)
-
-        return te.compute(oshape, sls)
-
-    m, n, d, i, l = (
-        te.size_var("m"),
-        te.size_var("n"),
-        te.size_var("d"),
-        te.size_var("i"),
-        te.size_var("l"),
-    )
-    data_ph = te.placeholder((m, d * 32), name="data")
-    indices_ph = te.placeholder((i,), name="indices", dtype="int32")
-    lengths_ph = te.placeholder((n,), name="lengths", dtype="int32")
-    Y = sparse_lengths_sum(data_ph, indices_ph, lengths_ph)
-    s = te.create_schedule([Y.op])
-    (n, d) = s[Y].op.axis
-    (do, di) = s[Y].split(d, factor=32)
-    (gg,) = s[Y].op.reduce_axis
-    s[Y].reorder(n, do, gg, di)
-    s[Y].vectorize(di)
-    stmt = tvm.lower(s, [data_ph, indices_ph, lengths_ph, Y], simple_mode=True)
-    assert "if" not in str(stmt)
-
-
 class BaseBeforeAfter(tvm.testing.CompareBeforeAfter):
     transitively_prove_inequalities = False
     convert_boolean_to_and_of_ors = False
@@ -668,7 +605,6 @@ def expected(self, test_case):
         priors = analyzer.canonical_simplify(priors)
 
         if provable:
-
             # well formed checker complains of undefined variables in condition
             @T.prim_func(check_well_formed=False)
             def func(A: T.Buffer(1, "bool")):
diff --git a/tests/python/tir-transform/test_tir_transform_split_host_device.py b/tests/python/tir-transform/test_tir_transform_split_host_device.py
index 2d0d8a68d83e..a7ea6d8cdd46 100644
--- a/tests/python/tir-transform/test_tir_transform_split_host_device.py
+++ b/tests/python/tir-transform/test_tir_transform_split_host_device.py
@@ -21,45 +21,6 @@
 from tvm.script import tir as T
 
 
-@tvm.testing.requires_cuda
-def test_split_host_device_func_attr():
-    m = te.size_var("m")
-    l = te.size_var("l")
-    A = te.placeholder((m, l), name="A")
-
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    xo, xi = s[A2].split(A2.op.axis[0], factor=8)
-    s[A2].bind(xo, te.thread_axis("blockIdx.x"))
-    s[A1].compute_at(s[A2], xo)
-    s[A1].set_scope("shared")
-
-    mod = tvm.lower(s, [A, A2])
-
-    cuda_target = tvm.target.Target("cuda", host="llvm")
-    mod = tvm.tir.transform.Apply(
-        lambda f: f.with_attr({"global_symbol": "test", "target": cuda_target})
-    )(mod)
-
-    mod = tvm.ir.transform.Sequential(
-        [
-            tvm.tir.transform.AnnotateDeviceRegions(),
-            tvm.tir.transform.SplitHostDevice(),
-            tvm.tir.transform.MakePackedAPI(),
-            tvm.tir.transform.LowerDeviceKernelLaunch(),
-        ]
-    )(mod)
-
-    fdevice = mod["test_kernel"]
-
-    assert fdevice.attrs["global_symbol"] == "test_kernel"
-    assert fdevice.attrs["calling_conv"].value == 2
-    assert str(fdevice.attrs["target"]) == str(tvm.target.Target("cuda"))
-    assert fdevice.attrs["tir.is_global_func"].value
-
-
 def test_ssa_across_entire_module():
     """The host and device functions should not share TIR vars
 
diff --git a/tests/python/tir-transform/test_tir_transform_storage_flatten.py b/tests/python/tir-transform/test_tir_transform_storage_flatten.py
index 4a81ab93c763..2c97cc53af67 100644
--- a/tests/python/tir-transform/test_tir_transform_storage_flatten.py
+++ b/tests/python/tir-transform/test_tir_transform_storage_flatten.py
@@ -17,72 +17,9 @@
 import tvm
 import tvm.testing
 from tvm import te
-from tvm.driver.build_module import schedule_to_module
 from tvm.script import tir as T
 
 
-def test_flatten2():
-    m = te.size_var("m")
-    l = te.size_var("l")
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    xo, xi = s[A2].split(A2.op.axis[0], 8)
-    s[A1].compute_at(s[A2], xo)
-    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="A")
-    A2b = tvm.tir.decl_buffer(A2.shape, A2.dtype, name="A2")
-
-    mod = schedule_to_module(s, [Ab, A2b], binds={A: Ab, A2: A2b})
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-
-def test_flatten_prefetch():
-    A = te.placeholder((25, 100, 4), name="A")
-    _A = tvm.tir.decl_buffer(A.shape, A.dtype, name="A")
-    i = te.size_var("i")
-    j = te.size_var("j")
-    region = [tvm.ir.Range.from_min_extent(i[0], i[1]) for i in [(i, 2), (j, 8), (0, 4)]]
-    stmt = tvm.tir.Prefetch(_A, region)
-
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([_A], stmt, {A: _A})
-
-    mod = tvm.IRModule.from_expr(func)
-    mod = tvm.transform.Sequential(
-        [tvm.tir.transform.StorageFlatten(64), tvm.tir.transform.Simplify()]
-    )(mod)
-    stmt = mod["main"].body
-    assert stmt.extent.value == 2
-    assert isinstance(stmt.body, tvm.tir.For)
-    assert stmt.body.extent.value == 2
-
-    def assert_flat_loads(stmt):
-        if isinstance(stmt, tvm.tir.BufferLoad):
-            assert len(stmt.indices) == 1, "All prefetch indices should be flattened"
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, assert_flat_loads)
-
-
-def test_flatten_storage_align():
-    m = 8
-    l = 16
-    A = te.placeholder((m, l), name="A")
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    s[A1].storage_align(A1.op.axis[0], 2, 1)
-
-    mod = schedule_to_module(s, [A, A2])
-    mod = tvm.transform.Sequential(
-        [tvm.tir.transform.StorageFlatten(64), tvm.tir.transform.Simplify()]
-    )(mod)
-
-    stmt = mod["main"].body
-    assert stmt.extents[0].value == 17 * 8
-
-
 def test_flatten_double_buffer():
     @tvm.script.ir_module
     class ModFromScript:
diff --git a/tests/python/tir-transform/test_tir_transform_storage_rewrite.py b/tests/python/tir-transform/test_tir_transform_storage_rewrite.py
index 68149e7d64bb..ab91c6c7b330 100644
--- a/tests/python/tir-transform/test_tir_transform_storage_rewrite.py
+++ b/tests/python/tir-transform/test_tir_transform_storage_rewrite.py
@@ -21,39 +21,9 @@
 import tvm
 import tvm.testing
 from tvm import te
-from tvm.driver.build_module import schedule_to_module
 from tvm.script import tir as T
 
 
-def test_storage_share():
-    m = te.var("m")
-    l = te.var("l")
-    A = te.placeholder((m, l), name="A")
-    num_stage = 5
-    B = A
-    for t in range(num_stage):
-        B = te.compute((m, l), lambda i, j: B[i, j] + (t + 1), name="A%d" % t)
-
-    s = te.create_schedule(B.op)
-    mod = schedule_to_module(s, [A, B])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    mod = tvm.tir.transform.Simplify()(mod)
-    mod = tvm.tir.transform.StorageRewrite()(mod)
-    stmt = mod["main"].body
-
-    # verify only have one allocations.
-    # verify inplace folding works
-    num_alloc = [0]
-
-    def verify(n):
-        if isinstance(n, tvm.tir.Allocate):
-            num_alloc[0] += 1
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert num_alloc[0] == 1
-
-
 def register_mem(scope_tb, max_bits):
     # Register mem
     @tvm.register_func("tvm.info.mem.%s" % scope_tb)
@@ -163,103 +133,6 @@ def verify(n):
     dtype_test(dtype_list, length)
 
 
-def test_inplace_rule():
-    m = 10
-    A = te.placeholder((m,), name="A")
-    A0 = te.compute((m,), lambda i: A[i], name="A0")
-    A1 = te.compute((m,), lambda i: A[i] + 1, name="A1")
-    AA = te.compute((m,), lambda i: A0[i] + A1[i] + A1[0], name="AA")
-    B = te.compute((m,), lambda i: AA[i] + 1, name="B")
-    s = te.create_schedule(B.op)
-    mod = schedule_to_module(s, [A, B])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    mod = tvm.tir.transform.Simplify()(mod)
-    mod = tvm.tir.transform.StorageRewrite()(mod)
-    stmt = mod["main"].body
-
-    # verify only have one allocations.
-    # verify inplace folding works
-    num_alloc = [0]
-
-    def verify(n):
-        if isinstance(n, tvm.tir.Allocate):
-            num_alloc[0] += 1
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert num_alloc[0] == 2
-
-
-def test_storage_combine():
-    n = 8
-    A = te.placeholder((4,), name="A")
-    num_stage = 5
-    B = A
-    stages = []
-    for t in range(num_stage):
-        B = te.compute((n,), lambda i: B[i] + B[0] + (t + 1), name="A%d" % t)
-        stages.append(B)
-
-    s = te.create_schedule(B.op)
-    for S in stages[:-1]:
-        s[S].set_scope("global:tag")
-
-    mod = schedule_to_module(s, [A, B])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    mod = tvm.tir.transform.Simplify()(mod)
-    mod = tvm.tir.transform.StorageRewrite()(mod)
-    stmt = mod["main"].body
-
-    num_alloc = [0]
-
-    def verify(n):
-        if isinstance(n, tvm.tir.Allocate):
-            num_alloc[0] += 1
-            assert n.extents[0].value == 16
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert num_alloc[0] == 1
-
-
-def test_storage_combine_with_vectorization():
-    n = 1024
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute((n,), lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
-    AA = s.cache_read(A, "global:tag", readers=[C])
-    BB = s.cache_read(B, "global:tag", readers=[C])
-    CC = s.cache_write(C, "global:tag")
-    s[CC].vectorize(s[CC].op.axis[0])
-    mod = schedule_to_module(s, [A, B, C])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-    mod = tvm.tir.transform.VectorizeLoop()(mod)
-    mod = tvm.tir.transform.StorageRewrite()(mod)
-    mod = tvm.tir.transform.Simplify()(mod)
-    stmt = mod["main"].body
-    num_alloc = [0]
-
-    def verify(v):
-        # find add op
-        if (
-            isinstance(v, tvm.tir.Add)
-            and isinstance(v.a, tvm.tir.BufferLoad)
-            and isinstance(v.b, tvm.tir.BufferLoad)
-        ):
-            lhs_ramp = v.a.indices[0]
-            rhs_ramp = v.b.indices[0]
-            # these two ramp load should not overlap
-            assert lhs_ramp.lanes == n
-            assert rhs_ramp.lanes == n
-            assert lhs_ramp.base >= rhs_ramp.base + n or rhs_ramp.base >= lhs_ramp.base + n
-        elif isinstance(v, tvm.tir.Allocate):
-            num_alloc[0] += 1
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert num_alloc[0] == 1
-
-
 def test_address_of():
     # In this test, the storage rewrite pass is allowed to
     # combine buffers B and D, but not C
@@ -313,40 +186,6 @@ def verify(n):
     assert total_alloc[0] == 16
 
 
-def test_storage_share_gpu():
-    m = te.var("m")
-    A = [te.placeholder((m), name="A")]
-    num_stage = 5
-    for t in range(num_stage):
-        A.append(te.compute((m,), lambda i: A[-1][i] + (t + 1), name="A%d_s" % t))
-        A.append(te.compute((m,), lambda i: A[-1][i], name="A%d" % t))
-    s = te.create_schedule(A[-1].op)
-    for t in range(num_stage):
-        x = A[2 * t + 2].op.axis[0]
-        bx, tx = s[A[2 * t + 2]].split(x, factor=32)
-        s[A[2 * t + 2]].bind(bx, te.thread_axis("blockIdx.x"))
-        s[A[2 * t + 2]].bind(tx, te.thread_axis("threadIdx.x"))
-        s[A[2 * t + 1]].compute_at(s[A[2 * t + 2]], tx)
-        s[A[2 * t + 1]].set_scope("shared")
-
-    mod = schedule_to_module(s, [A[0], A[-1]])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-    mod = tvm.tir.transform.Simplify()(mod)
-    mod = tvm.tir.transform.StorageRewrite()(mod)
-    stmt = mod["main"].body
-
-    alloc_stats = {"global": 0, "shared": 0}
-
-    def verify(n):
-        if isinstance(n, tvm.tir.Allocate):
-            scope = n.buffer_var.type_annotation.storage_scope
-            alloc_stats[scope] += 1
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert alloc_stats["global"] == 2
-    assert alloc_stats["shared"] == num_stage
-
-
 def test_parallel_alloc():
     ib = tvm.tir.ir_builder.create()
     n = te.var("n")
@@ -443,125 +282,6 @@ def get_mod(kind="serial"):
     assert isinstance(body.body.body, tvm.tir.Allocate)  # A
 
 
-def test_inplace_rule2(scope_tb="local_TB2", max_bits=1024 * 1024 * 1024):
-    # Test Buffer
-    register_mem(scope_tb, max_bits)
-    m = 10
-    A = te.placeholder((m,), name="A")
-    C = te.placeholder((m,), name="C")
-    D = te.placeholder((m,), name="D")
-    A0 = te.compute((m,), lambda i: A[i] + C[i], name="A0")
-    A1 = te.compute((m,), lambda i: D[i] * D[i], name="A1")
-    A2 = te.compute((m,), lambda i: A0[i] + A1[i], name="A2")
-    B = te.compute((m,), lambda i: A2[i], name="B")
-    s = te.create_schedule(B.op)
-    A0L = s.cache_read(A0, scope_tb, [A2])
-    A1L = s.cache_read(A1, scope_tb, [A2])
-    A2L = s.cache_read(A2, scope_tb, [B])
-    mod = schedule_to_module(s, [A, B, C, D])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    mod = tvm.tir.transform.Simplify()(mod)
-    mod = tvm.tir.transform.StorageRewrite()(mod)
-    stmt = mod["main"].body
-
-    # verify only have one allocations.
-    # verify inplace folding works
-    num_alloc = [0]
-
-    def verify(n):
-        if isinstance(n, tvm.tir.Allocate):
-            num_alloc[0] += 1
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-    assert num_alloc[0] == 2
-
-
-def test_exceed_mem():
-    max_bits = 639
-    # The critical max_num_bits is between 639 and 640
-    loc = -1
-    try:
-        test_inplace_rule2("local_TEM", max_bits)
-    except Exception as e:
-        estr = str(e)
-        loc = estr.find("Allocation exceed bound of memory")
-        assert loc != -1
-
-
-def test_inplace_rule3():
-    # Test Buffer
-    scope_tb = "local_TB3"
-    max_bits = 1024 * 1024 * 1024
-
-    register_mem(scope_tb, max_bits)
-    m = 10
-    B0 = te.placeholder((m,), name="B0")
-    B1 = te.placeholder((m,), name="B1")
-    B2 = te.placeholder((m,), name="B2")
-    B3 = te.placeholder((m,), name="B3")
-    B4 = te.placeholder((m,), name="B4")
-    B5 = te.placeholder((m,), name="B5")
-
-    B6 = te.compute((m,), lambda i: B1[i] * B5[i], name="B6")
-    B7 = te.compute((m,), lambda i: B2[i] * B4[i], name="B7")
-    B8 = te.compute((m,), lambda i: B6[i] - B7[i], name="B8")
-
-    B9 = te.compute((m,), lambda i: B2[i] * B3[i], name="B9")
-    B10 = te.compute((m,), lambda i: B0[i] * B5[i], name="B10")
-    B11 = te.compute((m,), lambda i: B9[i] - B10[i], name="B11")
-
-    B12 = te.compute((m,), lambda i: B0[i] * B4[i], name="B12")
-    B13 = te.compute((m,), lambda i: B1[i] * B3[i], name="B13")
-    B14 = te.compute((m,), lambda i: B12[i] - B13[i], name="B14")
-
-    B = te.compute((m,), lambda i: B8[i] * B11[i] + B14[i], name="B")
-    s = te.create_schedule(B.op)
-
-    B1L = s.cache_read(B1, scope_tb, [B6, B13])
-    B5L = s.cache_read(B5, scope_tb, [B6, B10])
-    B2L = s.cache_read(B2, scope_tb, [B7, B9])
-    B4L = s.cache_read(B4, scope_tb, [B7, B12])
-    B3L = s.cache_read(B3, scope_tb, [B9, B13])
-    B0L = s.cache_read(B0, scope_tb, [B10, B12])
-
-    B8L = s.cache_write(B8, scope_tb)
-    B11L = s.cache_write(B11, scope_tb)
-    B14L = s.cache_write(B14, scope_tb)
-    B6L = s.cache_write(B6, scope_tb)
-    B7L = s.cache_write(B7, scope_tb)
-    B9L = s.cache_write(B9, scope_tb)
-    B10L = s.cache_write(B10, scope_tb)
-    B12L = s.cache_write(B12, scope_tb)
-    B13L = s.cache_write(B13, scope_tb)
-
-    s[B12].compute_inline()
-    s[B13].compute_inline()
-    s[B8].compute_inline()
-    s[B11].compute_inline()
-    s[B14].compute_inline()
-    s[B6].compute_inline()
-    s[B7].compute_inline()
-    s[B9].compute_inline()
-    s[B10].compute_inline()
-
-    s = s.normalize()
-    mod = schedule_to_module(s, [B0, B1, B2, B3, B4, B5, B])
-    mod = tvm.tir.transform.StorageFlatten(64)(mod)
-
-    mod = tvm.tir.transform.Simplify()(mod)
-    mod = tvm.tir.transform.StorageRewrite()(mod)
-    stmt = mod["main"].body
-
-    # verify only have one allocations.
-    # verify inplace folding works
-    def verify(n):
-        if isinstance(n, tvm.tir.Allocate):
-            assert n.extents[0].value == 70
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-
-
 def test_alloc_seq_type():
     ib = tvm.tir.ir_builder.create()
     n = te.var("n")
@@ -665,46 +385,6 @@ def verify(n):
     assert num_alloc[0] == 1
 
 
-def test_replace_dataflow():
-    shape = (255,)
-    A = te.placeholder(shape, name="A")
-    B = te.compute(shape, lambda i: A[i] + A[i], name="B")
-    C = te.compute(shape, lambda i: A[i] + B[i], name="C")
-    D = te.compute(shape, lambda i: A[i] + C[i], name="D")
-    E = te.compute(shape, lambda i: A[i] + D[i], name="E")
-
-    s = te.create_schedule(E.op)
-    s.cache_read(A, "local", [B, C, D, E])
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-
-
-def test_large_input():
-    @te.hybrid.script
-    def compute(a, b):
-        n = 16384
-        c = output_tensor((n, n), "int32")
-        for i in range(n):
-            for j in range(n):
-                c[i, j] = a[i, j] - b[i, j]
-        return c
-
-    n = 16384
-    shape = (n, n)
-    a = te.placeholder(shape, name="a", dtype="int32")
-    b = te.placeholder(shape, name="b", dtype="int32")
-    c = te.compute(shape, lambda i, j: compute(a, b)[i, j])
-    c = te.compute(shape, lambda i, j: 1 + c[i, j])
-    s = te.create_schedule(c.op)
-    stmt = tvm.lower(s, [a, b, c])["main"].body
-
-    def verify(n):
-        if isinstance(n, tvm.tir.Allocate):
-            assert n.extents[0].value == 268435456
-
-    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
-
-
 def test_access_in_let_value():
     @T.prim_func
     def func(A: T.Buffer((8,), "float32")):
diff --git a/tests/python/tir-transform/test_tir_transform_thread_sync.py b/tests/python/tir-transform/test_tir_transform_thread_sync.py
index 5c43d8d96aa1..4ca33424c1d5 100644
--- a/tests/python/tir-transform/test_tir_transform_thread_sync.py
+++ b/tests/python/tir-transform/test_tir_transform_thread_sync.py
@@ -35,67 +35,6 @@ def run_passes(func: tvm.tir.PrimFunc):
     return tvm.tir.transform.ThreadSync("shared")(mod)
 
 
-@tvm.testing.requires_cuda
-def test_thread_storage_sync():
-    m = te.size_var("m")
-    l = te.size_var("l")
-    A = te.placeholder((m, l), name="A")
-
-    A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1")
-    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2")
-
-    s = te.create_schedule(A2.op)
-    xo, xi = s[A2].split(A2.op.axis[0], factor=8)
-    s[A2].bind(xo, te.thread_axis("blockIdx.x"))
-    s[A1].compute_at(s[A2], xo)
-    s[A1].set_scope("shared")
-
-    bounds = tvm.te.schedule.InferBound(s)
-    assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
-
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, A2], stmt, None)
-    mod = run_passes(func)
-    f = mod["test_kernel"]
-    body_list = tvm.tir.stmt_list(f.body.body.body.body.body.body)
-    assert body_list[1].value.op.same_as(tvm.ir.Op.get("tir.tvm_storage_sync"))
-
-
-@tvm.testing.requires_cuda
-def test_sync_else_branch():
-    def ir(A, B):
-        ib = tvm.tir.ir_builder.create()
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", 1)
-
-        local = ib.allocate(A.dtype, (8,), name="buf_local", scope="local")
-        shared = ib.allocate(A.dtype, (8,), name="buf_shared", scope="shared")
-
-        with ib.for_range(0, 8) as i:
-            with ib.if_scope(Aptr[i] < 0):
-                local[i] = Aptr[i]
-            with ib.else_scope():
-                shared[i] = Aptr[i]
-
-        with ib.for_range(0, 8) as i:
-            with ib.if_scope(Aptr[i] < 0):
-                Bptr[i] = local[i]
-            with ib.else_scope():
-                Bptr[i] = shared[i]
-
-        return ib.get()
-
-    A = tvm.tir.decl_buffer((8,), "float32")
-    B = tvm.tir.decl_buffer((8,), "float32")
-    stmt = ir(A, B)
-    func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None)
-    mod = run_passes(func)
-    assert "T.tvm_storage_sync" in str(mod)
-
-
 @tvm.testing.requires_cuda
 def test_sync_read_thread_id_independent_location():
     @T.prim_func
diff --git a/tests/python/tir-transform/test_tir_transform_unify_thread_binding.py b/tests/python/tir-transform/test_tir_transform_unify_thread_binding.py
index 9ee86433128d..a419dc3f9976 100644
--- a/tests/python/tir-transform/test_tir_transform_unify_thread_binding.py
+++ b/tests/python/tir-transform/test_tir_transform_unify_thread_binding.py
@@ -313,16 +313,5 @@ def unified_inner_binding_with_annotation(
     _check(inner_binding_with_annotation, unified_inner_binding_with_annotation)
 
 
-def test_lower_te():
-    a = te.placeholder((32, 2, 2))
-    b = te.compute((32, 2, 2), lambda i, j, k: a[i, j, k] * 2.0)
-    s = te.create_schedule(b.op)
-    s[b].bind(b.op.axis[1], te.thread_axis("threadIdx.x"))
-    s[b].bind(b.op.axis[2], te.thread_axis("threadIdx.x"))
-    orig_mod = tvm.driver.build_module.schedule_to_module(s, [a, b])
-    mod = tvm.tir.transform.UnifyThreadBinding()(orig_mod)
-    tvm.ir.assert_structural_equal(mod, orig_mod)  # UnifyThreadBinding should do nothing on TE
-
-
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/tir-transform/test_tir_transform_unroll_loop.py b/tests/python/tir-transform/test_tir_transform_unroll_loop.py
index a05a085eeb64..37dc64a9e79c 100644
--- a/tests/python/tir-transform/test_tir_transform_unroll_loop.py
+++ b/tests/python/tir-transform/test_tir_transform_unroll_loop.py
@@ -94,23 +94,6 @@ def test_unroll_fake_loop():
         assert isinstance(ret[0], tvm.tir.BufferStore)
 
 
-def test_unroll_single_count_loops():
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.compute((n,), lambda *i: A(*i), name="B")
-    s = te.create_schedule(B.op)
-    s = s.normalize()
-    dom_map = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-    # all parameters to UnrolLoops are default values except for
-    # auto_unroll_max_extent which has been set to 1 (default:0)
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))
-
-    with tvm.transform.PassContext(config={"tir.UnrollLoop": {"auto_max_step": 1}}):
-        ret = tvm.tir.transform.UnrollLoop()(mod)["main"].body
-        assert ret == stmt
-
-
 def test_unroll_allocations():
     @tvm.script.ir_module
     class before:
@@ -179,5 +162,4 @@ def main(B: T.Buffer((64,), "float32")):
     test_unroll_local_access()
     test_unroll_loop()
     test_unroll_fake_loop()
-    test_unroll_single_count_loops()
     test_unroll_allocations()
diff --git a/tests/python/tir-transform/test_tir_transform_vectorize.py b/tests/python/tir-transform/test_tir_transform_vectorize.py
index 9659d896aed8..c5569c829ad5 100644
--- a/tests/python/tir-transform/test_tir_transform_vectorize.py
+++ b/tests/python/tir-transform/test_tir_transform_vectorize.py
@@ -197,16 +197,6 @@ def main(a: T.handle, n: T.int32, x: T.int32):
         tvm.ir.assert_structural_equal(mod, After)
 
 
-def test_vectorize_with_if_cond_int64():
-    m = te.size_var("m", dtype="int64")
-    A = te.placeholder((m,), name="A", dtype="float32")
-    B = te.compute((m,), lambda i: te.if_then_else(i < 2, A[i], A[i] * 2), name="B")
-    s = te.create_schedule(B.op)
-    x, y = s[B].split(B.op.axis[0], factor=4)
-    s[B].vectorize(y)
-    f = tvm.build(s, [A, B], "llvm")
-
-
 @pytest.mark.parametrize("extent, target", [(4, simple_target), (T.vscale() * 4, sve_target)])
 def test_vectorize_let(extent, target):
     @I.ir_module
@@ -371,10 +361,9 @@ def test_ir(A, B, C):
         name="while_vectorize",
         dtype=dtype,
     )
-    s = te.create_schedule(C.op)
 
     try:
-        tvm.lower(s, [A, B, C], "llvm")
+        tvm.build(te.create_prim_func([A, B, C]), target="llvm")
         assert False
     except tvm.error.TVMError as e:
         error_msg = str(e).split("\n")[-1]
@@ -382,14 +371,6 @@ def test_ir(A, B, C):
         assert expected in error_msg
 
 
-def test_vectorize_dtype_mismatch():
-    n = tvm.tir.IntImm("int64", 4)
-    A = te.compute((n,), lambda i: tvm.tir.IntImm("int64", 2**31 - 1) + i, name="A")
-    s = te.create_schedule(A.op)
-    s[A].vectorize(A.op.axis[0])
-    tvm.lower(s, [A], "llvm", simple_mode=True)
-
-
 @pytest.mark.parametrize(
     "extent, vec_str, target",
     [(16, "float32x16", simple_target), (T.vscale() * 8, "float32xvscalex8", sve_target)],
@@ -815,7 +796,7 @@ def main(A: T.Buffer((25,), "float32"), B: T.Buffer((25,), "float32")):
     with tvm.target.Target(target):
         mod = tvm.tir.transform.VectorizeLoop()(Before)
         tvm.ir.assert_structural_equal(mod, After)
-        mod = tvm.build(mod, target)
+        mod = tvm.build(mod, target=target)
 
 
 @pytest.mark.parametrize(
@@ -843,7 +824,7 @@ def main(A: T.Buffer((25,), "int32"), B: T.Buffer((25,), "float32")):
     with pytest.raises(Exception) as e_info:
         with tvm.target.Target(target):
             mod = tvm.tir.transform.VectorizeLoop()(Before)
-            ex = tvm.build(mod, target)
+            ex = tvm.build(mod, target=target)
     tvm.ir.assert_structural_equal(mod, After)
     assert "Intrinsic does not support vectors" in e_info.value.args[0]
 
diff --git a/web/tests/python/webgpu_rpc_test.py b/web/tests/python/webgpu_rpc_test.py
index 6a87c1bbe556..e831afd9d3f8 100644
--- a/web/tests/python/webgpu_rpc_test.py
+++ b/web/tests/python/webgpu_rpc_test.py
@@ -24,7 +24,6 @@
 from tvm import te
 from tvm import rpc
 from tvm.contrib import utils, tvmjs
-from tvm.relay.backend import Runtime
 import numpy as np
 
 proxy_host = "127.0.0.1"
@@ -48,7 +47,7 @@ def test_rpc():
     sch.bind(i0, "blockIdx.x")
     sch.bind(i1, "threadIdx.x")
 
-    fadd = tvm.build(sch.mod, target=target, runtime=runtime)
+    fadd = tvm.build(sch.mod.with_attr("system_lib_prefix", ""), target=target)
     temp = utils.tempdir()
 
     wasm_path = temp.relpath("addone_gpu.wasm")
diff --git a/web/tests/python/websock_rpc_test.py b/web/tests/python/websock_rpc_test.py
deleted file mode 100644
index f7011cef4723..000000000000
--- a/web/tests/python/websock_rpc_test.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Simple testcode to test Javascript RPC
-
-To use it, start a rpc proxy with "python -m tvm.exec.rpc_proxy".
-Connect javascript end to the websocket port and connect to the RPC.
-"""
-
-import tvm
-from tvm import te
-from tvm import rpc
-from tvm.contrib import utils, tvmjs
-from tvm.relay.backend import Runtime
-import numpy as np
-
-proxy_host = "127.0.0.1"
-proxy_port = 9090
-
-
-def test_rpc():
-    if not tvm.runtime.enabled("rpc"):
-        return
-    # generate the wasm library
-    runtime = Runtime("cpp", {"system-lib": True})
-    target = "llvm -mtriple=wasm32-unknown-unknown-wasm"
-    if not tvm.runtime.enabled(target):
-        raise RuntimeError("Target %s is not enbaled" % target)
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
-
-    fadd = tvm.build(s, [A, B], target, runtime=runtime, name="addone")
-    temp = utils.tempdir()
-
-    wasm_path = temp.relpath("addone.wasm")
-    fadd.export_library(wasm_path, fcompile=tvmjs.create_tvmjs_wasm)
-
-    wasm_binary = open(wasm_path, "rb").read()
-
-    remote = rpc.connect(
-        proxy_host,
-        proxy_port,
-        key="wasm",
-        session_constructor_args=["rpc.WasmSession", wasm_binary],
-    )
-
-    def check(remote):
-        # basic function checks.
-        faddone = remote.get_function("testing.asyncAddOne")
-        fecho = remote.get_function("testing.echo")
-        assert faddone(100) == 101
-        assert fecho(1, 2, 3) == 1
-        assert fecho(1, 2, 3) == 1
-        assert fecho(100, 2, 3) == 100
-        assert fecho("xyz") == "xyz"
-        assert bytes(fecho(bytearray(b"123"))) == b"123"
-        # run the generated library.
-        f1 = remote.system_lib()
-        dev = remote.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
-        # invoke the function
-        addone = f1.get_function("addone")
-        addone(a, b)
-
-        # time evaluator
-        time_f = f1.time_evaluator("addone", dev, number=100, repeat=10)
-        time_f(a, b)
-        cost = time_f(a, b).mean
-        print("%g secs/op" % cost)
-        np.testing.assert_equal(b.numpy(), a.numpy() + 1)
-
-    check(remote)
-
-
-test_rpc()

From 5423ae11b940b66479496b83c9a4fb33b99ec799 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 15 Feb 2025 19:37:13 -0500
Subject: [PATCH 2/5] Simplify CI to focus on UT

The main rationale is that we should only have very few target
dependent UT in tests/python/codegen and possible
a new category in future for op-level integration if needed.
---
 ci/jenkins/generated/arm_jenkinsfile.groovy   | 273 +---------
 ci/jenkins/generated/cpu_jenkinsfile.groovy   | 232 +-------
 .../generated/hexagon_jenkinsfile.groovy      | 509 +-----------------
 .../templates/arm_jenkinsfile.groovy.j2       |  22 -
 .../templates/cpu_jenkinsfile.groovy.j2       |  17 +-
 .../templates/hexagon_jenkinsfile.groovy.j2   |  19 -
 tests/scripts/ci.py                           |   1 -
 7 files changed, 23 insertions(+), 1050 deletions(-)

diff --git a/ci/jenkins/generated/arm_jenkinsfile.groovy b/ci/jenkins/generated/arm_jenkinsfile.groovy
index 5e48cc65004b..03ea3a028040 100644
--- a/ci/jenkins/generated/arm_jenkinsfile.groovy
+++ b/ci/jenkins/generated/arm_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2025-02-15T10:14:10.162250
+// Generated at 2025-02-15T20:02:41.820729
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -545,274 +545,3 @@ def build() {
   }
 }
 build()
-
-
-
-def shard_run_integration_aarch64_1_of_4(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_arm)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=arm',
-            'TEST_STEP_NAME=integration: aarch64',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=0',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_arm)
-              python_unittest(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('integration: aarch64 1 of 4')
-  }
-}
-
-def shard_run_integration_aarch64_2_of_4(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_arm)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=arm',
-            'TEST_STEP_NAME=integration: aarch64',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=1',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_arm)
-              python_unittest(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('integration: aarch64 2 of 4')
-  }
-}
-
-def shard_run_integration_aarch64_3_of_4(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_arm)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=arm',
-            'TEST_STEP_NAME=integration: aarch64',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=2',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_arm)
-              python_unittest(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('integration: aarch64 3 of 4')
-  }
-}
-
-def shard_run_integration_aarch64_4_of_4(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_arm)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=arm',
-            'TEST_STEP_NAME=integration: aarch64',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=3',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_arm)
-              python_unittest(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('integration: aarch64 4 of 4')
-  }
-}
-
-
-
-def test() {
-  stage('Test') {
-    environment {
-      SKIP_SLOW_TESTS = "${skip_slow_tests}"
-    }
-    parallel(
-    'integration: aarch64 1 of 4': {
-      try {
-      shard_run_integration_aarch64_1_of_4('ARM-GRAVITON3-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_integration_aarch64_1_of_4('ARM-GRAVITON3')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'integration: aarch64 2 of 4': {
-      try {
-      shard_run_integration_aarch64_2_of_4('ARM-GRAVITON3-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_integration_aarch64_2_of_4('ARM-GRAVITON3')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'integration: aarch64 3 of 4': {
-      try {
-      shard_run_integration_aarch64_3_of_4('ARM-GRAVITON3-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_integration_aarch64_3_of_4('ARM-GRAVITON3')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'integration: aarch64 4 of 4': {
-      try {
-      shard_run_integration_aarch64_4_of_4('ARM-GRAVITON3-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_integration_aarch64_4_of_4('ARM-GRAVITON3')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    )
-  }
-}
-test()
diff --git a/ci/jenkins/generated/cpu_jenkinsfile.groovy b/ci/jenkins/generated/cpu_jenkinsfile.groovy
index b54fdf51ca3c..627bb85862f3 100644
--- a/ci/jenkins/generated/cpu_jenkinsfile.groovy
+++ b/ci/jenkins/generated/cpu_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2025-02-15T10:14:10.181874
+// Generated at 2025-02-15T19:40:24.687837
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -553,158 +553,21 @@ build()
 
 
 
-def shard_run_integration_CPU_1_of_4(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_cpu)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=cpu',
-            'TEST_STEP_NAME=integration: CPU',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=0',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('integration: CPU 1 of 4')
-  }
-}
-
-def shard_run_integration_CPU_2_of_4(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_cpu)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=cpu',
-            'TEST_STEP_NAME=integration: CPU',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=1',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('integration: CPU 2 of 4')
-  }
-}
-
-def shard_run_integration_CPU_3_of_4(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_cpu)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=cpu',
-            'TEST_STEP_NAME=integration: CPU',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=2',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('integration: CPU 3 of 4')
-  }
-}
 
-def shard_run_integration_CPU_4_of_4(node_type) {
+def shard_run_unittest_CPU_1_of_2(node_type) {
   echo 'Begin running on node_type ' + node_type
   if (!skip_ci && is_docs_only_build != 1) {
     node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
         // NOTE: if exception happens, it will be caught outside
         init_git()
         docker_init(ci_cpu)
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
             'PLATFORM=cpu',
-            'TEST_STEP_NAME=integration: CPU',
-            'TVM_NUM_SHARDS=4',
-            'TVM_SHARD_INDEX=3',
+            'TEST_STEP_NAME=unittest: CPU',
+            'TVM_NUM_SHARDS=2',
+            'TVM_SHARD_INDEX=0',
             "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
             sh(
                   script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
@@ -712,16 +575,14 @@ def shard_run_integration_CPU_4_of_4(node_type) {
                 )
 
               ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
+              cpp_unittest(ci_cpu)
+              python_unittest(ci_cpu)
           })
         }
         // only run upload if things are successful
         try {
           sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results",
+            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_CPU --items build/pytest-results",
             label: 'Upload JUnits to S3',
           )
 
@@ -733,13 +594,11 @@ def shard_run_integration_CPU_4_of_4(node_type) {
     }
     echo 'End running on node_type ' + node_type
   } else {
-    Utils.markStageSkippedForConditional('integration: CPU 4 of 4')
+    Utils.markStageSkippedForConditional('unittest: CPU 1 of 2')
   }
 }
 
-
-
-def shard_run_unittest_CPU_1_of_1(node_type) {
+def shard_run_unittest_CPU_2_of_2(node_type) {
   echo 'Begin running on node_type ' + node_type
   if (!skip_ci && is_docs_only_build != 1) {
     node(node_type) {
@@ -751,8 +610,8 @@ def shard_run_unittest_CPU_1_of_1(node_type) {
           withEnv([
             'PLATFORM=cpu',
             'TEST_STEP_NAME=unittest: CPU',
-            'TVM_NUM_SHARDS=1',
-            'TVM_SHARD_INDEX=0',
+            'TVM_NUM_SHARDS=2',
+            'TVM_SHARD_INDEX=1',
             "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
             sh(
                   script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu",
@@ -779,7 +638,7 @@ def shard_run_unittest_CPU_1_of_1(node_type) {
     }
     echo 'End running on node_type ' + node_type
   } else {
-    Utils.markStageSkippedForConditional('unittest: CPU 1 of 1')
+    Utils.markStageSkippedForConditional('unittest: CPU 2 of 2')
   }
 }
 
@@ -790,60 +649,9 @@ def test() {
       SKIP_SLOW_TESTS = "${skip_slow_tests}"
     }
     parallel(
-    'integration: CPU 1 of 4': {
-      try {
-      shard_run_integration_CPU_1_of_4('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_integration_CPU_1_of_4('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'integration: CPU 2 of 4': {
-      try {
-      shard_run_integration_CPU_2_of_4('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_integration_CPU_2_of_4('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'integration: CPU 3 of 4': {
-      try {
-      shard_run_integration_CPU_3_of_4('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_integration_CPU_3_of_4('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'integration: CPU 4 of 4': {
+    'unittest: CPU 1 of 2': {
       try {
-      shard_run_integration_CPU_4_of_4('CPU-SMALL-SPOT')
+      shard_run_unittest_CPU_1_of_2('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
         if (is_last_build()) {
           // retry if at last build
@@ -851,16 +659,16 @@ def test() {
           // and try again via on demand node
           echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
           currentBuild.result = 'SUCCESS'
-          shard_run_integration_CPU_4_of_4('CPU-SMALL')
+          shard_run_unittest_CPU_1_of_2('CPU-SMALL')
         } else {
           echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
           throw ex
         }
       }
     },
-    'unittest: CPU 1 of 1': {
+    'unittest: CPU 2 of 2': {
       try {
-      shard_run_unittest_CPU_1_of_1('CPU-SMALL-SPOT')
+      shard_run_unittest_CPU_2_of_2('CPU-SMALL-SPOT')
       } catch (Throwable ex) {
         if (is_last_build()) {
           // retry if at last build
@@ -868,7 +676,7 @@ def test() {
           // and try again via on demand node
           echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
           currentBuild.result = 'SUCCESS'
-          shard_run_unittest_CPU_1_of_1('CPU-SMALL')
+          shard_run_unittest_CPU_2_of_2('CPU-SMALL')
         } else {
           echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
           throw ex
diff --git a/ci/jenkins/generated/hexagon_jenkinsfile.groovy b/ci/jenkins/generated/hexagon_jenkinsfile.groovy
index da20f33bbb3d..a9014337a74a 100644
--- a/ci/jenkins/generated/hexagon_jenkinsfile.groovy
+++ b/ci/jenkins/generated/hexagon_jenkinsfile.groovy
@@ -60,7 +60,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2025-02-15T10:14:10.056677
+// Generated at 2025-02-15T19:31:36.031215
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // These are set at runtime from data in ci/jenkins/docker-images.yml, update
@@ -552,519 +552,12 @@ build()
 
 
 
-
-def shard_run_test_Hexagon_1_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=0',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              cpp_unittest(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 1 of 8')
-  }
-}
-
-def shard_run_test_Hexagon_2_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=1',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 2 of 8')
-  }
-}
-
-def shard_run_test_Hexagon_3_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=2',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 3 of 8')
-  }
-}
-
-def shard_run_test_Hexagon_4_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=3',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 4 of 8')
-  }
-}
-
-def shard_run_test_Hexagon_5_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=4',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 5 of 8')
-  }
-}
-
-def shard_run_test_Hexagon_6_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=5',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 6 of 8')
-  }
-}
-
-def shard_run_test_Hexagon_7_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=6',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 7 of 8')
-  }
-}
-
-def shard_run_test_Hexagon_8_of_8(node_type) {
-  echo 'Begin running on node_type ' + node_type
-  if (!skip_ci && is_docs_only_build != 1) {
-    node(node_type) {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-        // NOTE: if exception happens, it will be caught outside
-        init_git()
-        docker_init(ci_hexagon)
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'PLATFORM=hexagon',
-            'TEST_STEP_NAME=test: Hexagon',
-            'TVM_NUM_SHARDS=8',
-            'TVM_SHARD_INDEX=7',
-            "SKIP_SLOW_TESTS=${skip_slow_tests}"], {
-            sh(
-                  script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon",
-                  label: 'Download artifacts from S3',
-                )
-
-              ci_setup(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
-          })
-        }
-        // only run upload if things are successful
-        try {
-          sh(
-            script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results",
-            label: 'Upload JUnits to S3',
-          )
-
-          junit 'build/pytest-results/*.xml'
-        } catch (Exception e) {
-          echo 'Exception during JUnit upload: ' + e.toString()
-        }
-      }
-    }
-    echo 'End running on node_type ' + node_type
-  } else {
-    Utils.markStageSkippedForConditional('test: Hexagon 8 of 8')
-  }
-}
-
-
 def test() {
   stage('Test') {
     environment {
       SKIP_SLOW_TESTS = "${skip_slow_tests}"
     }
     parallel(
-    'test: Hexagon 1 of 8': {
-      try {
-      shard_run_test_Hexagon_1_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_1_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'test: Hexagon 2 of 8': {
-      try {
-      shard_run_test_Hexagon_2_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_2_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'test: Hexagon 3 of 8': {
-      try {
-      shard_run_test_Hexagon_3_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_3_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'test: Hexagon 4 of 8': {
-      try {
-      shard_run_test_Hexagon_4_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_4_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'test: Hexagon 5 of 8': {
-      try {
-      shard_run_test_Hexagon_5_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_5_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'test: Hexagon 6 of 8': {
-      try {
-      shard_run_test_Hexagon_6_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_6_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'test: Hexagon 7 of 8': {
-      try {
-      shard_run_test_Hexagon_7_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_7_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
-    'test: Hexagon 8 of 8': {
-      try {
-      shard_run_test_Hexagon_8_of_8('CPU-SMALL-SPOT')
-      } catch (Throwable ex) {
-        if (is_last_build()) {
-          // retry if at last build
-          // mark the current stage as success
-          // and try again via on demand node
-          echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand'
-          currentBuild.result = 'SUCCESS'
-          shard_run_test_Hexagon_8_of_8('CPU-SMALL')
-        } else {
-          echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build'
-          throw ex
-        }
-      }
-    },
     )
   }
 }
diff --git a/ci/jenkins/templates/arm_jenkinsfile.groovy.j2 b/ci/jenkins/templates/arm_jenkinsfile.groovy.j2
index aa999408a7e2..0781bc92dbe5 100644
--- a/ci/jenkins/templates/arm_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/arm_jenkinsfile.groovy.j2
@@ -33,25 +33,3 @@
   make_cpp_tests(ci_arm, 'build')
   {{ m.upload_artifacts(tag='arm', filenames=tvm_lib + cpptest) }}
 {% endcall %}
-
-{% set test_method_names = [] %}
-
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="integration: aarch64",
-  num_shards=4,
-  ws="tvm/ut-python-arm",
-  platform="arm",
-  docker_image="ci_arm",
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='arm') }}
-  ci_setup(ci_arm)
-  python_unittest(ci_arm)
-  sh (
-    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-    label: 'Run CPU integration tests',
-  )
-{% endcall %}
-
-
-{{ m.invoke_tests(node="ARM-GRAVITON3", test_method_names=test_method_names) -}}
diff --git a/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2 b/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
index e34132c94111..c84b0c48a29f 100644
--- a/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
@@ -40,27 +40,12 @@
 
 {% set test_method_names = [] %}
 
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="integration: CPU",
-  num_shards=4,
-  ws="tvm/integration-python-cpu",
-  platform="cpu",
-  docker_image="ci_cpu",
-  test_method_names=test_method_names,
-) %}
-  {{ m.download_artifacts(tag='cpu') }}
-  ci_setup(ci_cpu)
-  sh (
-    script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-    label: 'Run CPU integration tests',
-  )
-{% endcall %}
 
 {% call(shard_index, num_shards) m.sharded_test_step(
   name="unittest: CPU",
   ws="tvm/ut-python-cpu",
   platform="cpu",
-  num_shards=1,
+  num_shards=2,
   docker_image="ci_cpu",
   test_method_names=test_method_names,
 ) %}
diff --git a/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2 b/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2
index 91d3ce9ece42..b4177b332987 100644
--- a/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2
@@ -41,23 +41,4 @@
 
 {% set test_method_names = [] %}
 
-{% call(shard_index, num_shards) m.sharded_test_step(
-  name="test: Hexagon",
-  ws="tvm/test-hexagon",
-  platform="hexagon",
-  docker_image="ci_hexagon",
-  test_method_names=test_method_names,
-  num_shards=8,
-) %}
-  {{ m.download_artifacts(tag='hexagon') }}
-  ci_setup(ci_hexagon)
-  {% if shard_index == 1 %}
-  cpp_unittest(ci_hexagon)
-  {% endif %}
-  sh (
-    script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-    label: 'Run Hexagon tests',
-  )
-{% endcall %}
-
 {{ m.invoke_tests(node="CPU-SMALL", test_method_names=test_method_names) -}}
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 0bd97e4ee048..10d63129121f 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -683,7 +683,6 @@ def add_subparser(
                 "run full Python tests",
                 [
                     "./tests/scripts/task_python_unittest.sh",
-                    "./tests/scripts/task_python_arm_compute_library.sh",
                 ],
             ),
         },

From 3c670ba81c65e347e29d8d2d94f96c67b219a39a Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 15 Feb 2025 20:05:50 -0500
Subject: [PATCH 3/5] Re-enable wasm

---
 tests/scripts/task_web_wasm.sh | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/scripts/task_web_wasm.sh b/tests/scripts/task_web_wasm.sh
index 91bbbac52300..8a08c1ecb58d 100755
--- a/tests/scripts/task_web_wasm.sh
+++ b/tests/scripts/task_web_wasm.sh
@@ -25,9 +25,8 @@ cd web
 make clean
 npm install
 npm run lint
-# TODO(@tqchen, @siyuan): re-enable the following tests
-# npm run prepwasm
-# npm run bundle
-# npm run test
-# npm run typedoc
+npm run prepwasm
+npm run bundle
+npm run test
+npm run typedoc
 cd ..

From 048336ab3565cd0d7c1c5e70d6ee60ae7c7b83f9 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Sat, 15 Feb 2025 21:02:29 -0500
Subject: [PATCH 4/5] fix lint

---
 python/tvm/driver/build_module.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index 48909e9832e1..94006111ffa2 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -24,7 +24,6 @@
 
 
 from tvm.runtime import ndarray
-from tvm.ir import container
 from tvm.tir import PrimFunc
 from tvm.ir.module import IRModule
 from tvm.target import Target

From 24bbeb7e3f5ce450b0a69075144532ea567b8871 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <hzfengsy@sjtu.edu.cn>
Date: Sun, 16 Feb 2025 13:12:40 +0800
Subject: [PATCH 5/5] remove hybrid,sparse autodoc and remove tests

---
 docs/reference/api/python/contrib.rst | 5 -----
 docs/reference/api/python/te.rst      | 8 --------
 tests/scripts/task_java_unittest.sh   | 4 ++--
 3 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/docs/reference/api/python/contrib.rst b/docs/reference/api/python/contrib.rst
index 0eb3024c2d08..e85d3bec5caf 100644
--- a/docs/reference/api/python/contrib.rst
+++ b/docs/reference/api/python/contrib.rst
@@ -104,11 +104,6 @@ tvm.contrib.rocm
 .. automodule:: tvm.contrib.rocm
     :members:
 
-tvm.contrib.sparse
-~~~~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.sparse
-    :members:
-
 
 tvm.contrib.spirv
 ~~~~~~~~~~~~~~~~~
diff --git a/docs/reference/api/python/te.rst b/docs/reference/api/python/te.rst
index 83e0042db1b9..363dae675d84 100644
--- a/docs/reference/api/python/te.rst
+++ b/docs/reference/api/python/te.rst
@@ -23,11 +23,3 @@ tvm.te
    :members:
    :imported-members:
    :autosummary:
-
-
-tvm.te.hybrid
--------------
-.. automodule:: tvm.te.hybrid
-   :members:
-   :imported-members:
-   :autosummary:
diff --git a/tests/scripts/task_java_unittest.sh b/tests/scripts/task_java_unittest.sh
index a35b023ad0df..2eabac31cc28 100755
--- a/tests/scripts/task_java_unittest.sh
+++ b/tests/scripts/task_java_unittest.sh
@@ -35,8 +35,8 @@ cleanup()
 }
 trap cleanup 0
 
-python3 "$SCRIPT_DIR"/test_add_cpu.py "$TEMP_DIR"
-python3 "$SCRIPT_DIR"/test_add_gpu.py "$TEMP_DIR"
+# python3 "$SCRIPT_DIR"/test_add_cpu.py "$TEMP_DIR"
+# python3 "$SCRIPT_DIR"/test_add_gpu.py "$TEMP_DIR"
 
 # Skip the Java RPC Unittests, see https://github.com/apache/tvm/issues/13168
 # # start rpc proxy server